From d0eb2d867cf3dbba79ef4c678797e6b58638392c Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 27 Nov 2020 08:05:13 -0800 Subject: [PATCH 0001/1379] eCryptfs: add a semicolon Function like macros should have a semicolon. Signed-off-by: Tom Rix [tyhicks: Remove the trailing semicolin from the macro's definition, as suggested by Joe Perches] Signed-off-by: Tyler Hicks --- fs/ecryptfs/ecryptfs_kernel.h | 2 +- fs/ecryptfs/keystore.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index e6ac78c62ca4..4a62a7dcc4f5 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -528,7 +528,7 @@ ecryptfs_dentry_to_lower_path(struct dentry *dentry) } #define ecryptfs_printk(type, fmt, arg...) \ - __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); + __ecryptfs_printk(type "%s: " fmt, __func__, ## arg) __printf(1, 2) void __ecryptfs_printk(const char *fmt, ...); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index f6a17d259db7..2abd219cfeec 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1172,7 +1172,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher, cipher_code); if (rc) { ecryptfs_printk(KERN_ERR, "Cipher code [%d] is invalid\n", - cipher_code) + cipher_code); goto out; } crypt_stat->flags |= ECRYPTFS_KEY_VALID; From 902af369942f8d0a6bdaa8466ff0d84d3d9b03a8 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Thu, 24 Dec 2020 21:22:33 +0800 Subject: [PATCH 0002/1379] ecryptfs: use DEFINE_MUTEX() for mutex lock mutex lock can be initialized automatically with DEFINE_MUTEX() rather than explicitly calling mutex_init(). Signed-off-by: Zheng Yongjun Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 0681540c48d9..be906b9bbb11 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1590,11 +1590,10 @@ out: struct kmem_cache *ecryptfs_key_tfm_cache; static struct list_head key_tfm_list; -struct mutex key_tfm_list_mutex; +DEFINE_MUTEX(key_tfm_list_mutex); int __init ecryptfs_init_crypto(void) { - mutex_init(&key_tfm_list_mutex); INIT_LIST_HEAD(&key_tfm_list); return 0; } From a63d0120a2dd89eabf24b415b27208e190e989b0 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Sun, 7 Mar 2021 21:10:57 -0800 Subject: [PATCH 0003/1379] Input: exc3000 - split MT event handling from IRQ handler Split out the multitouch event handling into its own function to allow other events to be handled in the IRQ handler without disturbing the MT handling. Now that things are separated a bit more, stop treating vendor data requests special by cehcking for a locked mutex, but just look at the event ID to figure out if the message is a MT report or a vendor data query reply. Signed-off-by: Lucas Stach Link: https://lore.kernel.org/r/20210125182527.1225245-2-l.stach@pengutronix.de Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/exc3000.c | 128 +++++++++++++++------------- 1 file changed, 68 insertions(+), 60 deletions(-) diff --git a/drivers/input/touchscreen/exc3000.c b/drivers/input/touchscreen/exc3000.c index a6597f026980..398b56a3900a 100644 --- a/drivers/input/touchscreen/exc3000.c +++ b/drivers/input/touchscreen/exc3000.c @@ -30,6 +30,7 @@ #define EXC3000_LEN_MODEL_NAME 16 #define EXC3000_LEN_FW_VERSION 16 +#define EXC3000_VENDOR_EVENT 0x03 #define EXC3000_MT1_EVENT 0x06 #define EXC3000_MT2_EVENT 0x18 @@ -105,15 +106,16 @@ static void exc3000_timer(struct timer_list *t) input_sync(data->input); } +static inline void exc3000_schedule_timer(struct exc3000_data *data) +{ + mod_timer(&data->timer, jiffies + msecs_to_jiffies(EXC3000_TIMEOUT_MS)); +} + static int exc3000_read_frame(struct exc3000_data *data, u8 *buf) { struct i2c_client *client = data->client; - u8 expected_event = EXC3000_MT1_EVENT; int ret; - if (data->info->max_xy == SZ_16K - 1) - expected_event = EXC3000_MT2_EVENT; - ret = i2c_master_send(client, "'", 2); if (ret < 0) return ret; @@ -131,47 +133,62 @@ static int exc3000_read_frame(struct exc3000_data *data, u8 *buf) if (get_unaligned_le16(buf) != EXC3000_LEN_FRAME) return -EINVAL; - if (buf[2] != expected_event) - return -EINVAL; - return 0; } -static int exc3000_read_data(struct exc3000_data *data, - u8 *buf, int *n_slots) +static int exc3000_handle_mt_event(struct exc3000_data *data) { - int error; + struct input_dev *input = data->input; + int ret, total_slots; + u8 *buf = data->buf; - error = exc3000_read_frame(data, buf); - if (error) - return error; - - *n_slots = buf[3]; - if (!*n_slots || *n_slots > EXC3000_NUM_SLOTS) - return -EINVAL; - - if (*n_slots > EXC3000_SLOTS_PER_FRAME) { - /* Read 2nd frame to get the rest of the contacts. */ - error = exc3000_read_frame(data, buf + EXC3000_LEN_FRAME); - if (error) - return error; - - /* 2nd chunk must have number of contacts set to 0. */ - if (buf[EXC3000_LEN_FRAME + 3] != 0) - return -EINVAL; + total_slots = buf[3]; + if (!total_slots || total_slots > EXC3000_NUM_SLOTS) { + ret = -EINVAL; + goto out_fail; } + if (total_slots > EXC3000_SLOTS_PER_FRAME) { + /* Read 2nd frame to get the rest of the contacts. */ + ret = exc3000_read_frame(data, buf + EXC3000_LEN_FRAME); + if (ret) + goto out_fail; + + /* 2nd chunk must have number of contacts set to 0. */ + if (buf[EXC3000_LEN_FRAME + 3] != 0) { + ret = -EINVAL; + goto out_fail; + } + } + + /* + * We read full state successfully, no contacts will be "stuck". + */ + del_timer_sync(&data->timer); + + while (total_slots > 0) { + int slots = min(total_slots, EXC3000_SLOTS_PER_FRAME); + + exc3000_report_slots(input, &data->prop, buf + 4, slots); + total_slots -= slots; + buf += EXC3000_LEN_FRAME; + } + + input_mt_sync_frame(input); + input_sync(input); + return 0; + +out_fail: + /* Schedule a timer to release "stuck" contacts */ + exc3000_schedule_timer(data); + + return ret; } static int exc3000_query_interrupt(struct exc3000_data *data) { u8 *buf = data->buf; - int error; - - error = i2c_master_recv(data->client, buf, EXC3000_LEN_FRAME); - if (error < 0) - return error; if (buf[0] != 'B') return -EPROTO; @@ -189,40 +206,31 @@ static int exc3000_query_interrupt(struct exc3000_data *data) static irqreturn_t exc3000_interrupt(int irq, void *dev_id) { struct exc3000_data *data = dev_id; - struct input_dev *input = data->input; u8 *buf = data->buf; - int slots, total_slots; - int error; + int ret; - if (mutex_is_locked(&data->query_lock)) { + ret = exc3000_read_frame(data, buf); + if (ret) { + /* Schedule a timer to release "stuck" contacts */ + exc3000_schedule_timer(data); + goto out; + } + + switch (buf[2]) { + case EXC3000_VENDOR_EVENT: data->query_result = exc3000_query_interrupt(data); complete(&data->wait_event); - goto out; + break; + + case EXC3000_MT1_EVENT: + case EXC3000_MT2_EVENT: + exc3000_handle_mt_event(data); + break; + + default: + break; } - error = exc3000_read_data(data, buf, &total_slots); - if (error) { - /* Schedule a timer to release "stuck" contacts */ - mod_timer(&data->timer, - jiffies + msecs_to_jiffies(EXC3000_TIMEOUT_MS)); - goto out; - } - - /* - * We read full state successfully, no contacts will be "stuck". - */ - del_timer_sync(&data->timer); - - while (total_slots > 0) { - slots = min(total_slots, EXC3000_SLOTS_PER_FRAME); - exc3000_report_slots(input, &data->prop, buf + 4, slots); - total_slots -= slots; - buf += EXC3000_LEN_FRAME; - } - - input_mt_sync_frame(input); - input_sync(input); - out: return IRQ_HANDLED; } From 102feb1ddfd00d0c6115ae3372058012dc16418c Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Sun, 7 Mar 2021 21:35:54 -0800 Subject: [PATCH 0004/1379] Input: exc3000 - factor out vendor data request Factor out the vendor data i2c request handling to make it reusable for other functions. Also don't cache the model and firmware version string in the device private data as we never use the cached version, but always read from the device. Signed-off-by: Lucas Stach Link: https://lore.kernel.org/r/20210125182527.1225245-3-l.stach@pengutronix.de Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/exc3000.c | 136 +++++++++++++--------------- 1 file changed, 61 insertions(+), 75 deletions(-) diff --git a/drivers/input/touchscreen/exc3000.c b/drivers/input/touchscreen/exc3000.c index 398b56a3900a..e55fb79c7fde 100644 --- a/drivers/input/touchscreen/exc3000.c +++ b/drivers/input/touchscreen/exc3000.c @@ -25,6 +25,7 @@ #define EXC3000_NUM_SLOTS 10 #define EXC3000_SLOTS_PER_FRAME 5 #define EXC3000_LEN_FRAME 66 +#define EXC3000_LEN_VENDOR_REQUEST 68 #define EXC3000_LEN_POINT 10 #define EXC3000_LEN_MODEL_NAME 16 @@ -77,9 +78,6 @@ struct exc3000_data { u8 buf[2 * EXC3000_LEN_FRAME]; struct completion wait_event; struct mutex query_lock; - int query_result; - char model[EXC3000_LEN_MODEL_NAME]; - char fw_version[EXC3000_LEN_FW_VERSION]; }; static void exc3000_report_slots(struct input_dev *input, @@ -186,23 +184,6 @@ out_fail: return ret; } -static int exc3000_query_interrupt(struct exc3000_data *data) -{ - u8 *buf = data->buf; - - if (buf[0] != 'B') - return -EPROTO; - - if (buf[4] == 'E') - strlcpy(data->model, buf + 5, sizeof(data->model)); - else if (buf[4] == 'D') - strlcpy(data->fw_version, buf + 5, sizeof(data->fw_version)); - else - return -EPROTO; - - return 0; -} - static irqreturn_t exc3000_interrupt(int irq, void *dev_id) { struct exc3000_data *data = dev_id; @@ -218,7 +199,6 @@ static irqreturn_t exc3000_interrupt(int irq, void *dev_id) switch (buf[2]) { case EXC3000_VENDOR_EVENT: - data->query_result = exc3000_query_interrupt(data); complete(&data->wait_event); break; @@ -235,73 +215,75 @@ out: return IRQ_HANDLED; } +static int exc3000_vendor_data_request(struct exc3000_data *data, u8 *request, + u8 request_len, u8 *response, int timeout) +{ + u8 buf[EXC3000_LEN_VENDOR_REQUEST] = { 0x67, 0x00, 0x42, 0x00, 0x03 }; + int ret; + + mutex_lock(&data->query_lock); + + reinit_completion(&data->wait_event); + + buf[5] = request_len; + memcpy(&buf[6], request, request_len); + + ret = i2c_master_send(data->client, buf, EXC3000_LEN_VENDOR_REQUEST); + if (ret < 0) + goto out_unlock; + + if (response) { + ret = wait_for_completion_timeout(&data->wait_event, + timeout * HZ); + if (ret <= 0) { + ret = -ETIMEDOUT; + goto out_unlock; + } + + if (data->buf[3] >= EXC3000_LEN_FRAME) { + ret = -ENOSPC; + goto out_unlock; + } + + memcpy(response, &data->buf[4], data->buf[3]); + ret = data->buf[3]; + } + +out_unlock: + mutex_unlock(&data->query_lock); + + return ret; +} + static ssize_t fw_version_show(struct device *dev, struct device_attribute *attr, char *buf) { struct i2c_client *client = to_i2c_client(dev); struct exc3000_data *data = i2c_get_clientdata(client); - static const u8 request[68] = { - 0x67, 0x00, 0x42, 0x00, 0x03, 0x01, 'D', 0x00 - }; - int error; + u8 response[EXC3000_LEN_FRAME]; + int ret; - mutex_lock(&data->query_lock); + ret = exc3000_vendor_data_request(data, (u8[]){'D'}, 1, response, 1); + if (ret < 0) + return ret; - data->query_result = -ETIMEDOUT; - reinit_completion(&data->wait_event); - - error = i2c_master_send(client, request, sizeof(request)); - if (error < 0) { - mutex_unlock(&data->query_lock); - return error; - } - - wait_for_completion_interruptible_timeout(&data->wait_event, 1 * HZ); - mutex_unlock(&data->query_lock); - - if (data->query_result < 0) - return data->query_result; - - return sprintf(buf, "%s\n", data->fw_version); + return sprintf(buf, "%s\n", &response[1]); } static DEVICE_ATTR_RO(fw_version); -static ssize_t exc3000_get_model(struct exc3000_data *data) -{ - static const u8 request[68] = { - 0x67, 0x00, 0x42, 0x00, 0x03, 0x01, 'E', 0x00 - }; - struct i2c_client *client = data->client; - int error; - - mutex_lock(&data->query_lock); - data->query_result = -ETIMEDOUT; - reinit_completion(&data->wait_event); - - error = i2c_master_send(client, request, sizeof(request)); - if (error < 0) { - mutex_unlock(&data->query_lock); - return error; - } - - wait_for_completion_interruptible_timeout(&data->wait_event, 1 * HZ); - mutex_unlock(&data->query_lock); - - return data->query_result; -} - static ssize_t model_show(struct device *dev, struct device_attribute *attr, char *buf) { struct i2c_client *client = to_i2c_client(dev); struct exc3000_data *data = i2c_get_clientdata(client); - int error; + u8 response[EXC3000_LEN_FRAME]; + int ret; - error = exc3000_get_model(data); - if (error < 0) - return error; + ret = exc3000_vendor_data_request(data, (u8[]){'E'}, 1, response, 1); + if (ret < 0) + return ret; - return sprintf(buf, "%s\n", data->model); + return sprintf(buf, "%s\n", &response[1]); } static DEVICE_ATTR_RO(model); @@ -387,9 +369,15 @@ static int exc3000_probe(struct i2c_client *client) * or two touch events anyways). */ for (retry = 0; retry < 3; retry++) { - error = exc3000_get_model(data); - if (!error) + u8 response[EXC3000_LEN_FRAME]; + + error = exc3000_vendor_data_request(data, (u8[]){'E'}, 1, + response, 1); + if (error > 0) { + dev_dbg(&client->dev, "TS Model: %s", &response[1]); + error = 0; break; + } dev_warn(&client->dev, "Retry %d get EETI EXC3000 model: %d\n", retry + 1, error); } @@ -397,8 +385,6 @@ static int exc3000_probe(struct i2c_client *client) if (error) return error; - dev_dbg(&client->dev, "TS Model: %s", data->model); - i2c_set_clientdata(client, data); error = devm_device_add_group(&client->dev, &exc3000_attribute_group); From c929ac9eb85acf9217eb812369bbd4cf65a772e0 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Sun, 7 Mar 2021 21:40:01 -0800 Subject: [PATCH 0005/1379] Input: exc3000 - fix firmware version query for device in bootloader If the device is stuck in bootloader (maybe due to blank or corrupted application firmware) it won't answer a query for the firmware version. Fall back to returning the bootloader version in that case. Signed-off-by: Lucas Stach Link: https://lore.kernel.org/r/20210125182527.1225245-4-l.stach@pengutronix.de Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/exc3000.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/input/touchscreen/exc3000.c b/drivers/input/touchscreen/exc3000.c index e55fb79c7fde..66682a0c37d7 100644 --- a/drivers/input/touchscreen/exc3000.c +++ b/drivers/input/touchscreen/exc3000.c @@ -263,6 +263,20 @@ static ssize_t fw_version_show(struct device *dev, u8 response[EXC3000_LEN_FRAME]; int ret; + /* query bootloader info */ + ret = exc3000_vendor_data_request(data, + (u8[]){0x39, 0x02}, 2, response, 1); + if (ret < 0) + return ret; + + /* + * If the bootloader version is non-zero then the device is in + * bootloader mode and won't answer a query for the application FW + * version, so we just use the bootloader version info. + */ + if (response[2] || response[3]) + return sprintf(buf, "%d.%d\n", response[2], response[3]); + ret = exc3000_vendor_data_request(data, (u8[]){'D'}, 1, response, 1); if (ret < 0) return ret; From ad117c558e838f9fa93af265d8f9dd54e87e15b1 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Sun, 7 Mar 2021 21:40:37 -0800 Subject: [PATCH 0006/1379] Input: exc3000 - add type sysfs attribute Add a sysfs attribute to query the type of the touchscreen device. Signed-off-by: Lucas Stach Link: https://lore.kernel.org/r/20210125182527.1225245-5-l.stach@pengutronix.de Signed-off-by: Dmitry Torokhov --- .../ABI/testing/sysfs-driver-input-exc3000 | 9 +++++++++ drivers/input/touchscreen/exc3000.c | 17 +++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-driver-input-exc3000 b/Documentation/ABI/testing/sysfs-driver-input-exc3000 index cd7c578aef2c..704434b277b0 100644 --- a/Documentation/ABI/testing/sysfs-driver-input-exc3000 +++ b/Documentation/ABI/testing/sysfs-driver-input-exc3000 @@ -15,3 +15,12 @@ Description: Reports the model identification provided by the touchscreen, fo Access: Read Valid values: Represented as string + +What: /sys/bus/i2c/devices/xxx/type +Date: Jan 2021 +Contact: linux-input@vger.kernel.org +Description: Reports the type identification provided by the touchscreen, for example "PCAP82H80 Series" + + Access: Read + + Valid values: Represented as string diff --git a/drivers/input/touchscreen/exc3000.c b/drivers/input/touchscreen/exc3000.c index 66682a0c37d7..cbe0dd412912 100644 --- a/drivers/input/touchscreen/exc3000.c +++ b/drivers/input/touchscreen/exc3000.c @@ -301,9 +301,26 @@ static ssize_t model_show(struct device *dev, } static DEVICE_ATTR_RO(model); +static ssize_t type_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + struct exc3000_data *data = i2c_get_clientdata(client); + u8 response[EXC3000_LEN_FRAME]; + int ret; + + ret = exc3000_vendor_data_request(data, (u8[]){'F'}, 1, response, 1); + if (ret < 0) + return ret; + + return sprintf(buf, "%s\n", &response[1]); +} +static DEVICE_ATTR_RO(type); + static struct attribute *sysfs_attrs[] = { &dev_attr_fw_version.attr, &dev_attr_model.attr, + &dev_attr_type.attr, NULL }; From e41d237818598c0b17458b4d0416b091a7959e55 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 27 Dec 2020 16:59:56 -0500 Subject: [PATCH 0007/1379] qib_fs: switch to simple_recursive_removal() Signed-off-by: Al Viro --- drivers/infiniband/hw/qib/qib_fs.c | 68 +++--------------------------- 1 file changed, 5 insertions(+), 63 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index e336d778e076..a0c5f3bdc324 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -427,79 +427,21 @@ bail: return ret; } -static int remove_file(struct dentry *parent, char *name) -{ - struct dentry *tmp; - int ret; - - tmp = lookup_one_len(name, parent, strlen(name)); - - if (IS_ERR(tmp)) { - ret = PTR_ERR(tmp); - goto bail; - } - - spin_lock(&tmp->d_lock); - if (simple_positive(tmp)) { - __d_drop(tmp); - spin_unlock(&tmp->d_lock); - simple_unlink(d_inode(parent), tmp); - } else { - spin_unlock(&tmp->d_lock); - } - dput(tmp); - - ret = 0; -bail: - /* - * We don't expect clients to care about the return value, but - * it's there if they need it. - */ - return ret; -} - static int remove_device_files(struct super_block *sb, struct qib_devdata *dd) { - struct dentry *dir, *root; + struct dentry *dir; char unit[10]; - int ret, i; - root = dget(sb->s_root); - inode_lock(d_inode(root)); snprintf(unit, sizeof(unit), "%u", dd->unit); - dir = lookup_one_len(unit, root, strlen(unit)); + dir = lookup_one_len_unlocked(unit, sb->s_root, strlen(unit)); if (IS_ERR(dir)) { - ret = PTR_ERR(dir); pr_err("Lookup of %s failed\n", unit); - goto bail; + return PTR_ERR(dir); } - - inode_lock(d_inode(dir)); - remove_file(dir, "counters"); - remove_file(dir, "counter_names"); - remove_file(dir, "portcounter_names"); - for (i = 0; i < dd->num_pports; i++) { - char fname[24]; - - sprintf(fname, "port%dcounters", i + 1); - remove_file(dir, fname); - if (dd->flags & QIB_HAS_QSFP) { - sprintf(fname, "qsfp%d", i + 1); - remove_file(dir, fname); - } - } - remove_file(dir, "flash"); - inode_unlock(d_inode(dir)); - ret = simple_rmdir(d_inode(root), dir); - d_drop(dir); - dput(dir); - -bail: - inode_unlock(d_inode(root)); - dput(root); - return ret; + simple_recursive_removal(dir, NULL); + return 0; } /* From 145e1da374bcba14c9ca069646f68b76c422612a Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Wed, 3 Mar 2021 13:47:02 -0800 Subject: [PATCH 0008/1379] remoteproc: sysfs: Use sysfs_emit instead of sprintf For security reasons sysfs_emit() is preferred over sprintf(). Hence, convert the remoteproc's sysfs show functions accordingly. Signed-off-by: Raghavendra Rao Ananta Signed-off-by: Siddharth Gupta Link: https://lore.kernel.org/r/1614808022-26062-1-git-send-email-sidgup@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c index 1dbef895e65e..6840dad931d5 100644 --- a/drivers/remoteproc/remoteproc_sysfs.c +++ b/drivers/remoteproc/remoteproc_sysfs.c @@ -15,7 +15,7 @@ static ssize_t recovery_show(struct device *dev, { struct rproc *rproc = to_rproc(dev); - return sprintf(buf, "%s", rproc->recovery_disabled ? "disabled\n" : "enabled\n"); + return sysfs_emit(buf, "%s", rproc->recovery_disabled ? "disabled\n" : "enabled\n"); } /* @@ -82,7 +82,7 @@ static ssize_t coredump_show(struct device *dev, { struct rproc *rproc = to_rproc(dev); - return sprintf(buf, "%s\n", rproc_coredump_str[rproc->dump_conf]); + return sysfs_emit(buf, "%s\n", rproc_coredump_str[rproc->dump_conf]); } /* From c4e792d1acce31c2eb7b9193ab06ab94de05bf42 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 5 Feb 2021 19:23:00 +0100 Subject: [PATCH 0009/1379] ARM: 9056/1: decompressor: fix BSS size calculation for LLVM ld.lld The LLVM ld.lld linker uses a different symbol type for __bss_start, resulting in the calculation of KBSS_SZ to be thrown off. Up until now, this has gone unnoticed as it only affects the appended DTB case, but pending changes for ARM in the way the decompressed kernel is cleaned from the caches has uncovered this problem. On a ld.lld build: $ nm vmlinux |grep bss_ c1c22034 D __bss_start c1c86e98 B __bss_stop resulting in $ readelf -s arch/arm/boot/compressed/vmlinux | grep bss_size 433: c1c86e98 0 NOTYPE GLOBAL DEFAULT ABS _kernel_bss_size which is obviously incorrect, and may cause the cache clean to access unmapped memory, or cause the size calculation to wrap, resulting in no cache clean to be performed at all. Fix this by updating the sed regex to take D type symbols into account. Link: https://lore.kernel.org/linux-arm-kernel/6c65bcef-d4e7-25fa-43cf-2c435bb61bb9@collabora.com/ Link: https://lore.kernel.org/linux-arm-kernel/20210205085220.31232-1-ardb@kernel.org/ Cc: # v4.19+ Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Reported-by: Guillaume Tucker Reported-by: "kernelci.org bot" Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King --- arch/arm/boot/compressed/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index fd94e27ba4fa..c1f804768621 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -118,8 +118,8 @@ asflags-y := -DZIMAGE # Supply kernel BSS size to the decompressor via a linker symbol. KBSS_SZ = $(shell echo $$(($$($(NM) $(obj)/../../../../vmlinux | \ - sed -n -e 's/^\([^ ]*\) [AB] __bss_start$$/-0x\1/p' \ - -e 's/^\([^ ]*\) [AB] __bss_stop$$/+0x\1/p') )) ) + sed -n -e 's/^\([^ ]*\) [ABD] __bss_start$$/-0x\1/p' \ + -e 's/^\([^ ]*\) [ABD] __bss_stop$$/+0x\1/p') )) ) LDFLAGS_vmlinux = --defsym _kernel_bss_size=$(KBSS_SZ) # Supply ZRELADDR to the decompressor via a linker symbol. ifneq ($(CONFIG_AUTO_ZRELADDR),y) From c0e50736e826b51ddc437e6cf0dc68f07e4ad16b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 11 Feb 2021 09:19:46 +0100 Subject: [PATCH 0010/1379] ARM: 9057/1: cache-v7: add missing ISB after cache level selection A write to CSSELR needs to complete before its results can be observed via CCSIDR. So add a ISB to ensure that this is the case. Acked-by: Nicolas Pitre Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King --- arch/arm/mm/cache-v7.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index dc8f152f3556..307f381eee71 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -38,9 +38,10 @@ icache_size: * procedures. */ ENTRY(v7_invalidate_l1) - mov r0, #0 - mcr p15, 2, r0, c0, c0, 0 - mrc p15, 1, r0, c0, c0, 0 + mov r0, #0 + mcr p15, 2, r0, c0, c0, 0 @ select L1 data cache in CSSELR + isb + mrc p15, 1, r0, c0, c0, 0 @ read cache geometry from CCSIDR movw r1, #0x7fff and r2, r1, r0, lsr #13 From f9e7a99fb6b86aa6a00e53b34ee6973840e005aa Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 11 Feb 2021 09:23:09 +0100 Subject: [PATCH 0011/1379] ARM: 9058/1: cache-v7: refactor v7_invalidate_l1 to avoid clobbering r5/r6 The cache invalidation code in v7_invalidate_l1 can be tweaked to re-read the associativity from CCSIDR, and keep the way identifier component in a single register that is assigned in the outer loop. This way, we need 2 registers less. Given that the number of sets is typically much larger than the associativity, rearrange the code so that the outer loop has the fewer number of iterations, ensuring that the re-read of CCSIDR only occurs a handful of times in practice. Fix the whitespace while at it, and update the comment to indicate that this code is no longer a clone of anything else. Acked-by: Nicolas Pitre Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King --- arch/arm/mm/cache-v7.S | 51 +++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index 307f381eee71..76201ee9ee59 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -33,9 +33,8 @@ icache_size: * processor. We fix this by performing an invalidate, rather than a * clean + invalidate, before jumping into the kernel. * - * This function is cloned from arch/arm/mach-tegra/headsmp.S, and needs - * to be called for both secondary cores startup and primary core resume - * procedures. + * This function needs to be called for both secondary cores startup and + * primary core resume procedures. */ ENTRY(v7_invalidate_l1) mov r0, #0 @@ -43,32 +42,32 @@ ENTRY(v7_invalidate_l1) isb mrc p15, 1, r0, c0, c0, 0 @ read cache geometry from CCSIDR - movw r1, #0x7fff - and r2, r1, r0, lsr #13 + movw r3, #0x3ff + and r3, r3, r0, lsr #3 @ 'Associativity' in CCSIDR[12:3] + clz r1, r3 @ WayShift + mov r2, #1 + mov r3, r3, lsl r1 @ NumWays-1 shifted into bits [31:...] + movs r1, r2, lsl r1 @ #1 shifted left by same amount + moveq r1, #1 @ r1 needs value > 0 even if only 1 way - movw r1, #0x3ff + and r2, r0, #0x7 + add r2, r2, #4 @ SetShift - and r3, r1, r0, lsr #3 @ NumWays - 1 - add r2, r2, #1 @ NumSets +1: movw r4, #0x7fff + and r0, r4, r0, lsr #13 @ 'NumSets' in CCSIDR[27:13] - and r0, r0, #0x7 - add r0, r0, #4 @ SetShift - - clz r1, r3 @ WayShift - add r4, r3, #1 @ NumWays -1: sub r2, r2, #1 @ NumSets-- - mov r3, r4 @ Temp = NumWays -2: subs r3, r3, #1 @ Temp-- - mov r5, r3, lsl r1 - mov r6, r2, lsl r0 - orr r5, r5, r6 @ Reg = (Temp< Date: Thu, 11 Feb 2021 09:25:34 +0100 Subject: [PATCH 0012/1379] ARM: 9059/1: cache-v7: get rid of mini-stack Now that we have reduced the number of registers that we need to preserve when calling v7_invalidate_l1 from the boot code, we can use scratch registers to preserve the remaining ones, and get rid of the mini stack entirely. This works around any issues regarding cache behavior in relation to the uncached accesses to this memory, which is hard to get right in the general case (i.e., both bare metal and under virtualization) While at it, switch v7_invalidate_l1 to using ip as a scratch register instead of r4. This makes the function AAPCS compliant, and removes the need to stash r4 in ip across the call. Acked-by: Nicolas Pitre Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King --- arch/arm/include/asm/memory.h | 15 -------------- arch/arm/mm/cache-v7.S | 10 ++++----- arch/arm/mm/proc-v7.S | 39 ++++++++++++++++------------------- 3 files changed, 23 insertions(+), 41 deletions(-) diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index 2f841cb65c30..a711322d9f40 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -150,21 +150,6 @@ extern unsigned long vectors_base; */ #define PLAT_PHYS_OFFSET UL(CONFIG_PHYS_OFFSET) -#ifdef CONFIG_XIP_KERNEL -/* - * When referencing data in RAM from the XIP region in a relative manner - * with the MMU off, we need the relative offset between the two physical - * addresses. The macro below achieves this, which is: - * __pa(v_data) - __xip_pa(v_text) - */ -#define PHYS_RELATIVE(v_data, v_text) \ - (((v_data) - PAGE_OFFSET + PLAT_PHYS_OFFSET) - \ - ((v_text) - XIP_VIRT_ADDR(CONFIG_XIP_PHYS_ADDR) + \ - CONFIG_XIP_PHYS_ADDR)) -#else -#define PHYS_RELATIVE(v_data, v_text) ((v_data) - (v_text)) -#endif - #ifndef __ASSEMBLY__ /* diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index 76201ee9ee59..830bbfb26ca5 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -53,12 +53,12 @@ ENTRY(v7_invalidate_l1) and r2, r0, #0x7 add r2, r2, #4 @ SetShift -1: movw r4, #0x7fff - and r0, r4, r0, lsr #13 @ 'NumSets' in CCSIDR[27:13] +1: movw ip, #0x7fff + and r0, ip, r0, lsr #13 @ 'NumSets' in CCSIDR[27:13] -2: mov r4, r0, lsl r2 @ NumSet << SetShift - orr r4, r4, r3 @ Reg = (Temp< Date: Thu, 11 Feb 2021 10:35:30 +0100 Subject: [PATCH 0013/1379] ARM: 9060/1: kexec: Remove unused kexec_reinit callback The last (only?) user of this was removed in commit ba364fc752da ("ARM: Kirkwood: Remove mach-kirkwood"), back in v3.17. Link: https://lore.kernel.org/r/20210210235243.398810-1-joel@jms.id.au Signed-off-by: Joel Stanley Reviewed-by: Arnd Bergmann Signed-off-by: Russell King --- arch/arm/include/asm/kexec.h | 3 --- arch/arm/kernel/machine_kexec.c | 8 -------- 2 files changed, 11 deletions(-) diff --git a/arch/arm/include/asm/kexec.h b/arch/arm/include/asm/kexec.h index 22751b5b5735..e62832dcba76 100644 --- a/arch/arm/include/asm/kexec.h +++ b/arch/arm/include/asm/kexec.h @@ -56,9 +56,6 @@ static inline void crash_setup_regs(struct pt_regs *newregs, } } -/* Function pointer to optional machine-specific reinitialization */ -extern void (*kexec_reinit)(void); - static inline unsigned long phys_to_boot_phys(phys_addr_t phys) { return phys_to_idmap(phys); diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 2b09dad7935e..f567032a09c0 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -147,11 +147,6 @@ void machine_crash_shutdown(struct pt_regs *regs) pr_info("Loading crashdump kernel...\n"); } -/* - * Function pointer to optional machine-specific reinitialization - */ -void (*kexec_reinit)(void); - void machine_kexec(struct kimage *image) { unsigned long page_list, reboot_entry_phys; @@ -187,9 +182,6 @@ void machine_kexec(struct kimage *image) pr_info("Bye!\n"); - if (kexec_reinit) - kexec_reinit(); - soft_restart(reboot_entry_phys); } From 34731ed13e8a8ca95fa0dca466537396b5f2d1af Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Mar 2021 16:30:46 +0100 Subject: [PATCH 0014/1379] leds: lgm: fix gpiolib dependency Without gpiolib, the driver fails to build: drivers/leds/blink/leds-lgm-sso.c:123:19: error: field has incomplete type 'struct gpio_chip' struct gpio_chip chip; ^ include/linux/gpio.h:107:8: note: forward declaration of 'struct gpio_chip' struct gpio_chip; ^ drivers/leds/blink/leds-lgm-sso.c:263:3: error: implicit declaration of function 'gpiod_set_value' [-Werror,-Wimplicit-function-declaration] gpiod_set_value(led->gpiod, val); ^ drivers/leds/blink/leds-lgm-sso.c:263:3: note: did you mean 'gpio_set_value'? include/linux/gpio.h:168:20: note: 'gpio_set_value' declared here static inline void gpio_set_value(unsigned gpio, int value) ^ drivers/leds/blink/leds-lgm-sso.c:345:3: error: implicit declaration of function 'gpiod_set_value' [-Werror,-Wimplicit-function-declaration] gpiod_set_value(led->gpiod, 1); ^ Add the dependency in Kconfig. Fixes: c3987cd2bca3 ("leds: lgm: Add LED controller driver for LGM SoC") Signed-off-by: Arnd Bergmann Signed-off-by: Pavel Machek --- drivers/leds/blink/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/leds/blink/Kconfig b/drivers/leds/blink/Kconfig index 265b53476a80..6dedc58c47b3 100644 --- a/drivers/leds/blink/Kconfig +++ b/drivers/leds/blink/Kconfig @@ -9,6 +9,7 @@ if LEDS_BLINK config LEDS_BLINK_LGM tristate "LED support for Intel LGM SoC series" + depends on GPIOLIB depends on LEDS_CLASS depends on MFD_SYSCON depends on OF From 436cb709f8a9fd1a52e00e830e715c63191c2e63 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 24 Feb 2021 15:13:49 +0000 Subject: [PATCH 0015/1379] i3c: master: svc: remove redundant assignment to cmd->read_len The assignment of xfer_len to cmd->read_len appears to be redundant as the next statement re-assigns the value 0 to it. Clean up the code by removing the redundant first assignment. Addresses-Coverity: ("Unused value") Fixes: dd3c52846d59 ("i3c: master: svc: Add Silvaco I3C master driver") Signed-off-by: Colin Ian King Reviewed-by: Miquel Raynal Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210224151349.202332-1-colin.king@canonical.com --- drivers/i3c/master/svc-i3c-master.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index 8d990696676e..1f6ba4221817 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -1124,7 +1124,6 @@ static int svc_i3c_master_send_direct_ccc_cmd(struct svc_i3c_master *master, cmd->in = NULL; cmd->out = &ccc->id; cmd->len = 1; - cmd->read_len = xfer_len; cmd->read_len = 0; cmd->continued = true; From f6e5aedf470bd4522732595a85688052e3cbc03f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 25 Feb 2021 14:54:17 +0800 Subject: [PATCH 0016/1379] riscv: Add support for memtest The riscv [rv32_]defconfig enabled CONFIG_MEMTEST, but memtest feature is not supported in RISCV. Add early_memtest() to support for memtest. Signed-off-by: Kefeng Wang Signed-off-by: Palmer Dabbelt --- Documentation/admin-guide/kernel-parameters.txt | 2 +- arch/riscv/mm/init.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 04545725f187..20447b531630 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2794,7 +2794,7 @@ seconds. Use this parameter to check at some other rate. 0 disables periodic checking. - memtest= [KNL,X86,ARM,PPC] Enable memtest + memtest= [KNL,X86,ARM,PPC,RISCV] Enable memtest Format: default : 0 Specifies the number of memtest passes to be diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 067583ab1bd7..7f5036fbee8c 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -128,8 +128,9 @@ void __init setup_bootmem(void) if (max_mapped_addr == (dram_end - 1)) memblock_set_current_limit(max_mapped_addr - 4096); - max_pfn = PFN_DOWN(dram_end); - max_low_pfn = max_pfn; + min_low_pfn = PFN_UP(memblock_start_of_DRAM()); + max_low_pfn = max_pfn = PFN_DOWN(dram_end); + dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn)); set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); @@ -593,6 +594,7 @@ void __init paging_init(void) void __init misc_mem_init(void) { + early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); arch_numa_init(); sparse_init(); zone_sizes_init(); From 9530141455c968938a913d602a236c2a7b0322e1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 25 Feb 2021 15:03:03 +0800 Subject: [PATCH 0017/1379] riscv: Add ARCH_HAS_FORTIFY_SOURCE FORTIFY_SOURCE could detect various overflows at compile and run time. ARCH_HAS_FORTIFY_SOURCE means that the architecture can be built and run with CONFIG_FORTIFY_SOURCE. Select it in RISCV. See more about this feature from commit 6974f0c4555e ("include/linux/string.h: add the option of fortified string.h functions"). Signed-off-by: Kefeng Wang Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/string.h | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 85d626b8ce5e..6978b0739718 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -20,6 +20,7 @@ config RISCV select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEBUG_WX + select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h index 5477e7ecb6e1..909049366555 100644 --- a/arch/riscv/include/asm/string.h +++ b/arch/riscv/include/asm/string.h @@ -23,5 +23,10 @@ extern asmlinkage void *__memmove(void *, const void *, size_t); #define memcpy(dst, src, len) __memcpy(dst, src, len) #define memset(s, c, n) __memset(s, c, n) #define memmove(dst, src, len) __memmove(dst, src, len) + +#ifndef __NO_FORTIFY +#define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ +#endif + #endif #endif /* _ASM_RISCV_STRING_H */ From ea16ef967ec88bd67466d564d461c3fdf7f85bd9 Mon Sep 17 00:00:00 2001 From: Jingle Wu Date: Sun, 7 Mar 2021 18:23:25 -0800 Subject: [PATCH 0018/1379] Input: elan_i2c - reduce the resume time for new devices Newer controllers, such as Voxel, Delbin, Magple, Bobba and others, do not need to be reset after issuing power-on command, and skipping reset saves at least 100ms from resume time. Note that if first attempt of re-initializing device fails we will not be skipping reset on the subsequent ones. Signed-off-by: Jingle Wu Link: https://lore.kernel.org/r/20210226073537.4926-1-jingle.wu@emc.com.tw Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elan_i2c.h | 5 +++ drivers/input/mouse/elan_i2c_core.c | 58 ++++++++++++++++++++++++----- 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/drivers/input/mouse/elan_i2c.h b/drivers/input/mouse/elan_i2c.h index e12da5b024b0..838b3b346316 100644 --- a/drivers/input/mouse/elan_i2c.h +++ b/drivers/input/mouse/elan_i2c.h @@ -55,6 +55,11 @@ #define ETP_FW_PAGE_SIZE_512 512 #define ETP_FW_SIGNATURE_SIZE 6 +#define ETP_PRODUCT_ID_DELBIN 0x00C2 +#define ETP_PRODUCT_ID_VOXEL 0x00BF +#define ETP_PRODUCT_ID_MAGPIE 0x0120 +#define ETP_PRODUCT_ID_BOBBA 0x0121 + struct i2c_client; struct completion; diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index bef73822315d..dad22c1ea6a0 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -46,6 +46,9 @@ #define ETP_FINGER_WIDTH 15 #define ETP_RETRY_COUNT 3 +/* quirks to control the device */ +#define ETP_QUIRK_QUICK_WAKEUP BIT(0) + /* The main device structure */ struct elan_tp_data { struct i2c_client *client; @@ -90,8 +93,38 @@ struct elan_tp_data { bool baseline_ready; u8 clickpad; bool middle_button; + + u32 quirks; /* Various quirks */ }; +static u32 elan_i2c_lookup_quirks(u16 ic_type, u16 product_id) +{ + static const struct { + u16 ic_type; + u16 product_id; + u32 quirks; + } elan_i2c_quirks[] = { + { 0x0D, ETP_PRODUCT_ID_DELBIN, ETP_QUIRK_QUICK_WAKEUP }, + { 0x10, ETP_PRODUCT_ID_VOXEL, ETP_QUIRK_QUICK_WAKEUP }, + { 0x14, ETP_PRODUCT_ID_MAGPIE, ETP_QUIRK_QUICK_WAKEUP }, + { 0x14, ETP_PRODUCT_ID_BOBBA, ETP_QUIRK_QUICK_WAKEUP }, + }; + u32 quirks = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(elan_i2c_quirks); i++) { + if (elan_i2c_quirks[i].ic_type == ic_type && + elan_i2c_quirks[i].product_id == product_id) { + quirks = elan_i2c_quirks[i].quirks; + } + } + + if (ic_type >= 0x0D && product_id >= 0x123) + quirks |= ETP_QUIRK_QUICK_WAKEUP; + + return quirks; +} + static int elan_get_fwinfo(u16 ic_type, u8 iap_version, u16 *validpage_count, u32 *signature_address, u16 *page_size) { @@ -258,16 +291,18 @@ static int elan_check_ASUS_special_fw(struct elan_tp_data *data) return false; } -static int __elan_initialize(struct elan_tp_data *data) +static int __elan_initialize(struct elan_tp_data *data, bool skip_reset) { struct i2c_client *client = data->client; bool woken_up = false; int error; - error = data->ops->initialize(client); - if (error) { - dev_err(&client->dev, "device initialize failed: %d\n", error); - return error; + if (!skip_reset) { + error = data->ops->initialize(client); + if (error) { + dev_err(&client->dev, "device initialize failed: %d\n", error); + return error; + } } error = elan_query_product(data); @@ -311,16 +346,17 @@ static int __elan_initialize(struct elan_tp_data *data) return 0; } -static int elan_initialize(struct elan_tp_data *data) +static int elan_initialize(struct elan_tp_data *data, bool skip_reset) { int repeat = ETP_RETRY_COUNT; int error; do { - error = __elan_initialize(data); + error = __elan_initialize(data, skip_reset); if (!error) return 0; + skip_reset = false; msleep(30); } while (--repeat > 0); @@ -357,6 +393,8 @@ static int elan_query_device_info(struct elan_tp_data *data) if (error) return error; + data->quirks = elan_i2c_lookup_quirks(data->ic_type, data->product_id); + error = elan_get_fwinfo(data->ic_type, data->iap_version, &data->fw_validpage_count, &data->fw_signature_address, @@ -546,7 +584,7 @@ static int elan_update_firmware(struct elan_tp_data *data, data->ops->iap_reset(client); } else { /* Reinitialize TP after fw is updated */ - elan_initialize(data); + elan_initialize(data, false); elan_query_device_info(data); } @@ -1247,7 +1285,7 @@ static int elan_probe(struct i2c_client *client, } /* Initialize the touchpad. */ - error = elan_initialize(data); + error = elan_initialize(data, false); if (error) return error; @@ -1384,7 +1422,7 @@ static int __maybe_unused elan_resume(struct device *dev) goto err; } - error = elan_initialize(data); + error = elan_initialize(data, data->quirks & ETP_QUIRK_QUICK_WAKEUP); if (error) dev_err(dev, "initialize when resuming failed: %d\n", error); From e042e95bcab34b2265b0aaeb497030ea13c6c251 Mon Sep 17 00:00:00 2001 From: Tang Bin Date: Mon, 22 Feb 2021 14:11:05 +0800 Subject: [PATCH 0019/1379] thermal: amlogic: Omit superfluous error message in amlogic_thermal_probe() The function devm_platform_ioremap_resource has already contains error message, so remove the redundant dev_err here. Signed-off-by: Tang Bin Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210222061105.6008-1-tangbin@cmss.chinamobile.com --- drivers/thermal/amlogic_thermal.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/thermal/amlogic_thermal.c b/drivers/thermal/amlogic_thermal.c index dffe3ba8c7c4..e61b91d14ad1 100644 --- a/drivers/thermal/amlogic_thermal.c +++ b/drivers/thermal/amlogic_thermal.c @@ -254,10 +254,8 @@ static int amlogic_thermal_probe(struct platform_device *pdev) platform_set_drvdata(pdev, pdata); base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(base)) { - dev_err(dev, "failed to get io address\n"); + if (IS_ERR(base)) return PTR_ERR(base); - } pdata->regmap = devm_regmap_init_mmio(dev, base, pdata->data->regmap_config); From 6606800192008bd2929c55614697645f53e07427 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Fri, 5 Mar 2021 07:23:20 +0530 Subject: [PATCH 0020/1379] thermal: Fix a typo in the file soctherm.c s/calibaration/calibration/ Signed-off-by: Bhaskar Chowdhury Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210305015320.7614-1-unixbhaskar@gmail.com --- drivers/thermal/tegra/soctherm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c index 66e0639da4bf..8b8fbd49679b 100644 --- a/drivers/thermal/tegra/soctherm.c +++ b/drivers/thermal/tegra/soctherm.c @@ -2195,7 +2195,7 @@ static int tegra_soctherm_probe(struct platform_device *pdev) if (err) return err; - /* calculate tsensor calibaration data */ + /* calculate tsensor calibration data */ for (i = 0; i < soc->num_tsensors; ++i) { err = tegra_calc_tsensor_calib(&soc->tsensors[i], &shared_calib, From 76d6329534ae3b2f344aa72cc978ef4cfd69c0c8 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Fri, 5 Mar 2021 07:13:48 +0530 Subject: [PATCH 0021/1379] thermal: Fix couple of spellos in the file sun8i_thermal.c s/calibartion/calibration/ s/undocummented/undocumented/ Signed-off-by: Bhaskar Chowdhury Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210305014348.17412-1-unixbhaskar@gmail.com --- drivers/thermal/sun8i_thermal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c index 8c80bd06dd9f..d9cd23cbb671 100644 --- a/drivers/thermal/sun8i_thermal.c +++ b/drivers/thermal/sun8i_thermal.c @@ -300,7 +300,7 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev) * or 0x8xx, so they won't be away from the default value * for a lot. * - * So here we do not return error if the calibartion data is + * So here we do not return error if the calibration data is * not available, except the probe needs deferring. */ goto out; @@ -418,7 +418,7 @@ static int sun8i_h3_thermal_init(struct ths_device *tmdev) } /* - * Without this undocummented value, the returned temperatures would + * Without this undocumented value, the returned temperatures would * be higher than real ones by about 20C. */ #define SUN50I_H6_CTRL0_UNK 0x0000002f From 7fd49ca05be35a85c424a3ca8df931bd70c34535 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Tue, 9 Mar 2021 17:24:19 +0100 Subject: [PATCH 0022/1379] thermal: rcar_gen3_thermal: Add support for up to five TSC nodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for up to five TSC nodes. The new THCODE values are taken from the example in the datasheet. Signed-off-by: Niklas Söderlund Acked-by: Geert Uytterhoeven Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210309162419.2621359-1-niklas.soderlund+renesas@ragnatech.se --- drivers/thermal/rcar_gen3_thermal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/rcar_gen3_thermal.c b/drivers/thermal/rcar_gen3_thermal.c index 75c69fe6e955..e1e412348076 100644 --- a/drivers/thermal/rcar_gen3_thermal.c +++ b/drivers/thermal/rcar_gen3_thermal.c @@ -60,7 +60,7 @@ #define MCELSIUS(temp) ((temp) * 1000) #define GEN3_FUSE_MASK 0xFFF -#define TSC_MAX_NUM 4 +#define TSC_MAX_NUM 5 /* default THCODE values if FUSEs are missing */ static const int thcodes[TSC_MAX_NUM][3] = { @@ -68,6 +68,7 @@ static const int thcodes[TSC_MAX_NUM][3] = { { 3393, 2795, 2216 }, { 3389, 2805, 2237 }, { 3415, 2694, 2195 }, + { 3356, 2724, 2244 }, }; /* Structure for thermal temperature calculation */ From 7440e912b0fe755d80b958a65859ebabb5338cf8 Mon Sep 17 00:00:00 2001 From: Zhang Yunkai Date: Sat, 6 Mar 2021 04:34:15 -0800 Subject: [PATCH 0023/1379] thermal:ti-soc-thermal: Remove duplicate include in ti-bandgap 'of_device.h' included in 'ti-bandgap.c' is duplicated. It is also included in the 25th line. Signed-off-by: Zhang Yunkai Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210306123415.219441-1-zhang.yunkai@zte.com.cn --- drivers/thermal/ti-soc-thermal/ti-bandgap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/thermal/ti-soc-thermal/ti-bandgap.c b/drivers/thermal/ti-soc-thermal/ti-bandgap.c index 8a3646e26ddd..d81af89166d2 100644 --- a/drivers/thermal/ti-soc-thermal/ti-bandgap.c +++ b/drivers/thermal/ti-soc-thermal/ti-bandgap.c @@ -32,7 +32,6 @@ #include #include #include -#include #include "ti-bandgap.h" From 45c7eaeb29d67224db4ba935deb575586a1fda09 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Wed, 10 Mar 2021 04:24:23 -0800 Subject: [PATCH 0024/1379] thermal: thermal_of: Fix error return code of thermal_of_populate_bind_params() When kcalloc() returns NULL to __tcbp or of_count_phandle_with_args() returns zero or -ENOENT to count, no error return code of thermal_of_populate_bind_params() is assigned. To fix these bugs, ret is assigned with -ENOMEM and -ENOENT in these cases, respectively. Fixes: a92bab8919e3 ("of: thermal: Allow multiple devices to share cooling map") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210310122423.3266-1-baijiaju1990@gmail.com --- drivers/thermal/thermal_of.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index 69ef12f852b7..5b76f9a1280d 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -704,14 +704,17 @@ static int thermal_of_populate_bind_params(struct device_node *np, count = of_count_phandle_with_args(np, "cooling-device", "#cooling-cells"); - if (!count) { + if (count <= 0) { pr_err("Add a cooling_device property with at least one device\n"); + ret = -ENOENT; goto end; } __tcbp = kcalloc(count, sizeof(*__tcbp), GFP_KERNEL); - if (!__tcbp) + if (!__tcbp) { + ret = -ENOMEM; goto end; + } for (i = 0; i < count; i++) { ret = of_parse_phandle_with_args(np, "cooling-device", From 9468e7b031876935230182628f8d5f216c071784 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Wed, 10 Mar 2021 12:07:16 +0100 Subject: [PATCH 0025/1379] dt-bindings: thermal: rcar-gen3-thermal: Support five TSC nodes on r8a779a0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When adding support for V3U (r8a779a0) it was incorrectly recorded it supports four nodes, while in fact it supports five. The fifth node is named TSC0 and breaks the existing naming schema starting at 1. Work around this by separately defining the reg property for V3U and others. Restore the maximum number of nodes to three for other compatibles as it was before erroneously increasing it for V3U. Fixes: d7fdfb6541f3be88 ("dt-bindings: thermal: rcar-gen3-thermal: Add r8a779a0 support") Signed-off-by: Niklas Söderlund Reviewed-by: Geert Uytterhoeven Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210310110716.3297544-1-niklas.soderlund+renesas@ragnatech.se --- .../bindings/thermal/rcar-gen3-thermal.yaml | 43 +++++++++++++++---- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml b/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml index b33a76eeac4e..f963204e0b16 100644 --- a/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml +++ b/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml @@ -28,14 +28,7 @@ properties: - renesas,r8a77980-thermal # R-Car V3H - renesas,r8a779a0-thermal # R-Car V3U - reg: - minItems: 2 - maxItems: 4 - items: - - description: TSC1 registers - - description: TSC2 registers - - description: TSC3 registers - - description: TSC4 registers + reg: true interrupts: items: @@ -71,8 +64,25 @@ if: enum: - renesas,r8a779a0-thermal then: + properties: + reg: + minItems: 2 + maxItems: 3 + items: + - description: TSC1 registers + - description: TSC2 registers + - description: TSC3 registers required: - interrupts +else: + properties: + reg: + items: + - description: TSC0 registers + - description: TSC1 registers + - description: TSC2 registers + - description: TSC3 registers + - description: TSC4 registers additionalProperties: false @@ -111,3 +121,20 @@ examples: }; }; }; + - | + #include + #include + #include + + tsc_r8a779a0: thermal@e6190000 { + compatible = "renesas,r8a779a0-thermal"; + reg = <0xe6190000 0x200>, + <0xe6198000 0x200>, + <0xe61a0000 0x200>, + <0xe61a8000 0x200>, + <0xe61b0000 0x200>; + clocks = <&cpg CPG_MOD 919>; + power-domains = <&sysc R8A779A0_PD_ALWAYS_ON>; + resets = <&cpg 919>; + #thermal-sensor-cells = <1>; + }; From d9b7eae8e3424c3480fe9f40ebafbb0c96426e4c Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Mon, 22 Feb 2021 09:17:17 +0800 Subject: [PATCH 0026/1379] PCI/RCEC: Fix RCiEP device to RCEC association rcec_assoc_rciep() used "rciep->devfn" (a single byte encoding both the device and function number) as the device number to check whether the corresponding bit was set in the RCEC's Association Bitmap for RCiEPs. But per PCIe r5.0, sec 7.9.10.2, "Association Bitmap for RCiEPs", the 32-bit bitmap contains one bit per device. That bit applies to all functions of the device. Fix rcec_assoc_rciep() to convert the value of "rciep->devfn" to a device number to ensure that RCiEP devices are correctly associated with the RCEC. Reported-and-tested-by: Wen Jin Fixes: 507b460f8144 ("PCI/ERR: Add pcie_link_rcec() to associate RCiEPs") Link: https://lore.kernel.org/r/20210222011717.43266-1-qiuxu.zhuo@intel.com Signed-off-by: Qiuxu Zhuo Signed-off-by: Bjorn Helgaas Reviewed-by: Sean V Kelley --- drivers/pci/pcie/rcec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pcie/rcec.c b/drivers/pci/pcie/rcec.c index 2c5c552994e4..d0bcd141ac9c 100644 --- a/drivers/pci/pcie/rcec.c +++ b/drivers/pci/pcie/rcec.c @@ -32,7 +32,7 @@ static bool rcec_assoc_rciep(struct pci_dev *rcec, struct pci_dev *rciep) /* Same bus, so check bitmap */ for_each_set_bit(devn, &bitmap, 32) - if (devn == rciep->devfn) + if (devn == PCI_SLOT(rciep->devfn)) return true; return false; From 55cc33fab5ac9f7e2a97aa7c564e8b35355886d5 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Thu, 11 Mar 2021 09:48:09 +0100 Subject: [PATCH 0027/1379] rtc: m48t59: use platform_get_irq_optional The IRQ is optional, avoid the error message by using platform_get_irq_optional. Signed-off-by: Alexandre Belloni --- drivers/rtc/rtc-m48t59.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c index 1d2e99a70fce..f0f6b9b6daec 100644 --- a/drivers/rtc/rtc-m48t59.c +++ b/drivers/rtc/rtc-m48t59.c @@ -421,7 +421,7 @@ static int m48t59_rtc_probe(struct platform_device *pdev) /* Try to get irq number. We also can work in * the mode without IRQ. */ - m48t59->irq = platform_get_irq(pdev, 0); + m48t59->irq = platform_get_irq_optional(pdev, 0); if (m48t59->irq <= 0) m48t59->irq = NO_IRQ; From 312e3f8aefb5dc9c2f052ba0ee35a2fd6baa5bcd Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 11 Mar 2021 09:30:54 +0000 Subject: [PATCH 0028/1379] thermal: Fix spelling mistake "disabed" -> "disabled" There is a spelling mistake in a comment, fix it. Signed-off-by: Colin Ian King Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210311093054.5338-1-colin.king@canonical.com --- include/uapi/linux/thermal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/thermal.h b/include/uapi/linux/thermal.h index c105054cbb57..9aa2fedfa309 100644 --- a/include/uapi/linux/thermal.h +++ b/include/uapi/linux/thermal.h @@ -60,7 +60,7 @@ enum thermal_genl_event { THERMAL_GENL_EVENT_UNSPEC, THERMAL_GENL_EVENT_TZ_CREATE, /* Thermal zone creation */ THERMAL_GENL_EVENT_TZ_DELETE, /* Thermal zone deletion */ - THERMAL_GENL_EVENT_TZ_DISABLE, /* Thermal zone disabed */ + THERMAL_GENL_EVENT_TZ_DISABLE, /* Thermal zone disabled */ THERMAL_GENL_EVENT_TZ_ENABLE, /* Thermal zone enabled */ THERMAL_GENL_EVENT_TZ_TRIP_UP, /* Trip point crossed the way up */ THERMAL_GENL_EVENT_TZ_TRIP_DOWN, /* Trip point crossed the way down */ From 780a980e2b047768130ddb68d39fbde84b049630 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 24 Feb 2021 16:20:29 +0800 Subject: [PATCH 0029/1379] remoteproc: pru: Replace DEFINE_SIMPLE_ATTRIBUTE with DEFINE_DEBUGFS_ATTRIBUTE Fix the following coccicheck warning: ./drivers/remoteproc/pru_rproc.c:247:0-23: WARNING: pru_rproc_debug_ss_fops should be defined with DEFINE_DEBUGFS_ATTRIBUTE Reviewed-by: Mathieu Poirier Reported-by: Abaci Robot Signed-off-by: Yang Li Link: https://lore.kernel.org/r/1614154829-42461-1-git-send-email-yang.lee@linux.alibaba.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/pru_rproc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/remoteproc/pru_rproc.c b/drivers/remoteproc/pru_rproc.c index 2667919d76b3..16b204ed657b 100644 --- a/drivers/remoteproc/pru_rproc.c +++ b/drivers/remoteproc/pru_rproc.c @@ -244,8 +244,8 @@ static int pru_rproc_debug_ss_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(pru_rproc_debug_ss_fops, pru_rproc_debug_ss_get, - pru_rproc_debug_ss_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(pru_rproc_debug_ss_fops, pru_rproc_debug_ss_get, + pru_rproc_debug_ss_set, "%llu\n"); /* * Create PRU-specific debugfs entries From 2bf2346159bc99cf0679e25be20f4daca60f3f5c Mon Sep 17 00:00:00 2001 From: Jindong Yue Date: Wed, 24 Feb 2021 13:58:25 +0800 Subject: [PATCH 0030/1379] remoteproc: core: Remove casting to rproc_handle_resource_t There are four different callback functions that are used for the rproc_handle_resource_t callback that all have different second parameter types. rproc_handle_vdev -> struct fw_rsc_vdev rproc_handle_trace -> struct fw_rsc_trace rproc_handle_devmem -> struct fw_rsc_devmem rproc_handle_carveout -> struct fw_rsc_carveout These callbacks are cast to rproc_handle_resource_t so that there is no error about incompatible pointer types. Unfortunately, this is a Clang's Control-Flow Integrity checking violation, which verifies that the callback function's types match the prototypes exactly before jumping. [ 7.275750] Kernel panic - not syncing: CFI failure (target: rproc_handle_vdev+0x0/0x4) [ 7.283763] CPU: 2 PID: 1 Comm: init Tainted: G C O 5.4.70-03301-g527af2c96672 #17 [ 7.292463] Hardware name: NXP i.MX8MPlus EVK board (DT) [ 7.297779] Call trace: [ 7.300232] dump_backtrace.cfi_jt+0x0/0x4 [ 7.304337] show_stack+0x18/0x24 [ 7.307660] dump_stack+0xb8/0x114 [ 7.311069] panic+0x164/0x3d4 [ 7.314130] __ubsan_handle_cfi_check_fail_abort+0x0/0x14 [ 7.319533] perf_proc_update_handler+0x0/0xcc [ 7.323983] __cfi_check+0x63278/0x6a290 [ 7.327913] rproc_boot+0x3f8/0x738 [ 7.331404] rproc_add+0x68/0x110 [ 7.334738] imx_rproc_probe+0x5e4/0x708 [imx_rproc] [ 7.339711] platform_drv_probe+0xac/0xf0 [ 7.343726] really_probe+0x260/0x65c [ 7.347393] driver_probe_device+0x64/0x100 [ 7.351580] device_driver_attach+0x6c/0xac [ 7.355766] __driver_attach+0xdc/0x184 [ 7.359609] bus_for_each_dev+0x98/0x104 [ 7.363537] driver_attach+0x24/0x30 [ 7.367117] bus_add_driver+0x100/0x1e0 [ 7.370958] driver_register+0x78/0x114 [ 7.374800] __platform_driver_register+0x44/0x50 [ 7.379514] init_module+0x20/0xfe8 [imx_rproc] [ 7.384049] do_one_initcall+0x190/0x348 [ 7.387979] do_init_module+0x5c/0x210 [ 7.391731] load_module+0x2fbc/0x3590 [ 7.395485] __arm64_sys_finit_module+0xb8/0xec [ 7.400025] el0_svc_common+0xb4/0x19c [ 7.403777] el0_svc_handler+0x74/0x98 [ 7.407531] el0_svc+0x8/0xc [ 7.410419] SMP: stopping secondary CPUs [ 7.414648] Kernel Offset: disabled [ 7.418142] CPU features: 0x00010002,2000200c [ 7.422501] Memory Limit: none To fix this, change the second parameter of all functions to void * and use a local variable with the correct type so that everything works properly. With this, we can remove casting to rproc_handle_resource_t for these functions. Signed-off-by: Jindong Yue Reviewed-by: Peng Fan Reviewed-by: Sami Tolvanen Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/20210224055825.7417-1-jindong.yue@nxp.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 29 +++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index ab150765d124..553e42a4d2a0 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -482,7 +482,7 @@ static int copy_dma_range_map(struct device *to, struct device *from) /** * rproc_handle_vdev() - handle a vdev fw resource * @rproc: the remote processor - * @rsc: the vring resource descriptor + * @ptr: the vring resource descriptor * @offset: offset of the resource entry * @avail: size of available data (for sanity checking the image) * @@ -507,9 +507,10 @@ static int copy_dma_range_map(struct device *to, struct device *from) * * Returns 0 on success, or an appropriate error code otherwise */ -static int rproc_handle_vdev(struct rproc *rproc, struct fw_rsc_vdev *rsc, +static int rproc_handle_vdev(struct rproc *rproc, void *ptr, int offset, int avail) { + struct fw_rsc_vdev *rsc = ptr; struct device *dev = &rproc->dev; struct rproc_vdev *rvdev; int i, ret; @@ -627,7 +628,7 @@ void rproc_vdev_release(struct kref *ref) /** * rproc_handle_trace() - handle a shared trace buffer resource * @rproc: the remote processor - * @rsc: the trace resource descriptor + * @ptr: the trace resource descriptor * @offset: offset of the resource entry * @avail: size of available data (for sanity checking the image) * @@ -641,9 +642,10 @@ void rproc_vdev_release(struct kref *ref) * * Returns 0 on success, or an appropriate error code otherwise */ -static int rproc_handle_trace(struct rproc *rproc, struct fw_rsc_trace *rsc, +static int rproc_handle_trace(struct rproc *rproc, void *ptr, int offset, int avail) { + struct fw_rsc_trace *rsc = ptr; struct rproc_debug_trace *trace; struct device *dev = &rproc->dev; char name[15]; @@ -693,7 +695,7 @@ static int rproc_handle_trace(struct rproc *rproc, struct fw_rsc_trace *rsc, /** * rproc_handle_devmem() - handle devmem resource entry * @rproc: remote processor handle - * @rsc: the devmem resource entry + * @ptr: the devmem resource entry * @offset: offset of the resource entry * @avail: size of available data (for sanity checking the image) * @@ -716,9 +718,10 @@ static int rproc_handle_trace(struct rproc *rproc, struct fw_rsc_trace *rsc, * and not allow firmwares to request access to physical addresses that * are outside those ranges. */ -static int rproc_handle_devmem(struct rproc *rproc, struct fw_rsc_devmem *rsc, +static int rproc_handle_devmem(struct rproc *rproc, void *ptr, int offset, int avail) { + struct fw_rsc_devmem *rsc = ptr; struct rproc_mem_entry *mapping; struct device *dev = &rproc->dev; int ret; @@ -896,7 +899,7 @@ static int rproc_release_carveout(struct rproc *rproc, /** * rproc_handle_carveout() - handle phys contig memory allocation requests * @rproc: rproc handle - * @rsc: the resource entry + * @ptr: the resource entry * @offset: offset of the resource entry * @avail: size of available data (for image validation) * @@ -913,9 +916,9 @@ static int rproc_release_carveout(struct rproc *rproc, * pressure is important; it may have a substantial impact on performance. */ static int rproc_handle_carveout(struct rproc *rproc, - struct fw_rsc_carveout *rsc, - int offset, int avail) + void *ptr, int offset, int avail) { + struct fw_rsc_carveout *rsc = ptr; struct rproc_mem_entry *carveout; struct device *dev = &rproc->dev; @@ -1097,10 +1100,10 @@ EXPORT_SYMBOL(rproc_of_parse_firmware); * enum fw_resource_type. */ static rproc_handle_resource_t rproc_loading_handlers[RSC_LAST] = { - [RSC_CARVEOUT] = (rproc_handle_resource_t)rproc_handle_carveout, - [RSC_DEVMEM] = (rproc_handle_resource_t)rproc_handle_devmem, - [RSC_TRACE] = (rproc_handle_resource_t)rproc_handle_trace, - [RSC_VDEV] = (rproc_handle_resource_t)rproc_handle_vdev, + [RSC_CARVEOUT] = rproc_handle_carveout, + [RSC_DEVMEM] = rproc_handle_devmem, + [RSC_TRACE] = rproc_handle_trace, + [RSC_VDEV] = rproc_handle_vdev, }; /* handle firmware resource entries before booting the remote processor */ From 9e4c31799cbdcf271b81e1ed169cd2c131c4e079 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sat, 6 Mar 2021 19:24:16 +0800 Subject: [PATCH 0031/1379] dt-bindings: remoteproc: convert imx rproc bindings to json-schema Convert the imx rproc binding to DT schema format using json-schema. Reviewed-by: Rob Herring Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1615029865-23312-2-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- .../bindings/remoteproc/fsl,imx-rproc.yaml | 61 +++++++++++++++++++ .../bindings/remoteproc/imx-rproc.txt | 33 ---------- 2 files changed, 61 insertions(+), 33 deletions(-) create mode 100644 Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml delete mode 100644 Documentation/devicetree/bindings/remoteproc/imx-rproc.txt diff --git a/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml b/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml new file mode 100644 index 000000000000..54d2456530a6 --- /dev/null +++ b/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/remoteproc/fsl,imx-rproc.yaml#" +$schema: "http://devicetree.org/meta-schemas/core.yaml#" + +title: NXP iMX6SX/iMX7D Co-Processor Bindings + +description: + This binding provides support for ARM Cortex M4 Co-processor found on some NXP iMX SoCs. + +maintainers: + - Peng Fan + +properties: + compatible: + enum: + - fsl,imx7d-cm4 + - fsl,imx6sx-cm4 + + clocks: + maxItems: 1 + + syscon: + $ref: /schemas/types.yaml#/definitions/phandle + description: + Phandle to syscon block which provide access to System Reset Controller + + memory-region: + description: + If present, a phandle for a reserved memory area that used for vdev buffer, + resource table, vring region and others used by remote processor. + minItems: 1 + maxItems: 32 + +required: + - compatible + - clocks + - syscon + +additionalProperties: false + +examples: + - | + #include + m4_reserved_sysmem1: cm4@80000000 { + reg = <0x80000000 0x80000>; + }; + + m4_reserved_sysmem2: cm4@81000000 { + reg = <0x81000000 0x80000>; + }; + + imx7d-cm4 { + compatible = "fsl,imx7d-cm4"; + memory-region = <&m4_reserved_sysmem1>, <&m4_reserved_sysmem2>; + syscon = <&src>; + clocks = <&clks IMX7D_ARM_M4_ROOT_CLK>; + }; + +... diff --git a/Documentation/devicetree/bindings/remoteproc/imx-rproc.txt b/Documentation/devicetree/bindings/remoteproc/imx-rproc.txt deleted file mode 100644 index fbcefd965dc4..000000000000 --- a/Documentation/devicetree/bindings/remoteproc/imx-rproc.txt +++ /dev/null @@ -1,33 +0,0 @@ -NXP iMX6SX/iMX7D Co-Processor Bindings ----------------------------------------- - -This binding provides support for ARM Cortex M4 Co-processor found on some -NXP iMX SoCs. - -Required properties: -- compatible Should be one of: - "fsl,imx7d-cm4" - "fsl,imx6sx-cm4" -- clocks Clock for co-processor (See: ../clock/clock-bindings.txt) -- syscon Phandle to syscon block which provide access to - System Reset Controller - -Optional properties: -- memory-region list of phandels to the reserved memory regions. - (See: ../reserved-memory/reserved-memory.txt) - -Example: - m4_reserved_sysmem1: cm4@80000000 { - reg = <0x80000000 0x80000>; - }; - - m4_reserved_sysmem2: cm4@81000000 { - reg = <0x81000000 0x80000>; - }; - - imx7d-cm4 { - compatible = "fsl,imx7d-cm4"; - memory-region = <&m4_reserved_sysmem1>, <&m4_reserved_sysmem2>; - syscon = <&src>; - clocks = <&clks IMX7D_ARM_M4_ROOT_CLK>; - }; From bc403b4dfcbcefe489c94df9e568b8f57c2aaae7 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sat, 6 Mar 2021 19:24:17 +0800 Subject: [PATCH 0032/1379] dt-bindings: remoteproc: imx_rproc: add i.MX8MQ/M support Add i.MX8MQ/M support, also include mailbox for rpmsg/virtio usage. Reviewed-by: Rob Herring Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1615029865-23312-3-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- .../bindings/remoteproc/fsl,imx-rproc.yaml | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml b/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml index 54d2456530a6..208a628f8d6c 100644 --- a/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml +++ b/Documentation/devicetree/bindings/remoteproc/fsl,imx-rproc.yaml @@ -4,7 +4,7 @@ $id: "http://devicetree.org/schemas/remoteproc/fsl,imx-rproc.yaml#" $schema: "http://devicetree.org/meta-schemas/core.yaml#" -title: NXP iMX6SX/iMX7D Co-Processor Bindings +title: NXP i.MX Co-Processor Bindings description: This binding provides support for ARM Cortex M4 Co-processor found on some NXP iMX SoCs. @@ -15,6 +15,8 @@ maintainers: properties: compatible: enum: + - fsl,imx8mq-cm4 + - fsl,imx8mm-cm4 - fsl,imx7d-cm4 - fsl,imx6sx-cm4 @@ -26,6 +28,20 @@ properties: description: Phandle to syscon block which provide access to System Reset Controller + mbox-names: + items: + - const: tx + - const: rx + - const: rxdb + + mboxes: + description: + This property is required only if the rpmsg/virtio functionality is used. + List of <&phandle type channel> - 1 channel for TX, 1 channel for RX, 1 channel for RXDB. + (see mailbox/fsl,mu.yaml) + minItems: 1 + maxItems: 3 + memory-region: description: If present, a phandle for a reserved memory area that used for vdev buffer, @@ -58,4 +74,17 @@ examples: clocks = <&clks IMX7D_ARM_M4_ROOT_CLK>; }; + - | + #include + + imx8mm-cm4 { + compatible = "fsl,imx8mm-cm4"; + clocks = <&clk IMX8MM_CLK_M4_DIV>; + mbox-names = "tx", "rx", "rxdb"; + mboxes = <&mu 0 1 + &mu 1 1 + &mu 3 1>; + memory-region = <&vdev0buffer>, <&vdev0vring0>, <&vdev0vring1>, <&rsc_table>; + syscon = <&src>; + }; ... From 2cfc056ef2c28b4961bff5e2f6deed94afb14024 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sat, 6 Mar 2021 19:24:18 +0800 Subject: [PATCH 0033/1379] remoteproc: introduce is_iomem to rproc_mem_entry Introduce is_iomem to indicate this piece memory is iomem or not. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1615029865-23312-4-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index f28ee75d1005..a5f6d2d9cde2 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -315,6 +315,7 @@ struct rproc; /** * struct rproc_mem_entry - memory entry descriptor * @va: virtual address + * @is_iomem: io memory * @dma: dma address * @len: length, in bytes * @da: device address @@ -329,6 +330,7 @@ struct rproc; */ struct rproc_mem_entry { void *va; + bool is_iomem; dma_addr_t dma; size_t len; u32 da; From 40df0a91b2a5228ded8e5f75b80d28c96c6831cd Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sat, 6 Mar 2021 19:24:19 +0800 Subject: [PATCH 0034/1379] remoteproc: add is_iomem to da_to_va Introduce an extra parameter is_iomem to da_to_va, then the caller could take the memory as normal memory or io mapped memory. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Reported-by: kernel test robot Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1615029865-23312-5-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/imx_rproc.c | 2 +- drivers/remoteproc/ingenic_rproc.c | 2 +- drivers/remoteproc/keystone_remoteproc.c | 2 +- drivers/remoteproc/mtk_scp.c | 6 +++--- drivers/remoteproc/omap_remoteproc.c | 2 +- drivers/remoteproc/pru_rproc.c | 2 +- drivers/remoteproc/qcom_q6v5_adsp.c | 2 +- drivers/remoteproc/qcom_q6v5_pas.c | 2 +- drivers/remoteproc/qcom_q6v5_wcss.c | 2 +- drivers/remoteproc/qcom_wcnss.c | 2 +- drivers/remoteproc/remoteproc_core.c | 7 +++++-- drivers/remoteproc/remoteproc_coredump.c | 8 ++++++-- drivers/remoteproc/remoteproc_debugfs.c | 2 +- drivers/remoteproc/remoteproc_elf_loader.c | 21 +++++++++++++++------ drivers/remoteproc/remoteproc_internal.h | 2 +- drivers/remoteproc/st_slim_rproc.c | 2 +- drivers/remoteproc/ti_k3_dsp_remoteproc.c | 2 +- drivers/remoteproc/ti_k3_r5_remoteproc.c | 2 +- drivers/remoteproc/wkup_m3_rproc.c | 2 +- include/linux/remoteproc.h | 2 +- 20 files changed, 45 insertions(+), 29 deletions(-) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 8957ed271d20..6603e00bb6f4 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -208,7 +208,7 @@ static int imx_rproc_da_to_sys(struct imx_rproc *priv, u64 da, return -ENOENT; } -static void *imx_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *imx_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct imx_rproc *priv = rproc->priv; void *va = NULL; diff --git a/drivers/remoteproc/ingenic_rproc.c b/drivers/remoteproc/ingenic_rproc.c index e2618c36eaab..a356738160a4 100644 --- a/drivers/remoteproc/ingenic_rproc.c +++ b/drivers/remoteproc/ingenic_rproc.c @@ -121,7 +121,7 @@ static void ingenic_rproc_kick(struct rproc *rproc, int vqid) writel(vqid, vpu->aux_base + REG_CORE_MSG); } -static void *ingenic_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *ingenic_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct vpu *vpu = rproc->priv; void __iomem *va = NULL; diff --git a/drivers/remoteproc/keystone_remoteproc.c b/drivers/remoteproc/keystone_remoteproc.c index cd266163a65f..54781f553f4e 100644 --- a/drivers/remoteproc/keystone_remoteproc.c +++ b/drivers/remoteproc/keystone_remoteproc.c @@ -246,7 +246,7 @@ static void keystone_rproc_kick(struct rproc *rproc, int vqid) * can be used either by the remoteproc core for loading (when using kernel * remoteproc loader), or by any rpmsg bus drivers. */ -static void *keystone_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *keystone_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct keystone_rproc *ksproc = rproc->priv; void __iomem *va = NULL; diff --git a/drivers/remoteproc/mtk_scp.c b/drivers/remoteproc/mtk_scp.c index ce727598c41c..9679cc26895e 100644 --- a/drivers/remoteproc/mtk_scp.c +++ b/drivers/remoteproc/mtk_scp.c @@ -272,7 +272,7 @@ static int scp_elf_load_segments(struct rproc *rproc, const struct firmware *fw) } /* grab the kernel address for this device address */ - ptr = (void __iomem *)rproc_da_to_va(rproc, da, memsz); + ptr = (void __iomem *)rproc_da_to_va(rproc, da, memsz, NULL); if (!ptr) { dev_err(dev, "bad phdr da 0x%x mem 0x%x\n", da, memsz); ret = -EINVAL; @@ -509,7 +509,7 @@ static void *mt8192_scp_da_to_va(struct mtk_scp *scp, u64 da, size_t len) return NULL; } -static void *scp_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *scp_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct mtk_scp *scp = (struct mtk_scp *)rproc->priv; @@ -627,7 +627,7 @@ void *scp_mapping_dm_addr(struct mtk_scp *scp, u32 mem_addr) { void *ptr; - ptr = scp_da_to_va(scp->rproc, mem_addr, 0); + ptr = scp_da_to_va(scp->rproc, mem_addr, 0, NULL); if (!ptr) return ERR_PTR(-EINVAL); diff --git a/drivers/remoteproc/omap_remoteproc.c b/drivers/remoteproc/omap_remoteproc.c index d94b7391bf9d..43531caa1959 100644 --- a/drivers/remoteproc/omap_remoteproc.c +++ b/drivers/remoteproc/omap_remoteproc.c @@ -728,7 +728,7 @@ out: * Return: translated virtual address in kernel memory space on success, * or NULL on failure. */ -static void *omap_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *omap_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct omap_rproc *oproc = rproc->priv; int i; diff --git a/drivers/remoteproc/pru_rproc.c b/drivers/remoteproc/pru_rproc.c index 16b204ed657b..d6086f90e809 100644 --- a/drivers/remoteproc/pru_rproc.c +++ b/drivers/remoteproc/pru_rproc.c @@ -465,7 +465,7 @@ static void *pru_i_da_to_va(struct pru_rproc *pru, u32 da, size_t len) * core for any PRU client drivers. The PRU Instruction RAM access is restricted * only to the PRU loader code. */ -static void *pru_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *pru_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct pru_rproc *pru = rproc->priv; diff --git a/drivers/remoteproc/qcom_q6v5_adsp.c b/drivers/remoteproc/qcom_q6v5_adsp.c index e02450225e4a..8b0d8bbacd2e 100644 --- a/drivers/remoteproc/qcom_q6v5_adsp.c +++ b/drivers/remoteproc/qcom_q6v5_adsp.c @@ -281,7 +281,7 @@ static int adsp_stop(struct rproc *rproc) return ret; } -static void *adsp_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *adsp_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct qcom_adsp *adsp = (struct qcom_adsp *)rproc->priv; int offset; diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c index e635454d6170..ef85b5511dc9 100644 --- a/drivers/remoteproc/qcom_q6v5_pas.c +++ b/drivers/remoteproc/qcom_q6v5_pas.c @@ -242,7 +242,7 @@ static int adsp_stop(struct rproc *rproc) return ret; } -static void *adsp_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *adsp_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct qcom_adsp *adsp = (struct qcom_adsp *)rproc->priv; int offset; diff --git a/drivers/remoteproc/qcom_q6v5_wcss.c b/drivers/remoteproc/qcom_q6v5_wcss.c index 78ebe1168b33..704cd63c9af4 100644 --- a/drivers/remoteproc/qcom_q6v5_wcss.c +++ b/drivers/remoteproc/qcom_q6v5_wcss.c @@ -410,7 +410,7 @@ static int q6v5_wcss_stop(struct rproc *rproc) return 0; } -static void *q6v5_wcss_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *q6v5_wcss_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct q6v5_wcss *wcss = rproc->priv; int offset; diff --git a/drivers/remoteproc/qcom_wcnss.c b/drivers/remoteproc/qcom_wcnss.c index 2a6a23cb14ca..3a131163064c 100644 --- a/drivers/remoteproc/qcom_wcnss.c +++ b/drivers/remoteproc/qcom_wcnss.c @@ -320,7 +320,7 @@ static int wcnss_stop(struct rproc *rproc) return ret; } -static void *wcnss_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *wcnss_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct qcom_wcnss *wcnss = (struct qcom_wcnss *)rproc->priv; int offset; diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 553e42a4d2a0..5071cdbfc926 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -189,13 +189,13 @@ EXPORT_SYMBOL(rproc_va_to_pa); * here the output of the DMA API for the carveouts, which should be more * correct. */ -void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len) +void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct rproc_mem_entry *carveout; void *ptr = NULL; if (rproc->ops->da_to_va) { - ptr = rproc->ops->da_to_va(rproc, da, len); + ptr = rproc->ops->da_to_va(rproc, da, len, is_iomem); if (ptr) goto out; } @@ -217,6 +217,9 @@ void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len) ptr = carveout->va + offset; + if (is_iomem) + *is_iomem = carveout->is_iomem; + break; } diff --git a/drivers/remoteproc/remoteproc_coredump.c b/drivers/remoteproc/remoteproc_coredump.c index 81ec154a6a5e..aee657cc08c6 100644 --- a/drivers/remoteproc/remoteproc_coredump.c +++ b/drivers/remoteproc/remoteproc_coredump.c @@ -153,18 +153,22 @@ static void rproc_copy_segment(struct rproc *rproc, void *dest, size_t offset, size_t size) { void *ptr; + bool is_iomem; if (segment->dump) { segment->dump(rproc, segment, dest, offset, size); } else { - ptr = rproc_da_to_va(rproc, segment->da + offset, size); + ptr = rproc_da_to_va(rproc, segment->da + offset, size, &is_iomem); if (!ptr) { dev_err(&rproc->dev, "invalid copy request for segment %pad with offset %zu and size %zu)\n", &segment->da, offset, size); memset(dest, 0xff, size); } else { - memcpy(dest, ptr, size); + if (is_iomem) + memcpy_fromio(dest, ptr, size); + else + memcpy(dest, ptr, size); } } } diff --git a/drivers/remoteproc/remoteproc_debugfs.c b/drivers/remoteproc/remoteproc_debugfs.c index 7e5845376e9f..b5a1e3b697d9 100644 --- a/drivers/remoteproc/remoteproc_debugfs.c +++ b/drivers/remoteproc/remoteproc_debugfs.c @@ -132,7 +132,7 @@ static ssize_t rproc_trace_read(struct file *filp, char __user *userbuf, char buf[100]; int len; - va = rproc_da_to_va(data->rproc, trace->da, trace->len); + va = rproc_da_to_va(data->rproc, trace->da, trace->len, NULL); if (!va) { len = scnprintf(buf, sizeof(buf), "Trace %s not available\n", diff --git a/drivers/remoteproc/remoteproc_elf_loader.c b/drivers/remoteproc/remoteproc_elf_loader.c index df68d87752e4..11423588965a 100644 --- a/drivers/remoteproc/remoteproc_elf_loader.c +++ b/drivers/remoteproc/remoteproc_elf_loader.c @@ -175,6 +175,7 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw) u64 offset = elf_phdr_get_p_offset(class, phdr); u32 type = elf_phdr_get_p_type(class, phdr); void *ptr; + bool is_iomem; if (type != PT_LOAD) continue; @@ -204,7 +205,7 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw) } /* grab the kernel address for this device address */ - ptr = rproc_da_to_va(rproc, da, memsz); + ptr = rproc_da_to_va(rproc, da, memsz, &is_iomem); if (!ptr) { dev_err(dev, "bad phdr da 0x%llx mem 0x%llx\n", da, memsz); @@ -213,8 +214,12 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw) } /* put the segment where the remote processor expects it */ - if (filesz) - memcpy(ptr, elf_data + offset, filesz); + if (filesz) { + if (is_iomem) + memcpy_fromio(ptr, (void __iomem *)(elf_data + offset), filesz); + else + memcpy(ptr, elf_data + offset, filesz); + } /* * Zero out remaining memory for this segment. @@ -223,8 +228,12 @@ int rproc_elf_load_segments(struct rproc *rproc, const struct firmware *fw) * did this for us. albeit harmless, we may consider removing * this. */ - if (memsz > filesz) - memset(ptr + filesz, 0, memsz - filesz); + if (memsz > filesz) { + if (is_iomem) + memset_io((void __iomem *)(ptr + filesz), 0, memsz - filesz); + else + memset(ptr + filesz, 0, memsz - filesz); + } } return ret; @@ -377,6 +386,6 @@ struct resource_table *rproc_elf_find_loaded_rsc_table(struct rproc *rproc, return NULL; } - return rproc_da_to_va(rproc, sh_addr, sh_size); + return rproc_da_to_va(rproc, sh_addr, sh_size, NULL); } EXPORT_SYMBOL(rproc_elf_find_loaded_rsc_table); diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h index c34002888d2c..9ea37aa687d2 100644 --- a/drivers/remoteproc/remoteproc_internal.h +++ b/drivers/remoteproc/remoteproc_internal.h @@ -84,7 +84,7 @@ static inline void rproc_char_device_remove(struct rproc *rproc) void rproc_free_vring(struct rproc_vring *rvring); int rproc_alloc_vring(struct rproc_vdev *rvdev, int i); -void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len); +void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem); phys_addr_t rproc_va_to_pa(void *cpu_addr); int rproc_trigger_recovery(struct rproc *rproc); diff --git a/drivers/remoteproc/st_slim_rproc.c b/drivers/remoteproc/st_slim_rproc.c index 09bcb4d8b9e0..22096adc1ad3 100644 --- a/drivers/remoteproc/st_slim_rproc.c +++ b/drivers/remoteproc/st_slim_rproc.c @@ -174,7 +174,7 @@ static int slim_rproc_stop(struct rproc *rproc) return 0; } -static void *slim_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *slim_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct st_slim_rproc *slim_rproc = rproc->priv; void *va = NULL; diff --git a/drivers/remoteproc/ti_k3_dsp_remoteproc.c b/drivers/remoteproc/ti_k3_dsp_remoteproc.c index 863c0214e0a8..fd4eb67a6681 100644 --- a/drivers/remoteproc/ti_k3_dsp_remoteproc.c +++ b/drivers/remoteproc/ti_k3_dsp_remoteproc.c @@ -354,7 +354,7 @@ static int k3_dsp_rproc_stop(struct rproc *rproc) * can be used either by the remoteproc core for loading (when using kernel * remoteproc loader), or by any rpmsg bus drivers. */ -static void *k3_dsp_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *k3_dsp_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct k3_dsp_rproc *kproc = rproc->priv; void __iomem *va = NULL; diff --git a/drivers/remoteproc/ti_k3_r5_remoteproc.c b/drivers/remoteproc/ti_k3_r5_remoteproc.c index 62b5a4c29456..5cf8d030a1f0 100644 --- a/drivers/remoteproc/ti_k3_r5_remoteproc.c +++ b/drivers/remoteproc/ti_k3_r5_remoteproc.c @@ -590,7 +590,7 @@ out: * present in a DSP or IPU device). The translated addresses can be used * either by the remoteproc core for loading, or by any rpmsg bus drivers. */ -static void *k3_r5_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *k3_r5_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct k3_r5_rproc *kproc = rproc->priv; struct k3_r5_core *core = kproc->core; diff --git a/drivers/remoteproc/wkup_m3_rproc.c b/drivers/remoteproc/wkup_m3_rproc.c index 92d387dfc03b..484f7605823e 100644 --- a/drivers/remoteproc/wkup_m3_rproc.c +++ b/drivers/remoteproc/wkup_m3_rproc.c @@ -89,7 +89,7 @@ static int wkup_m3_rproc_stop(struct rproc *rproc) return error; } -static void *wkup_m3_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len) +static void *wkup_m3_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *is_iomem) { struct wkup_m3_rproc *wkupm3 = rproc->priv; void *va = NULL; diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index a5f6d2d9cde2..1b7d56c7a453 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -386,7 +386,7 @@ struct rproc_ops { int (*stop)(struct rproc *rproc); int (*attach)(struct rproc *rproc); void (*kick)(struct rproc *rproc, int vqid); - void * (*da_to_va)(struct rproc *rproc, u64 da, size_t len); + void * (*da_to_va)(struct rproc *rproc, u64 da, size_t len, bool *is_iomem); int (*parse_fw)(struct rproc *rproc, const struct firmware *fw); int (*handle_rsc)(struct rproc *rproc, u32 rsc_type, void *rsc, int offset, int avail); From 1896b3d82c555eaec6f6bde0c8d12377060bb22d Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sat, 6 Mar 2021 19:24:20 +0800 Subject: [PATCH 0035/1379] remoteproc: imx_rproc: correct err message It is using devm_ioremap, so not devm_ioremap_resource. Correct the error message and print out sa/size. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1615029865-23312-6-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/imx_rproc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 6603e00bb6f4..2a093cea4997 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -268,7 +268,7 @@ static int imx_rproc_addr_init(struct imx_rproc *priv, priv->mem[b].cpu_addr = devm_ioremap(&pdev->dev, att->sa, att->size); if (!priv->mem[b].cpu_addr) { - dev_err(dev, "devm_ioremap_resource failed\n"); + dev_err(dev, "failed to remap %#x bytes from %#x\n", att->size, att->sa); return -ENOMEM; } priv->mem[b].sys_addr = att->sa; @@ -298,7 +298,7 @@ static int imx_rproc_addr_init(struct imx_rproc *priv, priv->mem[b].cpu_addr = devm_ioremap_resource(&pdev->dev, &res); if (IS_ERR(priv->mem[b].cpu_addr)) { - dev_err(dev, "devm_ioremap_resource failed\n"); + dev_err(dev, "failed to remap %pr\n", &res); err = PTR_ERR(priv->mem[b].cpu_addr); return err; } From ecadcc47492cc73a9bb92fbf16098192df514b87 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sat, 6 Mar 2021 19:24:21 +0800 Subject: [PATCH 0036/1379] remoteproc: imx_rproc: use devm_ioremap We might need to map an region multiple times, becaue the region might be shared between remote processors, such i.MX8QM with dual M4 cores. So use devm_ioremap, not devm_ioremap_resource. Reviewed-by: Oleksij Rempel Reviewed-by: Richard Zhu Signed-off-by: Peng Fan Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/1615029865-23312-7-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/imx_rproc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 2a093cea4997..47fc1d06be6a 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -296,7 +296,8 @@ static int imx_rproc_addr_init(struct imx_rproc *priv, if (b >= IMX7D_RPROC_MEM_MAX) break; - priv->mem[b].cpu_addr = devm_ioremap_resource(&pdev->dev, &res); + /* Not use resource version, because we might share region */ + priv->mem[b].cpu_addr = devm_ioremap(&pdev->dev, res.start, resource_size(&res)); if (IS_ERR(priv->mem[b].cpu_addr)) { dev_err(dev, "failed to remap %pr\n", &res); err = PTR_ERR(priv->mem[b].cpu_addr); From b29b4249f8f0cad1a1787cbe59e638ff23d489ed Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sat, 6 Mar 2021 19:24:22 +0800 Subject: [PATCH 0037/1379] remoteproc: imx_rproc: add i.MX specific parse fw hook The hook is used to parse memory-regions and load resource table from the address the remote processor published. Reviewed-by: Richard Zhu Reviewed-by: Mathieu Poirier Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1615029865-23312-8-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/imx_rproc.c | 93 ++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 47fc1d06be6a..5ae1f5209548 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -241,10 +242,102 @@ static void *imx_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *i return va; } +static int imx_rproc_mem_alloc(struct rproc *rproc, + struct rproc_mem_entry *mem) +{ + struct device *dev = rproc->dev.parent; + void *va; + + dev_dbg(dev, "map memory: %p+%zx\n", &mem->dma, mem->len); + va = ioremap_wc(mem->dma, mem->len); + if (IS_ERR_OR_NULL(va)) { + dev_err(dev, "Unable to map memory region: %p+%zx\n", + &mem->dma, mem->len); + return -ENOMEM; + } + + /* Update memory entry va */ + mem->va = va; + + return 0; +} + +static int imx_rproc_mem_release(struct rproc *rproc, + struct rproc_mem_entry *mem) +{ + dev_dbg(rproc->dev.parent, "unmap memory: %pa\n", &mem->dma); + iounmap(mem->va); + + return 0; +} + +static int imx_rproc_parse_memory_regions(struct rproc *rproc) +{ + struct imx_rproc *priv = rproc->priv; + struct device_node *np = priv->dev->of_node; + struct of_phandle_iterator it; + struct rproc_mem_entry *mem; + struct reserved_mem *rmem; + u32 da; + + /* Register associated reserved memory regions */ + of_phandle_iterator_init(&it, np, "memory-region", NULL, 0); + while (of_phandle_iterator_next(&it) == 0) { + /* + * Ignore the first memory region which will be used vdev buffer. + * No need to do extra handlings, rproc_add_virtio_dev will handle it. + */ + if (!strcmp(it.node->name, "vdev0buffer")) + continue; + + rmem = of_reserved_mem_lookup(it.node); + if (!rmem) { + dev_err(priv->dev, "unable to acquire memory-region\n"); + return -EINVAL; + } + + /* No need to translate pa to da, i.MX use same map */ + da = rmem->base; + + /* Register memory region */ + mem = rproc_mem_entry_init(priv->dev, NULL, (dma_addr_t)rmem->base, rmem->size, da, + imx_rproc_mem_alloc, imx_rproc_mem_release, + it.node->name); + + if (mem) + rproc_coredump_add_segment(rproc, da, rmem->size); + else + return -ENOMEM; + + rproc_add_carveout(rproc, mem); + } + + return 0; +} + +static int imx_rproc_parse_fw(struct rproc *rproc, const struct firmware *fw) +{ + int ret = imx_rproc_parse_memory_regions(rproc); + + if (ret) + return ret; + + ret = rproc_elf_load_rsc_table(rproc, fw); + if (ret) + dev_info(&rproc->dev, "No resource table in elf\n"); + + return 0; +} + static const struct rproc_ops imx_rproc_ops = { .start = imx_rproc_start, .stop = imx_rproc_stop, .da_to_va = imx_rproc_da_to_va, + .load = rproc_elf_load_segments, + .parse_fw = imx_rproc_parse_fw, + .find_loaded_rsc_table = rproc_elf_find_loaded_rsc_table, + .sanity_check = rproc_elf_sanity_check, + .get_boot_addr = rproc_elf_get_boot_addr, }; static int imx_rproc_addr_init(struct imx_rproc *priv, From 4ab8f9607aad6323826c9b945dee52e565975fcc Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sat, 6 Mar 2021 19:24:23 +0800 Subject: [PATCH 0038/1379] remoteproc: imx_rproc: support i.MX8MQ/M Add i.MX8MQ dev/sys addr map and configuration data structure i.MX8MM share i.MX8MQ settings. Reviewed-by: Richard Zhu Reviewed-by: Mathieu Poirier Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1615029865-23312-9-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/Kconfig | 6 ++--- drivers/remoteproc/imx_rproc.c | 41 +++++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig index 15d1574d129b..7cf3d1b40c55 100644 --- a/drivers/remoteproc/Kconfig +++ b/drivers/remoteproc/Kconfig @@ -24,11 +24,11 @@ config REMOTEPROC_CDEV It's safe to say N if you don't want to use this interface. config IMX_REMOTEPROC - tristate "IMX6/7 remoteproc support" + tristate "i.MX remoteproc support" depends on ARCH_MXC help - Say y here to support iMX's remote processors (Cortex M4 - on iMX7D) via the remote processor framework. + Say y here to support iMX's remote processors via the remote + processor framework. It's safe to say N here. diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 5ae1f5209548..0124ebf69838 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -88,6 +88,34 @@ struct imx_rproc { struct clk *clk; }; +static const struct imx_rproc_att imx_rproc_att_imx8mq[] = { + /* dev addr , sys addr , size , flags */ + /* TCML - alias */ + { 0x00000000, 0x007e0000, 0x00020000, 0 }, + /* OCRAM_S */ + { 0x00180000, 0x00180000, 0x00008000, 0 }, + /* OCRAM */ + { 0x00900000, 0x00900000, 0x00020000, 0 }, + /* OCRAM */ + { 0x00920000, 0x00920000, 0x00020000, 0 }, + /* QSPI Code - alias */ + { 0x08000000, 0x08000000, 0x08000000, 0 }, + /* DDR (Code) - alias */ + { 0x10000000, 0x80000000, 0x0FFE0000, 0 }, + /* TCML */ + { 0x1FFE0000, 0x007E0000, 0x00020000, ATT_OWN }, + /* TCMU */ + { 0x20000000, 0x00800000, 0x00020000, ATT_OWN }, + /* OCRAM_S */ + { 0x20180000, 0x00180000, 0x00008000, ATT_OWN }, + /* OCRAM */ + { 0x20200000, 0x00900000, 0x00020000, ATT_OWN }, + /* OCRAM */ + { 0x20220000, 0x00920000, 0x00020000, ATT_OWN }, + /* DDR (Data) */ + { 0x40000000, 0x40000000, 0x80000000, 0 }, +}; + static const struct imx_rproc_att imx_rproc_att_imx7d[] = { /* dev addr , sys addr , size , flags */ /* OCRAM_S (M4 Boot code) - alias */ @@ -138,6 +166,15 @@ static const struct imx_rproc_att imx_rproc_att_imx6sx[] = { { 0x80000000, 0x80000000, 0x60000000, 0 }, }; +static const struct imx_rproc_dcfg imx_rproc_cfg_imx8mq = { + .src_reg = IMX7D_SRC_SCR, + .src_mask = IMX7D_M4_RST_MASK, + .src_start = IMX7D_M4_START, + .src_stop = IMX7D_M4_STOP, + .att = imx_rproc_att_imx8mq, + .att_size = ARRAY_SIZE(imx_rproc_att_imx8mq), +}; + static const struct imx_rproc_dcfg imx_rproc_cfg_imx7d = { .src_reg = IMX7D_SRC_SCR, .src_mask = IMX7D_M4_RST_MASK, @@ -496,6 +533,8 @@ static int imx_rproc_remove(struct platform_device *pdev) static const struct of_device_id imx_rproc_of_match[] = { { .compatible = "fsl,imx7d-cm4", .data = &imx_rproc_cfg_imx7d }, { .compatible = "fsl,imx6sx-cm4", .data = &imx_rproc_cfg_imx6sx }, + { .compatible = "fsl,imx8mq-cm4", .data = &imx_rproc_cfg_imx8mq }, + { .compatible = "fsl,imx8mm-cm4", .data = &imx_rproc_cfg_imx8mq }, {}, }; MODULE_DEVICE_TABLE(of, imx_rproc_of_match); @@ -512,5 +551,5 @@ static struct platform_driver imx_rproc_driver = { module_platform_driver(imx_rproc_driver); MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("IMX6SX/7D remote processor control driver"); +MODULE_DESCRIPTION("i.MX remote processor control driver"); MODULE_AUTHOR("Oleksij Rempel "); From 8f2d8961640f0346cbe892273c3260a0d30c1931 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sat, 6 Mar 2021 19:24:24 +0800 Subject: [PATCH 0039/1379] remoteproc: imx_rproc: ignore mapping vdev regions vdev regions are vdev0vring0, vdev0vring1, vdevbuffer and similar. They are handled by remoteproc common code, no need to map in imx rproc driver. Signed-off-by: Peng Fan Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/1615029865-23312-10-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/imx_rproc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 0124ebf69838..3685bbd135b0 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -417,6 +417,9 @@ static int imx_rproc_addr_init(struct imx_rproc *priv, struct resource res; node = of_parse_phandle(np, "memory-region", a); + /* Not map vdev region */ + if (!strcmp(node->name, "vdev")) + continue; err = of_address_to_resource(node, 0, &res); if (err) { dev_err(dev, "unable to resolve memory region\n"); From 2df7062002d0263bde70b453f671bb4f8493e169 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Sat, 6 Mar 2021 19:24:25 +0800 Subject: [PATCH 0040/1379] remoteproc: imx_proc: enable virtio/mailbox Use virtio/mailbox to build connection between Remote Proccessors and Linux. Add work queue to handle incoming messages. Reviewed-by: Richard Zhu Reviewed-by: Mathieu Poirier Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1615029865-23312-11-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/imx_rproc.c | 116 ++++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 3 deletions(-) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 3685bbd135b0..5ebb9f57d3e0 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -15,6 +16,9 @@ #include #include #include +#include + +#include "remoteproc_internal.h" #define IMX7D_SRC_SCR 0x0C #define IMX7D_ENABLE_M4 BIT(3) @@ -86,6 +90,11 @@ struct imx_rproc { const struct imx_rproc_dcfg *dcfg; struct imx_rproc_mem mem[IMX7D_RPROC_MEM_MAX]; struct clk *clk; + struct mbox_client cl; + struct mbox_chan *tx_ch; + struct mbox_chan *rx_ch; + struct work_struct rproc_work; + struct workqueue_struct *workqueue; }; static const struct imx_rproc_att imx_rproc_att_imx8mq[] = { @@ -366,9 +375,33 @@ static int imx_rproc_parse_fw(struct rproc *rproc, const struct firmware *fw) return 0; } +static void imx_rproc_kick(struct rproc *rproc, int vqid) +{ + struct imx_rproc *priv = rproc->priv; + int err; + __u32 mmsg; + + if (!priv->tx_ch) { + dev_err(priv->dev, "No initialized mbox tx channel\n"); + return; + } + + /* + * Send the index of the triggered virtqueue as the mu payload. + * Let remote processor know which virtqueue is used. + */ + mmsg = vqid << 16; + + err = mbox_send_message(priv->tx_ch, (void *)&mmsg); + if (err < 0) + dev_err(priv->dev, "%s: failed (%d, err:%d)\n", + __func__, vqid, err); +} + static const struct rproc_ops imx_rproc_ops = { .start = imx_rproc_start, .stop = imx_rproc_stop, + .kick = imx_rproc_kick, .da_to_va = imx_rproc_da_to_va, .load = rproc_elf_load_segments, .parse_fw = imx_rproc_parse_fw, @@ -444,6 +477,66 @@ static int imx_rproc_addr_init(struct imx_rproc *priv, return 0; } +static void imx_rproc_vq_work(struct work_struct *work) +{ + struct imx_rproc *priv = container_of(work, struct imx_rproc, + rproc_work); + + rproc_vq_interrupt(priv->rproc, 0); + rproc_vq_interrupt(priv->rproc, 1); +} + +static void imx_rproc_rx_callback(struct mbox_client *cl, void *msg) +{ + struct rproc *rproc = dev_get_drvdata(cl->dev); + struct imx_rproc *priv = rproc->priv; + + queue_work(priv->workqueue, &priv->rproc_work); +} + +static int imx_rproc_xtr_mbox_init(struct rproc *rproc) +{ + struct imx_rproc *priv = rproc->priv; + struct device *dev = priv->dev; + struct mbox_client *cl; + int ret; + + if (!of_get_property(dev->of_node, "mbox-names", NULL)) + return 0; + + cl = &priv->cl; + cl->dev = dev; + cl->tx_block = true; + cl->tx_tout = 100; + cl->knows_txdone = false; + cl->rx_callback = imx_rproc_rx_callback; + + priv->tx_ch = mbox_request_channel_byname(cl, "tx"); + if (IS_ERR(priv->tx_ch)) { + ret = PTR_ERR(priv->tx_ch); + return dev_err_probe(cl->dev, ret, + "failed to request tx mailbox channel: %d\n", ret); + } + + priv->rx_ch = mbox_request_channel_byname(cl, "rx"); + if (IS_ERR(priv->rx_ch)) { + mbox_free_channel(priv->tx_ch); + ret = PTR_ERR(priv->rx_ch); + return dev_err_probe(cl->dev, ret, + "failed to request rx mailbox channel: %d\n", ret); + } + + return 0; +} + +static void imx_rproc_free_mbox(struct rproc *rproc) +{ + struct imx_rproc *priv = rproc->priv; + + mbox_free_channel(priv->tx_ch); + mbox_free_channel(priv->rx_ch); +} + static int imx_rproc_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -481,18 +574,28 @@ static int imx_rproc_probe(struct platform_device *pdev) priv->dev = dev; dev_set_drvdata(dev, rproc); + priv->workqueue = create_workqueue(dev_name(dev)); + if (!priv->workqueue) { + dev_err(dev, "cannot create workqueue\n"); + ret = -ENOMEM; + goto err_put_rproc; + } + + ret = imx_rproc_xtr_mbox_init(rproc); + if (ret) + goto err_put_wkq; ret = imx_rproc_addr_init(priv, pdev); if (ret) { dev_err(dev, "failed on imx_rproc_addr_init\n"); - goto err_put_rproc; + goto err_put_mbox; } priv->clk = devm_clk_get(dev, NULL); if (IS_ERR(priv->clk)) { dev_err(dev, "Failed to get clock\n"); ret = PTR_ERR(priv->clk); - goto err_put_rproc; + goto err_put_mbox; } /* @@ -502,9 +605,11 @@ static int imx_rproc_probe(struct platform_device *pdev) ret = clk_prepare_enable(priv->clk); if (ret) { dev_err(&rproc->dev, "Failed to enable clock\n"); - goto err_put_rproc; + goto err_put_mbox; } + INIT_WORK(&priv->rproc_work, imx_rproc_vq_work); + ret = rproc_add(rproc); if (ret) { dev_err(dev, "rproc_add failed\n"); @@ -515,6 +620,10 @@ static int imx_rproc_probe(struct platform_device *pdev) err_put_clk: clk_disable_unprepare(priv->clk); +err_put_mbox: + imx_rproc_free_mbox(rproc); +err_put_wkq: + destroy_workqueue(priv->workqueue); err_put_rproc: rproc_free(rproc); @@ -528,6 +637,7 @@ static int imx_rproc_remove(struct platform_device *pdev) clk_disable_unprepare(priv->clk); rproc_del(rproc); + imx_rproc_free_mbox(rproc); rproc_free(rproc); return 0; From 59521c3c4b902b3749103cc7b8e64579317173e9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Mar 2021 16:24:46 +0100 Subject: [PATCH 0041/1379] PCI: al: Select CONFIG_PCI_ECAM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compile-testing this driver without ECAM support results in a link failure: ld.lld: error: undefined symbol: pci_ecam_map_bus >>> referenced by pcie-al.c >>> pci/controller/dwc/pcie-al.o:(al_pcie_map_bus) in archive drivers/built-in.a Select CONFIG_ECAM like the other drivers do. Link: https://lore.kernel.org/r/20210308152501.2135937-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- drivers/pci/controller/dwc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 22c5529e9a65..f4b589f16e74 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -311,6 +311,7 @@ config PCIE_AL depends on OF && (ARM64 || COMPILE_TEST) depends on PCI_MSI_IRQ_DOMAIN select PCIE_DW_HOST + select PCI_ECAM help Say Y here to enable support of the Amazon's Annapurna Labs PCIe controller IP on Amazon SoCs. The PCIe controller uses the DesignWare From 16f7ae5906dfbeff54f74ec75d0563bb3a87ab0b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Mar 2021 16:24:47 +0100 Subject: [PATCH 0042/1379] PCI: thunder: Fix compile testing Compile-testing these drivers is currently broken. Enabling it causes a couple of build failures though: drivers/pci/controller/pci-thunder-ecam.c:119:30: error: shift count >= width of type [-Werror,-Wshift-count-overflow] drivers/pci/controller/pci-thunder-pem.c:54:2: error: implicit declaration of function 'writeq' [-Werror,-Wimplicit-function-declaration] drivers/pci/controller/pci-thunder-pem.c:392:8: error: implicit declaration of function 'acpi_get_rc_resources' [-Werror,-Wimplicit-function-declaration] Fix them with the obvious one-line changes. Link: https://lore.kernel.org/r/20210308152501.2135937-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Bjorn Helgaas Reviewed-by: Kuppuswamy Sathyanarayanan Reviewed-by: Robert Richter --- drivers/pci/controller/pci-thunder-ecam.c | 2 +- drivers/pci/controller/pci-thunder-pem.c | 13 +++++++------ drivers/pci/pci.h | 6 ++++++ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/pci-thunder-ecam.c b/drivers/pci/controller/pci-thunder-ecam.c index f964fd26f7e0..ffd84656544f 100644 --- a/drivers/pci/controller/pci-thunder-ecam.c +++ b/drivers/pci/controller/pci-thunder-ecam.c @@ -116,7 +116,7 @@ static int thunder_ecam_p2_config_read(struct pci_bus *bus, unsigned int devfn, * the config space access window. Since we are working with * the high-order 32 bits, shift everything down by 32 bits. */ - node_bits = (cfg->res.start >> 32) & (1 << 12); + node_bits = upper_32_bits(cfg->res.start) & (1 << 12); v |= node_bits; set_val(v, where, size, val); diff --git a/drivers/pci/controller/pci-thunder-pem.c b/drivers/pci/controller/pci-thunder-pem.c index 1a3f70ac61fc..0660b9da204f 100644 --- a/drivers/pci/controller/pci-thunder-pem.c +++ b/drivers/pci/controller/pci-thunder-pem.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "../pci.h" #if defined(CONFIG_PCI_HOST_THUNDER_PEM) || (defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS)) @@ -324,9 +325,9 @@ static int thunder_pem_init(struct device *dev, struct pci_config_window *cfg, * structure here for the BAR. */ bar4_start = res_pem->start + 0xf00000; - pem_pci->ea_entry[0] = (u32)bar4_start | 2; - pem_pci->ea_entry[1] = (u32)(res_pem->end - bar4_start) & ~3u; - pem_pci->ea_entry[2] = (u32)(bar4_start >> 32); + pem_pci->ea_entry[0] = lower_32_bits(bar4_start) | 2; + pem_pci->ea_entry[1] = lower_32_bits(res_pem->end - bar4_start) & ~3u; + pem_pci->ea_entry[2] = upper_32_bits(bar4_start); cfg->priv = pem_pci; return 0; @@ -334,9 +335,9 @@ static int thunder_pem_init(struct device *dev, struct pci_config_window *cfg, #if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) -#define PEM_RES_BASE 0x87e0c0000000UL -#define PEM_NODE_MASK GENMASK(45, 44) -#define PEM_INDX_MASK GENMASK(26, 24) +#define PEM_RES_BASE 0x87e0c0000000ULL +#define PEM_NODE_MASK GENMASK_ULL(45, 44) +#define PEM_INDX_MASK GENMASK_ULL(26, 24) #define PEM_MIN_DOM_IN_NODE 4 #define PEM_MAX_DOM_IN_NODE 10 diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ef7c4661314f..9684b468267f 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -624,6 +624,12 @@ static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe) #if defined(CONFIG_PCI_QUIRKS) && defined(CONFIG_ARM64) int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment, struct resource *res); +#else +static inline int acpi_get_rc_resources(struct device *dev, const char *hid, + u16 segment, struct resource *res) +{ + return -ENODEV; +} #endif int pci_rebar_get_current_size(struct pci_dev *pdev, int bar); From 6e5a1fff9096ecd259dedcbbdc812aa90986a40e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Mar 2021 16:24:48 +0100 Subject: [PATCH 0043/1379] PCI: Avoid building empty drivers There are harmless warnings when compile testing the kernel with CONFIG_TRIM_UNUSED_KSYMS: drivers/pci/controller/dwc/pcie-al.o: no symbols drivers/pci/controller/pci-thunder-ecam.o: no symbols drivers/pci/controller/pci-thunder-pem.o: no symbols The problem here is that the host drivers get built even when the configuration symbols are all disabled, as they pretend to not be drivers but are silently enabled because of the promise that ACPI-based systems need no drivers. Add back the normal symbols to have these drivers built, and change the logic to otherwise only build them when both CONFIG_PCI_QUIRKS and CONFIG_ACPI are enabled. As a side-effect, this enables compile-testing the drivers on other architectures, which in turn needs the acpi_get_rc_resources() function to be defined. Link: https://lore.kernel.org/r/20210308152501.2135937-3-arnd@kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Bjorn Helgaas Reviewed-by: Robert Richter --- drivers/pci/controller/Makefile | 7 ++++++- drivers/pci/controller/dwc/Makefile | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile index e4559f2182f2..6d24a163033f 100644 --- a/drivers/pci/controller/Makefile +++ b/drivers/pci/controller/Makefile @@ -11,10 +11,13 @@ obj-$(CONFIG_PCIE_RCAR_HOST) += pcie-rcar.o pcie-rcar-host.o obj-$(CONFIG_PCIE_RCAR_EP) += pcie-rcar.o pcie-rcar-ep.o obj-$(CONFIG_PCI_HOST_COMMON) += pci-host-common.o obj-$(CONFIG_PCI_HOST_GENERIC) += pci-host-generic.o +obj-$(CONFIG_PCI_HOST_THUNDER_ECAM) += pci-thunder-ecam.o +obj-$(CONFIG_PCI_HOST_THUNDER_PEM) += pci-thunder-pem.o obj-$(CONFIG_PCIE_XILINX) += pcie-xilinx.o obj-$(CONFIG_PCIE_XILINX_NWL) += pcie-xilinx-nwl.o obj-$(CONFIG_PCIE_XILINX_CPM) += pcie-xilinx-cpm.o obj-$(CONFIG_PCI_V3_SEMI) += pci-v3-semi.o +obj-$(CONFIG_PCI_XGENE) += pci-xgene.o obj-$(CONFIG_PCI_XGENE_MSI) += pci-xgene-msi.o obj-$(CONFIG_PCI_VERSATILE) += pci-versatile.o obj-$(CONFIG_PCIE_IPROC) += pcie-iproc.o @@ -47,8 +50,10 @@ obj-y += mobiveil/ # ARM64 and use internal ifdefs to only build the pieces we need # depending on whether ACPI, the DT driver, or both are enabled. -ifdef CONFIG_PCI +ifdef CONFIG_ACPI +ifdef CONFIG_PCI_QUIRKS obj-$(CONFIG_ARM64) += pci-thunder-ecam.o obj-$(CONFIG_ARM64) += pci-thunder-pem.o obj-$(CONFIG_ARM64) += pci-xgene.o endif +endif diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile index a751553fa0db..ba7c42f6df6f 100644 --- a/drivers/pci/controller/dwc/Makefile +++ b/drivers/pci/controller/dwc/Makefile @@ -31,7 +31,12 @@ obj-$(CONFIG_PCIE_UNIPHIER_EP) += pcie-uniphier-ep.o # ARM64 and use internal ifdefs to only build the pieces we need # depending on whether ACPI, the DT driver, or both are enabled. -ifdef CONFIG_PCI +obj-$(CONFIG_PCIE_AL) += pcie-al.o +obj-$(CONFIG_PCI_HISI) += pcie-hisi.o + +ifdef CONFIG_ACPI +ifdef CONFIG_PCI_QUIRKS obj-$(CONFIG_ARM64) += pcie-al.o obj-$(CONFIG_ARM64) += pcie-hisi.o endif +endif From 43395d9e091220695d2503fccc6f4fc9785d1bee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Thu, 11 Mar 2021 00:17:17 +0000 Subject: [PATCH 0044/1379] PCI: Fix kernel-doc errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix kernel-doc formatting errors, function names that don't match the doc, and some missing parameter documentation. These are reported by: make W=1 drivers/pci/ No functional change intended. [bhelgaas: squashed into one patch since this only changes comments] Link: https://lore.kernel.org/r/20210311001724.423356-1-kw@linux.com Link: https://lore.kernel.org/r/20210311001724.423356-2-kw@linux.com Link: https://lore.kernel.org/r/20210311001724.423356-3-kw@linux.com Link: https://lore.kernel.org/r/20210311001724.423356-4-kw@linux.com Link: https://lore.kernel.org/r/20210311001724.423356-5-kw@linux.com Link: https://lore.kernel.org/r/20210311001724.423356-6-kw@linux.com Link: https://lore.kernel.org/r/20210311001724.423356-7-kw@linux.com Link: https://lore.kernel.org/r/20210311001724.423356-8-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/ats.c | 2 +- drivers/pci/controller/cadence/pci-j721e.c | 2 +- drivers/pci/controller/dwc/pci-keystone.c | 11 +++++++--- drivers/pci/endpoint/functions/pci-epf-ntb.c | 16 +++++++++----- drivers/pci/endpoint/functions/pci-epf-test.c | 2 +- drivers/pci/endpoint/pci-epc-core.c | 2 ++ drivers/pci/hotplug/acpi_pcihp.c | 2 +- drivers/pci/of.c | 22 +++++++++++++++---- drivers/pci/pcie/aer.c | 6 ++--- drivers/pci/pcie/pme.c | 2 +- 10 files changed, 47 insertions(+), 20 deletions(-) diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 0d3719407b8b..6d7d64939f82 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -480,7 +480,7 @@ EXPORT_SYMBOL_GPL(pci_pasid_features); #define PASID_NUMBER_SHIFT 8 #define PASID_NUMBER_MASK (0x1f << PASID_NUMBER_SHIFT) /** - * pci_max_pasid - Get maximum number of PASIDs supported by device + * pci_max_pasids - Get maximum number of PASIDs supported by device * @pdev: PCI device structure * * Returns negative value when PASID capability is not present. diff --git a/drivers/pci/controller/cadence/pci-j721e.c b/drivers/pci/controller/cadence/pci-j721e.c index 849f1e416ea5..f1eef67e9526 100644 --- a/drivers/pci/controller/cadence/pci-j721e.c +++ b/drivers/pci/controller/cadence/pci-j721e.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/** +/* * pci-j721e - PCIe controller driver for TI's J721E SoCs * * Copyright (C) 2020 Texas Instruments Incorporated - http://www.ti.com diff --git a/drivers/pci/controller/dwc/pci-keystone.c b/drivers/pci/controller/dwc/pci-keystone.c index 53aa35cb3a49..6745e69b7020 100644 --- a/drivers/pci/controller/dwc/pci-keystone.c +++ b/drivers/pci/controller/dwc/pci-keystone.c @@ -346,8 +346,9 @@ static const struct irq_domain_ops ks_pcie_legacy_irq_domain_ops = { }; /** - * ks_pcie_set_dbi_mode() - Set DBI mode to access overlaid BAR mask - * registers + * ks_pcie_set_dbi_mode() - Set DBI mode to access overlaid BAR mask registers + * @ks_pcie: A pointer to the keystone_pcie structure which holds the KeyStone + * PCIe host controller driver information. * * Since modification of dbi_cs2 involves different clock domain, read the * status back to ensure the transition is complete. @@ -367,6 +368,8 @@ static void ks_pcie_set_dbi_mode(struct keystone_pcie *ks_pcie) /** * ks_pcie_clear_dbi_mode() - Disable DBI mode + * @ks_pcie: A pointer to the keystone_pcie structure which holds the KeyStone + * PCIe host controller driver information. * * Since modification of dbi_cs2 involves different clock domain, read the * status back to ensure the transition is complete. @@ -449,6 +452,7 @@ static struct pci_ops ks_child_pcie_ops = { /** * ks_pcie_v3_65_add_bus() - keystone add_bus post initialization + * @bus: A pointer to the PCI bus structure. * * This sets BAR0 to enable inbound access for MSI_IRQ register */ @@ -488,6 +492,8 @@ static struct pci_ops ks_pcie_ops = { /** * ks_pcie_link_up() - Check if link up + * @pci: A pointer to the dw_pcie structure which holds the DesignWare PCIe host + * controller driver information. */ static int ks_pcie_link_up(struct dw_pcie *pci) { @@ -605,7 +611,6 @@ static void ks_pcie_msi_irq_handler(struct irq_desc *desc) /** * ks_pcie_legacy_irq_handler() - Handle legacy interrupt - * @irq: IRQ line for legacy interrupts * @desc: Pointer to irq descriptor * * Traverse through pending legacy interrupts and invoke handler for each. Also diff --git a/drivers/pci/endpoint/functions/pci-epf-ntb.c b/drivers/pci/endpoint/functions/pci-epf-ntb.c index 338148cf56f5..bce274d02dcf 100644 --- a/drivers/pci/endpoint/functions/pci-epf-ntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-ntb.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/** +/* * Endpoint Function Driver to implement Non-Transparent Bridge functionality * * Copyright (C) 2020 Texas Instruments @@ -696,7 +696,8 @@ reset_handler: /** * epf_ntb_peer_spad_bar_clear() - Clear Peer Scratchpad BAR - * @ntb: NTB device that facilitates communication between HOST1 and HOST2 + * @ntb_epc: EPC associated with one of the HOST which holds peer's outbound + * address. * *+-----------------+------->+------------------+ +-----------------+ *| BAR0 | | CONFIG REGION | | BAR0 | @@ -740,6 +741,7 @@ static void epf_ntb_peer_spad_bar_clear(struct epf_ntb_epc *ntb_epc) /** * epf_ntb_peer_spad_bar_set() - Set peer scratchpad BAR * @ntb: NTB device that facilitates communication between HOST1 and HOST2 + * @type: PRIMARY interface or SECONDARY interface * *+-----------------+------->+------------------+ +-----------------+ *| BAR0 | | CONFIG REGION | | BAR0 | @@ -808,7 +810,8 @@ static int epf_ntb_peer_spad_bar_set(struct epf_ntb *ntb, /** * epf_ntb_config_sspad_bar_clear() - Clear Config + Self scratchpad BAR - * @ntb: NTB device that facilitates communication between HOST1 and HOST2 + * @ntb_epc: EPC associated with one of the HOST which holds peer's outbound + * address. * * +-----------------+------->+------------------+ +-----------------+ * | BAR0 | | CONFIG REGION | | BAR0 | @@ -851,7 +854,8 @@ static void epf_ntb_config_sspad_bar_clear(struct epf_ntb_epc *ntb_epc) /** * epf_ntb_config_sspad_bar_set() - Set Config + Self scratchpad BAR - * @ntb: NTB device that facilitates communication between HOST1 and HOST2 + * @ntb_epc: EPC associated with one of the HOST which holds peer's outbound + * address. * * +-----------------+------->+------------------+ +-----------------+ * | BAR0 | | CONFIG REGION | | BAR0 | @@ -1312,6 +1316,7 @@ static int epf_ntb_configure_interrupt(struct epf_ntb *ntb, /** * epf_ntb_alloc_peer_mem() - Allocate memory in peer's outbound address space + * @dev: The PCI device. * @ntb_epc: EPC associated with one of the HOST whose BAR holds peer's outbound * address * @bar: BAR of @ntb_epc in for which memory has to be allocated (could be @@ -1660,7 +1665,6 @@ static int epf_ntb_init_epc_bar_interface(struct epf_ntb *ntb, * epf_ntb_init_epc_bar() - Identify BARs to be used for each of the NTB * constructs (scratchpad region, doorbell, memorywindow) * @ntb: NTB device that facilitates communication between HOST1 and HOST2 - * @type: PRIMARY interface or SECONDARY interface * * Wrapper to epf_ntb_init_epc_bar_interface() to identify the free BARs * to be used for each of BAR_CONFIG, BAR_PEER_SPAD, BAR_DB_MW1, BAR_MW2, @@ -2037,6 +2041,8 @@ static const struct config_item_type ntb_group_type = { /** * epf_ntb_add_cfs() - Add configfs directory specific to NTB * @epf: NTB endpoint function device + * @group: A pointer to the config_group structure referencing a group of + * config_items of a specific type that belong to a specific sub-system. * * Add configfs directory specific to NTB. This directory will hold * NTB specific properties like db_count, spad_count, num_mws etc., diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c index c0ac4e9cbe72..63d5f5c6e3e0 100644 --- a/drivers/pci/endpoint/functions/pci-epf-test.c +++ b/drivers/pci/endpoint/functions/pci-epf-test.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/** +/* * Test driver to test endpoint functionality * * Copyright (C) 2017 Texas Instruments diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c index cc8f9eb2b177..adec9bee72cf 100644 --- a/drivers/pci/endpoint/pci-epc-core.c +++ b/drivers/pci/endpoint/pci-epc-core.c @@ -594,6 +594,8 @@ EXPORT_SYMBOL_GPL(pci_epc_add_epf); * pci_epc_remove_epf() - remove PCI endpoint function from endpoint controller * @epc: the EPC device from which the endpoint function should be removed * @epf: the endpoint function to be removed + * @type: identifies if the EPC is connected to the primary or secondary + * interface of EPF * * Invoke to remove PCI endpoint function from the endpoint controller. */ diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c index 2750a64cecd3..4fedebf2f8c1 100644 --- a/drivers/pci/hotplug/acpi_pcihp.c +++ b/drivers/pci/hotplug/acpi_pcihp.c @@ -157,7 +157,7 @@ static int pcihp_is_ejectable(acpi_handle handle) } /** - * acpi_pcihp_check_ejectable - check if handle is ejectable ACPI PCI slot + * acpi_pci_check_ejectable - check if handle is ejectable ACPI PCI slot * @pbus: the PCI bus of the PCI slot corresponding to 'handle' * @handle: ACPI handle to check * diff --git a/drivers/pci/of.c b/drivers/pci/of.c index 5ea472ae22ac..da5b414d585a 100644 --- a/drivers/pci/of.c +++ b/drivers/pci/of.c @@ -190,10 +190,18 @@ int of_pci_parse_bus_range(struct device_node *node, struct resource *res) EXPORT_SYMBOL_GPL(of_pci_parse_bus_range); /** - * This function will try to obtain the host bridge domain number by - * finding a property called "linux,pci-domain" of the given device node. + * of_get_pci_domain_nr - Find the host bridge domain number + * of the given device node. + * @node: Device tree node with the domain information. * - * @node: device tree node with the domain information + * This function will try to obtain the host bridge domain number by finding + * a property called "linux,pci-domain" of the given device node. + * + * Return: + * * > 0 - On success, an associated domain number. + * * -EINVAL - The property "linux,pci-domain" does not exist. + * * -ENODATA - The linux,pci-domain" property does not have value. + * * -EOVERFLOW - Invalid "linux,pci-domain" property value. * * Returns the associated domain number from DT in the range [0-0xffff], or * a negative value if the required property is not found. @@ -585,10 +593,16 @@ int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge) #endif /* CONFIG_PCI */ /** + * of_pci_get_max_link_speed - Find the maximum link speed of the given device node. + * @node: Device tree node with the maximum link speed information. + * * This function will try to find the limitation of link speed by finding * a property called "max-link-speed" of the given device node. * - * @node: device tree node with the max link speed information + * Return: + * * > 0 - On success, a maximum link speed. + * * -EINVAL - Invalid "max-link-speed" property value, or failure to access + * the property of the device tree node. * * Returns the associated max link speed from DT, or a negative value if the * required property is not found or is invalid. diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index ba22388342d1..ec943cee5ecc 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -129,7 +129,7 @@ static const char * const ecrc_policy_str[] = { }; /** - * enable_ercr_checking - enable PCIe ECRC checking for a device + * enable_ecrc_checking - enable PCIe ECRC checking for a device * @dev: the PCI device * * Returns 0 on success, or negative on failure. @@ -153,7 +153,7 @@ static int enable_ecrc_checking(struct pci_dev *dev) } /** - * disable_ercr_checking - disables PCIe ECRC checking for a device + * disable_ecrc_checking - disables PCIe ECRC checking for a device * @dev: the PCI device * * Returns 0 on success, or negative on failure. @@ -1442,7 +1442,7 @@ static struct pcie_port_service_driver aerdriver = { }; /** - * aer_service_init - register AER root service driver + * pcie_aer_init - register AER root service driver * * Invoked when AER root service driver is loaded. */ diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c index 3fc08488d65f..1d0dd77fed3a 100644 --- a/drivers/pci/pcie/pme.c +++ b/drivers/pci/pcie/pme.c @@ -463,7 +463,7 @@ static struct pcie_port_service_driver pcie_pme_driver = { }; /** - * pcie_pme_service_init - Register the PCIe PME service driver. + * pcie_pme_init - Register the PCIe PME service driver. */ int __init pcie_pme_init(void) { From 2a4bd0c37c87f9f82b6265ec7e716d2a2d4b0c71 Mon Sep 17 00:00:00 2001 From: jiahao Date: Fri, 19 Feb 2021 20:46:32 +0800 Subject: [PATCH 0045/1379] f2fs: fix a spacing coding style Add a space before the plus. Signed-off-by: jiahao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 174a0819ad96..d0e5ef7e232e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1456,7 +1456,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) orphan_blocks); if (__remain_node_summaries(cpc->reason)) - ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + cp_payload_blks + data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); else From c67c8c0f47eb0bf49d6cf165389554e379443968 Mon Sep 17 00:00:00 2001 From: xuyehan Date: Tue, 23 Feb 2021 09:31:43 +0800 Subject: [PATCH 0046/1379] f2fs: fix a spelling error Delete the letter 'e' before 'number' Signed-off-by: xuyehan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index cbeac1bebe2f..9fa5a528cc23 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -276,7 +276,7 @@ Date April 2019 Contact: "Daniel Rosenberg" Description: If checkpoint=disable, it displays the number of blocks that are unusable. - If checkpoint=enable it displays the enumber of blocks that + If checkpoint=enable it displays the number of blocks that would be unusable if checkpoint=disable were to be set. What: /sys/fs/f2fs//encoding From 7dede88659df38f96128ab3922c50dde2d29c574 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:35:40 +0800 Subject: [PATCH 0047/1379] f2fs: fix to allow migrating fully valid segment F2FS_IOC_FLUSH_DEVICE/F2FS_IOC_RESIZE_FS needs to migrate all blocks of target segment to other place, no matter the segment has partially or fully valid blocks. However, after commit 803e74be04b3 ("f2fs: stop GC when the victim becomes fully valid"), we may skip migration due to target segment is fully valid, result in failing the ioctl interface, fix this. Fixes: 803e74be04b3 ("f2fs: stop GC when the victim becomes fully valid") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 9 +++++---- fs/f2fs/gc.c | 21 ++++++++++++--------- fs/f2fs/segment.c | 2 +- fs/f2fs/super.c | 2 +- 5 files changed, 20 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e2d302ae3a46..cccdfb1a40ab 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3547,7 +3547,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force, unsigned int segno); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d26ff2ae3f5e..1863944f4073 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1658,7 +1658,7 @@ next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, NULL_SEGNO); + err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); if (err && err != -ENODATA && err != -EAGAIN) goto out_err; } @@ -2489,7 +2489,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); + ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO); out: mnt_drop_write_file(filp); return ret; @@ -2525,7 +2525,8 @@ do_more: down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range->sync, true, GET_SEGNO(sbi, range->start)); + ret = f2fs_gc(sbi, range->sync, true, false, + GET_SEGNO(sbi, range->start)); if (ret) { if (ret == -EBUSY) ret = -EAGAIN; @@ -2978,7 +2979,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sm->last_victim[GC_CB] = end_segno + 1; sm->last_victim[GC_GREEDY] = end_segno + 1; sm->last_victim[ALLOC_NEXT] = end_segno + 1; - ret = f2fs_gc(sbi, true, true, start_segno); + ret = f2fs_gc(sbi, true, true, true, start_segno); if (ret == -EAGAIN) ret = 0; else if (ret < 0) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 39330ad3c44e..b3af76340026 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -112,7 +112,7 @@ do_gc: sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, true, NULL_SEGNO)) + if (f2fs_gc(sbi, sync_mode, true, false, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -1354,7 +1354,8 @@ out: * the victim data block is ignored. */ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, - struct gc_inode_list *gc_list, unsigned int segno, int gc_type) + struct gc_inode_list *gc_list, unsigned int segno, int gc_type, + bool force_migrate) { struct super_block *sb = sbi->sb; struct f2fs_summary *entry; @@ -1383,8 +1384,8 @@ next_step: * race condition along with SSR block allocation. */ if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || - get_valid_blocks(sbi, segno, true) == - BLKS_PER_SEC(sbi)) + (!force_migrate && get_valid_blocks(sbi, segno, true) == + BLKS_PER_SEC(sbi))) return submitted; if (check_valid_map(sbi, segno, off) == 0) @@ -1519,7 +1520,8 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int start_segno, - struct gc_inode_list *gc_list, int gc_type) + struct gc_inode_list *gc_list, int gc_type, + bool force_migrate) { struct page *sum_page; struct f2fs_summary_block *sum; @@ -1606,7 +1608,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); else submitted += gc_data_segment(sbi, sum->entries, gc_list, - segno, gc_type); + segno, gc_type, + force_migrate); stat_inc_seg_count(sbi, type, gc_type); migrated++; @@ -1634,7 +1637,7 @@ skip: } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, - bool background, unsigned int segno) + bool background, bool force, unsigned int segno) { int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0, seg_freed = 0, total_freed = 0; @@ -1696,7 +1699,7 @@ gc_more: if (ret) goto stop; - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); if (gc_type == FG_GC && seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) sec_freed++; @@ -1835,7 +1838,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - do_garbage_collect(sbi, segno, &gc_list, FG_GC); + do_garbage_collect(sbi, segno, &gc_list, FG_GC, true); put_gc_inode(&gc_list); if (!gc_only && get_valid_blocks(sbi, segno, true)) { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 993004f06a77..b8c20d29431d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -504,7 +504,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, NULL_SEGNO); + f2fs_gc(sbi, false, false, false, NULL_SEGNO); } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7069793752f1..a17a4dd8d449 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1865,7 +1865,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) while (!f2fs_time_over(sbi, DISABLE_TIME)) { down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, NULL_SEGNO); + err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); if (err == -ENODATA) { err = 0; break; From 3ab0598e6d860ef49d029943ba80f627c15c15d6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:35:41 +0800 Subject: [PATCH 0048/1379] f2fs: fix panic during f2fs_resize_fs() f2fs_resize_fs() hangs in below callstack with testcase: - mkfs 16GB image & mount image - dd 8GB fileA - dd 8GB fileB - sync - rm fileA - sync - resize filesystem to 8GB kernel BUG at segment.c:2484! Call Trace: allocate_segment_by_default+0x92/0xf0 [f2fs] f2fs_allocate_data_block+0x44b/0x7e0 [f2fs] do_write_page+0x5a/0x110 [f2fs] f2fs_outplace_write_data+0x55/0x100 [f2fs] f2fs_do_write_data_page+0x392/0x850 [f2fs] move_data_page+0x233/0x320 [f2fs] do_garbage_collect+0x14d9/0x1660 [f2fs] free_segment_range+0x1f7/0x310 [f2fs] f2fs_resize_fs+0x118/0x330 [f2fs] __f2fs_ioctl+0x487/0x3680 [f2fs] __x64_sys_ioctl+0x8e/0xd0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The root cause is we forgot to check that whether we have enough space in resized filesystem to store all valid blocks in before-resizing filesystem, then allocator will run out-of-space during block migration in free_segment_range(). Fixes: b4b10061ef98 ("f2fs: refactor resize_fs to avoid meta updates in progress") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b3af76340026..86ba8ed0b8a7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1977,7 +1977,20 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) /* stop CP to protect MAIN_SEC in free_segment_range */ f2fs_lock_op(sbi); + + spin_lock(&sbi->stat_lock); + if (shrunk_blocks + valid_user_blocks(sbi) + + sbi->current_reserved_blocks + sbi->unusable_block_count + + F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) + err = -ENOSPC; + spin_unlock(&sbi->stat_lock); + + if (err) + goto out_unlock; + err = free_segment_range(sbi, secs, true); + +out_unlock: f2fs_unlock_op(sbi); up_write(&sbi->gc_lock); if (err) From cd6ee739b8ee49cf5f3d7c9a0f663f9f0c5afe1b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:38:41 +0800 Subject: [PATCH 0049/1379] f2fs: avoid unused f2fs_show_compress_options() LKP reports: fs/f2fs/super.c:1516:20: warning: unused function 'f2fs_show_compress_options' [-Wunused-function] static inline void f2fs_show_compress_options(struct seq_file *seq, Fix this issue by covering f2fs_show_compress_options() with CONFIG_F2FS_FS_COMPRESSION macro. Fixes: 4c8ff7095bef ("f2fs: support data compression") Reported-by: kernel test robot Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a17a4dd8d449..e03b2f0f69d0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1616,6 +1616,7 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, #endif } +#ifdef CONFIG_F2FS_FS_COMPRESSION static inline void f2fs_show_compress_options(struct seq_file *seq, struct super_block *sb) { @@ -1661,6 +1662,7 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER) seq_printf(seq, ",compress_mode=%s", "user"); } +#endif static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { From 4831675c6be59dbe8e0b2a53dc237111f9307a4b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:38:42 +0800 Subject: [PATCH 0050/1379] f2fs: remove unused FORCE_FG_GC macro FORCE_FG_GC was introduced by commit 6aefd93b0137 ("f2fs: introduce background_gc=sync mount option"), but never be used, remove it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 229814b4f4a6..144980b62f9e 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -172,12 +172,10 @@ enum { /* * BG_GC means the background cleaning job. * FG_GC means the on-demand cleaning job. - * FORCE_FG_GC means on-demand cleaning job in background. */ enum { BG_GC = 0, FG_GC, - FORCE_FG_GC, }; /* for a function parameter to select a victim segment */ From 3b42c741b1bf52ee9ed6fba5f9636d80ddacf73f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 20 Feb 2021 17:38:43 +0800 Subject: [PATCH 0051/1379] f2fs: update comments for explicit memory barrier Add more detailed comments for explicit memory barrier used by f2fs, in order to enhance code readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +++++- fs/f2fs/segment.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d0e5ef7e232e..f6169611270f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1818,7 +1818,11 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) llist_add(&req.llnode, &cprc->issue_list); atomic_inc(&cprc->queued_ckpt); - /* update issue_list before we wake up issue_checkpoint thread */ + /* + * update issue_list before we wake up issue_checkpoint thread, + * this smp_mb() pairs with another barrier in ___wait_event(), + * see more details in comments of waitqueue_active(). + */ smp_mb(); if (waitqueue_active(&cprc->ckpt_wait_queue)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b8c20d29431d..29403540ff6e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -653,7 +653,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) llist_add(&cmd.llnode, &fcc->issue_list); - /* update issue_list before we wake up issue_flush thread */ + /* + * update issue_list before we wake up issue_flush thread, this + * smp_mb() pairs with another barrier in ___wait_event(), see + * more details in comments of waitqueue_active(). + */ smp_mb(); if (waitqueue_active(&fcc->flush_wait_queue)) From 43f8c47ea7d59c7b2270835f1d7c4618a1ea27b6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Feb 2021 18:07:33 +0800 Subject: [PATCH 0052/1379] f2fs: check discard command number before traversing discard pending list In trim thread, let's add a condition to check discard command number before traversing discard pending list, it can avoid unneeded traversing if there is no discard command. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 29403540ff6e..b5a40a39a03f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1759,6 +1759,8 @@ static int issue_discard_thread(void *data) wait_ms = dpolicy.max_interval; continue; } + if (!atomic_read(&dcc->discard_cmd_cnt)) + continue; if (sbi->gc_mode == GC_URGENT_HIGH) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); From 72f85881249e3a7403434631b9a9f934cdd1a83d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 26 Feb 2021 16:51:42 +0100 Subject: [PATCH 0053/1379] f2fs: compress: Allow modular (de)compression algorithms If F2FS_FS is modular, enabling the compressions options F2FS_FS_{LZ4,LZ4HZ,LZO,LZORLE,ZSTD} will make the (de)compression algorithms {LZ4,LZ4HC,LZO,ZSTD}_{,DE}COMPRESS builtin instead of modular, as the former depend on an intermediate boolean F2FS_FS_COMPRESSION, which in-turn depends on tristate F2FS_FS. Indeed, if a boolean symbol A depends directly on a tristate symbol B and selects another tristate symbol C: tristate B tristate C bool A depends on B select C and B is modular, then C will also be modular. However, if there is an intermediate boolean D in the dependency chain between A and B: tristate B tristate C bool D depends on B bool A depends on D select C then the modular state won't propagate from B to C, and C will be builtin instead of modular. As modular dependency propagation through intermediate symbols is obscure, fix this in a robust way by moving the selection of tristate (de)compression algorithms from the boolean compression options to the tristate main F2FS_FS option. Signed-off-by: Geert Uytterhoeven Reviewed-by: Chao Yu Reviewed-by: Masahiro Yamada Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 62e638a49bbf..7669de7b49ce 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -7,6 +7,13 @@ config F2FS_FS select CRYPTO_CRC32 select F2FS_FS_XATTR if FS_ENCRYPTION select FS_ENCRYPTION_ALGS if FS_ENCRYPTION + select LZ4_COMPRESS if F2FS_FS_LZ4 + select LZ4_DECOMPRESS if F2FS_FS_LZ4 + select LZ4HC_COMPRESS if F2FS_FS_LZ4HC + select LZO_COMPRESS if F2FS_FS_LZO + select LZO_DECOMPRESS if F2FS_FS_LZO + select ZSTD_COMPRESS if F2FS_FS_ZSTD + select ZSTD_DECOMPRESS if F2FS_FS_ZSTD help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on @@ -94,8 +101,6 @@ config F2FS_FS_COMPRESSION config F2FS_FS_LZO bool "LZO compression support" depends on F2FS_FS_COMPRESSION - select LZO_COMPRESS - select LZO_DECOMPRESS default y help Support LZO compress algorithm, if unsure, say Y. @@ -103,8 +108,6 @@ config F2FS_FS_LZO config F2FS_FS_LZ4 bool "LZ4 compression support" depends on F2FS_FS_COMPRESSION - select LZ4_COMPRESS - select LZ4_DECOMPRESS default y help Support LZ4 compress algorithm, if unsure, say Y. @@ -113,7 +116,6 @@ config F2FS_FS_LZ4HC bool "LZ4HC compression support" depends on F2FS_FS_COMPRESSION depends on F2FS_FS_LZ4 - select LZ4HC_COMPRESS default y help Support LZ4HC compress algorithm, LZ4HC has compatible on-disk @@ -122,8 +124,6 @@ config F2FS_FS_LZ4HC config F2FS_FS_ZSTD bool "ZSTD compression support" depends on F2FS_FS_COMPRESSION - select ZSTD_COMPRESS - select ZSTD_DECOMPRESS default y help Support ZSTD compress algorithm, if unsure, say Y. @@ -132,8 +132,6 @@ config F2FS_FS_LZORLE bool "LZO-RLE compression support" depends on F2FS_FS_COMPRESSION depends on F2FS_FS_LZO - select LZO_COMPRESS - select LZO_DECOMPRESS default y help Support LZO-RLE compress algorithm, if unsure, say Y. From 4260c4067fbba55a90037fe3ee32eff087749f83 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 24 Feb 2021 13:03:13 -0600 Subject: [PATCH 0054/1379] f2fs: Replace one-element array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. Refactor the code according to the use of a flexible-array member in struct f2fs_checkpoint, instead of a one-element arrays. Notice that a temporary pointer to void '*tmp_ptr' was used in order to fix the following errors when using a flexible array instead of a one element array in struct f2fs_checkpoint: CC [M] fs/f2fs/dir.o In file included from fs/f2fs/dir.c:13: fs/f2fs/f2fs.h: In function ‘__bitmap_ptr’: fs/f2fs/f2fs.h:2227:40: error: invalid use of flexible array member 2227 | return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32); | ^ fs/f2fs/f2fs.h:2227:49: error: invalid use of flexible array member 2227 | return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32); | ^ fs/f2fs/f2fs.h:2238:40: error: invalid use of flexible array member 2238 | return &ckpt->sit_nat_version_bitmap + offset; | ^ make[2]: *** [scripts/Makefile.build:287: fs/f2fs/dir.o] Error 1 make[1]: *** [scripts/Makefile.build:530: fs/f2fs] Error 2 make: *** [Makefile:1819: fs] Error 2 [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/79 Build-tested-by: kernel test robot Link: https://lore.kernel.org/lkml/603647e4.DeEFbl4eqljuwAUe%25lkp@intel.com/ Signed-off-by: Gustavo A. R. Silva Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++-- include/linux/f2fs_fs.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cccdfb1a40ab..99e243fd26d5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2215,6 +2215,7 @@ static inline block_t __cp_payload(struct f2fs_sb_info *sbi) static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + void *tmp_ptr = &ckpt->sit_nat_version_bitmap; int offset; if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { @@ -2224,7 +2225,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) * if large_nat_bitmap feature is enabled, leave checksum * protection for all nat/sit bitmaps. */ - return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32); + return tmp_ptr + offset + sizeof(__le32); } if (__cp_payload(sbi) > 0) { @@ -2235,7 +2236,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) } else { offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; - return &ckpt->sit_nat_version_bitmap + offset; + return tmp_ptr + offset; } } diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index c6cc0a566ef5..5487a80617a3 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -168,7 +168,7 @@ struct f2fs_checkpoint { unsigned char alloc_type[MAX_ACTIVE_LOGS]; /* SIT and NAT version bitmap */ - unsigned char sit_nat_version_bitmap[1]; + unsigned char sit_nat_version_bitmap[]; } __packed; #define CP_CHKSUM_OFFSET 4092 /* default chksum offset in checkpoint */ From ebc29b62a166e9116cd8159e9798044d02130279 Mon Sep 17 00:00:00 2001 From: "huangjianan@oppo.com" Date: Sat, 27 Feb 2021 20:02:29 +0800 Subject: [PATCH 0055/1379] f2fs: remove unnecessary IS_SWAPFILE check Now swapfile in f2fs directly submit IO to blockdev according to swapfile extents reported by f2fs when swapon, therefore there is no need to check IS_SWAPFILE when exec filesystem operation. Signed-off-by: Huang Jianan Signed-off-by: Guo Weichao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7c95818639a6..1531463768bf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1722,7 +1722,7 @@ static int get_data_block_dio_write(struct inode *inode, sector_t iblock, return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_DIO, NULL, f2fs_rw_hint_to_seg_type(inode->i_write_hint), - IS_SWAPFILE(inode) ? false : true); + true); } static int get_data_block_dio(struct inode *inode, sector_t iblock, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 99e243fd26d5..cd004424debc 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4177,8 +4177,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, if (F2FS_IO_ALIGNED(sbi)) return true; } - if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED) && - !IS_SWAPFILE(inode)) + if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED)) return true; return false; From 1da66103837077df70ddf7a49c46dfd025001a60 Mon Sep 17 00:00:00 2001 From: "huangjianan@oppo.com" Date: Sat, 27 Feb 2021 20:02:30 +0800 Subject: [PATCH 0056/1379] f2fs: fix last_lblock check in check_swap_activate_fast Because page_no < sis->max guarantees that the while loop break out normally, the wrong check contidion here doesn't cause a problem. Signed-off-by: Huang Jianan Signed-off-by: Guo Weichao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1531463768bf..398498fb99d5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3803,7 +3803,7 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, last_lblock = bytes_to_blks(inode, i_size_read(inode)); len = i_size_read(inode); - while (cur_lblock <= last_lblock && cur_lblock < sis->max) { + while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; pgoff_t next_pgofs; From 36e4d95891ed37eb98a660babec749be3fb35fd9 Mon Sep 17 00:00:00 2001 From: "huangjianan@oppo.com" Date: Mon, 1 Mar 2021 12:58:44 +0800 Subject: [PATCH 0057/1379] f2fs: check if swapfile is section-alligned If the swapfile isn't created by pin and fallocate, it can't be guaranteed section-aligned, so it may be selected by f2fs gc. When gc_pin_file_threshold is reached, the address of swapfile may change, but won't be synchronized to swap_extent, so swap will write to wrong address, which will cause data corruption. Signed-off-by: Huang Jianan Signed-off-by: Guo Weichao Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 109 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 88 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 398498fb99d5..0e749cf60e11 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3780,11 +3780,64 @@ int f2fs_migrate_page(struct address_space *mapping, #endif #ifdef CONFIG_SWAP +static int f2fs_is_file_aligned(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + block_t main_blkaddr = SM_I(sbi)->main_blkaddr; + block_t cur_lblock; + block_t last_lblock; + block_t pblock; + unsigned long nr_pblocks; + unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + int ret = 0; + + cur_lblock = 0; + last_lblock = bytes_to_blks(inode, i_size_read(inode)); + + while (cur_lblock < last_lblock) { + struct f2fs_map_blocks map; + + memset(&map, 0, sizeof(map)); + map.m_lblk = cur_lblock; + map.m_len = last_lblock - cur_lblock; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; + + ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); + if (ret) + goto out; + + /* hole */ + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + f2fs_err(sbi, "Swapfile has holes\n"); + ret = -ENOENT; + goto out; + } + + pblock = map.m_pblk; + nr_pblocks = map.m_len; + + if ((pblock - main_blkaddr) & (blocks_per_sec - 1) || + nr_pblocks & (blocks_per_sec - 1)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + + cur_lblock += nr_pblocks; + } +out: + return ret; +} + static int check_swap_activate_fast(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); sector_t cur_lblock; sector_t last_lblock; sector_t pblock; @@ -3792,8 +3845,8 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sector_t highest_pblock = 0; int nr_extents = 0; unsigned long nr_pblocks; - u64 len; - int ret; + unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + int ret = 0; /* * Map all the blocks into the extent list. This code doesn't try @@ -3801,31 +3854,41 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, */ cur_lblock = 0; last_lblock = bytes_to_blks(inode, i_size_read(inode)); - len = i_size_read(inode); while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; - pgoff_t next_pgofs; cond_resched(); memset(&map, 0, sizeof(map)); map.m_lblk = cur_lblock; - map.m_len = bytes_to_blks(inode, len) - cur_lblock; - map.m_next_pgofs = &next_pgofs; + map.m_len = last_lblock - cur_lblock; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); if (ret) - goto err_out; + goto out; /* hole */ - if (!(map.m_flags & F2FS_MAP_FLAGS)) - goto err_out; + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + f2fs_err(sbi, "Swapfile has holes\n"); + ret = -ENOENT; + goto out; + } pblock = map.m_pblk; nr_pblocks = map.m_len; + if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) || + nr_pblocks & (blocks_per_sec - 1)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; @@ -3854,9 +3917,6 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sis->highest_bit = cur_lblock - 1; out: return ret; -err_out: - pr_err("swapon: swapfile has holes\n"); - return -EINVAL; } /* Copied from generic_swapfile_activate() to check any holes */ @@ -3865,6 +3925,7 @@ static int check_swap_activate(struct swap_info_struct *sis, { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned blocks_per_page; unsigned long page_no; sector_t probe_block; @@ -3872,11 +3933,15 @@ static int check_swap_activate(struct swap_info_struct *sis, sector_t lowest_block = -1; sector_t highest_block = 0; int nr_extents = 0; - int ret; + int ret = 0; if (PAGE_SIZE == F2FS_BLKSIZE) return check_swap_activate_fast(sis, swap_file, span); + ret = f2fs_is_file_aligned(inode); + if (ret) + goto out; + blocks_per_page = bytes_to_blks(inode, PAGE_SIZE); /* @@ -3891,13 +3956,14 @@ static int check_swap_activate(struct swap_info_struct *sis, unsigned block_in_page; sector_t first_block; sector_t block = 0; - int err = 0; cond_resched(); block = probe_block; - err = bmap(inode, &block); - if (err || !block) + ret = bmap(inode, &block); + if (ret) + goto out; + if (!block) goto bad_bmap; first_block = block; @@ -3913,9 +3979,10 @@ static int check_swap_activate(struct swap_info_struct *sis, block_in_page++) { block = probe_block + block_in_page; - err = bmap(inode, &block); - - if (err || !block) + ret = bmap(inode, &block); + if (ret) + goto out; + if (!block) goto bad_bmap; if (block != first_block + block_in_page) { @@ -3955,8 +4022,8 @@ reprobe: out: return ret; bad_bmap: - pr_err("swapon: swapfile has holes\n"); - return -EINVAL; + f2fs_err(sbi, "Swapfile has holes\n"); + return -ENOENT; } static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file, From 1153db095fd6c5cc59425171ddef4a4c83464643 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 2 Mar 2021 16:35:32 +0800 Subject: [PATCH 0058/1379] f2fs: remove unused file_clear_encrypt() - file_clear_encrypt() was never be used, remove it. - In addition, relocating macros for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cd004424debc..fe9e0d4d3920 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -637,21 +637,26 @@ enum { #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) -#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) #define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) -#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) #define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) + +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) #define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) + #define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT) #define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT) -#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) + #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) + #define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) + #define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) #define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) #define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) + #define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) #define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) From 28e18ee636ba28532dbe425540af06245a0bbecb Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 4 Mar 2021 09:21:18 +0000 Subject: [PATCH 0059/1379] f2fs: fix a redundant call to f2fs_balance_fs if an error occurs The uninitialized variable dn.node_changed does not get set when a call to f2fs_get_node_page fails. This uninitialized value gets used in the call to f2fs_balance_fs() that may or not may not balances dirty node and dentry pages depending on the uninitialized state of the variable. Fix this by only calling f2fs_balance_fs if err is not set. Thanks to Jaegeuk Kim for suggesting an appropriate fix. Addresses-Coverity: ("Uninitialized scalar variable") Fixes: 2a3407607028 ("f2fs: call f2fs_balance_fs only when node was changed") Signed-off-by: Colin Ian King Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 993caefcd2bb..92652ca7a7c8 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -219,7 +219,8 @@ out: f2fs_put_page(page, 1); - f2fs_balance_fs(sbi, dn.node_changed); + if (!err) + f2fs_balance_fs(sbi, dn.node_changed); return err; } From 3c0315424f5e3d2a4113c7272367bee1e8e6a174 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 4 Mar 2021 21:43:10 -0800 Subject: [PATCH 0060/1379] f2fs: fix error handling in f2fs_end_enable_verity() f2fs didn't properly clean up if verity failed to be enabled on a file: - It left verity metadata (pages past EOF) in the page cache, which would be exposed to userspace if the file was later extended. - It didn't truncate the verity metadata at all (either from cache or from disk) if an error occurred while setting the verity bit. Fix these bugs by adding a call to truncate_inode_pages() and ensuring that we truncate the verity metadata (both from cache and from disk) in all error paths. Also rework the code to cleanly separate the success path from the error paths, which makes it much easier to understand. Finally, log a message if f2fs_truncate() fails, since it might otherwise fail silently. Reported-by: Yunlei He Fixes: 95ae251fe828 ("f2fs: add fs-verity support") Cc: # v5.4+ Signed-off-by: Eric Biggers Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/verity.c | 75 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 054ec852b5ea..15ba36926fad 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -152,40 +152,73 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc, size_t desc_size, u64 merkle_tree_size) { struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size; struct fsverity_descriptor_location dloc = { .version = cpu_to_le32(F2FS_VERIFY_VER), .size = cpu_to_le32(desc_size), .pos = cpu_to_le64(desc_pos), }; - int err = 0; + int err = 0, err2 = 0; - if (desc != NULL) { - /* Succeeded; write the verity descriptor. */ - err = pagecache_write(inode, desc, desc_size, desc_pos); + /* + * If an error already occurred (which fs/verity/ signals by passing + * desc == NULL), then only clean-up is needed. + */ + if (desc == NULL) + goto cleanup; - /* Write all pages before clearing FI_VERITY_IN_PROGRESS. */ - if (!err) - err = filemap_write_and_wait(inode->i_mapping); - } + /* Append the verity descriptor. */ + err = pagecache_write(inode, desc, desc_size, desc_pos); + if (err) + goto cleanup; - /* If we failed, truncate anything we wrote past i_size. */ - if (desc == NULL || err) - f2fs_truncate(inode); + /* + * Write all pages (both data and verity metadata). Note that this must + * happen before clearing FI_VERITY_IN_PROGRESS; otherwise pages beyond + * i_size won't be written properly. For crash consistency, this also + * must happen before the verity inode flag gets persisted. + */ + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto cleanup; + + /* Set the verity xattr. */ + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, + F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), + NULL, XATTR_CREATE); + if (err) + goto cleanup; + + /* Finally, set the verity inode flag. */ + file_set_verity(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, true); clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return 0; - if (desc != NULL && !err) { - err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, - F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), - NULL, XATTR_CREATE); - if (!err) { - file_set_verity(inode); - f2fs_set_inode_flags(inode); - f2fs_mark_inode_dirty_sync(inode, true); - } +cleanup: + /* + * Verity failed to be enabled, so clean up by truncating any verity + * metadata that was written beyond i_size (both from cache and from + * disk) and clearing FI_VERITY_IN_PROGRESS. + * + * Taking i_gc_rwsem[WRITE] is needed to stop f2fs garbage collection + * from re-instantiating cached pages we are truncating (since unlike + * normal file accesses, garbage collection isn't limited by i_size). + */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_inode_pages(inode->i_mapping, inode->i_size); + err2 = f2fs_truncate(inode); + if (err2) { + f2fs_err(sbi, "Truncating verity metadata failed (errno=%d)", + err2); + set_sbi_flag(sbi, SBI_NEED_FSCK); } - return err; + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return err ?: err2; } static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, From 0823427989c11240ad0f23561e66ff31a927018f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 1 Mar 2021 17:28:16 -0800 Subject: [PATCH 0061/1379] f2fs: expose # of overprivision segments This is useful when checking conditions during checkpoint=disable in Android. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 5 +++++ fs/f2fs/sysfs.c | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 9fa5a528cc23..4aa8f38b52d7 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -409,3 +409,8 @@ Description: Give a way to change checkpoint merge daemon's io priority. I/O priority "3". We can select the class between "rt" and "be", and set the I/O priority within valid range of it. "," delimiter is necessary in between I/O class and priority number. + +What: /sys/fs/f2fs//ovp_segments +Date: March 2021 +Contact: "Jaegeuk Kim" +Description: Shows the number of overprovision segments. diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e38a7f6921dd..0c391ab2d8b7 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -91,6 +91,13 @@ static ssize_t free_segments_show(struct f2fs_attr *a, (unsigned long long)(free_segments(sbi))); } +static ssize_t ovp_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%llu\n", + (unsigned long long)(overprovision_segments(sbi))); +} + static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -629,6 +636,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); +F2FS_GENERAL_RO_ATTR(ovp_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); @@ -715,6 +723,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), + ATTR_LIST(ovp_segments), ATTR_LIST(unusable), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), From 1bb73841ea7a88765db7f641a90120490f1f4aee Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Tue, 2 Mar 2021 07:21:33 +0100 Subject: [PATCH 0062/1379] PCI: Remove MicroGate SyncLink device IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The drivers were removed in a1f714b44e34 (tty: Remove redundant synclink driver) and 3d608a591b2b (tty: Remove redundant synclinkmp driver). Remove the PCI device ID entries as well. Link: https://lore.kernel.org/r/20210302062214.29627-3-jslaby@suse.cz Signed-off-by: Jiri Slaby Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- include/linux/pci_ids.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index a76ccb697bef..8a18517696c1 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2065,8 +2065,6 @@ #define PCI_DEVICE_ID_EXAR_XR17V358 0x0358 #define PCI_VENDOR_ID_MICROGATE 0x13c0 -#define PCI_DEVICE_ID_MICROGATE_USC 0x0010 -#define PCI_DEVICE_ID_MICROGATE_SCA 0x0030 #define PCI_VENDOR_ID_3WARE 0x13C1 #define PCI_DEVICE_ID_3WARE_1000 0x1000 From 18cda8018a4b5f4819e02ad2cabf40c3666c6366 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 12 Mar 2021 08:04:20 +0000 Subject: [PATCH 0063/1379] remoteproc: imx_rproc: fix return value check in imx_rproc_addr_init() In case of error, the function devm_ioremap() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. Fixes: ecadcc47492c ("remoteproc: imx_rproc: use devm_ioremap") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Link: https://lore.kernel.org/r/20210312080420.277151-1-weiyongjun1@huawei.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/imx_rproc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 5ebb9f57d3e0..6d3207ccbaef 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -464,10 +464,9 @@ static int imx_rproc_addr_init(struct imx_rproc *priv, /* Not use resource version, because we might share region */ priv->mem[b].cpu_addr = devm_ioremap(&pdev->dev, res.start, resource_size(&res)); - if (IS_ERR(priv->mem[b].cpu_addr)) { + if (!priv->mem[b].cpu_addr) { dev_err(dev, "failed to remap %pr\n", &res); - err = PTR_ERR(priv->mem[b].cpu_addr); - return err; + return -ENOMEM; } priv->mem[b].sys_addr = res.start; priv->mem[b].size = resource_size(&res); From 58483761810087e5ffdf36e84ac1bf26df909097 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 14 Mar 2021 12:13:29 +0100 Subject: [PATCH 0064/1379] thermal/drivers/core: Use a char pointer for the cooling device name We want to have any kind of name for the cooling devices as we do no longer want to rely on auto-numbering. Let's replace the cooling device's fixed array by a char pointer to be allocated dynamically when registering the cooling device, so we don't limit the length of the name. Rework the error path at the same time as we have to rollback the allocations in case of error. Tested with a dummy device having the name: "Llanfairpwllgwyngyllgogerychwyrndrobwllllantysiliogogogoch" A village on the island of Anglesey (Wales), known to have the longest name in Europe. Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Tested-by: Ido Schimmel Link: https://lore.kernel.org/r/20210314111333.16551-1-daniel.lezcano@linaro.org --- .../ethernet/mellanox/mlxsw/core_thermal.c | 2 +- drivers/thermal/thermal_core.c | 38 +++++++++++-------- include/linux/thermal.h | 2 +- 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c index bf85ce9835d7..7447c2a73cbd 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c @@ -141,7 +141,7 @@ static int mlxsw_get_cooling_device_idx(struct mlxsw_thermal *thermal, /* Allow mlxsw thermal zone binding to an external cooling device */ for (i = 0; i < ARRAY_SIZE(mlxsw_thermal_external_allowed_cdev); i++) { if (strnstr(cdev->type, mlxsw_thermal_external_allowed_cdev[i], - sizeof(cdev->type))) + strlen(cdev->type))) return 0; } diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 996c038f83a4..c8d4010940ef 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -960,10 +960,7 @@ __thermal_cooling_device_register(struct device_node *np, { struct thermal_cooling_device *cdev; struct thermal_zone_device *pos = NULL; - int result; - - if (type && strlen(type) >= THERMAL_NAME_LENGTH) - return ERR_PTR(-EINVAL); + int ret; if (!ops || !ops->get_max_state || !ops->get_cur_state || !ops->set_cur_state) @@ -973,14 +970,17 @@ __thermal_cooling_device_register(struct device_node *np, if (!cdev) return ERR_PTR(-ENOMEM); - result = ida_simple_get(&thermal_cdev_ida, 0, 0, GFP_KERNEL); - if (result < 0) { - kfree(cdev); - return ERR_PTR(result); + ret = ida_simple_get(&thermal_cdev_ida, 0, 0, GFP_KERNEL); + if (ret < 0) + goto out_kfree_cdev; + cdev->id = ret; + + cdev->type = kstrdup(type ? type : "", GFP_KERNEL); + if (!cdev->type) { + ret = -ENOMEM; + goto out_ida_remove; } - cdev->id = result; - strlcpy(cdev->type, type ? : "", sizeof(cdev->type)); mutex_init(&cdev->lock); INIT_LIST_HEAD(&cdev->thermal_instances); cdev->np = np; @@ -990,12 +990,9 @@ __thermal_cooling_device_register(struct device_node *np, cdev->devdata = devdata; thermal_cooling_device_setup_sysfs(cdev); dev_set_name(&cdev->device, "cooling_device%d", cdev->id); - result = device_register(&cdev->device); - if (result) { - ida_simple_remove(&thermal_cdev_ida, cdev->id); - put_device(&cdev->device); - return ERR_PTR(result); - } + ret = device_register(&cdev->device); + if (ret) + goto out_kfree_type; /* Add 'this' new cdev to the global cdev list */ mutex_lock(&thermal_list_lock); @@ -1013,6 +1010,14 @@ __thermal_cooling_device_register(struct device_node *np, mutex_unlock(&thermal_list_lock); return cdev; + +out_kfree_type: + kfree(cdev->type); + put_device(&cdev->device); +out_ida_remove: + ida_simple_remove(&thermal_cdev_ida, cdev->id); +out_kfree_cdev: + return ERR_PTR(ret); } /** @@ -1171,6 +1176,7 @@ void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev) ida_simple_remove(&thermal_cdev_ida, cdev->id); device_del(&cdev->device); thermal_cooling_device_destroy_sysfs(cdev); + kfree(cdev->type); put_device(&cdev->device); } EXPORT_SYMBOL_GPL(thermal_cooling_device_unregister); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 6ac7bb1d2b1f..169502164364 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -91,7 +91,7 @@ struct thermal_cooling_device_ops { struct thermal_cooling_device { int id; - char type[THERMAL_NAME_LENGTH]; + char *type; struct device device; struct device_node *np; void *devdata; From ef37d1f9acb57b7a5993e93ae582ba5f4108919e Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 14 Mar 2021 12:13:30 +0100 Subject: [PATCH 0065/1379] thermal/drivers/cpufreq_cooling: Use device name instead of auto-numbering Currently the naming of a cooling device is just a cooling technique followed by a number. When there are multiple cooling devices using the same technique, it is impossible to clearly identify the related device as this one is just a number. For instance: thermal-cpufreq-0 thermal-cpufreq-1 etc ... The 'thermal' prefix is redundant with the subsystem namespace. This patch removes the 'thermal' prefix and changes the number by the device name. So the naming above becomes: cpufreq-cpu0 cpufreq-cpu4 etc ... Signed-off-by: Daniel Lezcano Acked-by: Viresh Kumar Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20210314111333.16551-2-daniel.lezcano@linaro.org --- drivers/thermal/cpufreq_cooling.c | 34 +++++++++++-------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 10af3341e5ea..3f5f1dce1320 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -13,10 +13,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include @@ -50,8 +50,6 @@ struct time_in_idle { /** * struct cpufreq_cooling_device - data for cooling device with cpufreq - * @id: unique integer value corresponding to each cpufreq_cooling_device - * registered. * @last_load: load measured by the latest call to cpufreq_get_requested_power() * @cpufreq_state: integer value representing the current state of cpufreq * cooling devices. @@ -69,7 +67,6 @@ struct time_in_idle { * cpufreq_cooling_device. */ struct cpufreq_cooling_device { - int id; u32 last_load; unsigned int cpufreq_state; unsigned int max_level; @@ -82,7 +79,6 @@ struct cpufreq_cooling_device { struct freq_qos_request qos_req; }; -static DEFINE_IDA(cpufreq_ida); static DEFINE_MUTEX(cooling_list_lock); static LIST_HEAD(cpufreq_cdev_list); @@ -528,11 +524,11 @@ __cpufreq_cooling_register(struct device_node *np, { struct thermal_cooling_device *cdev; struct cpufreq_cooling_device *cpufreq_cdev; - char dev_name[THERMAL_NAME_LENGTH]; unsigned int i; struct device *dev; int ret; struct thermal_cooling_device_ops *cooling_ops; + char *name; dev = get_cpu_device(policy->cpu); if (unlikely(!dev)) { @@ -567,16 +563,6 @@ __cpufreq_cooling_register(struct device_node *np, /* max_level is an index, not a counter */ cpufreq_cdev->max_level = i - 1; - ret = ida_simple_get(&cpufreq_ida, 0, 0, GFP_KERNEL); - if (ret < 0) { - cdev = ERR_PTR(ret); - goto free_idle_time; - } - cpufreq_cdev->id = ret; - - snprintf(dev_name, sizeof(dev_name), "thermal-cpufreq-%d", - cpufreq_cdev->id); - cooling_ops = &cpufreq_cooling_ops; #ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR @@ -591,7 +577,7 @@ __cpufreq_cooling_register(struct device_node *np, pr_err("%s: unsorted frequency tables are not supported\n", __func__); cdev = ERR_PTR(-EINVAL); - goto remove_ida; + goto free_idle_time; } ret = freq_qos_add_request(&policy->constraints, @@ -601,11 +587,18 @@ __cpufreq_cooling_register(struct device_node *np, pr_err("%s: Failed to add freq constraint (%d)\n", __func__, ret); cdev = ERR_PTR(ret); - goto remove_ida; + goto free_idle_time; } - cdev = thermal_of_cooling_device_register(np, dev_name, cpufreq_cdev, + cdev = ERR_PTR(-ENOMEM); + name = kasprintf(GFP_KERNEL, "cpufreq-%s", dev_name(dev)); + if (!name) + goto remove_qos_req; + + cdev = thermal_of_cooling_device_register(np, name, cpufreq_cdev, cooling_ops); + kfree(name); + if (IS_ERR(cdev)) goto remove_qos_req; @@ -617,8 +610,6 @@ __cpufreq_cooling_register(struct device_node *np, remove_qos_req: freq_qos_remove_request(&cpufreq_cdev->qos_req); -remove_ida: - ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id); free_idle_time: free_idle_time(cpufreq_cdev); free_cdev: @@ -712,7 +703,6 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) thermal_cooling_device_unregister(cdev); freq_qos_remove_request(&cpufreq_cdev->qos_req); - ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id); free_idle_time(cpufreq_cdev); kfree(cpufreq_cdev); } From f8d354e821b268c23a6cd548b7154e55c3954496 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 14 Mar 2021 12:13:31 +0100 Subject: [PATCH 0066/1379] thermal/drivers/devfreq_cooling: Use device name instead of auto-numbering Currently the naming of a cooling device is just a cooling technique followed by a number. When there are multiple cooling devices using the same technique, it is impossible to clearly identify the related device as this one is just a number. For instance: thermal-devfreq-0 thermal-devfreq-1 etc ... The 'thermal' prefix is redundant with the subsystem namespace. This patch removes the 'thermal' prefix and changes the number by the device name. So the naming above becomes: devfreq-5000000.gpu devfreq-1d84000.ufshc etc ... Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20210314111333.16551-3-daniel.lezcano@linaro.org --- drivers/thermal/devfreq_cooling.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c index fed3121ff2a1..fb250ac16f50 100644 --- a/drivers/thermal/devfreq_cooling.c +++ b/drivers/thermal/devfreq_cooling.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -25,11 +24,8 @@ #define HZ_PER_KHZ 1000 #define SCALE_ERROR_MITIGATION 100 -static DEFINE_IDA(devfreq_ida); - /** * struct devfreq_cooling_device - Devfreq cooling device - * @id: unique integer value corresponding to each * devfreq_cooling_device registered. * @cdev: Pointer to associated thermal cooling device. * @devfreq: Pointer to associated devfreq device. @@ -51,7 +47,6 @@ static DEFINE_IDA(devfreq_ida); * @em_pd: Energy Model for the associated Devfreq device */ struct devfreq_cooling_device { - int id; struct thermal_cooling_device *cdev; struct devfreq *devfreq; unsigned long cooling_state; @@ -363,7 +358,7 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df, struct thermal_cooling_device *cdev; struct device *dev = df->dev.parent; struct devfreq_cooling_device *dfc; - char dev_name[THERMAL_NAME_LENGTH]; + char *name; int err, num_opps; dfc = kzalloc(sizeof(*dfc), GFP_KERNEL); @@ -407,30 +402,27 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df, if (err < 0) goto free_table; - err = ida_simple_get(&devfreq_ida, 0, 0, GFP_KERNEL); - if (err < 0) + cdev = ERR_PTR(-ENOMEM); + name = kasprintf(GFP_KERNEL, "devfreq-%s", dev_name(dev)); + if (!name) goto remove_qos_req; - dfc->id = err; - - snprintf(dev_name, sizeof(dev_name), "thermal-devfreq-%d", dfc->id); - - cdev = thermal_of_cooling_device_register(np, dev_name, dfc, + cdev = thermal_of_cooling_device_register(np, name, dfc, &devfreq_cooling_ops); + kfree(name); + if (IS_ERR(cdev)) { err = PTR_ERR(cdev); dev_err(dev, "Failed to register devfreq cooling device (%d)\n", err); - goto release_ida; + goto remove_qos_req; } dfc->cdev = cdev; return cdev; -release_ida: - ida_simple_remove(&devfreq_ida, dfc->id); remove_qos_req: dev_pm_qos_remove_request(&dfc->req_max_freq); free_table: @@ -527,7 +519,6 @@ void devfreq_cooling_unregister(struct thermal_cooling_device *cdev) dev = dfc->devfreq->dev.parent; thermal_cooling_device_unregister(dfc->cdev); - ida_simple_remove(&devfreq_ida, dfc->id); dev_pm_qos_remove_request(&dfc->req_max_freq); em_dev_unregister_perf_domain(dev); From 6fd1b186d900acf4cef9d3c23ec2839022a46345 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 14 Mar 2021 12:13:32 +0100 Subject: [PATCH 0067/1379] thermal/drivers/cpuidle_cooling: Use device name instead of auto-numbering Currently the naming of a cooling device is just a cooling technique followed by a number. When there are multiple cooling devices using the same technique, it is impossible to clearly identify the related device as this one is just a number. For instance: thermal-idle-0 thermal-idle-1 thermal-idle-2 thermal-idle-3 etc ... The 'thermal' prefix is redundant with the subsystem namespace. This patch removes the 'thermal prefix and changes the number by the device name. So the naming above becomes: idle-cpu0 idle-cpu1 idle-cpu2 idle-cpu3 etc ... Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20210314111333.16551-4-daniel.lezcano@linaro.org --- drivers/thermal/cpuidle_cooling.c | 33 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/thermal/cpuidle_cooling.c b/drivers/thermal/cpuidle_cooling.c index 7ecab4b16b29..f32976163bad 100644 --- a/drivers/thermal/cpuidle_cooling.c +++ b/drivers/thermal/cpuidle_cooling.c @@ -9,9 +9,9 @@ #include #include +#include #include #include -#include #include #include #include @@ -26,8 +26,6 @@ struct cpuidle_cooling_device { unsigned long state; }; -static DEFINE_IDA(cpuidle_ida); - /** * cpuidle_cooling_runtime - Running time computation * @idle_duration_us: CPU idle time to inject in microseconds @@ -174,10 +172,11 @@ static int __cpuidle_cooling_register(struct device_node *np, struct idle_inject_device *ii_dev; struct cpuidle_cooling_device *idle_cdev; struct thermal_cooling_device *cdev; + struct device *dev; unsigned int idle_duration_us = TICK_USEC; unsigned int latency_us = UINT_MAX; - char dev_name[THERMAL_NAME_LENGTH]; - int id, ret; + char *name; + int ret; idle_cdev = kzalloc(sizeof(*idle_cdev), GFP_KERNEL); if (!idle_cdev) { @@ -185,16 +184,10 @@ static int __cpuidle_cooling_register(struct device_node *np, goto out; } - id = ida_simple_get(&cpuidle_ida, 0, 0, GFP_KERNEL); - if (id < 0) { - ret = id; - goto out_kfree; - } - ii_dev = idle_inject_register(drv->cpumask); if (!ii_dev) { ret = -EINVAL; - goto out_id; + goto out_kfree; } of_property_read_u32(np, "duration-us", &idle_duration_us); @@ -205,24 +198,30 @@ static int __cpuidle_cooling_register(struct device_node *np, idle_cdev->ii_dev = ii_dev; - snprintf(dev_name, sizeof(dev_name), "thermal-idle-%d", id); + dev = get_cpu_device(cpumask_first(drv->cpumask)); - cdev = thermal_of_cooling_device_register(np, dev_name, idle_cdev, + name = kasprintf(GFP_KERNEL, "idle-%s", dev_name(dev)); + if (!name) { + ret = -ENOMEM; + goto out_unregister; + } + + cdev = thermal_of_cooling_device_register(np, name, idle_cdev, &cpuidle_cooling_ops); + kfree(name); + if (IS_ERR(cdev)) { ret = PTR_ERR(cdev); goto out_unregister; } pr_debug("%s: Idle injection set with idle duration=%u, latency=%u\n", - dev_name, idle_duration_us, latency_us); + name, idle_duration_us, latency_us); return 0; out_unregister: idle_inject_unregister(ii_dev); -out_id: - ida_simple_remove(&cpuidle_ida, id); out_kfree: kfree(idle_cdev); out: From 87602aeb8ad5bb1b2e23285a9d1322ac033f86c9 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 14 Mar 2021 12:13:33 +0100 Subject: [PATCH 0068/1379] thermal/drivers/cpufreq_cooling: Remove unused list There is a list with the purpose of grouping the cpufreq cooling device together as described in the comments but actually it is unused, the code evolved since 2012 and the list was no longer needed. Delete the remaining unused list related code. Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20210314111333.16551-5-daniel.lezcano@linaro.org --- drivers/thermal/cpufreq_cooling.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 3f5f1dce1320..f3d308427665 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -59,7 +59,6 @@ struct time_in_idle { * @cdev: thermal_cooling_device pointer to keep track of the * registered cooling device. * @policy: cpufreq policy. - * @node: list_head to link all cpufreq_cooling_device together. * @idle_time: idle time stats * @qos_req: PM QoS contraint to apply * @@ -72,16 +71,12 @@ struct cpufreq_cooling_device { unsigned int max_level; struct em_perf_domain *em; struct cpufreq_policy *policy; - struct list_head node; #ifndef CONFIG_SMP struct time_in_idle *idle_time; #endif struct freq_qos_request qos_req; }; -static DEFINE_MUTEX(cooling_list_lock); -static LIST_HEAD(cpufreq_cdev_list); - #ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR /** * get_level: Find the level for a particular frequency @@ -602,10 +597,6 @@ __cpufreq_cooling_register(struct device_node *np, if (IS_ERR(cdev)) goto remove_qos_req; - mutex_lock(&cooling_list_lock); - list_add(&cpufreq_cdev->node, &cpufreq_cdev_list); - mutex_unlock(&cooling_list_lock); - return cdev; remove_qos_req: @@ -697,10 +688,6 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) cpufreq_cdev = cdev->devdata; - mutex_lock(&cooling_list_lock); - list_del(&cpufreq_cdev->node); - mutex_unlock(&cooling_list_lock); - thermal_cooling_device_unregister(cdev); freq_qos_remove_request(&cpufreq_cdev->qos_req); free_idle_time(cpufreq_cdev); From cd2b4f14edaba60f4f8d429d46e3636202a30f5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 1 Mar 2021 18:38:47 +0100 Subject: [PATCH 0069/1379] pcmcia: ds: Remove if with always false condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pcmcia_device_remove() is only ever called by the driver core with dev->driver pointing to a valid driver. Signed-off-by: Uwe Kleine-König [linux@dominikbrodowski.net: shorten commit message, fix reference to pcmcia_device_probe] Signed-off-by: Dominik Brodowski --- drivers/pcmcia/ds.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c index 72114907c0e4..6ecb64140f37 100644 --- a/drivers/pcmcia/ds.c +++ b/drivers/pcmcia/ds.c @@ -371,9 +371,6 @@ static int pcmcia_device_remove(struct device *dev) pcmcia_card_remove(p_dev->socket, p_dev); /* detach the "instance" */ - if (!p_drv) - return 0; - if (p_drv->remove) p_drv->remove(p_dev); @@ -389,7 +386,7 @@ static int pcmcia_device_remove(struct device *dev) "pcmcia: driver %s did not release window properly\n", p_drv->name); - /* references from pcmcia_probe_device */ + /* references from pcmcia_device_probe */ pcmcia_put_dev(p_dev); module_put(p_drv->owner); From f20a7596a0c13947405e961a8adb77abeddc989f Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 11:02:34 +0000 Subject: [PATCH 0070/1379] pcmcia: rsrc_nonstatic: Demote kernel-doc abuses Fixes the following W=1 kernel build warning(s): drivers/pcmcia/rsrc_nonstatic.c:265: warning: Function parameter or member 's' not described in 'readable' drivers/pcmcia/rsrc_nonstatic.c:265: warning: Function parameter or member 'res' not described in 'readable' drivers/pcmcia/rsrc_nonstatic.c:265: warning: Function parameter or member 'count' not described in 'readable' drivers/pcmcia/rsrc_nonstatic.c:296: warning: Function parameter or member 's' not described in 'checksum' drivers/pcmcia/rsrc_nonstatic.c:296: warning: Function parameter or member 'res' not described in 'checksum' drivers/pcmcia/rsrc_nonstatic.c:296: warning: Function parameter or member 'value' not described in 'checksum' drivers/pcmcia/rsrc_nonstatic.c:349: warning: Function parameter or member 'value' not described in 'do_validate_mem' drivers/pcmcia/rsrc_nonstatic.c:349: warning: Excess function parameter 'validate' description in 'do_validate_mem' drivers/pcmcia/rsrc_nonstatic.c:407: warning: Function parameter or member 'value' not described in 'do_mem_probe' drivers/pcmcia/rsrc_nonstatic.c:407: warning: Excess function parameter 'validate' description in 'do_mem_probe' drivers/pcmcia/rsrc_nonstatic.c:407: warning: Excess function parameter 'fallback' description in 'do_mem_probe' Signed-off-by: Lee Jones [linux@dominikbrodowski.net: removed list of CCs] Signed-off-by: Dominik Brodowski --- drivers/pcmcia/rsrc_nonstatic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c index 3b05760e69d6..55f9fed478eb 100644 --- a/drivers/pcmcia/rsrc_nonstatic.c +++ b/drivers/pcmcia/rsrc_nonstatic.c @@ -257,7 +257,7 @@ static void do_io_probe(struct pcmcia_socket *s, unsigned int base, /*======================================================================*/ -/** +/* * readable() - iomem validation function for cards with a valid CIS */ static int readable(struct pcmcia_socket *s, struct resource *res, @@ -288,7 +288,7 @@ static int readable(struct pcmcia_socket *s, struct resource *res, return 0; } -/** +/* * checksum() - iomem validation function for simple memory cards */ static int checksum(struct pcmcia_socket *s, struct resource *res, From cc448baf85c8f25282189b7f81c85c3bf10dfd9f Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 11:02:35 +0000 Subject: [PATCH 0071/1379] pcmcia: cistpl: Demote non-conformant kernel-doc headers to standard comments Fixes the following W=1 kernel build warning(s): drivers/pcmcia/cistpl.c:88: warning: Function parameter or member 's' not described in 'set_cis_map' drivers/pcmcia/cistpl.c:88: warning: Function parameter or member 'card_offset' not described in 'set_cis_map' drivers/pcmcia/cistpl.c:88: warning: Function parameter or member 'flags' not described in 'set_cis_map' drivers/pcmcia/cistpl.c:136: warning: Function parameter or member 's' not described in 'pcmcia_read_cis_mem' drivers/pcmcia/cistpl.c:136: warning: Function parameter or member 'attr' not described in 'pcmcia_read_cis_mem' drivers/pcmcia/cistpl.c:136: warning: Function parameter or member 'addr' not described in 'pcmcia_read_cis_mem' drivers/pcmcia/cistpl.c:136: warning: Function parameter or member 'len' not described in 'pcmcia_read_cis_mem' drivers/pcmcia/cistpl.c:136: warning: Function parameter or member 'ptr' not described in 'pcmcia_read_cis_mem' drivers/pcmcia/cistpl.c:217: warning: Function parameter or member 's' not described in 'pcmcia_write_cis_mem' drivers/pcmcia/cistpl.c:217: warning: Function parameter or member 'attr' not described in 'pcmcia_write_cis_mem' drivers/pcmcia/cistpl.c:217: warning: Function parameter or member 'addr' not described in 'pcmcia_write_cis_mem' drivers/pcmcia/cistpl.c:217: warning: Function parameter or member 'len' not described in 'pcmcia_write_cis_mem' drivers/pcmcia/cistpl.c:217: warning: Function parameter or member 'ptr' not described in 'pcmcia_write_cis_mem' drivers/pcmcia/cistpl.c:289: warning: Function parameter or member 's' not described in 'read_cis_cache' drivers/pcmcia/cistpl.c:289: warning: Function parameter or member 'attr' not described in 'read_cis_cache' drivers/pcmcia/cistpl.c:289: warning: Function parameter or member 'addr' not described in 'read_cis_cache' drivers/pcmcia/cistpl.c:289: warning: Function parameter or member 'len' not described in 'read_cis_cache' drivers/pcmcia/cistpl.c:289: warning: Function parameter or member 'ptr' not described in 'read_cis_cache' drivers/pcmcia/cistpl.c:372: warning: Function parameter or member 's' not described in 'verify_cis_cache' drivers/pcmcia/cistpl.c:412: warning: Function parameter or member 's' not described in 'pcmcia_replace_cis' drivers/pcmcia/cistpl.c:412: warning: Function parameter or member 'data' not described in 'pcmcia_replace_cis' drivers/pcmcia/cistpl.c:412: warning: Function parameter or member 'len' not described in 'pcmcia_replace_cis' Signed-off-by: Lee Jones [linux@dominikbrodowski.net: removed list of CCs] Signed-off-by: Dominik Brodowski --- drivers/pcmcia/cistpl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/pcmcia/cistpl.c b/drivers/pcmcia/cistpl.c index e6939103991b..948b763dc451 100644 --- a/drivers/pcmcia/cistpl.c +++ b/drivers/pcmcia/cistpl.c @@ -75,7 +75,7 @@ void release_cis_mem(struct pcmcia_socket *s) mutex_unlock(&s->ops_mutex); } -/** +/* * set_cis_map() - map the card memory at "card_offset" into virtual space. * * If flags & MAP_ATTRIB, map the attribute space, otherwise @@ -126,7 +126,7 @@ static void __iomem *set_cis_map(struct pcmcia_socket *s, #define IS_ATTR 1 #define IS_INDIRECT 8 -/** +/* * pcmcia_read_cis_mem() - low-level function to read CIS memory * * must be called with ops_mutex held @@ -206,7 +206,7 @@ int pcmcia_read_cis_mem(struct pcmcia_socket *s, int attr, u_int addr, } -/** +/* * pcmcia_write_cis_mem() - low-level function to write CIS memory * * Probably only useful for writing one-byte registers. Must be called @@ -277,7 +277,7 @@ int pcmcia_write_cis_mem(struct pcmcia_socket *s, int attr, u_int addr, } -/** +/* * read_cis_cache() - read CIS memory or its associated cache * * This is a wrapper around read_cis_mem, with the same interface, @@ -365,7 +365,7 @@ void destroy_cis_cache(struct pcmcia_socket *s) } } -/** +/* * verify_cis_cache() - does the CIS match what is in the CIS cache? */ int verify_cis_cache(struct pcmcia_socket *s) @@ -401,7 +401,7 @@ int verify_cis_cache(struct pcmcia_socket *s) return 0; } -/** +/* * pcmcia_replace_cis() - use a replacement CIS instead of the card's CIS * * For really bad cards, we provide a facility for uploading a From f4468bbbe25eae0839aeada937c4ac8d9a5086d9 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 11:02:36 +0000 Subject: [PATCH 0072/1379] pcmcia: pcmcia_cis: Demote non-conforming kernel-doc headers to standard kernel-doc Fixes the following W=1 kernel build warning(s): drivers/pcmcia/pcmcia_cis.c:129: warning: Function parameter or member 'flags' not described in 'pcmcia_io_cfg_data_width' drivers/pcmcia/pcmcia_cis.c:154: warning: Function parameter or member 'tuple' not described in 'pcmcia_do_loop_config' drivers/pcmcia/pcmcia_cis.c:154: warning: Function parameter or member 'parse' not described in 'pcmcia_do_loop_config' drivers/pcmcia/pcmcia_cis.c:154: warning: Function parameter or member 'priv' not described in 'pcmcia_do_loop_config' drivers/pcmcia/pcmcia_cis.c:300: warning: Function parameter or member 'tuple' not described in 'pcmcia_do_loop_tuple' drivers/pcmcia/pcmcia_cis.c:300: warning: Function parameter or member 'parse' not described in 'pcmcia_do_loop_tuple' drivers/pcmcia/pcmcia_cis.c:300: warning: Function parameter or member 'priv' not described in 'pcmcia_do_loop_tuple' drivers/pcmcia/pcmcia_cis.c:351: warning: Function parameter or member 'p_dev' not described in 'pcmcia_do_get_tuple' drivers/pcmcia/pcmcia_cis.c:351: warning: Function parameter or member 'tuple' not described in 'pcmcia_do_get_tuple' drivers/pcmcia/pcmcia_cis.c:351: warning: Function parameter or member 'priv' not described in 'pcmcia_do_get_tuple' drivers/pcmcia/pcmcia_cis.c:399: warning: Function parameter or member 'p_dev' not described in 'pcmcia_do_get_mac' drivers/pcmcia/pcmcia_cis.c:399: warning: Function parameter or member 'tuple' not described in 'pcmcia_do_get_mac' drivers/pcmcia/pcmcia_cis.c:399: warning: Function parameter or member 'priv' not described in 'pcmcia_do_get_mac' Signed-off-by: Lee Jones [linux@dominikbrodowski.net: removed list of CCs] Signed-off-by: Dominik Brodowski --- drivers/pcmcia/pcmcia_cis.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/pcmcia/pcmcia_cis.c b/drivers/pcmcia/pcmcia_cis.c index e4c4daf92038..d2d0ed4b27c8 100644 --- a/drivers/pcmcia/pcmcia_cis.c +++ b/drivers/pcmcia/pcmcia_cis.c @@ -122,7 +122,7 @@ next_entry: } -/** +/* * pcmcia_io_cfg_data_width() - convert cfgtable to data path width parameter */ static int pcmcia_io_cfg_data_width(unsigned int flags) @@ -143,7 +143,7 @@ struct pcmcia_cfg_mem { cistpl_cftable_entry_t dflt; }; -/** +/* * pcmcia_do_loop_config() - internal helper for pcmcia_loop_config() * * pcmcia_do_loop_config() is the internal callback for the call from @@ -289,7 +289,7 @@ struct pcmcia_loop_mem { void *priv_data); }; -/** +/* * pcmcia_do_loop_tuple() - internal helper for pcmcia_loop_config() * * pcmcia_do_loop_tuple() is the internal callback for the call from @@ -337,7 +337,7 @@ struct pcmcia_loop_get { cisdata_t **buf; }; -/** +/* * pcmcia_do_get_tuple() - internal helper for pcmcia_get_tuple() * * pcmcia_do_get_tuple() is the internal callback for the call from @@ -386,7 +386,7 @@ size_t pcmcia_get_tuple(struct pcmcia_device *p_dev, cisdata_t code, EXPORT_SYMBOL(pcmcia_get_tuple); -/** +/* * pcmcia_do_get_mac() - internal helper for pcmcia_get_mac_from_cis() * * pcmcia_do_get_mac() is the internal callback for the call from From 6562e2cb8c7fa2f4982cdb28a2740a3bb7ec2c43 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 11:02:37 +0000 Subject: [PATCH 0073/1379] pcmcia: ds: Fix function name disparity in header Fixes the following W=1 kernel build warning(s): drivers/pcmcia/ds.c:96: warning: expecting prototype for pcmcia_store_new_id(). Prototype was for new_id_store() instead Signed-off-by: Lee Jones [linux@dominikbrodowski.net: removed list of CCs] Signed-off-by: Dominik Brodowski --- drivers/pcmcia/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c index 6ecb64140f37..bd81aa64d011 100644 --- a/drivers/pcmcia/ds.c +++ b/drivers/pcmcia/ds.c @@ -83,7 +83,7 @@ struct pcmcia_dynid { }; /** - * pcmcia_store_new_id - add a new PCMCIA device ID to this driver and re-probe devices + * new_id_store() - add a new PCMCIA device ID to this driver and re-probe devices * @driver: target device driver * @buf: buffer for scanning device ID data * @count: input size From 1d26d6f2cbd049b4be295db2d6328d6bcfd50af7 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 11:02:38 +0000 Subject: [PATCH 0074/1379] pcmcia: pcmcia_resource: Fix some kernel-doc formatting/disparities and demote others Fixes the following W=1 kernel build warning(s): drivers/pcmcia/pcmcia_resource.c:160: warning: Function parameter or member 'p_dev' not described in 'pcmcia_access_config' drivers/pcmcia/pcmcia_resource.c:160: warning: Function parameter or member 'where' not described in 'pcmcia_access_config' drivers/pcmcia/pcmcia_resource.c:160: warning: Function parameter or member 'val' not described in 'pcmcia_access_config' drivers/pcmcia/pcmcia_resource.c:160: warning: Function parameter or member 'accessf' not described in 'pcmcia_access_config' drivers/pcmcia/pcmcia_resource.c:194: warning: Function parameter or member 'p_dev' not described in 'pcmcia_read_config_byte' drivers/pcmcia/pcmcia_resource.c:194: warning: Function parameter or member 'where' not described in 'pcmcia_read_config_byte' drivers/pcmcia/pcmcia_resource.c:194: warning: Function parameter or member 'val' not described in 'pcmcia_read_config_byte' drivers/pcmcia/pcmcia_resource.c:207: warning: Function parameter or member 'p_dev' not described in 'pcmcia_write_config_byte' drivers/pcmcia/pcmcia_resource.c:207: warning: Function parameter or member 'where' not described in 'pcmcia_write_config_byte' drivers/pcmcia/pcmcia_resource.c:207: warning: Function parameter or member 'val' not described in 'pcmcia_write_config_byte' drivers/pcmcia/pcmcia_resource.c:728: warning: Function parameter or member 'p_dev' not described in 'pcmcia_setup_isa_irq' drivers/pcmcia/pcmcia_resource.c:728: warning: Function parameter or member 'type' not described in 'pcmcia_setup_isa_irq' drivers/pcmcia/pcmcia_resource.c:793: warning: Function parameter or member 'p_dev' not described in 'pcmcia_setup_irq' Signed-off-by: Lee Jones [linux@dominikbrodowski.net: removed list of CCs] Signed-off-by: Dominik Brodowski --- drivers/pcmcia/pcmcia_resource.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/pcmcia/pcmcia_resource.c b/drivers/pcmcia/pcmcia_resource.c index e3a6b6c8a5b0..c1c197292111 100644 --- a/drivers/pcmcia/pcmcia_resource.c +++ b/drivers/pcmcia/pcmcia_resource.c @@ -144,7 +144,7 @@ static int alloc_io_space(struct pcmcia_socket *s, struct resource *res, } -/** +/* * pcmcia_access_config() - read or write card configuration registers * * pcmcia_access_config() reads and writes configuration registers in @@ -184,7 +184,7 @@ static int pcmcia_access_config(struct pcmcia_device *p_dev, } -/** +/* * pcmcia_read_config_byte() - read a byte from a card configuration register * * pcmcia_read_config_byte() reads a byte from a configuration register in @@ -197,7 +197,7 @@ int pcmcia_read_config_byte(struct pcmcia_device *p_dev, off_t where, u8 *val) EXPORT_SYMBOL(pcmcia_read_config_byte); -/** +/* * pcmcia_write_config_byte() - write a byte to a card configuration register * * pcmcia_write_config_byte() writes a byte to a configuration register in @@ -720,7 +720,8 @@ static irqreturn_t test_action(int cpl, void *dev_id) /** * pcmcia_setup_isa_irq() - determine whether an ISA IRQ can be used - * @p_dev - the associated PCMCIA device + * @p_dev: the associated PCMCIA device + * @type: IRQ type (flags) * * locking note: must be called with ops_mutex locked. */ @@ -785,7 +786,7 @@ void pcmcia_cleanup_irq(struct pcmcia_socket *s) /** * pcmcia_setup_irq() - determine IRQ to be used for device - * @p_dev - the associated PCMCIA device + * @p_dev: the associated PCMCIA device * * locking note: must be called with ops_mutex locked. */ From e9d503fef7da2cc0610ce9cd056d0347ec9cafc4 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Fri, 12 Mar 2021 11:02:39 +0000 Subject: [PATCH 0075/1379] pcmcia: rsrc_nonstatic: Fix call-back function as reference formatting Fixes the following W=1 kernel build warning(s): drivers/pcmcia/rsrc_nonstatic.c:349: warning: Function parameter or member 'value' not described in 'do_validate_mem' drivers/pcmcia/rsrc_nonstatic.c:349: warning: Excess function parameter 'validate' description in 'do_validate_mem' drivers/pcmcia/rsrc_nonstatic.c:407: warning: Function parameter or member 'value' not described in 'do_mem_probe' drivers/pcmcia/rsrc_nonstatic.c:407: warning: Excess function parameter 'validate' description in 'do_mem_probe' drivers/pcmcia/rsrc_nonstatic.c:407: warning: Excess function parameter 'fallback' description in 'do_mem_probe' Signed-off-by: Lee Jones [linux@dominikbrodowski.net: removed list of CCs] Signed-off-by: Dominik Brodowski --- drivers/pcmcia/rsrc_nonstatic.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c index 55f9fed478eb..bb15a8bdbaab 100644 --- a/drivers/pcmcia/rsrc_nonstatic.c +++ b/drivers/pcmcia/rsrc_nonstatic.c @@ -343,9 +343,9 @@ static int checksum(struct pcmcia_socket *s, struct resource *res, */ static int do_validate_mem(struct pcmcia_socket *s, unsigned long base, unsigned long size, - int validate (struct pcmcia_socket *s, - struct resource *res, - unsigned int *value)) + int (*validate)(struct pcmcia_socket *s, + struct resource *res, + unsigned int *value)) { struct socket_data *s_data = s->resource_data; struct resource *res1, *res2; @@ -398,12 +398,12 @@ static int do_validate_mem(struct pcmcia_socket *s, * function returns the size of the usable memory area. */ static int do_mem_probe(struct pcmcia_socket *s, u_long base, u_long num, - int validate (struct pcmcia_socket *s, - struct resource *res, - unsigned int *value), - int fallback (struct pcmcia_socket *s, - struct resource *res, - unsigned int *value)) + int (*validate)(struct pcmcia_socket *s, + struct resource *res, + unsigned int *value), + int (*fallback)(struct pcmcia_socket *s, + struct resource *res, + unsigned int *value)) { struct socket_data *s_data = s->resource_data; u_long i, j, bad, fail, step; From eedb0b12d091a21909b5e84d9f3e5e649305bd12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jan 2021 14:53:22 +0100 Subject: [PATCH 0076/1379] dma-mapping: add a dma_mmap_pages helper Add a helper to map memory allocated using dma_alloc_pages into a user address space, similar to the dma_alloc_attrs function for coherent allocations. Signed-off-by: Christoph Hellwig Reviewed-by: Tomasz Figa Tested-by: Ricardo Ribalda --- Documentation/core-api/dma-api.rst | 10 ++++++++++ include/linux/dma-mapping.h | 2 ++ kernel/dma/mapping.c | 13 +++++++++++++ 3 files changed, 25 insertions(+) diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst index e6d23f117308..157a474ae544 100644 --- a/Documentation/core-api/dma-api.rst +++ b/Documentation/core-api/dma-api.rst @@ -563,6 +563,16 @@ Free a region of memory previously allocated using dma_alloc_pages(). dev, size, dma_handle and dir must all be the same as those passed into dma_alloc_pages(). page must be the pointer returned by dma_alloc_pages(). +:: + + int + dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, + size_t size, struct page *page) + +Map an allocation returned from dma_alloc_pages() into a user address space. +dev and size must be the same as those passed into dma_alloc_pages(). +page must be the pointer returned by dma_alloc_pages(). + :: void * diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2a984cb4d1e0..2b8dce756e1f 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -263,6 +263,8 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir); +int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, + size_t size, struct page *page); static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b6a633679933..9ce86c77651c 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -517,6 +517,19 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page, } EXPORT_SYMBOL_GPL(dma_free_pages); +int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, + size_t size, struct page *page) +{ + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (vma->vm_pgoff >= count || vma_pages(vma) > count - vma->vm_pgoff) + return -ENXIO; + return remap_pfn_range(vma, vma->vm_start, + page_to_pfn(page) + vma->vm_pgoff, + vma_pages(vma) << PAGE_SHIFT, vma->vm_page_prot); +} +EXPORT_SYMBOL_GPL(dma_mmap_pages); + int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); From 198c50e2ccff5c78ddbe0cb01593ac32458deb69 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Feb 2021 18:12:02 +0100 Subject: [PATCH 0077/1379] dma-mapping: refactor dma_{alloc,free}_pages Factour out internal versions without the dma_debug calls in preparation for callers that will need different dma_debug calls. Note that this changes the dma_debug calls to get the not page aligned size values, but as long as alloc and free agree on one variant we are fine. Signed-off-by: Christoph Hellwig Reviewed-by: Tomasz Figa Tested-by: Ricardo Ribalda --- kernel/dma/mapping.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 9ce86c77651c..07f964ebcda1 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -477,11 +477,10 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, } EXPORT_SYMBOL(dma_free_attrs); -struct page *dma_alloc_pages(struct device *dev, size_t size, +static struct page *__dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { const struct dma_map_ops *ops = get_dma_ops(dev); - struct page *page; if (WARN_ON_ONCE(!dev->coherent_dma_mask)) return NULL; @@ -490,31 +489,41 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) - page = dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); - else if (ops->alloc_pages) - page = ops->alloc_pages(dev, size, dma_handle, dir, gfp); - else + return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); + if (!ops->alloc_pages) return NULL; + return ops->alloc_pages(dev, size, dma_handle, dir, gfp); +} - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle); +struct page *dma_alloc_pages(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) +{ + struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp); + if (page) + debug_dma_map_page(dev, page, 0, size, dir, *dma_handle); return page; } EXPORT_SYMBOL_GPL(dma_alloc_pages); -void dma_free_pages(struct device *dev, size_t size, struct page *page, +static void __dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); size = PAGE_ALIGN(size); - debug_dma_unmap_page(dev, dma_handle, size, dir); - if (dma_alloc_direct(dev, ops)) dma_direct_free_pages(dev, size, page, dma_handle, dir); else if (ops->free_pages) ops->free_pages(dev, size, page, dma_handle, dir); } + +void dma_free_pages(struct device *dev, size_t size, struct page *page, + dma_addr_t dma_handle, enum dma_data_direction dir) +{ + debug_dma_unmap_page(dev, dma_handle, size, dir); + __dma_free_pages(dev, size, page, dma_handle, dir); +} EXPORT_SYMBOL_GPL(dma_free_pages); int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, From 7d5b5738d1514e9dd8ed452660e2a4d25beb9483 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jan 2021 14:54:18 +0100 Subject: [PATCH 0078/1379] dma-mapping: add a dma_alloc_noncontiguous API Add a new API that returns a potentiall virtually non-contigous sg_table and a DMA address. This API is only properly implemented for dma-iommu and will simply return a contigious chunk as a fallback. The intent is that drivers can use this API if either: - no kernel mapping or only temporary kernel mappings are required. That is as a better replacement for DMA_ATTR_NO_KERNEL_MAPPING - a kernel mapping is required for cached and DMA mapped pages, but the driver also needs the pages to e.g. map them to userspace. In that sense it is a replacement for some aspects of the recently removed and never fully implemented DMA_ATTR_NON_CONSISTENT Signed-off-by: Christoph Hellwig Reviewed-by: Tomasz Figa Tested-by: Ricardo Ribalda --- Documentation/core-api/dma-api.rst | 78 +++++++++++++++++++++ include/linux/dma-map-ops.h | 19 ++++++ include/linux/dma-mapping.h | 32 +++++++++ kernel/dma/mapping.c | 106 +++++++++++++++++++++++++++++ 4 files changed, 235 insertions(+) diff --git a/Documentation/core-api/dma-api.rst b/Documentation/core-api/dma-api.rst index 157a474ae544..00a1d4fa3f9e 100644 --- a/Documentation/core-api/dma-api.rst +++ b/Documentation/core-api/dma-api.rst @@ -594,6 +594,84 @@ dev, size, dma_handle and dir must all be the same as those passed into dma_alloc_noncoherent(). cpu_addr must be the virtual address returned by dma_alloc_noncoherent(). +:: + + struct sg_table * + dma_alloc_noncontiguous(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, + unsigned long attrs); + +This routine allocates bytes of non-coherent and possibly non-contiguous +memory. It returns a pointer to struct sg_table that describes the allocated +and DMA mapped memory, or NULL if the allocation failed. The resulting memory +can be used for struct page mapped into a scatterlist are suitable for. + +The return sg_table is guaranteed to have 1 single DMA mapped segment as +indicated by sgt->nents, but it might have multiple CPU side segments as +indicated by sgt->orig_nents. + +The dir parameter specified if data is read and/or written by the device, +see dma_map_single() for details. + +The gfp parameter allows the caller to specify the ``GFP_`` flags (see +kmalloc()) for the allocation, but rejects flags used to specify a memory +zone such as GFP_DMA or GFP_HIGHMEM. + +The attrs argument must be either 0 or DMA_ATTR_ALLOC_SINGLE_PAGES. + +Before giving the memory to the device, dma_sync_sgtable_for_device() needs +to be called, and before reading memory written by the device, +dma_sync_sgtable_for_cpu(), just like for streaming DMA mappings that are +reused. + +:: + + void + dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, + enum dma_data_direction dir) + +Free memory previously allocated using dma_alloc_noncontiguous(). dev, size, +and dir must all be the same as those passed into dma_alloc_noncontiguous(). +sgt must be the pointer returned by dma_alloc_noncontiguous(). + +:: + + void * + dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt) + +Return a contiguous kernel mapping for an allocation returned from +dma_alloc_noncontiguous(). dev and size must be the same as those passed into +dma_alloc_noncontiguous(). sgt must be the pointer returned by +dma_alloc_noncontiguous(). + +Once a non-contiguous allocation is mapped using this function, the +flush_kernel_vmap_range() and invalidate_kernel_vmap_range() APIs must be used +to manage the coherency between the kernel mapping, the device and user space +mappings (if any). + +:: + + void + dma_vunmap_noncontiguous(struct device *dev, void *vaddr) + +Unmap a kernel mapping returned by dma_vmap_noncontiguous(). dev must be the +same the one passed into dma_alloc_noncontiguous(). vaddr must be the pointer +returned by dma_vmap_noncontiguous(). + + +:: + + int + dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, + size_t size, struct sg_table *sgt) + +Map an allocation returned from dma_alloc_noncontiguous() into a user address +space. dev and size must be the same as those passed into +dma_alloc_noncontiguous(). sgt must be the pointer returned by +dma_alloc_noncontiguous(). + :: int diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 51872e736e7b..0d53a96a3d64 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -22,6 +22,11 @@ struct dma_map_ops { gfp_t gfp); void (*free_pages)(struct device *dev, size_t size, struct page *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir); + struct sg_table *(*alloc_noncontiguous)(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, + unsigned long attrs); + void (*free_noncontiguous)(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir); int (*mmap)(struct device *, struct vm_area_struct *, void *, dma_addr_t, size_t, unsigned long attrs); @@ -198,6 +203,20 @@ static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, } #endif /* CONFIG_DMA_DECLARE_COHERENT */ +/* + * This is the actual return value from the ->alloc_noncontiguous method. + * The users of the DMA API should only care about the sg_table, but to make + * the DMA-API internal vmaping and freeing easier we stash away the page + * array as well (except for the fallback case). This can go away any time, + * e.g. when a vmap-variant that takes a scatterlist comes along. + */ +struct dma_sgt_handle { + struct sg_table sgt; + struct page **pages; +}; +#define sgt_handle(sgt) \ + container_of((sgt), struct dma_sgt_handle, sgt) + int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2b8dce756e1f..954847f9a3e0 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -144,6 +144,15 @@ u64 dma_get_required_mask(struct device *dev); size_t dma_max_mapping_size(struct device *dev); bool dma_need_sync(struct device *dev, dma_addr_t dma_addr); unsigned long dma_get_merge_boundary(struct device *dev); +struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, unsigned long attrs); +void dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir); +void *dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt); +void dma_vunmap_noncontiguous(struct device *dev, void *vaddr); +int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, + size_t size, struct sg_table *sgt); #else /* CONFIG_HAS_DMA */ static inline dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, @@ -257,6 +266,29 @@ static inline unsigned long dma_get_merge_boundary(struct device *dev) { return 0; } +static inline struct sg_table *dma_alloc_noncontiguous(struct device *dev, + size_t size, enum dma_data_direction dir, gfp_t gfp, + unsigned long attrs) +{ + return NULL; +} +static inline void dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ +} +static inline void *dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt) +{ + return NULL; +} +static inline void dma_vunmap_noncontiguous(struct device *dev, void *vaddr) +{ +} +static inline int dma_mmap_noncontiguous(struct device *dev, + struct vm_area_struct *vma, size_t size, struct sg_table *sgt) +{ + return -EINVAL; +} #endif /* CONFIG_HAS_DMA */ struct page *dma_alloc_pages(struct device *dev, size_t size, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 07f964ebcda1..2b06a809d0b9 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -539,6 +539,112 @@ int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(dma_mmap_pages); +static struct sg_table *alloc_single_sgt(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp) +{ + struct sg_table *sgt; + struct page *page; + + sgt = kmalloc(sizeof(*sgt), gfp); + if (!sgt) + return NULL; + if (sg_alloc_table(sgt, 1, gfp)) + goto out_free_sgt; + page = __dma_alloc_pages(dev, size, &sgt->sgl->dma_address, dir, gfp); + if (!page) + goto out_free_table; + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + sg_dma_len(sgt->sgl) = sgt->sgl->length; + return sgt; +out_free_table: + sg_free_table(sgt); +out_free_sgt: + kfree(sgt); + return NULL; +} + +struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, + enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + struct sg_table *sgt; + + if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES)) + return NULL; + + if (ops && ops->alloc_noncontiguous) + sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs); + else + sgt = alloc_single_sgt(dev, size, dir, gfp); + + if (sgt) { + sgt->nents = 1; + debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir); + } + return sgt; +} +EXPORT_SYMBOL_GPL(dma_alloc_noncontiguous); + +static void free_single_sgt(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ + __dma_free_pages(dev, size, sg_page(sgt->sgl), sgt->sgl->dma_address, + dir); + sg_free_table(sgt); + kfree(sgt); +} + +void dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + debug_dma_unmap_sg(dev, sgt->sgl, sgt->orig_nents, dir); + if (ops && ops->free_noncontiguous) + ops->free_noncontiguous(dev, size, sgt, dir); + else + free_single_sgt(dev, size, sgt, dir); +} +EXPORT_SYMBOL_GPL(dma_free_noncontiguous); + +void *dma_vmap_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (ops && ops->alloc_noncontiguous) + return vmap(sgt_handle(sgt)->pages, count, VM_MAP, PAGE_KERNEL); + return page_address(sg_page(sgt->sgl)); +} +EXPORT_SYMBOL_GPL(dma_vmap_noncontiguous); + +void dma_vunmap_noncontiguous(struct device *dev, void *vaddr) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops && ops->alloc_noncontiguous) + vunmap(vaddr); +} +EXPORT_SYMBOL_GPL(dma_vunmap_noncontiguous); + +int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, + size_t size, struct sg_table *sgt) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (ops && ops->alloc_noncontiguous) { + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + if (vma->vm_pgoff >= count || + vma_pages(vma) > count - vma->vm_pgoff) + return -ENXIO; + return vm_map_pages(vma, sgt_handle(sgt)->pages, count); + } + return dma_mmap_pages(dev, vma, size, sg_page(sgt->sgl)); +} +EXPORT_SYMBOL_GPL(dma_mmap_noncontiguous); + int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); From 8230ce9a4e206fa1be17d66245f87cae2935d7d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jan 2021 14:44:15 +0100 Subject: [PATCH 0079/1379] dma-iommu: refactor iommu_dma_alloc_remap Split out a new helper that only allocates a sg_table worth of memory without mapping it into contiguous kernel address space. Signed-off-by: Christoph Hellwig Reviewed-by: Tomasz Figa Tested-by: Ricardo Ribalda --- drivers/iommu/dma-iommu.c | 69 ++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index af765c813cc8..ec1abad156db 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -650,23 +650,12 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev, return pages; } -/** - * iommu_dma_alloc_remap - Allocate and map a buffer contiguous in IOVA space - * @dev: Device to allocate memory for. Must be a real device - * attached to an iommu_dma_domain - * @size: Size of buffer in bytes - * @dma_handle: Out argument for allocated DMA handle - * @gfp: Allocation flags - * @prot: pgprot_t to use for the remapped mapping - * @attrs: DMA attributes for this allocation - * - * If @size is less than PAGE_SIZE, then a full CPU page will be allocated, +/* + * If size is less than PAGE_SIZE, then a full CPU page will be allocated, * but an IOMMU which supports smaller pages might not map the whole thing. - * - * Return: Mapped virtual address, or NULL on failure. */ -static void *iommu_dma_alloc_remap(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot, +static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev, + size_t size, struct sg_table *sgt, gfp_t gfp, pgprot_t prot, unsigned long attrs) { struct iommu_domain *domain = iommu_get_dma_domain(dev); @@ -676,11 +665,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size, int ioprot = dma_info_to_prot(DMA_BIDIRECTIONAL, coherent, attrs); unsigned int count, min_size, alloc_sizes = domain->pgsize_bitmap; struct page **pages; - struct sg_table sgt; dma_addr_t iova; - void *vaddr; - - *dma_handle = DMA_MAPPING_ERROR; if (static_branch_unlikely(&iommu_deferred_attach_enabled) && iommu_deferred_attach(dev, domain)) @@ -707,34 +692,26 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size, if (!iova) goto out_free_pages; - if (sg_alloc_table_from_pages(&sgt, pages, count, 0, size, GFP_KERNEL)) + if (sg_alloc_table_from_pages(sgt, pages, count, 0, size, GFP_KERNEL)) goto out_free_iova; if (!(ioprot & IOMMU_CACHE)) { struct scatterlist *sg; int i; - for_each_sg(sgt.sgl, sg, sgt.orig_nents, i) + for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) arch_dma_prep_coherent(sg_page(sg), sg->length); } - if (iommu_map_sg_atomic(domain, iova, sgt.sgl, sgt.orig_nents, ioprot) + if (iommu_map_sg_atomic(domain, iova, sgt->sgl, sgt->orig_nents, ioprot) < size) goto out_free_sg; - vaddr = dma_common_pages_remap(pages, size, prot, - __builtin_return_address(0)); - if (!vaddr) - goto out_unmap; + sgt->sgl->dma_address = iova; + return pages; - *dma_handle = iova; - sg_free_table(&sgt); - return vaddr; - -out_unmap: - __iommu_dma_unmap(dev, iova, size); out_free_sg: - sg_free_table(&sgt); + sg_free_table(sgt); out_free_iova: iommu_dma_free_iova(cookie, iova, size, NULL); out_free_pages: @@ -742,6 +719,32 @@ out_free_pages: return NULL; } +static void *iommu_dma_alloc_remap(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot, + unsigned long attrs) +{ + struct page **pages; + struct sg_table sgt; + void *vaddr; + + pages = __iommu_dma_alloc_noncontiguous(dev, size, &sgt, gfp, prot, + attrs); + if (!pages) + return NULL; + *dma_handle = sgt.sgl->dma_address; + sg_free_table(&sgt); + vaddr = dma_common_pages_remap(pages, size, prot, + __builtin_return_address(0)); + if (!vaddr) + goto out_unmap; + return vaddr; + +out_unmap: + __iommu_dma_unmap(dev, *dma_handle, size); + __iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT); + return NULL; +} + static void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) { From e817ee5f2f95ca58a3b961ae4acfd3885e830b9c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Jan 2021 14:47:29 +0100 Subject: [PATCH 0080/1379] dma-iommu: implement ->alloc_noncontiguous Implement support for allocating a non-contiguous DMA region. Signed-off-by: Christoph Hellwig Reviewed-by: Tomasz Figa Tested-by: Ricardo Ribalda --- drivers/iommu/dma-iommu.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index ec1abad156db..1946422e4ac7 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -708,6 +708,7 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev, goto out_free_sg; sgt->sgl->dma_address = iova; + sgt->sgl->dma_length = size; return pages; out_free_sg: @@ -745,6 +746,37 @@ out_unmap: return NULL; } +#ifdef CONFIG_DMA_REMAP +static struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev, + size_t size, enum dma_data_direction dir, gfp_t gfp, + unsigned long attrs) +{ + struct dma_sgt_handle *sh; + + sh = kmalloc(sizeof(*sh), gfp); + if (!sh) + return NULL; + + sh->pages = __iommu_dma_alloc_noncontiguous(dev, size, &sh->sgt, gfp, + PAGE_KERNEL, attrs); + if (!sh->pages) { + kfree(sh); + return NULL; + } + return &sh->sgt; +} + +static void iommu_dma_free_noncontiguous(struct device *dev, size_t size, + struct sg_table *sgt, enum dma_data_direction dir) +{ + struct dma_sgt_handle *sh = sgt_handle(sgt); + + __iommu_dma_unmap(dev, sgt->sgl->dma_address, size); + __iommu_dma_free_pages(sh->pages, PAGE_ALIGN(size) >> PAGE_SHIFT); + sg_free_table(&sh->sgt); +} +#endif /* CONFIG_DMA_REMAP */ + static void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir) { @@ -1261,6 +1293,10 @@ static const struct dma_map_ops iommu_dma_ops = { .free = iommu_dma_free, .alloc_pages = dma_common_alloc_pages, .free_pages = dma_common_free_pages, +#ifdef CONFIG_DMA_REMAP + .alloc_noncontiguous = iommu_dma_alloc_noncontiguous, + .free_noncontiguous = iommu_dma_free_noncontiguous, +#endif .mmap = iommu_dma_mmap, .get_sgtable = iommu_dma_get_sgtable, .map_page = iommu_dma_map_page, From 936d3685e62436a378f02b8b74759b054d4aeca1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 25 Feb 2021 14:42:04 +0100 Subject: [PATCH 0081/1379] rtc: tps65910: include linux/property.h The added device_property_present() call causes a build failure in some configurations because of the missing header: drivers/rtc/rtc-tps65910.c:422:7: error: implicit declaration of function 'device_property_present' [-Werror,-Wimplicit-function-declaration] Fixes: 454ba154a62c ("rtc: tps65910: Support wakeup-source property") Signed-off-by: Arnd Bergmann Reviewed-by: Dmitry Osipenko Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210225134215.2263694-1-arnd@kernel.org --- drivers/rtc/rtc-tps65910.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/rtc/rtc-tps65910.c b/drivers/rtc/rtc-tps65910.c index 288abb1abdb8..bc89c62ccb9b 100644 --- a/drivers/rtc/rtc-tps65910.c +++ b/drivers/rtc/rtc-tps65910.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include From 6e00b6d0083ea5f529b057e87c0236747871b6a8 Mon Sep 17 00:00:00 2001 From: Heiko Schocher Date: Tue, 9 Mar 2021 14:47:19 +0100 Subject: [PATCH 0082/1379] rtc: rv3028: correct weekday register usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The datasheet for the rv3028 says the weekday has exact 3 bits and in chapter 3.4.0 for the "3h–Weekday" register it says: """ This register holds the current day of the week. Each value represents one weekday that is assigned by the user. Values will range from 0 to 6 The weekday counter is simply a 3-bit counter which counts up to 6 and then resets to 0. """ So do not code weekday bitwise instead, use the raw values from 0-6. Tested on "PHYTEC phyBOARD-Pollux i.MX8MP" board. Signed-off-by: Heiko Schocher Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210309134719.1494062-1-hs@denx.de --- drivers/rtc/rtc-rv3028.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-rv3028.c b/drivers/rtc/rtc-rv3028.c index 0c48d980d06a..12c807306893 100644 --- a/drivers/rtc/rtc-rv3028.c +++ b/drivers/rtc/rtc-rv3028.c @@ -320,7 +320,7 @@ static int rv3028_get_time(struct device *dev, struct rtc_time *tm) tm->tm_sec = bcd2bin(date[RV3028_SEC] & 0x7f); tm->tm_min = bcd2bin(date[RV3028_MIN] & 0x7f); tm->tm_hour = bcd2bin(date[RV3028_HOUR] & 0x3f); - tm->tm_wday = ilog2(date[RV3028_WDAY] & 0x7f); + tm->tm_wday = date[RV3028_WDAY] & 0x7f; tm->tm_mday = bcd2bin(date[RV3028_DAY] & 0x3f); tm->tm_mon = bcd2bin(date[RV3028_MONTH] & 0x1f) - 1; tm->tm_year = bcd2bin(date[RV3028_YEAR]) + 100; @@ -337,7 +337,7 @@ static int rv3028_set_time(struct device *dev, struct rtc_time *tm) date[RV3028_SEC] = bin2bcd(tm->tm_sec); date[RV3028_MIN] = bin2bcd(tm->tm_min); date[RV3028_HOUR] = bin2bcd(tm->tm_hour); - date[RV3028_WDAY] = 1 << (tm->tm_wday); + date[RV3028_WDAY] = tm->tm_wday; date[RV3028_DAY] = bin2bcd(tm->tm_mday); date[RV3028_MONTH] = bin2bcd(tm->tm_mon + 1); date[RV3028_YEAR] = bin2bcd(tm->tm_year - 100); From 198da7be18c47637d69cdab1f65581b04ebd759d Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 15 Mar 2021 14:20:29 -0300 Subject: [PATCH 0083/1379] rtc: imxdi: Convert to a DT-only driver i.MX has been converted to a DT-only platform, so make the driver depend on OF, remove the CONFIG_OF ifdefery and remove of_match_ptr(). Signed-off-by: Fabio Estevam Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210315172029.173250-1-festevam@gmail.com --- drivers/rtc/Kconfig | 1 + drivers/rtc/rtc-imxdi.c | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index ce723dc54aa4..9a82cb57586f 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -1331,6 +1331,7 @@ config RTC_DRV_DIGICOLOR config RTC_DRV_IMXDI tristate "Freescale IMX DryIce Real Time Clock" depends on ARCH_MXC + depends on OF help Support for Freescale IMX DryIce RTC diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c index c2692da74e09..c1806f4d68e7 100644 --- a/drivers/rtc/rtc-imxdi.c +++ b/drivers/rtc/rtc-imxdi.c @@ -840,19 +840,17 @@ static int __exit dryice_rtc_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_OF static const struct of_device_id dryice_dt_ids[] = { { .compatible = "fsl,imx25-rtc" }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, dryice_dt_ids); -#endif static struct platform_driver dryice_rtc_driver = { .driver = { .name = "imxdi_rtc", - .of_match_table = of_match_ptr(dryice_dt_ids), + .of_match_table = dryice_dt_ids, }, .remove = __exit_p(dryice_rtc_remove), }; From cee451c9d57ee170f123adacd70391dfb7a0b1a6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 15 Mar 2021 16:49:53 -0700 Subject: [PATCH 0084/1379] Input: tsc2007 - convert to GPIO descriptors This converts the driver to use GPIO descriptors. Note that it now uses logical polarity and thus nagation has been dropped. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210314210951.645783-1-andy.shevchenko@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/tsc2007.h | 4 +++- drivers/input/touchscreen/tsc2007_core.c | 15 ++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/input/touchscreen/tsc2007.h b/drivers/input/touchscreen/tsc2007.h index 91c60bf6dcaf..69b08dd6c8df 100644 --- a/drivers/input/touchscreen/tsc2007.h +++ b/drivers/input/touchscreen/tsc2007.h @@ -19,6 +19,8 @@ #ifndef _TSC2007_H #define _TSC2007_H +struct gpio_desc; + #define TSC2007_MEASURE_TEMP0 (0x0 << 4) #define TSC2007_MEASURE_AUX (0x2 << 4) #define TSC2007_MEASURE_TEMP1 (0x4 << 4) @@ -69,7 +71,7 @@ struct tsc2007 { int fuzzy; int fuzzz; - unsigned int gpio; + struct gpio_desc *gpiod; int irq; wait_queue_head_t wait; diff --git a/drivers/input/touchscreen/tsc2007_core.c b/drivers/input/touchscreen/tsc2007_core.c index 3b80abfc1eca..d19c623a73f5 100644 --- a/drivers/input/touchscreen/tsc2007_core.c +++ b/drivers/input/touchscreen/tsc2007_core.c @@ -19,11 +19,11 @@ #include #include +#include #include #include #include #include -#include #include #include "tsc2007.h" @@ -226,7 +226,7 @@ static int tsc2007_get_pendown_state_gpio(struct device *dev) struct i2c_client *client = to_i2c_client(dev); struct tsc2007 *ts = i2c_get_clientdata(client); - return !gpio_get_value(ts->gpio); + return gpiod_get_value(ts->gpiod); } static int tsc2007_probe_dt(struct i2c_client *client, struct tsc2007 *ts) @@ -266,13 +266,14 @@ static int tsc2007_probe_dt(struct i2c_client *client, struct tsc2007 *ts) return -EINVAL; } - ts->gpio = of_get_gpio(np, 0); - if (gpio_is_valid(ts->gpio)) + ts->gpiod = devm_gpiod_get_optional(&client->dev, NULL, GPIOD_IN); + if (IS_ERR(ts->gpiod)) + return PTR_ERR(ts->gpiod); + + if (ts->gpiod) ts->get_pendown_state = tsc2007_get_pendown_state_gpio; else - dev_warn(&client->dev, - "GPIO not specified in DT (of_get_gpio returned %d)\n", - ts->gpio); + dev_warn(&client->dev, "Pen down GPIO not specified in DT\n"); return 0; } From e512a9e9f44db4fad09d3c747c07311a643dd356 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 15 Mar 2021 17:00:57 -0700 Subject: [PATCH 0085/1379] Input: tsc2007 - make use of device properties Device property API allows to gather device resources from different sources, such as ACPI. Convert the drivers to unleash the power of device property API. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20210314210951.645783-2-andy.shevchenko@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/tsc2007_core.c | 49 ++++++++---------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/drivers/input/touchscreen/tsc2007_core.c b/drivers/input/touchscreen/tsc2007_core.c index d19c623a73f5..3e871d182c40 100644 --- a/drivers/input/touchscreen/tsc2007_core.c +++ b/drivers/input/touchscreen/tsc2007_core.c @@ -23,7 +23,8 @@ #include #include #include -#include +#include +#include #include #include "tsc2007.h" @@ -220,7 +221,6 @@ static void tsc2007_close(struct input_dev *input_dev) tsc2007_stop(ts); } -#ifdef CONFIG_OF static int tsc2007_get_pendown_state_gpio(struct device *dev) { struct i2c_client *client = to_i2c_client(dev); @@ -229,63 +229,50 @@ static int tsc2007_get_pendown_state_gpio(struct device *dev) return gpiod_get_value(ts->gpiod); } -static int tsc2007_probe_dt(struct i2c_client *client, struct tsc2007 *ts) +static int tsc2007_probe_properties(struct device *dev, struct tsc2007 *ts) { - struct device_node *np = client->dev.of_node; u32 val32; u64 val64; - if (!np) { - dev_err(&client->dev, "missing device tree data\n"); - return -EINVAL; - } - - if (!of_property_read_u32(np, "ti,max-rt", &val32)) + if (!device_property_read_u32(dev, "ti,max-rt", &val32)) ts->max_rt = val32; else ts->max_rt = MAX_12BIT; - if (!of_property_read_u32(np, "ti,fuzzx", &val32)) + if (!device_property_read_u32(dev, "ti,fuzzx", &val32)) ts->fuzzx = val32; - if (!of_property_read_u32(np, "ti,fuzzy", &val32)) + if (!device_property_read_u32(dev, "ti,fuzzy", &val32)) ts->fuzzy = val32; - if (!of_property_read_u32(np, "ti,fuzzz", &val32)) + if (!device_property_read_u32(dev, "ti,fuzzz", &val32)) ts->fuzzz = val32; - if (!of_property_read_u64(np, "ti,poll-period", &val64)) + if (!device_property_read_u64(dev, "ti,poll-period", &val64)) ts->poll_period = msecs_to_jiffies(val64); else ts->poll_period = msecs_to_jiffies(1); - if (!of_property_read_u32(np, "ti,x-plate-ohms", &val32)) { + if (!device_property_read_u32(dev, "ti,x-plate-ohms", &val32)) { ts->x_plate_ohms = val32; } else { - dev_err(&client->dev, "missing ti,x-plate-ohms devicetree property."); + dev_err(dev, "Missing ti,x-plate-ohms device property\n"); return -EINVAL; } - ts->gpiod = devm_gpiod_get_optional(&client->dev, NULL, GPIOD_IN); + ts->gpiod = devm_gpiod_get_optional(dev, NULL, GPIOD_IN); if (IS_ERR(ts->gpiod)) return PTR_ERR(ts->gpiod); if (ts->gpiod) ts->get_pendown_state = tsc2007_get_pendown_state_gpio; else - dev_warn(&client->dev, "Pen down GPIO not specified in DT\n"); + dev_warn(dev, "Pen down GPIO is not specified in properties\n"); return 0; } -#else -static int tsc2007_probe_dt(struct i2c_client *client, struct tsc2007 *ts) -{ - dev_err(&client->dev, "platform data is required!\n"); - return -EINVAL; -} -#endif -static int tsc2007_probe_pdev(struct i2c_client *client, struct tsc2007 *ts, +static int tsc2007_probe_pdev(struct device *dev, struct tsc2007 *ts, const struct tsc2007_platform_data *pdata, const struct i2c_device_id *id) { @@ -300,7 +287,7 @@ static int tsc2007_probe_pdev(struct i2c_client *client, struct tsc2007 *ts, ts->fuzzz = pdata->fuzzz; if (pdata->x_plate_ohms == 0) { - dev_err(&client->dev, "x_plate_ohms is not set up in platform data"); + dev_err(dev, "x_plate_ohms is not set up in platform data\n"); return -EINVAL; } @@ -333,9 +320,9 @@ static int tsc2007_probe(struct i2c_client *client, return -ENOMEM; if (pdata) - err = tsc2007_probe_pdev(client, ts, pdata, id); + err = tsc2007_probe_pdev(&client->dev, ts, pdata, id); else - err = tsc2007_probe_dt(client, ts); + err = tsc2007_probe_properties(&client->dev, ts); if (err) return err; @@ -432,18 +419,16 @@ static const struct i2c_device_id tsc2007_idtable[] = { MODULE_DEVICE_TABLE(i2c, tsc2007_idtable); -#ifdef CONFIG_OF static const struct of_device_id tsc2007_of_match[] = { { .compatible = "ti,tsc2007" }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, tsc2007_of_match); -#endif static struct i2c_driver tsc2007_driver = { .driver = { .name = "tsc2007", - .of_match_table = of_match_ptr(tsc2007_of_match), + .of_match_table = tsc2007_of_match, }, .id_table = tsc2007_idtable, .probe = tsc2007_probe, From 9346ff0bc6ff3c3a495d50a43b57df8fed7bc562 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Mon, 15 Mar 2021 20:58:00 -0300 Subject: [PATCH 0086/1379] rtc: mxc: Remove unneeded of_match_ptr() i.MX is a DT-only platform, so of_match_ptr() can be safely removed. Remove the unneeded of_match_ptr(). Signed-off-by: Fabio Estevam Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210315235800.200137-1-festevam@gmail.com --- drivers/rtc/rtc-mxc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index db57dda7ab97..0f08f22df869 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -415,7 +415,7 @@ static int mxc_rtc_probe(struct platform_device *pdev) static struct platform_driver mxc_rtc_driver = { .driver = { .name = "mxc_rtc", - .of_match_table = of_match_ptr(imx_rtc_dt_ids), + .of_match_table = imx_rtc_dt_ids, }, .probe = mxc_rtc_probe, }; From 50db2050faf854cbaf4b6557a7a8ca21bff302ae Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Sat, 13 Mar 2021 13:53:11 +0100 Subject: [PATCH 0087/1379] dmaengine: xilinx: Introduce synchronize() callback The Xilinx dmaengine driver uses a tasklet to process completed descriptors and execute their callbacks. Currently consumers of the DMA channel have to no method of synchronization against this tasklet when using the Xilinx dmaengine drivers. This can lead to race conditions when the consumer frees resources that are accessed in the callback before the tasklet has finished running. It is not enough to just call dmaengine_terminal_all() since on a multi-processor system the tasklet can run concurrently to it and might call the callback after dmaengine_terminate_all() has already finished. To mitigate this issue implement the synchronize() callback for the driver, which will wait until the tasklet has finished. Signed-off-by: Lars-Peter Clausen Link: https://lore.kernel.org/r/20210313125311.4823-1-lars@metafoo.de Signed-off-by: Vinod Koul --- drivers/dma/xilinx/xilinx_dma.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c index 3aded7861fef..75c0b8e904e5 100644 --- a/drivers/dma/xilinx/xilinx_dma.c +++ b/drivers/dma/xilinx/xilinx_dma.c @@ -2453,6 +2453,13 @@ static int xilinx_dma_terminate_all(struct dma_chan *dchan) return 0; } +static void xilinx_dma_synchronize(struct dma_chan *dchan) +{ + struct xilinx_dma_chan *chan = to_xilinx_chan(dchan); + + tasklet_kill(&chan->tasklet); +} + /** * xilinx_dma_channel_set_config - Configure VDMA channel * Run-time configuration for Axi VDMA, supports: @@ -3074,6 +3081,7 @@ static int xilinx_dma_probe(struct platform_device *pdev) xdev->common.device_free_chan_resources = xilinx_dma_free_chan_resources; xdev->common.device_terminate_all = xilinx_dma_terminate_all; + xdev->common.device_synchronize = xilinx_dma_synchronize; xdev->common.device_tx_status = xilinx_dma_tx_status; xdev->common.device_issue_pending = xilinx_dma_issue_pending; if (xdev->dma_config->dmatype == XDMA_TYPE_AXIDMA) { From 6e3e14c9385c3cfb35f9da4f495acdd21f9bc25b Mon Sep 17 00:00:00 2001 From: "jeson.gao" Date: Mon, 15 Mar 2021 16:25:37 +0800 Subject: [PATCH 0088/1379] thermal/core/power_allocator: Using round the division when re-divvying up power The division is used directly in re-divvying up power, the decimal part will be discarded, devices will get less than the extra_actor_power - 1. if using round the division to make the calculation more accurate. For example: actor0 received more than its max_power, it has the extra_power 759 actor1 received less than its max_power, it require extra_actor_power 395 actor2 received less than its max_power, it require extra_actor_power 365 actor1 and actor2 require the total capped_extra_power 760 using division in re-divvying up power actor1 would actually get the extra_actor_power 394 actor2 would actually get the extra_actor_power 364 if using round the division in re-divvying up power actor1 would actually get the extra_actor_power 394 actor2 would actually get the extra_actor_power 365 Signed-off-by: Jeson Gao Reviewed-by: Lukasz Luba Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1615796737-4688-1-git-send-email-gao.yunxiao6@gmail.com --- drivers/thermal/gov_power_allocator.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 92acae53df49..2802a0e13c88 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -374,9 +374,11 @@ static void divvy_up_power(u32 *req_power, u32 *max_power, int num_actors, */ extra_power = min(extra_power, capped_extra_power); if (capped_extra_power > 0) - for (i = 0; i < num_actors; i++) - granted_power[i] += (extra_actor_power[i] * - extra_power) / capped_extra_power; + for (i = 0; i < num_actors; i++) { + u64 extra_range = (u64)extra_actor_power[i] * extra_power; + granted_power[i] += DIV_ROUND_CLOSEST_ULL(extra_range, + capped_extra_power); + } } static int allocate_power(struct thermal_zone_device *tz, From 04e0a39fc10f82a71b84af73351333b184cee578 Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:03:55 +0100 Subject: [PATCH 0089/1379] dmaengine: dw-edma: Add writeq() and readq() for 64 bits architectures Add writeq() and readq() for 64 bits architures support. Supporting these two functions will allow the write or the read of eDMA 64 bits registers at once instead of having two consecutive operations. Also, this improvement will allow the PCI optimization transaction messages, which will generate a 64 bits message instead of two messages of 32 bits. Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/3f1120f7c6003b38ec8b851fc68936007c4d9fd8.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-v0-core.c | 254 +++++++++++++++++------ drivers/dma/dw-edma/dw-edma-v0-debugfs.c | 48 ++--- drivers/dma/dw-edma/dw-edma-v0-regs.h | 149 +++++++++---- 3 files changed, 326 insertions(+), 125 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c index 692de47b1670..7888eda52845 100644 --- a/drivers/dma/dw-edma/dw-edma-v0-core.c +++ b/drivers/dma/dw-edma/dw-edma-v0-core.c @@ -28,29 +28,69 @@ static inline struct dw_edma_v0_regs __iomem *__dw_regs(struct dw_edma *dw) return dw->rg_region.vaddr; } -#define SET(dw, name, value) \ +#define SET_32(dw, name, value) \ writel(value, &(__dw_regs(dw)->name)) -#define GET(dw, name) \ +#define GET_32(dw, name) \ readl(&(__dw_regs(dw)->name)) -#define SET_RW(dw, dir, name, value) \ +#define SET_RW_32(dw, dir, name, value) \ do { \ if ((dir) == EDMA_DIR_WRITE) \ - SET(dw, wr_##name, value); \ + SET_32(dw, wr_##name, value); \ else \ - SET(dw, rd_##name, value); \ + SET_32(dw, rd_##name, value); \ } while (0) -#define GET_RW(dw, dir, name) \ +#define GET_RW_32(dw, dir, name) \ ((dir) == EDMA_DIR_WRITE \ - ? GET(dw, wr_##name) \ - : GET(dw, rd_##name)) + ? GET_32(dw, wr_##name) \ + : GET_32(dw, rd_##name)) -#define SET_BOTH(dw, name, value) \ +#define SET_BOTH_32(dw, name, value) \ do { \ - SET(dw, wr_##name, value); \ - SET(dw, rd_##name, value); \ + SET_32(dw, wr_##name, value); \ + SET_32(dw, rd_##name, value); \ + } while (0) + +#ifdef CONFIG_64BIT + +#define SET_64(dw, name, value) \ + writeq(value, &(__dw_regs(dw)->name)) + +#define GET_64(dw, name) \ + readq(&(__dw_regs(dw)->name)) + +#define SET_RW_64(dw, dir, name, value) \ + do { \ + if ((dir) == EDMA_DIR_WRITE) \ + SET_64(dw, wr_##name, value); \ + else \ + SET_64(dw, rd_##name, value); \ + } while (0) + +#define GET_RW_64(dw, dir, name) \ + ((dir) == EDMA_DIR_WRITE \ + ? GET_64(dw, wr_##name) \ + : GET_64(dw, rd_##name)) + +#define SET_BOTH_64(dw, name, value) \ + do { \ + SET_64(dw, wr_##name, value); \ + SET_64(dw, rd_##name, value); \ + } while (0) + +#endif /* CONFIG_64BIT */ + +#define SET_COMPAT(dw, name, value) \ + writel(value, &(__dw_regs(dw)->type.unroll.name)) + +#define SET_RW_COMPAT(dw, dir, name, value) \ + do { \ + if ((dir) == EDMA_DIR_WRITE) \ + SET_COMPAT(dw, wr_##name, value); \ + else \ + SET_COMPAT(dw, rd_##name, value); \ } while (0) static inline struct dw_edma_v0_ch_regs __iomem * @@ -115,21 +155,86 @@ static inline u32 readl_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch, return value; } -#define SET_CH(dw, dir, ch, name, value) \ +#define SET_CH_32(dw, dir, ch, name, value) \ writel_ch(dw, dir, ch, value, &(__dw_ch_regs(dw, dir, ch)->name)) -#define GET_CH(dw, dir, ch, name) \ +#define GET_CH_32(dw, dir, ch, name) \ readl_ch(dw, dir, ch, &(__dw_ch_regs(dw, dir, ch)->name)) -#define SET_LL(ll, value) \ +#define SET_LL_32(ll, value) \ writel(value, ll) +#ifdef CONFIG_64BIT + +static inline void writeq_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch, + u64 value, void __iomem *addr) +{ + if (dw->mf == EDMA_MF_EDMA_LEGACY) { + u32 viewport_sel; + unsigned long flags; + + raw_spin_lock_irqsave(&dw->lock, flags); + + viewport_sel = FIELD_PREP(EDMA_V0_VIEWPORT_MASK, ch); + if (dir == EDMA_DIR_READ) + viewport_sel |= BIT(31); + + writel(viewport_sel, + &(__dw_regs(dw)->type.legacy.viewport_sel)); + writeq(value, addr); + + raw_spin_unlock_irqrestore(&dw->lock, flags); + } else { + writeq(value, addr); + } +} + +static inline u64 readq_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch, + const void __iomem *addr) +{ + u32 value; + + if (dw->mf == EDMA_MF_EDMA_LEGACY) { + u32 viewport_sel; + unsigned long flags; + + raw_spin_lock_irqsave(&dw->lock, flags); + + viewport_sel = FIELD_PREP(EDMA_V0_VIEWPORT_MASK, ch); + if (dir == EDMA_DIR_READ) + viewport_sel |= BIT(31); + + writel(viewport_sel, + &(__dw_regs(dw)->type.legacy.viewport_sel)); + value = readq(addr); + + raw_spin_unlock_irqrestore(&dw->lock, flags); + } else { + value = readq(addr); + } + + return value; +} + +#define SET_CH_64(dw, dir, ch, name, value) \ + writeq_ch(dw, dir, ch, value, &(__dw_ch_regs(dw, dir, ch)->name)) + +#define GET_CH_64(dw, dir, ch, name) \ + readq_ch(dw, dir, ch, &(__dw_ch_regs(dw, dir, ch)->name)) + +#define SET_LL_64(ll, value) \ + writeq(value, ll) + +#endif /* CONFIG_64BIT */ + /* eDMA management callbacks */ void dw_edma_v0_core_off(struct dw_edma *dw) { - SET_BOTH(dw, int_mask, EDMA_V0_DONE_INT_MASK | EDMA_V0_ABORT_INT_MASK); - SET_BOTH(dw, int_clear, EDMA_V0_DONE_INT_MASK | EDMA_V0_ABORT_INT_MASK); - SET_BOTH(dw, engine_en, 0); + SET_BOTH_32(dw, int_mask, + EDMA_V0_DONE_INT_MASK | EDMA_V0_ABORT_INT_MASK); + SET_BOTH_32(dw, int_clear, + EDMA_V0_DONE_INT_MASK | EDMA_V0_ABORT_INT_MASK); + SET_BOTH_32(dw, engine_en, 0); } u16 dw_edma_v0_core_ch_count(struct dw_edma *dw, enum dw_edma_dir dir) @@ -137,9 +242,11 @@ u16 dw_edma_v0_core_ch_count(struct dw_edma *dw, enum dw_edma_dir dir) u32 num_ch; if (dir == EDMA_DIR_WRITE) - num_ch = FIELD_GET(EDMA_V0_WRITE_CH_COUNT_MASK, GET(dw, ctrl)); + num_ch = FIELD_GET(EDMA_V0_WRITE_CH_COUNT_MASK, + GET_32(dw, ctrl)); else - num_ch = FIELD_GET(EDMA_V0_READ_CH_COUNT_MASK, GET(dw, ctrl)); + num_ch = FIELD_GET(EDMA_V0_READ_CH_COUNT_MASK, + GET_32(dw, ctrl)); if (num_ch > EDMA_V0_MAX_NR_CH) num_ch = EDMA_V0_MAX_NR_CH; @@ -153,7 +260,7 @@ enum dma_status dw_edma_v0_core_ch_status(struct dw_edma_chan *chan) u32 tmp; tmp = FIELD_GET(EDMA_V0_CH_STATUS_MASK, - GET_CH(dw, chan->dir, chan->id, ch_control1)); + GET_CH_32(dw, chan->dir, chan->id, ch_control1)); if (tmp == 1) return DMA_IN_PROGRESS; @@ -167,26 +274,28 @@ void dw_edma_v0_core_clear_done_int(struct dw_edma_chan *chan) { struct dw_edma *dw = chan->chip->dw; - SET_RW(dw, chan->dir, int_clear, - FIELD_PREP(EDMA_V0_DONE_INT_MASK, BIT(chan->id))); + SET_RW_32(dw, chan->dir, int_clear, + FIELD_PREP(EDMA_V0_DONE_INT_MASK, BIT(chan->id))); } void dw_edma_v0_core_clear_abort_int(struct dw_edma_chan *chan) { struct dw_edma *dw = chan->chip->dw; - SET_RW(dw, chan->dir, int_clear, - FIELD_PREP(EDMA_V0_ABORT_INT_MASK, BIT(chan->id))); + SET_RW_32(dw, chan->dir, int_clear, + FIELD_PREP(EDMA_V0_ABORT_INT_MASK, BIT(chan->id))); } u32 dw_edma_v0_core_status_done_int(struct dw_edma *dw, enum dw_edma_dir dir) { - return FIELD_GET(EDMA_V0_DONE_INT_MASK, GET_RW(dw, dir, int_status)); + return FIELD_GET(EDMA_V0_DONE_INT_MASK, + GET_RW_32(dw, dir, int_status)); } u32 dw_edma_v0_core_status_abort_int(struct dw_edma *dw, enum dw_edma_dir dir) { - return FIELD_GET(EDMA_V0_ABORT_INT_MASK, GET_RW(dw, dir, int_status)); + return FIELD_GET(EDMA_V0_ABORT_INT_MASK, + GET_RW_32(dw, dir, int_status)); } static void dw_edma_v0_core_write_chunk(struct dw_edma_chunk *chunk) @@ -209,15 +318,23 @@ static void dw_edma_v0_core_write_chunk(struct dw_edma_chunk *chunk) control |= (DW_EDMA_V0_LIE | DW_EDMA_V0_RIE); /* Channel control */ - SET_LL(&lli[i].control, control); + SET_LL_32(&lli[i].control, control); /* Transfer size */ - SET_LL(&lli[i].transfer_size, child->sz); - /* SAR - low, high */ - SET_LL(&lli[i].sar_low, lower_32_bits(child->sar)); - SET_LL(&lli[i].sar_high, upper_32_bits(child->sar)); - /* DAR - low, high */ - SET_LL(&lli[i].dar_low, lower_32_bits(child->dar)); - SET_LL(&lli[i].dar_high, upper_32_bits(child->dar)); + SET_LL_32(&lli[i].transfer_size, child->sz); + /* SAR */ + #ifdef CONFIG_64BIT + SET_LL_64(&lli[i].sar.reg, child->sar); + #else /* CONFIG_64BIT */ + SET_LL_32(&lli[i].sar.lsb, lower_32_bits(child->sar)); + SET_LL_32(&lli[i].sar.msb, upper_32_bits(child->sar)); + #endif /* CONFIG_64BIT */ + /* DAR */ + #ifdef CONFIG_64BIT + SET_LL_64(&lli[i].dar.reg, child->dar); + #else /* CONFIG_64BIT */ + SET_LL_32(&lli[i].dar.lsb, lower_32_bits(child->dar)); + SET_LL_32(&lli[i].dar.msb, upper_32_bits(child->dar)); + #endif /* CONFIG_64BIT */ i++; } @@ -227,10 +344,14 @@ static void dw_edma_v0_core_write_chunk(struct dw_edma_chunk *chunk) control |= DW_EDMA_V0_CB; /* Channel control */ - SET_LL(&llp->control, control); - /* Linked list - low, high */ - SET_LL(&llp->llp_low, lower_32_bits(chunk->ll_region.paddr)); - SET_LL(&llp->llp_high, upper_32_bits(chunk->ll_region.paddr)); + SET_LL_32(&llp->control, control); + /* Linked list */ + #ifdef CONFIG_64BIT + SET_LL_64(&llp->llp.reg, chunk->ll_region.paddr); + #else /* CONFIG_64BIT */ + SET_LL_32(&llp->llp.lsb, lower_32_bits(chunk->ll_region.paddr)); + SET_LL_32(&llp->llp.msb, upper_32_bits(chunk->ll_region.paddr)); + #endif /* CONFIG_64BIT */ } void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first) @@ -243,28 +364,33 @@ void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first) if (first) { /* Enable engine */ - SET_RW(dw, chan->dir, engine_en, BIT(0)); + SET_RW_32(dw, chan->dir, engine_en, BIT(0)); /* Interrupt unmask - done, abort */ - tmp = GET_RW(dw, chan->dir, int_mask); + tmp = GET_RW_32(dw, chan->dir, int_mask); tmp &= ~FIELD_PREP(EDMA_V0_DONE_INT_MASK, BIT(chan->id)); tmp &= ~FIELD_PREP(EDMA_V0_ABORT_INT_MASK, BIT(chan->id)); - SET_RW(dw, chan->dir, int_mask, tmp); + SET_RW_32(dw, chan->dir, int_mask, tmp); /* Linked list error */ - tmp = GET_RW(dw, chan->dir, linked_list_err_en); + tmp = GET_RW_32(dw, chan->dir, linked_list_err_en); tmp |= FIELD_PREP(EDMA_V0_LINKED_LIST_ERR_MASK, BIT(chan->id)); - SET_RW(dw, chan->dir, linked_list_err_en, tmp); + SET_RW_32(dw, chan->dir, linked_list_err_en, tmp); /* Channel control */ - SET_CH(dw, chan->dir, chan->id, ch_control1, - (DW_EDMA_V0_CCS | DW_EDMA_V0_LLE)); - /* Linked list - low, high */ - SET_CH(dw, chan->dir, chan->id, llp_low, - lower_32_bits(chunk->ll_region.paddr)); - SET_CH(dw, chan->dir, chan->id, llp_high, - upper_32_bits(chunk->ll_region.paddr)); + SET_CH_32(dw, chan->dir, chan->id, ch_control1, + (DW_EDMA_V0_CCS | DW_EDMA_V0_LLE)); + /* Linked list */ + #ifdef CONFIG_64BIT + SET_CH_64(dw, chan->dir, chan->id, llp.reg, + chunk->ll_region.paddr); + #else /* CONFIG_64BIT */ + SET_CH_32(dw, chan->dir, chan->id, llp.lsb, + lower_32_bits(chunk->ll_region.paddr)); + SET_CH_32(dw, chan->dir, chan->id, llp.msb, + upper_32_bits(chunk->ll_region.paddr)); + #endif /* CONFIG_64BIT */ } /* Doorbell */ - SET_RW(dw, chan->dir, doorbell, - FIELD_PREP(EDMA_V0_DOORBELL_CH_MASK, chan->id)); + SET_RW_32(dw, chan->dir, doorbell, + FIELD_PREP(EDMA_V0_DOORBELL_CH_MASK, chan->id)); } int dw_edma_v0_core_device_config(struct dw_edma_chan *chan) @@ -273,31 +399,31 @@ int dw_edma_v0_core_device_config(struct dw_edma_chan *chan) u32 tmp = 0; /* MSI done addr - low, high */ - SET_RW(dw, chan->dir, done_imwr_low, chan->msi.address_lo); - SET_RW(dw, chan->dir, done_imwr_high, chan->msi.address_hi); + SET_RW_32(dw, chan->dir, done_imwr.lsb, chan->msi.address_lo); + SET_RW_32(dw, chan->dir, done_imwr.msb, chan->msi.address_hi); /* MSI abort addr - low, high */ - SET_RW(dw, chan->dir, abort_imwr_low, chan->msi.address_lo); - SET_RW(dw, chan->dir, abort_imwr_high, chan->msi.address_hi); + SET_RW_32(dw, chan->dir, abort_imwr.lsb, chan->msi.address_lo); + SET_RW_32(dw, chan->dir, abort_imwr.msb, chan->msi.address_hi); /* MSI data - low, high */ switch (chan->id) { case 0: case 1: - tmp = GET_RW(dw, chan->dir, ch01_imwr_data); + tmp = GET_RW_32(dw, chan->dir, ch01_imwr_data); break; case 2: case 3: - tmp = GET_RW(dw, chan->dir, ch23_imwr_data); + tmp = GET_RW_32(dw, chan->dir, ch23_imwr_data); break; case 4: case 5: - tmp = GET_RW(dw, chan->dir, ch45_imwr_data); + tmp = GET_RW_32(dw, chan->dir, ch45_imwr_data); break; case 6: case 7: - tmp = GET_RW(dw, chan->dir, ch67_imwr_data); + tmp = GET_RW_32(dw, chan->dir, ch67_imwr_data); break; } @@ -316,22 +442,22 @@ int dw_edma_v0_core_device_config(struct dw_edma_chan *chan) switch (chan->id) { case 0: case 1: - SET_RW(dw, chan->dir, ch01_imwr_data, tmp); + SET_RW_32(dw, chan->dir, ch01_imwr_data, tmp); break; case 2: case 3: - SET_RW(dw, chan->dir, ch23_imwr_data, tmp); + SET_RW_32(dw, chan->dir, ch23_imwr_data, tmp); break; case 4: case 5: - SET_RW(dw, chan->dir, ch45_imwr_data, tmp); + SET_RW_32(dw, chan->dir, ch45_imwr_data, tmp); break; case 6: case 7: - SET_RW(dw, chan->dir, ch67_imwr_data, tmp); + SET_RW_32(dw, chan->dir, ch67_imwr_data, tmp); break; } diff --git a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c index 6f62711a4c94..a5e2783a26f5 100644 --- a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c +++ b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c @@ -114,12 +114,12 @@ static void dw_edma_debugfs_regs_ch(struct dw_edma_v0_ch_regs __iomem *regs, REGISTER(ch_control1), REGISTER(ch_control2), REGISTER(transfer_size), - REGISTER(sar_low), - REGISTER(sar_high), - REGISTER(dar_low), - REGISTER(dar_high), - REGISTER(llp_low), - REGISTER(llp_high), + REGISTER(sar.lsb), + REGISTER(sar.msb), + REGISTER(dar.lsb), + REGISTER(dar.msb), + REGISTER(llp.lsb), + REGISTER(llp.msb), }; nr_entries = ARRAY_SIZE(debugfs_regs); @@ -132,17 +132,17 @@ static void dw_edma_debugfs_regs_wr(struct dentry *dir) /* eDMA global registers */ WR_REGISTER(engine_en), WR_REGISTER(doorbell), - WR_REGISTER(ch_arb_weight_low), - WR_REGISTER(ch_arb_weight_high), + WR_REGISTER(ch_arb_weight.lsb), + WR_REGISTER(ch_arb_weight.msb), /* eDMA interrupts registers */ WR_REGISTER(int_status), WR_REGISTER(int_mask), WR_REGISTER(int_clear), WR_REGISTER(err_status), - WR_REGISTER(done_imwr_low), - WR_REGISTER(done_imwr_high), - WR_REGISTER(abort_imwr_low), - WR_REGISTER(abort_imwr_high), + WR_REGISTER(done_imwr.lsb), + WR_REGISTER(done_imwr.msb), + WR_REGISTER(abort_imwr.lsb), + WR_REGISTER(abort_imwr.msb), WR_REGISTER(ch01_imwr_data), WR_REGISTER(ch23_imwr_data), WR_REGISTER(ch45_imwr_data), @@ -152,8 +152,8 @@ static void dw_edma_debugfs_regs_wr(struct dentry *dir) const struct debugfs_entries debugfs_unroll_regs[] = { /* eDMA channel context grouping */ WR_REGISTER_UNROLL(engine_chgroup), - WR_REGISTER_UNROLL(engine_hshake_cnt_low), - WR_REGISTER_UNROLL(engine_hshake_cnt_high), + WR_REGISTER_UNROLL(engine_hshake_cnt.lsb), + WR_REGISTER_UNROLL(engine_hshake_cnt.msb), WR_REGISTER_UNROLL(ch0_pwr_en), WR_REGISTER_UNROLL(ch1_pwr_en), WR_REGISTER_UNROLL(ch2_pwr_en), @@ -200,19 +200,19 @@ static void dw_edma_debugfs_regs_rd(struct dentry *dir) /* eDMA global registers */ RD_REGISTER(engine_en), RD_REGISTER(doorbell), - RD_REGISTER(ch_arb_weight_low), - RD_REGISTER(ch_arb_weight_high), + RD_REGISTER(ch_arb_weight.lsb), + RD_REGISTER(ch_arb_weight.msb), /* eDMA interrupts registers */ RD_REGISTER(int_status), RD_REGISTER(int_mask), RD_REGISTER(int_clear), - RD_REGISTER(err_status_low), - RD_REGISTER(err_status_high), + RD_REGISTER(err_status.lsb), + RD_REGISTER(err_status.msb), RD_REGISTER(linked_list_err_en), - RD_REGISTER(done_imwr_low), - RD_REGISTER(done_imwr_high), - RD_REGISTER(abort_imwr_low), - RD_REGISTER(abort_imwr_high), + RD_REGISTER(done_imwr.lsb), + RD_REGISTER(done_imwr.msb), + RD_REGISTER(abort_imwr.lsb), + RD_REGISTER(abort_imwr.msb), RD_REGISTER(ch01_imwr_data), RD_REGISTER(ch23_imwr_data), RD_REGISTER(ch45_imwr_data), @@ -221,8 +221,8 @@ static void dw_edma_debugfs_regs_rd(struct dentry *dir) const struct debugfs_entries debugfs_unroll_regs[] = { /* eDMA channel context grouping */ RD_REGISTER_UNROLL(engine_chgroup), - RD_REGISTER_UNROLL(engine_hshake_cnt_low), - RD_REGISTER_UNROLL(engine_hshake_cnt_high), + RD_REGISTER_UNROLL(engine_hshake_cnt.lsb), + RD_REGISTER_UNROLL(engine_hshake_cnt.msb), RD_REGISTER_UNROLL(ch0_pwr_en), RD_REGISTER_UNROLL(ch1_pwr_en), RD_REGISTER_UNROLL(ch2_pwr_en), diff --git a/drivers/dma/dw-edma/dw-edma-v0-regs.h b/drivers/dma/dw-edma/dw-edma-v0-regs.h index dfd70e223c2f..d07151d6ecd8 100644 --- a/drivers/dma/dw-edma/dw-edma-v0-regs.h +++ b/drivers/dma/dw-edma/dw-edma-v0-regs.h @@ -28,30 +28,55 @@ struct dw_edma_v0_ch_regs { u32 ch_control1; /* 0x000 */ u32 ch_control2; /* 0x004 */ u32 transfer_size; /* 0x008 */ - u32 sar_low; /* 0x00c */ - u32 sar_high; /* 0x010 */ - u32 dar_low; /* 0x014 */ - u32 dar_high; /* 0x018 */ - u32 llp_low; /* 0x01c */ - u32 llp_high; /* 0x020 */ -}; + union { + u64 reg; /* 0x00c..0x010 */ + struct { + u32 lsb; /* 0x00c */ + u32 msb; /* 0x010 */ + }; + } sar; + union { + u64 reg; /* 0x014..0x018 */ + struct { + u32 lsb; /* 0x014 */ + u32 msb; /* 0x018 */ + }; + } dar; + union { + u64 reg; /* 0x01c..0x020 */ + struct { + u32 lsb; /* 0x01c */ + u32 msb; /* 0x020 */ + }; + } llp; +} __packed; struct dw_edma_v0_ch { struct dw_edma_v0_ch_regs wr; /* 0x200 */ u32 padding_1[55]; /* [0x224..0x2fc] */ struct dw_edma_v0_ch_regs rd; /* 0x300 */ u32 padding_2[55]; /* [0x324..0x3fc] */ -}; +} __packed; struct dw_edma_v0_unroll { u32 padding_1; /* 0x0f8 */ u32 wr_engine_chgroup; /* 0x100 */ u32 rd_engine_chgroup; /* 0x104 */ - u32 wr_engine_hshake_cnt_low; /* 0x108 */ - u32 wr_engine_hshake_cnt_high; /* 0x10c */ + union { + u64 reg; /* 0x108..0x10c */ + struct { + u32 lsb; /* 0x108 */ + u32 msb; /* 0x10c */ + }; + } wr_engine_hshake_cnt; u32 padding_2[2]; /* [0x110..0x114] */ - u32 rd_engine_hshake_cnt_low; /* 0x118 */ - u32 rd_engine_hshake_cnt_high; /* 0x11c */ + union { + u64 reg; /* 0x120..0x124 */ + struct { + u32 lsb; /* 0x120 */ + u32 msb; /* 0x124 */ + }; + } rd_engine_hshake_cnt; u32 padding_3[2]; /* [0x120..0x124] */ u32 wr_ch0_pwr_en; /* 0x128 */ u32 wr_ch1_pwr_en; /* 0x12c */ @@ -72,12 +97,12 @@ struct dw_edma_v0_unroll { u32 rd_ch7_pwr_en; /* 0x184 */ u32 padding_5[30]; /* [0x188..0x1fc] */ struct dw_edma_v0_ch ch[EDMA_V0_MAX_NR_CH]; /* [0x200..0x1120] */ -}; +} __packed; struct dw_edma_v0_legacy { u32 viewport_sel; /* 0x0f8 */ struct dw_edma_v0_ch_regs ch; /* [0x100..0x120] */ -}; +} __packed; struct dw_edma_v0_regs { /* eDMA global registers */ @@ -87,14 +112,24 @@ struct dw_edma_v0_regs { u32 wr_engine_en; /* 0x00c */ u32 wr_doorbell; /* 0x010 */ u32 padding_2; /* 0x014 */ - u32 wr_ch_arb_weight_low; /* 0x018 */ - u32 wr_ch_arb_weight_high; /* 0x01c */ + union { + u64 reg; /* 0x018..0x01c */ + struct { + u32 lsb; /* 0x018 */ + u32 msb; /* 0x01c */ + }; + } wr_ch_arb_weight; u32 padding_3[3]; /* [0x020..0x028] */ u32 rd_engine_en; /* 0x02c */ u32 rd_doorbell; /* 0x030 */ u32 padding_4; /* 0x034 */ - u32 rd_ch_arb_weight_low; /* 0x038 */ - u32 rd_ch_arb_weight_high; /* 0x03c */ + union { + u64 reg; /* 0x038..0x03c */ + struct { + u32 lsb; /* 0x038 */ + u32 msb; /* 0x03c */ + }; + } rd_ch_arb_weight; u32 padding_5[3]; /* [0x040..0x048] */ /* eDMA interrupts registers */ u32 wr_int_status; /* 0x04c */ @@ -102,10 +137,20 @@ struct dw_edma_v0_regs { u32 wr_int_mask; /* 0x054 */ u32 wr_int_clear; /* 0x058 */ u32 wr_err_status; /* 0x05c */ - u32 wr_done_imwr_low; /* 0x060 */ - u32 wr_done_imwr_high; /* 0x064 */ - u32 wr_abort_imwr_low; /* 0x068 */ - u32 wr_abort_imwr_high; /* 0x06c */ + union { + u64 reg; /* 0x060..0x064 */ + struct { + u32 lsb; /* 0x060 */ + u32 msb; /* 0x064 */ + }; + } wr_done_imwr; + union { + u64 reg; /* 0x068..0x06c */ + struct { + u32 lsb; /* 0x068 */ + u32 msb; /* 0x06c */ + }; + } wr_abort_imwr; u32 wr_ch01_imwr_data; /* 0x070 */ u32 wr_ch23_imwr_data; /* 0x074 */ u32 wr_ch45_imwr_data; /* 0x078 */ @@ -118,15 +163,30 @@ struct dw_edma_v0_regs { u32 rd_int_mask; /* 0x0a8 */ u32 rd_int_clear; /* 0x0ac */ u32 padding_10; /* 0x0b0 */ - u32 rd_err_status_low; /* 0x0b4 */ - u32 rd_err_status_high; /* 0x0b8 */ + union { + u64 reg; /* 0x0b4..0x0b8 */ + struct { + u32 lsb; /* 0x0b4 */ + u32 msb; /* 0x0b8 */ + }; + } rd_err_status; u32 padding_11[2]; /* [0x0bc..0x0c0] */ u32 rd_linked_list_err_en; /* 0x0c4 */ u32 padding_12; /* 0x0c8 */ - u32 rd_done_imwr_low; /* 0x0cc */ - u32 rd_done_imwr_high; /* 0x0d0 */ - u32 rd_abort_imwr_low; /* 0x0d4 */ - u32 rd_abort_imwr_high; /* 0x0d8 */ + union { + u64 reg; /* 0x0cc..0x0d0 */ + struct { + u32 lsb; /* 0x0cc */ + u32 msb; /* 0x0d0 */ + }; + } rd_done_imwr; + union { + u64 reg; /* 0x0d4..0x0d8 */ + struct { + u32 lsb; /* 0x0d4 */ + u32 msb; /* 0x0d8 */ + }; + } rd_abort_imwr; u32 rd_ch01_imwr_data; /* 0x0dc */ u32 rd_ch23_imwr_data; /* 0x0e0 */ u32 rd_ch45_imwr_data; /* 0x0e4 */ @@ -137,22 +197,37 @@ struct dw_edma_v0_regs { struct dw_edma_v0_legacy legacy; /* [0x0f8..0x120] */ struct dw_edma_v0_unroll unroll; /* [0x0f8..0x1120] */ } type; -}; +} __packed; struct dw_edma_v0_lli { u32 control; u32 transfer_size; - u32 sar_low; - u32 sar_high; - u32 dar_low; - u32 dar_high; -}; + union { + u64 reg; + struct { + u32 lsb; + u32 msb; + }; + } sar; + union { + u64 reg; + struct { + u32 lsb; + u32 msb; + }; + } dar; +} __packed; struct dw_edma_v0_llp { u32 control; u32 reserved; - u32 llp_low; - u32 llp_high; -}; + union { + u64 reg; + struct { + u32 lsb; + u32 msb; + }; + } llp; +} __packed; #endif /* _DW_EDMA_V0_REGS_H */ From b79f17517ad8c928c3acb1c89bcca9e242b29c84 Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:03:56 +0100 Subject: [PATCH 0090/1379] dmaengine: dw-edma: Fix comments offset characters' alignment Fix comments offset characters' alignment to follow the same structure of similar comments. Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/8e0e1e46e1c1a78fe62d08c4ee09fb96254a9393.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-v0-regs.h | 214 +++++++++++++------------- 1 file changed, 107 insertions(+), 107 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-v0-regs.h b/drivers/dma/dw-edma/dw-edma-v0-regs.h index d07151d6ecd8..e175f7b20480 100644 --- a/drivers/dma/dw-edma/dw-edma-v0-regs.h +++ b/drivers/dma/dw-edma/dw-edma-v0-regs.h @@ -25,177 +25,177 @@ #define EDMA_V0_CH_EVEN_MSI_DATA_MASK GENMASK(15, 0) struct dw_edma_v0_ch_regs { - u32 ch_control1; /* 0x000 */ - u32 ch_control2; /* 0x004 */ - u32 transfer_size; /* 0x008 */ + u32 ch_control1; /* 0x0000 */ + u32 ch_control2; /* 0x0004 */ + u32 transfer_size; /* 0x0008 */ union { - u64 reg; /* 0x00c..0x010 */ + u64 reg; /* 0x000c..0x0010 */ struct { - u32 lsb; /* 0x00c */ - u32 msb; /* 0x010 */ + u32 lsb; /* 0x000c */ + u32 msb; /* 0x0010 */ }; } sar; union { - u64 reg; /* 0x014..0x018 */ + u64 reg; /* 0x0014..0x0018 */ struct { - u32 lsb; /* 0x014 */ - u32 msb; /* 0x018 */ + u32 lsb; /* 0x0014 */ + u32 msb; /* 0x0018 */ }; } dar; union { - u64 reg; /* 0x01c..0x020 */ + u64 reg; /* 0x001c..0x0020 */ struct { - u32 lsb; /* 0x01c */ - u32 msb; /* 0x020 */ + u32 lsb; /* 0x001c */ + u32 msb; /* 0x0020 */ }; } llp; } __packed; struct dw_edma_v0_ch { - struct dw_edma_v0_ch_regs wr; /* 0x200 */ - u32 padding_1[55]; /* [0x224..0x2fc] */ - struct dw_edma_v0_ch_regs rd; /* 0x300 */ - u32 padding_2[55]; /* [0x324..0x3fc] */ + struct dw_edma_v0_ch_regs wr; /* 0x0200 */ + u32 padding_1[55]; /* 0x0224..0x02fc */ + struct dw_edma_v0_ch_regs rd; /* 0x0300 */ + u32 padding_2[55]; /* 0x0324..0x03fc */ } __packed; struct dw_edma_v0_unroll { - u32 padding_1; /* 0x0f8 */ - u32 wr_engine_chgroup; /* 0x100 */ - u32 rd_engine_chgroup; /* 0x104 */ + u32 padding_1; /* 0x00f8 */ + u32 wr_engine_chgroup; /* 0x0100 */ + u32 rd_engine_chgroup; /* 0x0104 */ union { - u64 reg; /* 0x108..0x10c */ + u64 reg; /* 0x0108..0x010c */ struct { - u32 lsb; /* 0x108 */ - u32 msb; /* 0x10c */ + u32 lsb; /* 0x0108 */ + u32 msb; /* 0x010c */ }; } wr_engine_hshake_cnt; - u32 padding_2[2]; /* [0x110..0x114] */ + u32 padding_2[2]; /* 0x0110..0x0114 */ union { - u64 reg; /* 0x120..0x124 */ + u64 reg; /* 0x0120..0x0124 */ struct { - u32 lsb; /* 0x120 */ - u32 msb; /* 0x124 */ + u32 lsb; /* 0x0120 */ + u32 msb; /* 0x0124 */ }; } rd_engine_hshake_cnt; - u32 padding_3[2]; /* [0x120..0x124] */ - u32 wr_ch0_pwr_en; /* 0x128 */ - u32 wr_ch1_pwr_en; /* 0x12c */ - u32 wr_ch2_pwr_en; /* 0x130 */ - u32 wr_ch3_pwr_en; /* 0x134 */ - u32 wr_ch4_pwr_en; /* 0x138 */ - u32 wr_ch5_pwr_en; /* 0x13c */ - u32 wr_ch6_pwr_en; /* 0x140 */ - u32 wr_ch7_pwr_en; /* 0x144 */ - u32 padding_4[8]; /* [0x148..0x164] */ - u32 rd_ch0_pwr_en; /* 0x168 */ - u32 rd_ch1_pwr_en; /* 0x16c */ - u32 rd_ch2_pwr_en; /* 0x170 */ - u32 rd_ch3_pwr_en; /* 0x174 */ - u32 rd_ch4_pwr_en; /* 0x178 */ - u32 rd_ch5_pwr_en; /* 0x18c */ - u32 rd_ch6_pwr_en; /* 0x180 */ - u32 rd_ch7_pwr_en; /* 0x184 */ - u32 padding_5[30]; /* [0x188..0x1fc] */ - struct dw_edma_v0_ch ch[EDMA_V0_MAX_NR_CH]; /* [0x200..0x1120] */ + u32 padding_3[2]; /* 0x0120..0x0124 */ + u32 wr_ch0_pwr_en; /* 0x0128 */ + u32 wr_ch1_pwr_en; /* 0x012c */ + u32 wr_ch2_pwr_en; /* 0x0130 */ + u32 wr_ch3_pwr_en; /* 0x0134 */ + u32 wr_ch4_pwr_en; /* 0x0138 */ + u32 wr_ch5_pwr_en; /* 0x013c */ + u32 wr_ch6_pwr_en; /* 0x0140 */ + u32 wr_ch7_pwr_en; /* 0x0144 */ + u32 padding_4[8]; /* 0x0148..0x0164 */ + u32 rd_ch0_pwr_en; /* 0x0168 */ + u32 rd_ch1_pwr_en; /* 0x016c */ + u32 rd_ch2_pwr_en; /* 0x0170 */ + u32 rd_ch3_pwr_en; /* 0x0174 */ + u32 rd_ch4_pwr_en; /* 0x0178 */ + u32 rd_ch5_pwr_en; /* 0x018c */ + u32 rd_ch6_pwr_en; /* 0x0180 */ + u32 rd_ch7_pwr_en; /* 0x0184 */ + u32 padding_5[30]; /* 0x0188..0x01fc */ + struct dw_edma_v0_ch ch[EDMA_V0_MAX_NR_CH]; /* 0x0200..0x1120 */ } __packed; struct dw_edma_v0_legacy { - u32 viewport_sel; /* 0x0f8 */ - struct dw_edma_v0_ch_regs ch; /* [0x100..0x120] */ + u32 viewport_sel; /* 0x00f8 */ + struct dw_edma_v0_ch_regs ch; /* 0x0100..0x0120 */ } __packed; struct dw_edma_v0_regs { /* eDMA global registers */ - u32 ctrl_data_arb_prior; /* 0x000 */ - u32 padding_1; /* 0x004 */ - u32 ctrl; /* 0x008 */ - u32 wr_engine_en; /* 0x00c */ - u32 wr_doorbell; /* 0x010 */ - u32 padding_2; /* 0x014 */ + u32 ctrl_data_arb_prior; /* 0x0000 */ + u32 padding_1; /* 0x0004 */ + u32 ctrl; /* 0x0008 */ + u32 wr_engine_en; /* 0x000c */ + u32 wr_doorbell; /* 0x0010 */ + u32 padding_2; /* 0x0014 */ union { - u64 reg; /* 0x018..0x01c */ + u64 reg; /* 0x0018..0x001c */ struct { - u32 lsb; /* 0x018 */ - u32 msb; /* 0x01c */ + u32 lsb; /* 0x0018 */ + u32 msb; /* 0x001c */ }; } wr_ch_arb_weight; - u32 padding_3[3]; /* [0x020..0x028] */ - u32 rd_engine_en; /* 0x02c */ - u32 rd_doorbell; /* 0x030 */ - u32 padding_4; /* 0x034 */ + u32 padding_3[3]; /* 0x0020..0x0028 */ + u32 rd_engine_en; /* 0x002c */ + u32 rd_doorbell; /* 0x0030 */ + u32 padding_4; /* 0x0034 */ union { - u64 reg; /* 0x038..0x03c */ + u64 reg; /* 0x0038..0x003c */ struct { - u32 lsb; /* 0x038 */ - u32 msb; /* 0x03c */ + u32 lsb; /* 0x0038 */ + u32 msb; /* 0x003c */ }; } rd_ch_arb_weight; - u32 padding_5[3]; /* [0x040..0x048] */ + u32 padding_5[3]; /* 0x0040..0x0048 */ /* eDMA interrupts registers */ - u32 wr_int_status; /* 0x04c */ - u32 padding_6; /* 0x050 */ - u32 wr_int_mask; /* 0x054 */ - u32 wr_int_clear; /* 0x058 */ - u32 wr_err_status; /* 0x05c */ + u32 wr_int_status; /* 0x004c */ + u32 padding_6; /* 0x0050 */ + u32 wr_int_mask; /* 0x0054 */ + u32 wr_int_clear; /* 0x0058 */ + u32 wr_err_status; /* 0x005c */ union { - u64 reg; /* 0x060..0x064 */ + u64 reg; /* 0x0060..0x0064 */ struct { - u32 lsb; /* 0x060 */ - u32 msb; /* 0x064 */ + u32 lsb; /* 0x0060 */ + u32 msb; /* 0x0064 */ }; } wr_done_imwr; union { - u64 reg; /* 0x068..0x06c */ + u64 reg; /* 0x0068..0x006c */ struct { - u32 lsb; /* 0x068 */ - u32 msb; /* 0x06c */ + u32 lsb; /* 0x0068 */ + u32 msb; /* 0x006c */ }; } wr_abort_imwr; - u32 wr_ch01_imwr_data; /* 0x070 */ - u32 wr_ch23_imwr_data; /* 0x074 */ - u32 wr_ch45_imwr_data; /* 0x078 */ - u32 wr_ch67_imwr_data; /* 0x07c */ - u32 padding_7[4]; /* [0x080..0x08c] */ - u32 wr_linked_list_err_en; /* 0x090 */ - u32 padding_8[3]; /* [0x094..0x09c] */ - u32 rd_int_status; /* 0x0a0 */ - u32 padding_9; /* 0x0a4 */ - u32 rd_int_mask; /* 0x0a8 */ - u32 rd_int_clear; /* 0x0ac */ - u32 padding_10; /* 0x0b0 */ + u32 wr_ch01_imwr_data; /* 0x0070 */ + u32 wr_ch23_imwr_data; /* 0x0074 */ + u32 wr_ch45_imwr_data; /* 0x0078 */ + u32 wr_ch67_imwr_data; /* 0x007c */ + u32 padding_7[4]; /* 0x0080..0x008c */ + u32 wr_linked_list_err_en; /* 0x0090 */ + u32 padding_8[3]; /* 0x0094..0x009c */ + u32 rd_int_status; /* 0x00a0 */ + u32 padding_9; /* 0x00a4 */ + u32 rd_int_mask; /* 0x00a8 */ + u32 rd_int_clear; /* 0x00ac */ + u32 padding_10; /* 0x00b0 */ union { - u64 reg; /* 0x0b4..0x0b8 */ + u64 reg; /* 0x00b4..0x00b8 */ struct { - u32 lsb; /* 0x0b4 */ - u32 msb; /* 0x0b8 */ + u32 lsb; /* 0x00b4 */ + u32 msb; /* 0x00b8 */ }; } rd_err_status; - u32 padding_11[2]; /* [0x0bc..0x0c0] */ - u32 rd_linked_list_err_en; /* 0x0c4 */ - u32 padding_12; /* 0x0c8 */ + u32 padding_11[2]; /* 0x00bc..0x00c0 */ + u32 rd_linked_list_err_en; /* 0x00c4 */ + u32 padding_12; /* 0x00c8 */ union { - u64 reg; /* 0x0cc..0x0d0 */ + u64 reg; /* 0x00cc..0x00d0 */ struct { - u32 lsb; /* 0x0cc */ - u32 msb; /* 0x0d0 */ + u32 lsb; /* 0x00cc */ + u32 msb; /* 0x00d0 */ }; } rd_done_imwr; union { - u64 reg; /* 0x0d4..0x0d8 */ + u64 reg; /* 0x00d4..0x00d8 */ struct { - u32 lsb; /* 0x0d4 */ - u32 msb; /* 0x0d8 */ + u32 lsb; /* 0x00d4 */ + u32 msb; /* 0x00d8 */ }; } rd_abort_imwr; - u32 rd_ch01_imwr_data; /* 0x0dc */ - u32 rd_ch23_imwr_data; /* 0x0e0 */ - u32 rd_ch45_imwr_data; /* 0x0e4 */ - u32 rd_ch67_imwr_data; /* 0x0e8 */ - u32 padding_13[4]; /* [0x0ec..0x0f8] */ + u32 rd_ch01_imwr_data; /* 0x00dc */ + u32 rd_ch23_imwr_data; /* 0x00e0 */ + u32 rd_ch45_imwr_data; /* 0x00e4 */ + u32 rd_ch67_imwr_data; /* 0x00e8 */ + u32 padding_13[4]; /* 0x00ec..0x00f8 */ /* eDMA channel context grouping */ union dw_edma_v0_type { - struct dw_edma_v0_legacy legacy; /* [0x0f8..0x120] */ - struct dw_edma_v0_unroll unroll; /* [0x0f8..0x1120] */ + struct dw_edma_v0_legacy legacy; /* 0x00f8..0x0120 */ + struct dw_edma_v0_unroll unroll; /* 0x00f8..0x1120 */ } type; } __packed; From e0c1d53891c43a70c9fa85ddb3174ab5afd7e2ec Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:03:57 +0100 Subject: [PATCH 0091/1379] dmaengine: dw-edma: Add support for the HDMA feature Add support for the HDMA feature. This new feature enables the current eDMA IP to use a deeper prefetch of the linked list, which reduces the algorithm execution latency observed when loading the elements of the list, causing more stable and higher data transfer. Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/5f40f89ef7d6255a12d5b23f34e6e59dcd28861e.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.h | 10 +++--- drivers/dma/dw-edma/dw-edma-pcie.c | 23 ++++++------- drivers/dma/dw-edma/dw-edma-v0-core.c | 42 ++++++++++++++++++++++-- drivers/dma/dw-edma/dw-edma-v0-debugfs.c | 9 +++-- 4 files changed, 60 insertions(+), 24 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h index 31fc50d31792..3f9593ede881 100644 --- a/drivers/dma/dw-edma/dw-edma-core.h +++ b/drivers/dma/dw-edma/dw-edma-core.h @@ -21,9 +21,10 @@ enum dw_edma_dir { EDMA_DIR_READ }; -enum dw_edma_mode { - EDMA_MODE_LEGACY = 0, - EDMA_MODE_UNROLL +enum dw_edma_map_format { + EDMA_MF_EDMA_LEGACY = 0x0, + EDMA_MF_EDMA_UNROLL = 0x1, + EDMA_MF_HDMA_COMPAT = 0x5 }; enum dw_edma_request { @@ -123,8 +124,7 @@ struct dw_edma { struct dw_edma_irq *irq; int nr_irqs; - u32 version; - enum dw_edma_mode mode; + enum dw_edma_map_format mf; struct dw_edma_chan *chan; const struct dw_edma_core_ops *ops; diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c index 1eafc602e17e..c130549d3262 100644 --- a/drivers/dma/dw-edma/dw-edma-pcie.c +++ b/drivers/dma/dw-edma/dw-edma-pcie.c @@ -30,8 +30,7 @@ struct dw_edma_pcie_data { off_t dt_off; size_t dt_sz; /* Other */ - u32 version; - enum dw_edma_mode mode; + enum dw_edma_map_format mf; u8 irqs; }; @@ -49,8 +48,7 @@ static const struct dw_edma_pcie_data snps_edda_data = { .dt_off = 0x00800000, /* 8 Mbytes */ .dt_sz = 0x03800000, /* 56 Mbytes */ /* Other */ - .version = 0, - .mode = EDMA_MODE_UNROLL, + .mf = EDMA_MF_EDMA_UNROLL, .irqs = 1, }; @@ -69,8 +67,8 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, const struct dw_edma_pcie_data *pdata = (void *)pid->driver_data; struct device *dev = &pdev->dev; struct dw_edma_chip *chip; - int err, nr_irqs; struct dw_edma *dw; + int err, nr_irqs; /* Enable PCI device */ err = pcim_enable_device(pdev); @@ -157,16 +155,19 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, dw->dt_region.paddr += pdata->dt_off; dw->dt_region.sz = pdata->dt_sz; - dw->version = pdata->version; - dw->mode = pdata->mode; + dw->mf = pdata->mf; dw->nr_irqs = nr_irqs; dw->ops = &dw_edma_pcie_core_ops; /* Debug info */ - pci_dbg(pdev, "Version:\t%u\n", dw->version); - - pci_dbg(pdev, "Mode:\t%s\n", - dw->mode == EDMA_MODE_LEGACY ? "Legacy" : "Unroll"); + if (dw->mf == EDMA_MF_EDMA_LEGACY) + pci_dbg(pdev, "Version:\teDMA Port Logic (0x%x)\n", dw->mf); + else if (dw->mf == EDMA_MF_EDMA_UNROLL) + pci_dbg(pdev, "Version:\teDMA Unroll (0x%x)\n", dw->mf); + else if (dw->mf == EDMA_MF_HDMA_COMPAT) + pci_dbg(pdev, "Version:\tHDMA Compatible (0x%x)\n", dw->mf); + else + pci_dbg(pdev, "Version:\tUnknown (0x%x)\n", dw->mf); pci_dbg(pdev, "Registers:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n", pdata->rg_bar, pdata->rg_off, pdata->rg_sz, diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c index 7888eda52845..5b0541a3a094 100644 --- a/drivers/dma/dw-edma/dw-edma-v0-core.c +++ b/drivers/dma/dw-edma/dw-edma-v0-core.c @@ -96,7 +96,7 @@ static inline struct dw_edma_v0_regs __iomem *__dw_regs(struct dw_edma *dw) static inline struct dw_edma_v0_ch_regs __iomem * __dw_ch_regs(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch) { - if (dw->mode == EDMA_MODE_LEGACY) + if (dw->mf == EDMA_MF_EDMA_LEGACY) return &(__dw_regs(dw)->type.legacy.ch); if (dir == EDMA_DIR_WRITE) @@ -108,7 +108,7 @@ __dw_ch_regs(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch) static inline void writel_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch, u32 value, void __iomem *addr) { - if (dw->mode == EDMA_MODE_LEGACY) { + if (dw->mf == EDMA_MF_EDMA_LEGACY) { u32 viewport_sel; unsigned long flags; @@ -133,7 +133,7 @@ static inline u32 readl_ch(struct dw_edma *dw, enum dw_edma_dir dir, u16 ch, { u32 value; - if (dw->mode == EDMA_MODE_LEGACY) { + if (dw->mf == EDMA_MF_EDMA_LEGACY) { u32 viewport_sel; unsigned long flags; @@ -365,6 +365,42 @@ void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first) if (first) { /* Enable engine */ SET_RW_32(dw, chan->dir, engine_en, BIT(0)); + if (dw->mf == EDMA_MF_HDMA_COMPAT) { + switch (chan->id) { + case 0: + SET_RW_COMPAT(dw, chan->dir, ch0_pwr_en, + BIT(0)); + break; + case 1: + SET_RW_COMPAT(dw, chan->dir, ch1_pwr_en, + BIT(0)); + break; + case 2: + SET_RW_COMPAT(dw, chan->dir, ch2_pwr_en, + BIT(0)); + break; + case 3: + SET_RW_COMPAT(dw, chan->dir, ch3_pwr_en, + BIT(0)); + break; + case 4: + SET_RW_COMPAT(dw, chan->dir, ch4_pwr_en, + BIT(0)); + break; + case 5: + SET_RW_COMPAT(dw, chan->dir, ch5_pwr_en, + BIT(0)); + break; + case 6: + SET_RW_COMPAT(dw, chan->dir, ch6_pwr_en, + BIT(0)); + break; + case 7: + SET_RW_COMPAT(dw, chan->dir, ch7_pwr_en, + BIT(0)); + break; + } + } /* Interrupt unmask - done, abort */ tmp = GET_RW_32(dw, chan->dir, int_mask); tmp &= ~FIELD_PREP(EDMA_V0_DONE_INT_MASK, BIT(chan->id)); diff --git a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c index a5e2783a26f5..157dfc2e2430 100644 --- a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c +++ b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c @@ -55,7 +55,7 @@ struct debugfs_entries { static int dw_edma_debugfs_u32_get(void *data, u64 *val) { void __iomem *reg = (void __force __iomem *)data; - if (dw->mode == EDMA_MODE_LEGACY && + if (dw->mf == EDMA_MF_EDMA_LEGACY && reg >= (void __iomem *)®s->type.legacy.ch) { void __iomem *ptr = ®s->type.legacy.ch; u32 viewport_sel = 0; @@ -174,7 +174,7 @@ static void dw_edma_debugfs_regs_wr(struct dentry *dir) nr_entries = ARRAY_SIZE(debugfs_regs); dw_edma_debugfs_create_x32(debugfs_regs, nr_entries, regs_dir); - if (dw->mode == EDMA_MODE_UNROLL) { + if (dw->mf == EDMA_MF_HDMA_COMPAT) { nr_entries = ARRAY_SIZE(debugfs_unroll_regs); dw_edma_debugfs_create_x32(debugfs_unroll_regs, nr_entries, regs_dir); @@ -243,7 +243,7 @@ static void dw_edma_debugfs_regs_rd(struct dentry *dir) nr_entries = ARRAY_SIZE(debugfs_regs); dw_edma_debugfs_create_x32(debugfs_regs, nr_entries, regs_dir); - if (dw->mode == EDMA_MODE_UNROLL) { + if (dw->mf == EDMA_MF_HDMA_COMPAT) { nr_entries = ARRAY_SIZE(debugfs_unroll_regs); dw_edma_debugfs_create_x32(debugfs_unroll_regs, nr_entries, regs_dir); @@ -297,8 +297,7 @@ void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip) if (!base_dir) return; - debugfs_create_u32("version", 0444, base_dir, &dw->version); - debugfs_create_u32("mode", 0444, base_dir, &dw->mode); + debugfs_create_u32("mf", 0444, base_dir, &dw->mf); debugfs_create_u16("wr_ch_cnt", 0444, base_dir, &dw->wr_ch_cnt); debugfs_create_u16("rd_ch_cnt", 0444, base_dir, &dw->rd_ch_cnt); From c124fd9a969acaa83f6dfa5e160a99a500af9e4b Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:03:58 +0100 Subject: [PATCH 0092/1379] PCI: Add pci_find_vsec_capability() to find a specific VSEC Add pci_find_vsec_capability() to locate a Vendor-Specific Extended Capability with the specified VSEC ID. The Vendor-Specific Extended Capability (VSEC) allows one or more proprietary capabilities defined by the vendor which aren't standard or shared between vendors. Signed-off-by: Gustavo Pimentel Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/d89506834fb11c6fa0bd5d515c0dd55b13ac6958.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/pci/pci.c | 30 ++++++++++++++++++++++++++++++ include/linux/pci.h | 1 + 2 files changed, 31 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 16a17215f633..97c0f2ec922b 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -692,6 +692,36 @@ u8 pci_find_ht_capability(struct pci_dev *dev, int ht_cap) } EXPORT_SYMBOL_GPL(pci_find_ht_capability); +/** + * pci_find_vsec_capability - Find a vendor-specific extended capability + * @dev: PCI device to query + * @vendor: Vendor ID for which capability is defined + * @cap: Vendor-specific capability ID + * + * If @dev has Vendor ID @vendor, search for a VSEC capability with + * VSEC ID @cap. If found, return the capability offset in + * config space; otherwise return 0. + */ +u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap) +{ + u16 vsec = 0; + u32 header; + + if (vendor != dev->vendor) + return 0; + + while ((vsec = pci_find_next_ext_capability(dev, vsec, + PCI_EXT_CAP_ID_VNDR))) { + if (pci_read_config_dword(dev, vsec + PCI_VNDR_HEADER, + &header) == PCIBIOS_SUCCESSFUL && + PCI_VNDR_HEADER_ID(header) == cap) + return vsec; + } + + return 0; +} +EXPORT_SYMBOL_GPL(pci_find_vsec_capability); + /** * pci_find_parent_resource - return resource region of parent bus of given * region diff --git a/include/linux/pci.h b/include/linux/pci.h index 86c799c97b77..59e731a2d1f8 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1077,6 +1077,7 @@ u8 pci_find_next_ht_capability(struct pci_dev *dev, u8 pos, int ht_cap); u16 pci_find_ext_capability(struct pci_dev *dev, int cap); u16 pci_find_next_ext_capability(struct pci_dev *dev, u16 pos, int cap); struct pci_bus *pci_find_next_bus(const struct pci_bus *from); +u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap); u64 pci_get_dsn(struct pci_dev *dev); From 1aef6ffe999eec7b7fdcfad7ffef9c157727ffcb Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:03:59 +0100 Subject: [PATCH 0093/1379] dmaengine: dw-edma: Add PCIe VSEC data retrieval support The latest eDMA IP development implements a Vendor-Specific Extended Capability that contains the eDMA BAR, offset, map format, and the number of read/write channels available. Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/0b880b8893ff457ffc1b5071a1c7f47e61ceea1c.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.c | 20 +++--- drivers/dma/dw-edma/dw-edma-pcie.c | 110 ++++++++++++++++++++++------- 2 files changed, 98 insertions(+), 32 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index 08d71dafa001..18a2878d2c50 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -863,15 +863,19 @@ int dw_edma_probe(struct dw_edma_chip *chip) raw_spin_lock_init(&dw->lock); - /* Find out how many write channels are supported by hardware */ - dw->wr_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE); - if (!dw->wr_ch_cnt) - return -EINVAL; + if (!dw->wr_ch_cnt) { + /* Find out how many write channels are supported by hardware */ + dw->wr_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE); + if (!dw->wr_ch_cnt) + return -EINVAL; + } - /* Find out how many read channels are supported by hardware */ - dw->rd_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ); - if (!dw->rd_ch_cnt) - return -EINVAL; + if (!dw->rd_ch_cnt) { + /* Find out how many read channels are supported by hardware */ + dw->rd_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ); + if (!dw->rd_ch_cnt) + return -EINVAL; + } dev_vdbg(dev, "Channels:\twrite=%d, read=%d\n", dw->wr_ch_cnt, dw->rd_ch_cnt); diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c index c130549d3262..eb6f8b399cc7 100644 --- a/drivers/dma/dw-edma/dw-edma-pcie.c +++ b/drivers/dma/dw-edma/dw-edma-pcie.c @@ -13,9 +13,16 @@ #include #include #include +#include #include "dw-edma-core.h" +#define DW_PCIE_VSEC_DMA_ID 0x6 +#define DW_PCIE_VSEC_DMA_BAR GENMASK(10, 8) +#define DW_PCIE_VSEC_DMA_MAP GENMASK(2, 0) +#define DW_PCIE_VSEC_DMA_RD_CH GENMASK(25, 16) +#define DW_PCIE_VSEC_DMA_WR_CH GENMASK(9, 0) + struct dw_edma_pcie_data { /* eDMA registers location */ enum pci_barno rg_bar; @@ -32,6 +39,8 @@ struct dw_edma_pcie_data { /* Other */ enum dw_edma_map_format mf; u8 irqs; + u16 rd_ch_cnt; + u16 wr_ch_cnt; }; static const struct dw_edma_pcie_data snps_edda_data = { @@ -50,6 +59,8 @@ static const struct dw_edma_pcie_data snps_edda_data = { /* Other */ .mf = EDMA_MF_EDMA_UNROLL, .irqs = 1, + .rd_ch_cnt = 0, + .wr_ch_cnt = 0, }; static int dw_edma_pcie_irq_vector(struct device *dev, unsigned int nr) @@ -61,10 +72,51 @@ static const struct dw_edma_core_ops dw_edma_pcie_core_ops = { .irq_vector = dw_edma_pcie_irq_vector, }; +static void dw_edma_pcie_get_vsec_dma_data(struct pci_dev *pdev, + struct dw_edma_pcie_data *pdata) +{ + u32 val, map; + u16 vsec; + u64 off; + + vsec = pci_find_vsec_capability(pdev, PCI_VENDOR_ID_SYNOPSYS, + DW_PCIE_VSEC_DMA_ID); + if (!vsec) + return; + + pci_read_config_dword(pdev, vsec + PCI_VNDR_HEADER, &val); + if (PCI_VNDR_HEADER_REV(val) != 0x00 || + PCI_VNDR_HEADER_LEN(val) != 0x18) + return; + + pci_dbg(pdev, "Detected PCIe Vendor-Specific Extended Capability DMA\n"); + pci_read_config_dword(pdev, vsec + 0x8, &val); + map = FIELD_GET(DW_PCIE_VSEC_DMA_MAP, val); + if (map != EDMA_MF_EDMA_LEGACY && + map != EDMA_MF_EDMA_UNROLL && + map != EDMA_MF_HDMA_COMPAT) + return; + + pdata->mf = map; + pdata->rg_bar = FIELD_GET(DW_PCIE_VSEC_DMA_BAR, val); + + pci_read_config_dword(pdev, vsec + 0xc, &val); + pdata->rd_ch_cnt = FIELD_GET(DW_PCIE_VSEC_DMA_RD_CH, val); + pdata->wr_ch_cnt = FIELD_GET(DW_PCIE_VSEC_DMA_WR_CH, val); + + pci_read_config_dword(pdev, vsec + 0x14, &val); + off = val; + pci_read_config_dword(pdev, vsec + 0x10, &val); + off <<= 32; + off |= val; + pdata->rg_off = off; +} + static int dw_edma_pcie_probe(struct pci_dev *pdev, const struct pci_device_id *pid) { - const struct dw_edma_pcie_data *pdata = (void *)pid->driver_data; + struct dw_edma_pcie_data *pdata = (void *)pid->driver_data; + struct dw_edma_pcie_data vsec_data; struct device *dev = &pdev->dev; struct dw_edma_chip *chip; struct dw_edma *dw; @@ -77,10 +129,18 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, return err; } + memcpy(&vsec_data, pdata, sizeof(struct dw_edma_pcie_data)); + + /* + * Tries to find if exists a PCIe Vendor-Specific Extended Capability + * for the DMA, if one exists, then reconfigures it. + */ + dw_edma_pcie_get_vsec_dma_data(pdev, &vsec_data); + /* Mapping PCI BAR regions */ - err = pcim_iomap_regions(pdev, BIT(pdata->rg_bar) | - BIT(pdata->ll_bar) | - BIT(pdata->dt_bar), + err = pcim_iomap_regions(pdev, BIT(vsec_data.rg_bar) | + BIT(vsec_data.ll_bar) | + BIT(vsec_data.dt_bar), pci_name(pdev)); if (err) { pci_err(pdev, "eDMA BAR I/O remapping failed\n"); @@ -123,7 +183,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, return -ENOMEM; /* IRQs allocation */ - nr_irqs = pci_alloc_irq_vectors(pdev, 1, pdata->irqs, + nr_irqs = pci_alloc_irq_vectors(pdev, 1, vsec_data.irqs, PCI_IRQ_MSI | PCI_IRQ_MSIX); if (nr_irqs < 1) { pci_err(pdev, "fail to alloc IRQ vector (number of IRQs=%u)\n", @@ -137,27 +197,29 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, chip->id = pdev->devfn; chip->irq = pdev->irq; - dw->rg_region.vaddr = pcim_iomap_table(pdev)[pdata->rg_bar]; - dw->rg_region.vaddr += pdata->rg_off; - dw->rg_region.paddr = pdev->resource[pdata->rg_bar].start; - dw->rg_region.paddr += pdata->rg_off; - dw->rg_region.sz = pdata->rg_sz; + dw->rg_region.vaddr = pcim_iomap_table(pdev)[vsec_data.rg_bar]; + dw->rg_region.vaddr += vsec_data.rg_off; + dw->rg_region.paddr = pdev->resource[vsec_data.rg_bar].start; + dw->rg_region.paddr += vsec_data.rg_off; + dw->rg_region.sz = vsec_data.rg_sz; - dw->ll_region.vaddr = pcim_iomap_table(pdev)[pdata->ll_bar]; - dw->ll_region.vaddr += pdata->ll_off; - dw->ll_region.paddr = pdev->resource[pdata->ll_bar].start; - dw->ll_region.paddr += pdata->ll_off; - dw->ll_region.sz = pdata->ll_sz; + dw->ll_region.vaddr = pcim_iomap_table(pdev)[vsec_data.ll_bar]; + dw->ll_region.vaddr += vsec_data.ll_off; + dw->ll_region.paddr = pdev->resource[vsec_data.ll_bar].start; + dw->ll_region.paddr += vsec_data.ll_off; + dw->ll_region.sz = vsec_data.ll_sz; - dw->dt_region.vaddr = pcim_iomap_table(pdev)[pdata->dt_bar]; - dw->dt_region.vaddr += pdata->dt_off; - dw->dt_region.paddr = pdev->resource[pdata->dt_bar].start; - dw->dt_region.paddr += pdata->dt_off; - dw->dt_region.sz = pdata->dt_sz; + dw->dt_region.vaddr = pcim_iomap_table(pdev)[vsec_data.dt_bar]; + dw->dt_region.vaddr += vsec_data.dt_off; + dw->dt_region.paddr = pdev->resource[vsec_data.dt_bar].start; + dw->dt_region.paddr += vsec_data.dt_off; + dw->dt_region.sz = vsec_data.dt_sz; - dw->mf = pdata->mf; + dw->mf = vsec_data.mf; dw->nr_irqs = nr_irqs; dw->ops = &dw_edma_pcie_core_ops; + dw->rd_ch_cnt = vsec_data.rd_ch_cnt; + dw->wr_ch_cnt = vsec_data.wr_ch_cnt; /* Debug info */ if (dw->mf == EDMA_MF_EDMA_LEGACY) @@ -170,15 +232,15 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, pci_dbg(pdev, "Version:\tUnknown (0x%x)\n", dw->mf); pci_dbg(pdev, "Registers:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n", - pdata->rg_bar, pdata->rg_off, pdata->rg_sz, + vsec_data.rg_bar, vsec_data.rg_off, vsec_data.rg_sz, dw->rg_region.vaddr, &dw->rg_region.paddr); pci_dbg(pdev, "L. List:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n", - pdata->ll_bar, pdata->ll_off, pdata->ll_sz, + vsec_data.ll_bar, vsec_data.ll_off, vsec_data.ll_sz, dw->ll_region.vaddr, &dw->ll_region.paddr); pci_dbg(pdev, "Data:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n", - pdata->dt_bar, pdata->dt_off, pdata->dt_sz, + vsec_data.dt_bar, vsec_data.dt_off, vsec_data.dt_sz, dw->dt_region.vaddr, &dw->dt_region.paddr); pci_dbg(pdev, "Nr. IRQs:\t%u\n", dw->nr_irqs); From 85e7518f42c85d339fac0af9f9d025d7e6717f2d Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:04:00 +0100 Subject: [PATCH 0094/1379] dmaengine: dw-edma: Add device_prep_interleave_dma() support Add device_prep_interleave_dma() support to Synopsys DMA driver. This feature implements a similar data transfer mechanism to the scatter-gather implementation. Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/73dc36264910654e266ae25814d892a0476e4427.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.c | 87 +++++++++++++++++++++++------- drivers/dma/dw-edma/dw-edma-core.h | 13 +++-- 2 files changed, 79 insertions(+), 21 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index 18a2878d2c50..1227a3ed45aa 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -329,7 +329,7 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) struct dw_edma_chunk *chunk; struct dw_edma_burst *burst; struct dw_edma_desc *desc; - u32 cnt; + u32 cnt = 0; int i; if (!chan->configured) @@ -352,12 +352,19 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) return NULL; } - if (xfer->cyclic) { + if (xfer->type == EDMA_XFER_CYCLIC) { if (!xfer->xfer.cyclic.len || !xfer->xfer.cyclic.cnt) return NULL; - } else { + } else if (xfer->type == EDMA_XFER_SCATTER_GATHER) { if (xfer->xfer.sg.len < 1) return NULL; + } else if (xfer->type == EDMA_XFER_INTERLEAVED) { + if (!xfer->xfer.il->numf) + return NULL; + if (xfer->xfer.il->numf > 0 && xfer->xfer.il->frame_size > 0) + return NULL; + } else { + return NULL; } desc = dw_edma_alloc_desc(chan); @@ -368,18 +375,28 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) if (unlikely(!chunk)) goto err_alloc; - src_addr = chan->config.src_addr; - dst_addr = chan->config.dst_addr; - - if (xfer->cyclic) { - cnt = xfer->xfer.cyclic.cnt; + if (xfer->type == EDMA_XFER_INTERLEAVED) { + src_addr = xfer->xfer.il->src_start; + dst_addr = xfer->xfer.il->dst_start; } else { + src_addr = chan->config.src_addr; + dst_addr = chan->config.dst_addr; + } + + if (xfer->type == EDMA_XFER_CYCLIC) { + cnt = xfer->xfer.cyclic.cnt; + } else if (xfer->type == EDMA_XFER_SCATTER_GATHER) { cnt = xfer->xfer.sg.len; sg = xfer->xfer.sg.sgl; + } else if (xfer->type == EDMA_XFER_INTERLEAVED) { + if (xfer->xfer.il->numf > 0) + cnt = xfer->xfer.il->numf; + else + cnt = xfer->xfer.il->frame_size; } for (i = 0; i < cnt; i++) { - if (!xfer->cyclic && !sg) + if (xfer->type == EDMA_XFER_SCATTER_GATHER && !sg) break; if (chunk->bursts_alloc == chan->ll_max) { @@ -392,19 +409,21 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) if (unlikely(!burst)) goto err_alloc; - if (xfer->cyclic) + if (xfer->type == EDMA_XFER_CYCLIC) burst->sz = xfer->xfer.cyclic.len; - else + else if (xfer->type == EDMA_XFER_SCATTER_GATHER) burst->sz = sg_dma_len(sg); + else if (xfer->type == EDMA_XFER_INTERLEAVED) + burst->sz = xfer->xfer.il->sgl[i].size; chunk->ll_region.sz += burst->sz; desc->alloc_sz += burst->sz; if (chan->dir == EDMA_DIR_WRITE) { burst->sar = src_addr; - if (xfer->cyclic) { + if (xfer->type == EDMA_XFER_CYCLIC) { burst->dar = xfer->xfer.cyclic.paddr; - } else { + } else if (xfer->type == EDMA_XFER_SCATTER_GATHER) { burst->dar = dst_addr; /* Unlike the typical assumption by other * drivers/IPs the peripheral memory isn't @@ -416,9 +435,9 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) } } else { burst->dar = dst_addr; - if (xfer->cyclic) { + if (xfer->type == EDMA_XFER_CYCLIC) { burst->sar = xfer->xfer.cyclic.paddr; - } else { + } else if (xfer->type == EDMA_XFER_SCATTER_GATHER) { burst->sar = src_addr; /* Unlike the typical assumption by other * drivers/IPs the peripheral memory isn't @@ -430,10 +449,24 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) } } - if (!xfer->cyclic) { + if (xfer->type == EDMA_XFER_SCATTER_GATHER) { src_addr += sg_dma_len(sg); dst_addr += sg_dma_len(sg); sg = sg_next(sg); + } else if (xfer->type == EDMA_XFER_INTERLEAVED && + xfer->xfer.il->frame_size > 0) { + struct dma_interleaved_template *il = xfer->xfer.il; + struct data_chunk *dc = &il->sgl[i]; + + if (il->src_sgl) { + src_addr += burst->sz; + src_addr += dmaengine_get_src_icg(il, dc); + } + + if (il->dst_sgl) { + dst_addr += burst->sz; + dst_addr += dmaengine_get_dst_icg(il, dc); + } } } @@ -459,7 +492,7 @@ dw_edma_device_prep_slave_sg(struct dma_chan *dchan, struct scatterlist *sgl, xfer.xfer.sg.sgl = sgl; xfer.xfer.sg.len = len; xfer.flags = flags; - xfer.cyclic = false; + xfer.type = EDMA_XFER_SCATTER_GATHER; return dw_edma_device_transfer(&xfer); } @@ -478,7 +511,23 @@ dw_edma_device_prep_dma_cyclic(struct dma_chan *dchan, dma_addr_t paddr, xfer.xfer.cyclic.len = len; xfer.xfer.cyclic.cnt = count; xfer.flags = flags; - xfer.cyclic = true; + xfer.type = EDMA_XFER_CYCLIC; + + return dw_edma_device_transfer(&xfer); +} + +static struct dma_async_tx_descriptor * +dw_edma_device_prep_interleaved_dma(struct dma_chan *dchan, + struct dma_interleaved_template *ilt, + unsigned long flags) +{ + struct dw_edma_transfer xfer; + + xfer.dchan = dchan; + xfer.direction = ilt->dir; + xfer.xfer.il = ilt; + xfer.flags = flags; + xfer.type = EDMA_XFER_INTERLEAVED; return dw_edma_device_transfer(&xfer); } @@ -738,6 +787,7 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write, dma_cap_set(DMA_SLAVE, dma->cap_mask); dma_cap_set(DMA_CYCLIC, dma->cap_mask); dma_cap_set(DMA_PRIVATE, dma->cap_mask); + dma_cap_set(DMA_INTERLEAVE, dma->cap_mask); dma->directions = BIT(write ? DMA_DEV_TO_MEM : DMA_MEM_TO_DEV); dma->src_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES); dma->dst_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES); @@ -756,6 +806,7 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write, dma->device_tx_status = dw_edma_device_tx_status; dma->device_prep_slave_sg = dw_edma_device_prep_slave_sg; dma->device_prep_dma_cyclic = dw_edma_device_prep_dma_cyclic; + dma->device_prep_interleaved_dma = dw_edma_device_prep_interleaved_dma; dma_set_max_seg_size(dma->dev, U32_MAX); diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h index 3f9593ede881..f72ebaa118af 100644 --- a/drivers/dma/dw-edma/dw-edma-core.h +++ b/drivers/dma/dw-edma/dw-edma-core.h @@ -39,6 +39,12 @@ enum dw_edma_status { EDMA_ST_BUSY }; +enum dw_edma_xfer_type { + EDMA_XFER_SCATTER_GATHER = 0, + EDMA_XFER_CYCLIC, + EDMA_XFER_INTERLEAVED +}; + struct dw_edma_chan; struct dw_edma_chunk; @@ -146,12 +152,13 @@ struct dw_edma_cyclic { struct dw_edma_transfer { struct dma_chan *dchan; union dw_edma_xfer { - struct dw_edma_sg sg; - struct dw_edma_cyclic cyclic; + struct dw_edma_sg sg; + struct dw_edma_cyclic cyclic; + struct dma_interleaved_template *il; } xfer; enum dma_transfer_direction direction; unsigned long flags; - bool cyclic; + enum dw_edma_xfer_type type; }; static inline From 16b90dd94d3f88b9f43cc06228d2b64d32225e5d Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:04:01 +0100 Subject: [PATCH 0095/1379] dmaengine: dw-edma: Improve number of channels check It was added some extra checks to ensure that the driver doesn't try to use more DMA channels than actually are available in hardware. Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/cfb2b0a4f97ae9dc83ebe5ea59d6a51d69ea3654.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.c | 21 +++++++++------------ drivers/dma/dw-edma/dw-edma-core.h | 2 ++ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index 1227a3ed45aa..cc391070c1bf 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -914,19 +914,16 @@ int dw_edma_probe(struct dw_edma_chip *chip) raw_spin_lock_init(&dw->lock); - if (!dw->wr_ch_cnt) { - /* Find out how many write channels are supported by hardware */ - dw->wr_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE); - if (!dw->wr_ch_cnt) - return -EINVAL; - } + dw->wr_ch_cnt = min_t(u16, dw->wr_ch_cnt, + dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE)); + dw->wr_ch_cnt = min_t(u16, dw->wr_ch_cnt, EDMA_MAX_WR_CH); - if (!dw->rd_ch_cnt) { - /* Find out how many read channels are supported by hardware */ - dw->rd_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ); - if (!dw->rd_ch_cnt) - return -EINVAL; - } + dw->rd_ch_cnt = min_t(u16, dw->rd_ch_cnt, + dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ)); + dw->rd_ch_cnt = min_t(u16, dw->rd_ch_cnt, EDMA_MAX_RD_CH); + + if (!dw->wr_ch_cnt && !dw->rd_ch_cnt) + return -EINVAL; dev_vdbg(dev, "Channels:\twrite=%d, read=%d\n", dw->wr_ch_cnt, dw->rd_ch_cnt); diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h index f72ebaa118af..650b1c7d30b7 100644 --- a/drivers/dma/dw-edma/dw-edma-core.h +++ b/drivers/dma/dw-edma/dw-edma-core.h @@ -15,6 +15,8 @@ #include "../virt-dma.h" #define EDMA_LL_SZ 24 +#define EDMA_MAX_WR_CH 8 +#define EDMA_MAX_RD_CH 8 enum dw_edma_dir { EDMA_DIR_WRITE = 0, From f3167dc16378da4abd4ca19d6700170fcdfd5be7 Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:04:02 +0100 Subject: [PATCH 0096/1379] dmaengine: dw-edma: Reorder variables to keep consistency In the driver code structure, I tried to keep the code style consistency by writing the write channels instructions first, and then follow by the read channels instructions, mimicking the hardware implementation. However, this code style failed in some cases. This patch fixes that and no functional changes are expected. Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/9bd1f86f19df8bb5de502fb85a0c5dc07978a9ba.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-pcie.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c index eb6f8b399cc7..63c62e16e20a 100644 --- a/drivers/dma/dw-edma/dw-edma-pcie.c +++ b/drivers/dma/dw-edma/dw-edma-pcie.c @@ -20,8 +20,8 @@ #define DW_PCIE_VSEC_DMA_ID 0x6 #define DW_PCIE_VSEC_DMA_BAR GENMASK(10, 8) #define DW_PCIE_VSEC_DMA_MAP GENMASK(2, 0) -#define DW_PCIE_VSEC_DMA_RD_CH GENMASK(25, 16) #define DW_PCIE_VSEC_DMA_WR_CH GENMASK(9, 0) +#define DW_PCIE_VSEC_DMA_RD_CH GENMASK(25, 16) struct dw_edma_pcie_data { /* eDMA registers location */ @@ -39,8 +39,8 @@ struct dw_edma_pcie_data { /* Other */ enum dw_edma_map_format mf; u8 irqs; - u16 rd_ch_cnt; u16 wr_ch_cnt; + u16 rd_ch_cnt; }; static const struct dw_edma_pcie_data snps_edda_data = { @@ -59,8 +59,8 @@ static const struct dw_edma_pcie_data snps_edda_data = { /* Other */ .mf = EDMA_MF_EDMA_UNROLL, .irqs = 1, - .rd_ch_cnt = 0, .wr_ch_cnt = 0, + .rd_ch_cnt = 0, }; static int dw_edma_pcie_irq_vector(struct device *dev, unsigned int nr) @@ -101,8 +101,8 @@ static void dw_edma_pcie_get_vsec_dma_data(struct pci_dev *pdev, pdata->rg_bar = FIELD_GET(DW_PCIE_VSEC_DMA_BAR, val); pci_read_config_dword(pdev, vsec + 0xc, &val); - pdata->rd_ch_cnt = FIELD_GET(DW_PCIE_VSEC_DMA_RD_CH, val); pdata->wr_ch_cnt = FIELD_GET(DW_PCIE_VSEC_DMA_WR_CH, val); + pdata->rd_ch_cnt = FIELD_GET(DW_PCIE_VSEC_DMA_RD_CH, val); pci_read_config_dword(pdev, vsec + 0x14, &val); off = val; @@ -218,8 +218,8 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, dw->mf = vsec_data.mf; dw->nr_irqs = nr_irqs; dw->ops = &dw_edma_pcie_core_ops; - dw->rd_ch_cnt = vsec_data.rd_ch_cnt; dw->wr_ch_cnt = vsec_data.wr_ch_cnt; + dw->rd_ch_cnt = vsec_data.rd_ch_cnt; /* Debug info */ if (dw->mf == EDMA_MF_EDMA_LEGACY) From 31fb8c1ff962d93ed5025f39a6a186207c9805eb Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:04:03 +0100 Subject: [PATCH 0097/1379] dmaengine: dw-edma: Improve the linked list and data blocks definition In the previous implementation, the driver assumed that there existed only two memory spaces that would equally distribute the amount of read/write channels. This might not be the case on some other implementations, therefore this patch change this requirement so that each write/read channel has its own linked list and data space well defined, which allows different sizes and locations. Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/2e316cb983f8a1e09ce929029f87619dc92a52de.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.c | 51 ++++---- drivers/dma/dw-edma/dw-edma-core.h | 9 +- drivers/dma/dw-edma/dw-edma-pcie.c | 185 ++++++++++++++++++++--------- 3 files changed, 160 insertions(+), 85 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index cc391070c1bf..552b5f998dbf 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -81,8 +81,13 @@ static struct dw_edma_chunk *dw_edma_alloc_chunk(struct dw_edma_desc *desc) * - Even chunks originate CB equal to 1 */ chunk->cb = !(desc->chunks_alloc % 2); - chunk->ll_region.paddr = dw->ll_region.paddr + chan->ll_off; - chunk->ll_region.vaddr = dw->ll_region.vaddr + chan->ll_off; + if (chan->dir == EDMA_DIR_WRITE) { + chunk->ll_region.paddr = dw->ll_region_wr[chan->id].paddr; + chunk->ll_region.vaddr = dw->ll_region_wr[chan->id].vaddr; + } else { + chunk->ll_region.paddr = dw->ll_region_rd[chan->id].paddr; + chunk->ll_region.vaddr = dw->ll_region_rd[chan->id].vaddr; + } if (desc->chunk) { /* Create and add new element into the linked list */ @@ -691,24 +696,13 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write, struct device *dev = chip->dev; struct dw_edma *dw = chip->dw; struct dw_edma_chan *chan; - size_t ll_chunk, dt_chunk; struct dw_edma_irq *irq; struct dma_device *dma; - u32 i, j, cnt, ch_cnt; u32 alloc, off_alloc; + u32 i, j, cnt; int err = 0; u32 pos; - ch_cnt = dw->wr_ch_cnt + dw->rd_ch_cnt; - ll_chunk = dw->ll_region.sz; - dt_chunk = dw->dt_region.sz; - - /* Calculate linked list chunk for each channel */ - ll_chunk /= roundup_pow_of_two(ch_cnt); - - /* Calculate linked list chunk for each channel */ - dt_chunk /= roundup_pow_of_two(ch_cnt); - if (write) { i = 0; cnt = dw->wr_ch_cnt; @@ -740,14 +734,14 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write, chan->request = EDMA_REQ_NONE; chan->status = EDMA_ST_IDLE; - chan->ll_off = (ll_chunk * i); - chan->ll_max = (ll_chunk / EDMA_LL_SZ) - 1; + if (write) + chan->ll_max = (dw->ll_region_wr[j].sz / EDMA_LL_SZ); + else + chan->ll_max = (dw->ll_region_rd[j].sz / EDMA_LL_SZ); + chan->ll_max -= 1; - chan->dt_off = (dt_chunk * i); - - dev_vdbg(dev, "L. List:\tChannel %s[%u] off=0x%.8lx, max_cnt=%u\n", - write ? "write" : "read", j, - chan->ll_off, chan->ll_max); + dev_vdbg(dev, "L. List:\tChannel %s[%u] max_cnt=%u\n", + write ? "write" : "read", j, chan->ll_max); if (dw->nr_irqs == 1) pos = 0; @@ -772,12 +766,15 @@ static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write, chan->vc.desc_free = vchan_free_desc; vchan_init(&chan->vc, dma); - dt_region->paddr = dw->dt_region.paddr + chan->dt_off; - dt_region->vaddr = dw->dt_region.vaddr + chan->dt_off; - dt_region->sz = dt_chunk; - - dev_vdbg(dev, "Data:\tChannel %s[%u] off=0x%.8lx\n", - write ? "write" : "read", j, chan->dt_off); + if (write) { + dt_region->paddr = dw->dt_region_wr[j].paddr; + dt_region->vaddr = dw->dt_region_wr[j].vaddr; + dt_region->sz = dw->dt_region_wr[j].sz; + } else { + dt_region->paddr = dw->dt_region_rd[j].paddr; + dt_region->vaddr = dw->dt_region_rd[j].vaddr; + dt_region->sz = dw->dt_region_rd[j].sz; + } dw_edma_v0_core_device_config(chan); } diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h index 650b1c7d30b7..cba543655e33 100644 --- a/drivers/dma/dw-edma/dw-edma-core.h +++ b/drivers/dma/dw-edma/dw-edma-core.h @@ -91,11 +91,8 @@ struct dw_edma_chan { int id; enum dw_edma_dir dir; - off_t ll_off; u32 ll_max; - off_t dt_off; - struct msi_msg msi; enum dw_edma_request request; @@ -126,8 +123,10 @@ struct dw_edma { u16 rd_ch_cnt; struct dw_edma_region rg_region; /* Registers */ - struct dw_edma_region ll_region; /* Linked list */ - struct dw_edma_region dt_region; /* Data */ + struct dw_edma_region ll_region_wr[EDMA_MAX_WR_CH]; + struct dw_edma_region ll_region_rd[EDMA_MAX_RD_CH]; + struct dw_edma_region dt_region_wr[EDMA_MAX_WR_CH]; + struct dw_edma_region dt_region_rd[EDMA_MAX_RD_CH]; struct dw_edma_irq *irq; int nr_irqs; diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c index 63c62e16e20a..1055fdbba390 100644 --- a/drivers/dma/dw-edma/dw-edma-pcie.c +++ b/drivers/dma/dw-edma/dw-edma-pcie.c @@ -23,19 +23,28 @@ #define DW_PCIE_VSEC_DMA_WR_CH GENMASK(9, 0) #define DW_PCIE_VSEC_DMA_RD_CH GENMASK(25, 16) +#define DW_BLOCK(a, b, c) \ + { \ + .bar = a, \ + .off = b, \ + .sz = c, \ + }, + +struct dw_edma_block { + enum pci_barno bar; + off_t off; + size_t sz; +}; + struct dw_edma_pcie_data { /* eDMA registers location */ - enum pci_barno rg_bar; - off_t rg_off; - size_t rg_sz; + struct dw_edma_block rg; /* eDMA memory linked list location */ - enum pci_barno ll_bar; - off_t ll_off; - size_t ll_sz; + struct dw_edma_block ll_wr[EDMA_MAX_WR_CH]; + struct dw_edma_block ll_rd[EDMA_MAX_RD_CH]; /* eDMA memory data location */ - enum pci_barno dt_bar; - off_t dt_off; - size_t dt_sz; + struct dw_edma_block dt_wr[EDMA_MAX_WR_CH]; + struct dw_edma_block dt_rd[EDMA_MAX_RD_CH]; /* Other */ enum dw_edma_map_format mf; u8 irqs; @@ -45,22 +54,40 @@ struct dw_edma_pcie_data { static const struct dw_edma_pcie_data snps_edda_data = { /* eDMA registers location */ - .rg_bar = BAR_0, - .rg_off = 0x00001000, /* 4 Kbytes */ - .rg_sz = 0x00002000, /* 8 Kbytes */ + .rg.bar = BAR_0, + .rg.off = 0x00001000, /* 4 Kbytes */ + .rg.sz = 0x00002000, /* 8 Kbytes */ /* eDMA memory linked list location */ - .ll_bar = BAR_2, - .ll_off = 0x00000000, /* 0 Kbytes */ - .ll_sz = 0x00800000, /* 8 Mbytes */ + .ll_wr = { + /* Channel 0 - BAR 2, offset 0 Mbytes, size 2 Mbytes */ + DW_BLOCK(BAR_2, 0x00000000, 0x00200000) + /* Channel 1 - BAR 2, offset 2 Mbytes, size 2 Mbytes */ + DW_BLOCK(BAR_2, 0x00200000, 0x00200000) + }, + .ll_rd = { + /* Channel 0 - BAR 2, offset 4 Mbytes, size 2 Mbytes */ + DW_BLOCK(BAR_2, 0x00400000, 0x00200000) + /* Channel 1 - BAR 2, offset 6 Mbytes, size 2 Mbytes */ + DW_BLOCK(BAR_2, 0x00600000, 0x00200000) + }, /* eDMA memory data location */ - .dt_bar = BAR_2, - .dt_off = 0x00800000, /* 8 Mbytes */ - .dt_sz = 0x03800000, /* 56 Mbytes */ + .dt_wr = { + /* Channel 0 - BAR 2, offset 8 Mbytes, size 14 Mbytes */ + DW_BLOCK(BAR_2, 0x00800000, 0x00e00000) + /* Channel 1 - BAR 2, offset 22 Mbytes, size 14 Mbytes */ + DW_BLOCK(BAR_2, 0x01600000, 0x00e00000) + }, + .dt_rd = { + /* Channel 0 - BAR 2, offset 36 Mbytes, size 14 Mbytes */ + DW_BLOCK(BAR_2, 0x02400000, 0x00e00000) + /* Channel 1 - BAR 2, offset 50 Mbytes, size 14 Mbytes */ + DW_BLOCK(BAR_2, 0x03200000, 0x00e00000) + }, /* Other */ .mf = EDMA_MF_EDMA_UNROLL, .irqs = 1, - .wr_ch_cnt = 0, - .rd_ch_cnt = 0, + .wr_ch_cnt = 2, + .rd_ch_cnt = 2, }; static int dw_edma_pcie_irq_vector(struct device *dev, unsigned int nr) @@ -98,18 +125,20 @@ static void dw_edma_pcie_get_vsec_dma_data(struct pci_dev *pdev, return; pdata->mf = map; - pdata->rg_bar = FIELD_GET(DW_PCIE_VSEC_DMA_BAR, val); + pdata->rg.bar = FIELD_GET(DW_PCIE_VSEC_DMA_BAR, val); pci_read_config_dword(pdev, vsec + 0xc, &val); - pdata->wr_ch_cnt = FIELD_GET(DW_PCIE_VSEC_DMA_WR_CH, val); - pdata->rd_ch_cnt = FIELD_GET(DW_PCIE_VSEC_DMA_RD_CH, val); + pdata->wr_ch_cnt = min_t(u16, pdata->wr_ch_cnt, + FIELD_GET(DW_PCIE_VSEC_DMA_WR_CH, val)); + pdata->rd_ch_cnt = min_t(u16, pdata->rd_ch_cnt, + FIELD_GET(DW_PCIE_VSEC_DMA_RD_CH, val)); pci_read_config_dword(pdev, vsec + 0x14, &val); off = val; pci_read_config_dword(pdev, vsec + 0x10, &val); off <<= 32; off |= val; - pdata->rg_off = off; + pdata->rg.off = off; } static int dw_edma_pcie_probe(struct pci_dev *pdev, @@ -121,6 +150,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, struct dw_edma_chip *chip; struct dw_edma *dw; int err, nr_irqs; + int i, mask; /* Enable PCI device */ err = pcim_enable_device(pdev); @@ -138,10 +168,16 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, dw_edma_pcie_get_vsec_dma_data(pdev, &vsec_data); /* Mapping PCI BAR regions */ - err = pcim_iomap_regions(pdev, BIT(vsec_data.rg_bar) | - BIT(vsec_data.ll_bar) | - BIT(vsec_data.dt_bar), - pci_name(pdev)); + mask = BIT(vsec_data.rg.bar); + for (i = 0; i < vsec_data.wr_ch_cnt; i++) { + mask |= BIT(vsec_data.ll_wr[i].bar); + mask |= BIT(vsec_data.dt_wr[i].bar); + } + for (i = 0; i < vsec_data.rd_ch_cnt; i++) { + mask |= BIT(vsec_data.ll_rd[i].bar); + mask |= BIT(vsec_data.dt_rd[i].bar); + } + err = pcim_iomap_regions(pdev, mask, pci_name(pdev)); if (err) { pci_err(pdev, "eDMA BAR I/O remapping failed\n"); return err; @@ -197,30 +233,56 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, chip->id = pdev->devfn; chip->irq = pdev->irq; - dw->rg_region.vaddr = pcim_iomap_table(pdev)[vsec_data.rg_bar]; - dw->rg_region.vaddr += vsec_data.rg_off; - dw->rg_region.paddr = pdev->resource[vsec_data.rg_bar].start; - dw->rg_region.paddr += vsec_data.rg_off; - dw->rg_region.sz = vsec_data.rg_sz; - - dw->ll_region.vaddr = pcim_iomap_table(pdev)[vsec_data.ll_bar]; - dw->ll_region.vaddr += vsec_data.ll_off; - dw->ll_region.paddr = pdev->resource[vsec_data.ll_bar].start; - dw->ll_region.paddr += vsec_data.ll_off; - dw->ll_region.sz = vsec_data.ll_sz; - - dw->dt_region.vaddr = pcim_iomap_table(pdev)[vsec_data.dt_bar]; - dw->dt_region.vaddr += vsec_data.dt_off; - dw->dt_region.paddr = pdev->resource[vsec_data.dt_bar].start; - dw->dt_region.paddr += vsec_data.dt_off; - dw->dt_region.sz = vsec_data.dt_sz; - dw->mf = vsec_data.mf; dw->nr_irqs = nr_irqs; dw->ops = &dw_edma_pcie_core_ops; dw->wr_ch_cnt = vsec_data.wr_ch_cnt; dw->rd_ch_cnt = vsec_data.rd_ch_cnt; + dw->rg_region.vaddr = pcim_iomap_table(pdev)[vsec_data.rg.bar]; + dw->rg_region.vaddr += vsec_data.rg.off; + dw->rg_region.paddr = pdev->resource[vsec_data.rg.bar].start; + dw->rg_region.paddr += vsec_data.rg.off; + dw->rg_region.sz = vsec_data.rg.sz; + + for (i = 0; i < dw->wr_ch_cnt; i++) { + struct dw_edma_region *ll_region = &dw->ll_region_wr[i]; + struct dw_edma_region *dt_region = &dw->dt_region_wr[i]; + struct dw_edma_block *ll_block = &vsec_data.ll_wr[i]; + struct dw_edma_block *dt_block = &vsec_data.dt_wr[i]; + + ll_region->vaddr = pcim_iomap_table(pdev)[ll_block->bar]; + ll_region->vaddr += ll_block->off; + ll_region->paddr = pdev->resource[ll_block->bar].start; + ll_region->paddr += ll_block->off; + ll_region->sz = ll_block->sz; + + dt_region->vaddr = pcim_iomap_table(pdev)[dt_block->bar]; + dt_region->vaddr += dt_block->off; + dt_region->paddr = pdev->resource[dt_block->bar].start; + dt_region->paddr += dt_block->off; + dt_region->sz = dt_block->sz; + } + + for (i = 0; i < dw->rd_ch_cnt; i++) { + struct dw_edma_region *ll_region = &dw->ll_region_rd[i]; + struct dw_edma_region *dt_region = &dw->dt_region_rd[i]; + struct dw_edma_block *ll_block = &vsec_data.ll_rd[i]; + struct dw_edma_block *dt_block = &vsec_data.dt_rd[i]; + + ll_region->vaddr = pcim_iomap_table(pdev)[ll_block->bar]; + ll_region->vaddr += ll_block->off; + ll_region->paddr = pdev->resource[ll_block->bar].start; + ll_region->paddr += ll_block->off; + ll_region->sz = ll_block->sz; + + dt_region->vaddr = pcim_iomap_table(pdev)[dt_block->bar]; + dt_region->vaddr += dt_block->off; + dt_region->paddr = pdev->resource[dt_block->bar].start; + dt_region->paddr += dt_block->off; + dt_region->sz = dt_block->sz; + } + /* Debug info */ if (dw->mf == EDMA_MF_EDMA_LEGACY) pci_dbg(pdev, "Version:\teDMA Port Logic (0x%x)\n", dw->mf); @@ -232,16 +294,33 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, pci_dbg(pdev, "Version:\tUnknown (0x%x)\n", dw->mf); pci_dbg(pdev, "Registers:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n", - vsec_data.rg_bar, vsec_data.rg_off, vsec_data.rg_sz, + vsec_data.rg.bar, vsec_data.rg.off, vsec_data.rg.sz, dw->rg_region.vaddr, &dw->rg_region.paddr); - pci_dbg(pdev, "L. List:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n", - vsec_data.ll_bar, vsec_data.ll_off, vsec_data.ll_sz, - dw->ll_region.vaddr, &dw->ll_region.paddr); - pci_dbg(pdev, "Data:\tBAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n", - vsec_data.dt_bar, vsec_data.dt_off, vsec_data.dt_sz, - dw->dt_region.vaddr, &dw->dt_region.paddr); + for (i = 0; i < dw->wr_ch_cnt; i++) { + pci_dbg(pdev, "L. List:\tWRITE CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n", + i, vsec_data.ll_wr[i].bar, + vsec_data.ll_wr[i].off, dw->ll_region_wr[i].sz, + dw->ll_region_wr[i].vaddr, &dw->ll_region_wr[i].paddr); + + pci_dbg(pdev, "Data:\tWRITE CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n", + i, vsec_data.dt_wr[i].bar, + vsec_data.dt_wr[i].off, dw->dt_region_wr[i].sz, + dw->dt_region_wr[i].vaddr, &dw->dt_region_wr[i].paddr); + } + + for (i = 0; i < dw->rd_ch_cnt; i++) { + pci_dbg(pdev, "L. List:\tREAD CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n", + i, vsec_data.ll_rd[i].bar, + vsec_data.ll_rd[i].off, dw->ll_region_rd[i].sz, + dw->ll_region_rd[i].vaddr, &dw->ll_region_rd[i].paddr); + + pci_dbg(pdev, "Data:\tREAD CH%.2u, BAR=%u, off=0x%.8lx, sz=0x%zx bytes, addr(v=%p, p=%pa)\n", + i, vsec_data.dt_rd[i].bar, + vsec_data.dt_rd[i].off, dw->dt_region_rd[i].sz, + dw->dt_region_rd[i].vaddr, &dw->dt_region_rd[i].paddr); + } pci_dbg(pdev, "Nr. IRQs:\t%u\n", dw->nr_irqs); From da6e0dd54135e51ca858ee231674ba93ca4ba89f Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:04:04 +0100 Subject: [PATCH 0098/1379] dmaengine: dw-edma: Change linked list and data blocks offset and sizes Changes the linked list and data blocks offset and sizes to follow the recommendation given by the hardware team for the IPK solution. Although the previous data blocks offset and sizes are still valid and functional, using them that might present some issues related to the IPK solution, since this solution is based on FPGA and might be subjected to timmings constrains. Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/f682e7f7f06dc6b2efdd431481d6fb4d762c2c05.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-pcie.c | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c index 1055fdbba390..fa66e0380e6d 100644 --- a/drivers/dma/dw-edma/dw-edma-pcie.c +++ b/drivers/dma/dw-edma/dw-edma-pcie.c @@ -59,29 +59,29 @@ static const struct dw_edma_pcie_data snps_edda_data = { .rg.sz = 0x00002000, /* 8 Kbytes */ /* eDMA memory linked list location */ .ll_wr = { - /* Channel 0 - BAR 2, offset 0 Mbytes, size 2 Mbytes */ - DW_BLOCK(BAR_2, 0x00000000, 0x00200000) - /* Channel 1 - BAR 2, offset 2 Mbytes, size 2 Mbytes */ - DW_BLOCK(BAR_2, 0x00200000, 0x00200000) + /* Channel 0 - BAR 2, offset 0 Mbytes, size 2 Kbytes */ + DW_BLOCK(BAR_2, 0x00000000, 0x00000800) + /* Channel 1 - BAR 2, offset 2 Mbytes, size 2 Kbytes */ + DW_BLOCK(BAR_2, 0x00200000, 0x00000800) }, .ll_rd = { - /* Channel 0 - BAR 2, offset 4 Mbytes, size 2 Mbytes */ - DW_BLOCK(BAR_2, 0x00400000, 0x00200000) - /* Channel 1 - BAR 2, offset 6 Mbytes, size 2 Mbytes */ - DW_BLOCK(BAR_2, 0x00600000, 0x00200000) + /* Channel 0 - BAR 2, offset 4 Mbytes, size 2 Kbytes */ + DW_BLOCK(BAR_2, 0x00400000, 0x00000800) + /* Channel 1 - BAR 2, offset 6 Mbytes, size 2 Kbytes */ + DW_BLOCK(BAR_2, 0x00600000, 0x00000800) }, /* eDMA memory data location */ .dt_wr = { - /* Channel 0 - BAR 2, offset 8 Mbytes, size 14 Mbytes */ - DW_BLOCK(BAR_2, 0x00800000, 0x00e00000) - /* Channel 1 - BAR 2, offset 22 Mbytes, size 14 Mbytes */ - DW_BLOCK(BAR_2, 0x01600000, 0x00e00000) + /* Channel 0 - BAR 2, offset 8 Mbytes, size 2 Kbytes */ + DW_BLOCK(BAR_2, 0x00800000, 0x00000800) + /* Channel 1 - BAR 2, offset 9 Mbytes, size 2 Kbytes */ + DW_BLOCK(BAR_2, 0x00900000, 0x00000800) }, .dt_rd = { - /* Channel 0 - BAR 2, offset 36 Mbytes, size 14 Mbytes */ - DW_BLOCK(BAR_2, 0x02400000, 0x00e00000) - /* Channel 1 - BAR 2, offset 50 Mbytes, size 14 Mbytes */ - DW_BLOCK(BAR_2, 0x03200000, 0x00e00000) + /* Channel 0 - BAR 2, offset 10 Mbytes, size 2 Kbytes */ + DW_BLOCK(BAR_2, 0x00a00000, 0x00000800) + /* Channel 1 - BAR 2, offset 11 Mbytes, size 2 Kbytes */ + DW_BLOCK(BAR_2, 0x00b00000, 0x00000800) }, /* Other */ .mf = EDMA_MF_EDMA_UNROLL, From 5244ac2e2e34d1f558ae690f10882ebe42365ea2 Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:04:05 +0100 Subject: [PATCH 0099/1379] dmaengine: dw-edma: Move struct dentry variable from static definition into dw_edma struct Move struct dentry variable from static definition (dw-edma-v0-debugfs.c) into dw_edma struct (dw-edma-core.h) Also the variable was renamed from base_dir to debugfs. Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/07c1167b671e7b175700e2e7061cf0b3dd8c6adb.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.c | 2 +- drivers/dma/dw-edma/dw-edma-core.h | 3 +++ drivers/dma/dw-edma/dw-edma-v0-core.c | 4 ++-- drivers/dma/dw-edma/dw-edma-v0-core.h | 2 +- drivers/dma/dw-edma/dw-edma-v0-debugfs.c | 22 +++++++++++++--------- drivers/dma/dw-edma/dw-edma-v0-debugfs.h | 4 ++-- 6 files changed, 22 insertions(+), 15 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index 552b5f998dbf..bc94c8dfd9fd 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -1003,7 +1003,7 @@ int dw_edma_remove(struct dw_edma_chip *chip) dma_async_device_unregister(&dw->rd_edma); /* Turn debugfs off */ - dw_edma_v0_core_debugfs_off(); + dw_edma_v0_core_debugfs_off(chip); return 0; } diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h index cba543655e33..60316d408c3e 100644 --- a/drivers/dma/dw-edma/dw-edma-core.h +++ b/drivers/dma/dw-edma/dw-edma-core.h @@ -137,6 +137,9 @@ struct dw_edma { const struct dw_edma_core_ops *ops; raw_spinlock_t lock; /* Only for legacy */ +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs; +#endif /* CONFIG_DEBUG_FS */ }; struct dw_edma_sg { diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c index 5b0541a3a094..329fc2e57b70 100644 --- a/drivers/dma/dw-edma/dw-edma-v0-core.c +++ b/drivers/dma/dw-edma/dw-edma-v0-core.c @@ -506,7 +506,7 @@ void dw_edma_v0_core_debugfs_on(struct dw_edma_chip *chip) dw_edma_v0_debugfs_on(chip); } -void dw_edma_v0_core_debugfs_off(void) +void dw_edma_v0_core_debugfs_off(struct dw_edma_chip *chip) { - dw_edma_v0_debugfs_off(); + dw_edma_v0_debugfs_off(chip); } diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.h b/drivers/dma/dw-edma/dw-edma-v0-core.h index abae1527f1f9..2afa626b8300 100644 --- a/drivers/dma/dw-edma/dw-edma-v0-core.h +++ b/drivers/dma/dw-edma/dw-edma-v0-core.h @@ -23,6 +23,6 @@ void dw_edma_v0_core_start(struct dw_edma_chunk *chunk, bool first); int dw_edma_v0_core_device_config(struct dw_edma_chan *chan); /* eDMA debug fs callbacks */ void dw_edma_v0_core_debugfs_on(struct dw_edma_chip *chip); -void dw_edma_v0_core_debugfs_off(void); +void dw_edma_v0_core_debugfs_off(struct dw_edma_chip *chip); #endif /* _DW_EDMA_V0_CORE_H */ diff --git a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c index 157dfc2e2430..4b3bcffd15ef 100644 --- a/drivers/dma/dw-edma/dw-edma-v0-debugfs.c +++ b/drivers/dma/dw-edma/dw-edma-v0-debugfs.c @@ -38,7 +38,6 @@ #define CHANNEL_STR "channel" #define REGISTERS_STR "registers" -static struct dentry *base_dir; static struct dw_edma *dw; static struct dw_edma_v0_regs __iomem *regs; @@ -272,7 +271,7 @@ static void dw_edma_debugfs_regs(void) struct dentry *regs_dir; int nr_entries; - regs_dir = debugfs_create_dir(REGISTERS_STR, base_dir); + regs_dir = debugfs_create_dir(REGISTERS_STR, dw->debugfs); if (!regs_dir) return; @@ -293,18 +292,23 @@ void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip) if (!regs) return; - base_dir = debugfs_create_dir(dw->name, NULL); - if (!base_dir) + dw->debugfs = debugfs_create_dir(dw->name, NULL); + if (!dw->debugfs) return; - debugfs_create_u32("mf", 0444, base_dir, &dw->mf); - debugfs_create_u16("wr_ch_cnt", 0444, base_dir, &dw->wr_ch_cnt); - debugfs_create_u16("rd_ch_cnt", 0444, base_dir, &dw->rd_ch_cnt); + debugfs_create_u32("mf", 0444, dw->debugfs, &dw->mf); + debugfs_create_u16("wr_ch_cnt", 0444, dw->debugfs, &dw->wr_ch_cnt); + debugfs_create_u16("rd_ch_cnt", 0444, dw->debugfs, &dw->rd_ch_cnt); dw_edma_debugfs_regs(); } -void dw_edma_v0_debugfs_off(void) +void dw_edma_v0_debugfs_off(struct dw_edma_chip *chip) { - debugfs_remove_recursive(base_dir); + dw = chip->dw; + if (!dw) + return; + + debugfs_remove_recursive(dw->debugfs); + dw->debugfs = NULL; } diff --git a/drivers/dma/dw-edma/dw-edma-v0-debugfs.h b/drivers/dma/dw-edma/dw-edma-v0-debugfs.h index 5450a0a94193..d0ff25a9ea5c 100644 --- a/drivers/dma/dw-edma/dw-edma-v0-debugfs.h +++ b/drivers/dma/dw-edma/dw-edma-v0-debugfs.h @@ -13,13 +13,13 @@ #ifdef CONFIG_DEBUG_FS void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip); -void dw_edma_v0_debugfs_off(void); +void dw_edma_v0_debugfs_off(struct dw_edma_chip *chip); #else static inline void dw_edma_v0_debugfs_on(struct dw_edma_chip *chip) { } -static inline void dw_edma_v0_debugfs_off(void) +static inline void dw_edma_v0_debugfs_off(struct dw_edma_chip *chip) { } #endif /* CONFIG_DEBUG_FS */ From e970dcc4bd8e0a1376e794fc81d41d0fc98262dd Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:04:06 +0100 Subject: [PATCH 0100/1379] dmaengine: dw-edma: Fix crash on loading/unloading driver When the driver is compiled as a module and loaded if we try to unload it, the Kernel shows a crash log. This Kernel crash is due to the dma_async_device_unregister() call done after deleting the channels, this patch fixes this issue. Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/4aa850c035cf7ee488f1d3fb6dee0e37be0dce0a.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index bc94c8dfd9fd..a26d0d578a27 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -986,21 +986,20 @@ int dw_edma_remove(struct dw_edma_chip *chip) /* Power management */ pm_runtime_disable(dev); - list_for_each_entry_safe(chan, _chan, &dw->wr_edma.channels, - vc.chan.device_node) { - list_del(&chan->vc.chan.device_node); - tasklet_kill(&chan->vc.task); - } - - list_for_each_entry_safe(chan, _chan, &dw->rd_edma.channels, - vc.chan.device_node) { - list_del(&chan->vc.chan.device_node); - tasklet_kill(&chan->vc.task); - } - /* Deregister eDMA device */ dma_async_device_unregister(&dw->wr_edma); + list_for_each_entry_safe(chan, _chan, &dw->wr_edma.channels, + vc.chan.device_node) { + tasklet_kill(&chan->vc.task); + list_del(&chan->vc.chan.device_node); + } + dma_async_device_unregister(&dw->rd_edma); + list_for_each_entry_safe(chan, _chan, &dw->rd_edma.channels, + vc.chan.device_node) { + tasklet_kill(&chan->vc.task); + list_del(&chan->vc.chan.device_node); + } /* Turn debugfs off */ dw_edma_v0_core_debugfs_off(chip); From cb498d7f3b086bcfab642e38aa80600dcebe0e0a Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:04:07 +0100 Subject: [PATCH 0101/1379] dmaengine: dw-edma: Change DMA abbreviation from lower into upper case To keep code consistent, some comments with dma keyword written in lower case are now in upper case. Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/8c4b3db90767972a2b4cbb6fa818cf0e9c3d6fe3.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index a26d0d578a27..0793df12401f 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -341,15 +341,15 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) return NULL; switch (chan->config.direction) { - case DMA_DEV_TO_MEM: /* local dma */ + case DMA_DEV_TO_MEM: /* local DMA */ if (dir == DMA_DEV_TO_MEM && chan->dir == EDMA_DIR_READ) break; return NULL; - case DMA_MEM_TO_DEV: /* local dma */ + case DMA_MEM_TO_DEV: /* local DMA */ if (dir == DMA_MEM_TO_DEV && chan->dir == EDMA_DIR_WRITE) break; return NULL; - default: /* remote dma */ + default: /* remote DMA */ if (dir == DMA_MEM_TO_DEV && chan->dir == EDMA_DIR_READ) break; if (dir == DMA_DEV_TO_MEM && chan->dir == EDMA_DIR_WRITE) From b671d098a97f59d6056b34e0a759ec976325c216 Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:04:08 +0100 Subject: [PATCH 0102/1379] dmaengine: dw-edma: Revert fix scatter-gather address calculation Reverting the applied patch because it caused a regression on ARC700 platform (32 bits). Fixes: 05655541c950 ("dmaengine: dw-edma: Fix scatter-gather address calculation") Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/1778422e389fe40032e216b59b1b992c61ec9887.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index 0793df12401f..53289927dd0d 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -429,7 +429,8 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) if (xfer->type == EDMA_XFER_CYCLIC) { burst->dar = xfer->xfer.cyclic.paddr; } else if (xfer->type == EDMA_XFER_SCATTER_GATHER) { - burst->dar = dst_addr; + src_addr += sg_dma_len(sg); + burst->dar = sg_dma_address(sg); /* Unlike the typical assumption by other * drivers/IPs the peripheral memory isn't * a FIFO memory, in this case, it's a @@ -443,7 +444,8 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) if (xfer->type == EDMA_XFER_CYCLIC) { burst->sar = xfer->xfer.cyclic.paddr; } else if (xfer->type == EDMA_XFER_SCATTER_GATHER) { - burst->sar = src_addr; + dst_addr += sg_dma_len(sg); + burst->sar = sg_dma_address(sg); /* Unlike the typical assumption by other * drivers/IPs the peripheral memory isn't * a FIFO memory, in this case, it's a @@ -455,8 +457,6 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) } if (xfer->type == EDMA_XFER_SCATTER_GATHER) { - src_addr += sg_dma_len(sg); - dst_addr += sg_dma_len(sg); sg = sg_next(sg); } else if (xfer->type == EDMA_XFER_INTERLEAVED && xfer->xfer.il->frame_size > 0) { From 84b0aa2e0d91d7974c6cfcb3a1ce230e7366293e Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Thu, 18 Feb 2021 20:04:09 +0100 Subject: [PATCH 0103/1379] dmaengine: dw-edma: Add pcim_iomap_table return check Currently, is missing a null check on a pcim_iomap_table() return value and this can lead to a null pointer dereference if the desired BAR wasn't mapped previously. Fix this by adding a null check and returning -ENOMEM. Addresses-Coverity: ("Dereference null return") Signed-off-by: Gustavo Pimentel Link: https://lore.kernel.org/r/bc5e6b8632c84660bb6dae454980e9419992ed14.1613674948.git.gustavo.pimentel@synopsys.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-pcie.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c index fa66e0380e6d..44f6e09bdb53 100644 --- a/drivers/dma/dw-edma/dw-edma-pcie.c +++ b/drivers/dma/dw-edma/dw-edma-pcie.c @@ -240,6 +240,9 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, dw->rd_ch_cnt = vsec_data.rd_ch_cnt; dw->rg_region.vaddr = pcim_iomap_table(pdev)[vsec_data.rg.bar]; + if (!dw->rg_region.vaddr) + return -ENOMEM; + dw->rg_region.vaddr += vsec_data.rg.off; dw->rg_region.paddr = pdev->resource[vsec_data.rg.bar].start; dw->rg_region.paddr += vsec_data.rg.off; @@ -252,12 +255,18 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, struct dw_edma_block *dt_block = &vsec_data.dt_wr[i]; ll_region->vaddr = pcim_iomap_table(pdev)[ll_block->bar]; + if (!ll_region->vaddr) + return -ENOMEM; + ll_region->vaddr += ll_block->off; ll_region->paddr = pdev->resource[ll_block->bar].start; ll_region->paddr += ll_block->off; ll_region->sz = ll_block->sz; dt_region->vaddr = pcim_iomap_table(pdev)[dt_block->bar]; + if (!dt_region->vaddr) + return -ENOMEM; + dt_region->vaddr += dt_block->off; dt_region->paddr = pdev->resource[dt_block->bar].start; dt_region->paddr += dt_block->off; @@ -271,12 +280,18 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, struct dw_edma_block *dt_block = &vsec_data.dt_rd[i]; ll_region->vaddr = pcim_iomap_table(pdev)[ll_block->bar]; + if (!ll_region->vaddr) + return -ENOMEM; + ll_region->vaddr += ll_block->off; ll_region->paddr = pdev->resource[ll_block->bar].start; ll_region->paddr += ll_block->off; ll_region->sz = ll_block->sz; dt_region->vaddr = pcim_iomap_table(pdev)[dt_block->bar]; + if (!dt_region->vaddr) + return -ENOMEM; + dt_region->vaddr += dt_block->off; dt_region->paddr = pdev->resource[dt_block->bar].start; dt_region->paddr += dt_block->off; From 9906aa5bd6f5a13c9c5488d5426893a7b38b550f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:23 +0100 Subject: [PATCH 0104/1379] powerpc/svm: stop using io_tlb_start Use the local variable that is passed to swiotlb_init_with_tbl for freeing the memory in the failure case to isolate the code a little better from swiotlb internals. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- arch/powerpc/platforms/pseries/svm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index 7b739cc7a8a9..1d829e257996 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -55,9 +55,9 @@ void __init svm_swiotlb_init(void) if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, false)) return; - if (io_tlb_start) - memblock_free_early(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + + memblock_free_early(__pa(vstart), + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); panic("SVM: Cannot allocate SWIOTLB buffer"); } From 2973073a80b46daebc352c31d09d95d16cf6876e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:24 +0100 Subject: [PATCH 0105/1379] swiotlb: remove the alloc_size parameter to swiotlb_tbl_unmap_single Now that swiotlb remembers the allocation size there is no need to pass it back to swiotlb_tbl_unmap_single. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- drivers/iommu/dma-iommu.c | 11 +++------- drivers/xen/swiotlb-xen.c | 4 ++-- include/linux/swiotlb.h | 1 - kernel/dma/direct.h | 2 +- kernel/dma/swiotlb.c | 45 ++++++++++++++++++++------------------- 5 files changed, 29 insertions(+), 34 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index af765c813cc8..9149597410e2 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -499,8 +499,6 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr, unsigned long attrs) { struct iommu_domain *domain = iommu_get_dma_domain(dev); - struct iommu_dma_cookie *cookie = domain->iova_cookie; - struct iova_domain *iovad = &cookie->iovad; phys_addr_t phys; phys = iommu_iova_to_phys(domain, dma_addr); @@ -510,8 +508,7 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr, __iommu_dma_unmap(dev, dma_addr, size); if (unlikely(is_swiotlb_buffer(phys))) - swiotlb_tbl_unmap_single(dev, phys, size, - iova_align(iovad, size), dir, attrs); + swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, @@ -581,10 +578,8 @@ static dma_addr_t __iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys, } iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask); - if ((iova == DMA_MAPPING_ERROR) && is_swiotlb_buffer(phys)) - swiotlb_tbl_unmap_single(dev, phys, org_size, - aligned_size, dir, attrs); - + if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys)) + swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs); return iova; } diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 2b385c1b4a99..d47f1b311caa 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -406,7 +406,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, * Ensure that the address returned is DMA'ble */ if (unlikely(!dma_capable(dev, dev_addr, size, true))) { - swiotlb_tbl_unmap_single(dev, map, size, size, dir, + swiotlb_tbl_unmap_single(dev, map, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return DMA_MAPPING_ERROR; } @@ -445,7 +445,7 @@ static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, /* NOTE: We use dev_addr here, not paddr! */ if (is_xen_swiotlb_buffer(hwdev, dev_addr)) - swiotlb_tbl_unmap_single(hwdev, paddr, size, size, dir, attrs); + swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); } static void diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 5857a937c637..59f421d041ed 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -57,7 +57,6 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, extern void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, size_t mapping_size, - size_t alloc_size, enum dma_data_direction dir, unsigned long attrs); diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index b98615578737..e1bf721591c0 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -114,6 +114,6 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, dma_direct_sync_single_for_cpu(dev, addr, size, dir); if (unlikely(is_swiotlb_buffer(phys))) - swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs); + swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } #endif /* _KERNEL_DMA_DIRECT_H */ diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index c10e855a03bc..03aa614565e4 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -102,7 +102,7 @@ static phys_addr_t *io_tlb_orig_addr; /* * The mapped buffer's size should be validated during a sync operation. */ -static size_t *io_tlb_orig_size; +static size_t *io_tlb_alloc_size; /* * Protect the above data structures in the map and unmap calls @@ -253,15 +253,15 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) __func__, alloc_size, PAGE_SIZE); alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)); - io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE); - if (!io_tlb_orig_size) + io_tlb_alloc_size = memblock_alloc(alloc_size, PAGE_SIZE); + if (!io_tlb_alloc_size) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); for (i = 0; i < io_tlb_nslabs; i++) { io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - io_tlb_orig_size[i] = 0; + io_tlb_alloc_size[i] = 0; } io_tlb_index = 0; no_iotlb_memory = false; @@ -393,18 +393,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) if (!io_tlb_orig_addr) goto cleanup4; - io_tlb_orig_size = (size_t *) + io_tlb_alloc_size = (size_t *) __get_free_pages(GFP_KERNEL, get_order(io_tlb_nslabs * sizeof(size_t))); - if (!io_tlb_orig_size) + if (!io_tlb_alloc_size) goto cleanup5; for (i = 0; i < io_tlb_nslabs; i++) { io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - io_tlb_orig_size[i] = 0; + io_tlb_alloc_size[i] = 0; } io_tlb_index = 0; no_iotlb_memory = false; @@ -436,7 +436,7 @@ void __init swiotlb_exit(void) return; if (late_alloc) { - free_pages((unsigned long)io_tlb_orig_size, + free_pages((unsigned long)io_tlb_alloc_size, get_order(io_tlb_nslabs * sizeof(size_t))); free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * sizeof(phys_addr_t))); @@ -447,7 +447,7 @@ void __init swiotlb_exit(void) } else { memblock_free_late(__pa(io_tlb_orig_addr), PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); - memblock_free_late(__pa(io_tlb_orig_size), + memblock_free_late(__pa(io_tlb_alloc_size), PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t))); memblock_free_late(__pa(io_tlb_list), PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); @@ -639,7 +639,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, */ for (i = 0; i < nr_slots(alloc_size + offset); i++) { io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i); - io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT); + io_tlb_alloc_size[index+i] = alloc_size - (i << IO_TLB_SHIFT); } tlb_addr = slot_addr(io_tlb_start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && @@ -648,14 +648,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, return tlb_addr; } -static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size) +static void validate_sync_size_and_truncate(struct device *hwdev, size_t alloc_size, size_t *size) { - if (*size > orig_size) { + if (*size > alloc_size) { /* Warn and truncate mapping_size */ dev_WARN_ONCE(hwdev, 1, "Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n", - orig_size, *size); - *size = orig_size; + alloc_size, *size); + *size = alloc_size; } } @@ -663,16 +663,17 @@ static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_si * tlb_addr is the physical address of the bounce buffer to unmap. */ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t mapping_size, size_t alloc_size, - enum dma_data_direction dir, unsigned long attrs) + size_t mapping_size, enum dma_data_direction dir, + unsigned long attrs) { unsigned long flags; unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); - int i, count, nslots = nr_slots(alloc_size + offset); int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; + size_t alloc_size = io_tlb_alloc_size[index]; + int i, count, nslots = nr_slots(alloc_size + offset); - validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size); + validate_sync_size_and_truncate(hwdev, alloc_size, &mapping_size); /* * First, sync the memory before unmapping the entry @@ -701,7 +702,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, for (i = index + nslots - 1; i >= index; i--) { io_tlb_list[i] = ++count; io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - io_tlb_orig_size[i] = 0; + io_tlb_alloc_size[i] = 0; } /* @@ -721,13 +722,13 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_sync_target target) { int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; - size_t orig_size = io_tlb_orig_size[index]; + size_t alloc_size = io_tlb_alloc_size[index]; phys_addr_t orig_addr = io_tlb_orig_addr[index]; if (orig_addr == INVALID_PHYS_ADDR) return; - validate_sync_size_and_truncate(hwdev, orig_size, &size); + validate_sync_size_and_truncate(hwdev, alloc_size, &size); switch (target) { case SYNC_FOR_CPU: @@ -770,7 +771,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, /* Ensure that the address returned is DMA'ble */ dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr); if (unlikely(!dma_capable(dev, dma_addr, size, true))) { - swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir, + swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); dev_WARN_ONCE(dev, 1, "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", From 2bdba622c351259317b0294c6e9fe243b2404316 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:25 +0100 Subject: [PATCH 0106/1379] swiotlb: move orig addr and size validation into swiotlb_bounce Move the code to find and validate the original buffer address and size from the callers into swiotlb_bounce. This means a tiny bit of extra work in the swiotlb_map path, but avoids code duplication and a leads to a better code structure. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 59 +++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 03aa614565e4..a431c6b64e82 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -460,12 +460,25 @@ void __init swiotlb_exit(void) /* * Bounce: copy the swiotlb buffer from or back to the original dma location */ -static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir) +static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, + enum dma_data_direction dir) { + int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + size_t alloc_size = io_tlb_alloc_size[index]; + phys_addr_t orig_addr = io_tlb_orig_addr[index]; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = phys_to_virt(tlb_addr); + if (orig_addr == INVALID_PHYS_ADDR) + return; + + if (size > alloc_size) { + dev_WARN_ONCE(dev, 1, + "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n", + alloc_size, size); + size = alloc_size; + } + if (PageHighMem(pfn_to_page(pfn))) { /* The buffer does not have a mapping. Map it in and copy */ unsigned int offset = orig_addr & ~PAGE_MASK; @@ -644,21 +657,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, tlb_addr = slot_addr(io_tlb_start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE); + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } -static void validate_sync_size_and_truncate(struct device *hwdev, size_t alloc_size, size_t *size) -{ - if (*size > alloc_size) { - /* Warn and truncate mapping_size */ - dev_WARN_ONCE(hwdev, 1, - "Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n", - alloc_size, *size); - *size = alloc_size; - } -} - /* * tlb_addr is the physical address of the bounce buffer to unmap. */ @@ -669,19 +671,15 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, unsigned long flags; unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT; - phys_addr_t orig_addr = io_tlb_orig_addr[index]; - size_t alloc_size = io_tlb_alloc_size[index]; - int i, count, nslots = nr_slots(alloc_size + offset); - - validate_sync_size_and_truncate(hwdev, alloc_size, &mapping_size); + int nslots = nr_slots(io_tlb_alloc_size[index] + offset); + int count, i; /* * First, sync the memory before unmapping the entry */ - if (orig_addr != INVALID_PHYS_ADDR && - !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) - swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE); + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE); /* * Return the buffer to the free list by setting the corresponding @@ -721,27 +719,16 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir, enum dma_sync_target target) { - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; - size_t alloc_size = io_tlb_alloc_size[index]; - phys_addr_t orig_addr = io_tlb_orig_addr[index]; - - if (orig_addr == INVALID_PHYS_ADDR) - return; - - validate_sync_size_and_truncate(hwdev, alloc_size, &size); - switch (target) { case SYNC_FOR_CPU: if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_FROM_DEVICE); + swiotlb_bounce(hwdev, tlb_addr, size, DMA_FROM_DEVICE); else BUG_ON(dir != DMA_TO_DEVICE); break; case SYNC_FOR_DEVICE: if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_TO_DEVICE); + swiotlb_bounce(hwdev, tlb_addr, size, DMA_TO_DEVICE); else BUG_ON(dir != DMA_FROM_DEVICE); break; From 80808d273a3f075196d1a26463f65d4c9d2891c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:26 +0100 Subject: [PATCH 0107/1379] swiotlb: split swiotlb_tbl_sync_single Split swiotlb_tbl_sync_single into two separate funtions for the to device and to cpu synchronization. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- drivers/iommu/dma-iommu.c | 12 ++++++------ drivers/xen/swiotlb-xen.c | 4 ++-- include/linux/swiotlb.h | 17 ++++------------- kernel/dma/direct.c | 8 ++++---- kernel/dma/direct.h | 4 ++-- kernel/dma/swiotlb.c | 34 +++++++++++++++------------------- 6 files changed, 33 insertions(+), 46 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 9149597410e2..3087d9fa6065 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -750,7 +750,7 @@ static void iommu_dma_sync_single_for_cpu(struct device *dev, arch_sync_dma_for_cpu(phys, size, dir); if (is_swiotlb_buffer(phys)) - swiotlb_tbl_sync_single(dev, phys, size, dir, SYNC_FOR_CPU); + swiotlb_sync_single_for_cpu(dev, phys, size, dir); } static void iommu_dma_sync_single_for_device(struct device *dev, @@ -763,7 +763,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev, phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); if (is_swiotlb_buffer(phys)) - swiotlb_tbl_sync_single(dev, phys, size, dir, SYNC_FOR_DEVICE); + swiotlb_sync_single_for_device(dev, phys, size, dir); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(phys, size, dir); @@ -784,8 +784,8 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev, arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir); if (is_swiotlb_buffer(sg_phys(sg))) - swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length, - dir, SYNC_FOR_CPU); + swiotlb_sync_single_for_cpu(dev, sg_phys(sg), + sg->length, dir); } } @@ -801,8 +801,8 @@ static void iommu_dma_sync_sg_for_device(struct device *dev, for_each_sg(sgl, sg, nelems, i) { if (is_swiotlb_buffer(sg_phys(sg))) - swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length, - dir, SYNC_FOR_DEVICE); + swiotlb_sync_single_for_device(dev, sg_phys(sg), + sg->length, dir); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(sg_phys(sg), sg->length, dir); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index d47f1b311caa..4e8a4e14942a 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -462,7 +462,7 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr, } if (is_xen_swiotlb_buffer(dev, dma_addr)) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); + swiotlb_sync_single_for_cpu(dev, paddr, size, dir); } static void @@ -472,7 +472,7 @@ xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr, phys_addr_t paddr = xen_dma_to_phys(dev, dma_addr); if (is_xen_swiotlb_buffer(dev, dma_addr)) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); + swiotlb_sync_single_for_device(dev, paddr, size, dir); if (!dev_is_dma_coherent(dev)) { if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 59f421d041ed..0696bdc8072e 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -42,14 +42,6 @@ extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); extern int swiotlb_late_init_with_default_size(size_t default_size); extern void __init swiotlb_update_mem_attributes(void); -/* - * Enumeration for sync targets - */ -enum dma_sync_target { - SYNC_FOR_CPU = 0, - SYNC_FOR_DEVICE = 1, -}; - phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs); @@ -60,11 +52,10 @@ extern void swiotlb_tbl_unmap_single(struct device *hwdev, enum dma_data_direction dir, unsigned long attrs); -extern void swiotlb_tbl_sync_single(struct device *hwdev, - phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir, - enum dma_sync_target target); - +void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir); +void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir); dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys, size_t size, enum dma_data_direction dir, unsigned long attrs); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 002268262c9a..f737e3347059 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -344,8 +344,8 @@ void dma_direct_sync_sg_for_device(struct device *dev, phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, sg->length, - dir, SYNC_FOR_DEVICE); + swiotlb_sync_single_for_device(dev, paddr, sg->length, + dir); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(paddr, sg->length, @@ -370,8 +370,8 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, arch_sync_dma_for_cpu(paddr, sg->length, dir); if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, sg->length, dir, - SYNC_FOR_CPU); + swiotlb_sync_single_for_cpu(dev, paddr, sg->length, + dir); if (dir == DMA_FROM_DEVICE) arch_dma_mark_clean(paddr, sg->length); diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index e1bf721591c0..50afc05b6f1d 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -57,7 +57,7 @@ static inline void dma_direct_sync_single_for_device(struct device *dev, phys_addr_t paddr = dma_to_phys(dev, addr); if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); + swiotlb_sync_single_for_device(dev, paddr, size, dir); if (!dev_is_dma_coherent(dev)) arch_sync_dma_for_device(paddr, size, dir); @@ -74,7 +74,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, } if (unlikely(is_swiotlb_buffer(paddr))) - swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_CPU); + swiotlb_sync_single_for_cpu(dev, paddr, size, dir); if (dir == DMA_FROM_DEVICE) arch_dma_mark_clean(paddr, size); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index a431c6b64e82..5fe8781be6f2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -715,26 +715,22 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, spin_unlock_irqrestore(&io_tlb_lock, flags); } -void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir, - enum dma_sync_target target) +void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir) { - switch (target) { - case SYNC_FOR_CPU: - if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(hwdev, tlb_addr, size, DMA_FROM_DEVICE); - else - BUG_ON(dir != DMA_TO_DEVICE); - break; - case SYNC_FOR_DEVICE: - if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(hwdev, tlb_addr, size, DMA_TO_DEVICE); - else - BUG_ON(dir != DMA_FROM_DEVICE); - break; - default: - BUG(); - } + if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + swiotlb_bounce(dev, tlb_addr, size, DMA_TO_DEVICE); + else + BUG_ON(dir != DMA_FROM_DEVICE); +} + +void swiotlb_sync_single_for_cpu(struct device *dev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir) +{ + if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) + swiotlb_bounce(dev, tlb_addr, size, DMA_FROM_DEVICE); + else + BUG_ON(dir != DMA_TO_DEVICE); } /* From 16bc75f3aa3963588b13b01f6dad589f85d9f733 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:27 +0100 Subject: [PATCH 0108/1379] xen-swiotlb: use is_swiotlb_buffer in is_xen_swiotlb_buffer Use the is_swiotlb_buffer to check if a physical address is a swiotlb buffer. This works because xen-swiotlb does use the same buffer as the main swiotlb code, and xen_io_tlb_{start,end} are just the addresses for it that went through phys_to_virt. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 4e8a4e14942a..bffb35993c9d 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -111,10 +111,8 @@ static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr) * have the same virtual address as another address * in our domain. Therefore _only_ check address within our domain. */ - if (pfn_valid(PFN_DOWN(paddr))) { - return paddr >= virt_to_phys(xen_io_tlb_start) && - paddr < virt_to_phys(xen_io_tlb_end); - } + if (pfn_valid(PFN_DOWN(paddr))) + return is_swiotlb_buffer(paddr); return 0; } From 6223d1cef7b462de9aeede5846e6dcdbbd1c4d60 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:28 +0100 Subject: [PATCH 0109/1379] xen-swiotlb: use io_tlb_end in xen_swiotlb_dma_supported Use the existing variable that holds the physical address for xen_io_tlb_end to simplify xen_swiotlb_dma_supported a bit, and remove the otherwise unused xen_io_tlb_end variable and the xen_virt_to_bus helper. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index bffb35993c9d..e99f0614dcb9 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -46,7 +46,7 @@ * API. */ -static char *xen_io_tlb_start, *xen_io_tlb_end; +static char *xen_io_tlb_start; static unsigned long xen_io_tlb_nslabs; /* * Quick lookup value of the bus address of the IOTLB. @@ -82,11 +82,6 @@ static inline phys_addr_t xen_dma_to_phys(struct device *dev, return xen_bus_to_phys(dev, dma_to_phys(dev, dma_addr)); } -static inline dma_addr_t xen_virt_to_bus(struct device *dev, void *address) -{ - return xen_phys_to_dma(dev, virt_to_phys(address)); -} - static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); @@ -250,7 +245,6 @@ retry: rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); end: - xen_io_tlb_end = xen_io_tlb_start + bytes; if (!rc) swiotlb_set_max_segment(PAGE_SIZE); @@ -558,7 +552,7 @@ xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, static int xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) { - return xen_virt_to_bus(hwdev, xen_io_tlb_end - 1) <= mask; + return xen_phys_to_dma(hwdev, io_tlb_end - 1) <= mask; } const struct dma_map_ops xen_swiotlb_dma_ops = { From 4035b43da6daa51668830becfe8537841414946b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:29 +0100 Subject: [PATCH 0110/1379] xen-swiotlb: remove xen_set_nslabs The xen_set_nslabs function is a little weird, as it has just one caller, that caller passes a global variable as the argument, which is then overriden in the function and a derivative of it returned. Just add a cpp symbol for the default size using a readable constant and open code the remaining three lines in the caller. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index e99f0614dcb9..5352655432e7 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -138,16 +138,6 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) } while (i < nslabs); return 0; } -static unsigned long xen_set_nslabs(unsigned long nr_tbl) -{ - if (!nr_tbl) { - xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); - xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); - } else - xen_io_tlb_nslabs = nr_tbl; - - return xen_io_tlb_nslabs << IO_TLB_SHIFT; -} enum xen_swiotlb_err { XEN_SWIOTLB_UNKNOWN = 0, @@ -170,6 +160,9 @@ static const char *xen_swiotlb_error(enum xen_swiotlb_err err) } return ""; } + +#define DEFAULT_NSLABS ALIGN(SZ_64M >> IO_TLB_SHIFT, IO_TLB_SEGSIZE) + int __ref xen_swiotlb_init(int verbose, bool early) { unsigned long bytes, order; @@ -179,8 +172,10 @@ int __ref xen_swiotlb_init(int verbose, bool early) xen_io_tlb_nslabs = swiotlb_nr_tbl(); retry: - bytes = xen_set_nslabs(xen_io_tlb_nslabs); - order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT); + if (!xen_io_tlb_nslabs) + xen_io_tlb_nslabs = DEFAULT_NSLABS; + bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; + order = get_order(bytes); /* * IO TLB memory already allocated. Just use it. From cbce99527ca7c4e98db9890651d5e9dfc2fb89ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:30 +0100 Subject: [PATCH 0111/1379] xen-swiotlb: remove xen_io_tlb_start and xen_io_tlb_nslabs The xen_io_tlb_start and xen_io_tlb_nslabs variables are now only used in xen_swiotlb_init, so replace them with local variables. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 57 +++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 5352655432e7..1a31ddf71397 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -40,14 +40,7 @@ #include #define MAX_DMA_BITS 32 -/* - * Used to do a quick range check in swiotlb_tbl_unmap_single and - * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this - * API. - */ -static char *xen_io_tlb_start; -static unsigned long xen_io_tlb_nslabs; /* * Quick lookup value of the bus address of the IOTLB. */ @@ -169,75 +162,75 @@ int __ref xen_swiotlb_init(int verbose, bool early) int rc = -ENOMEM; enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; unsigned int repeat = 3; + char *start; + unsigned long nslabs; - xen_io_tlb_nslabs = swiotlb_nr_tbl(); + nslabs = swiotlb_nr_tbl(); retry: - if (!xen_io_tlb_nslabs) - xen_io_tlb_nslabs = DEFAULT_NSLABS; - bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; + if (!nslabs) + nslabs = DEFAULT_NSLABS; + bytes = nslabs << IO_TLB_SHIFT; order = get_order(bytes); /* * IO TLB memory already allocated. Just use it. */ - if (io_tlb_start != 0) { - xen_io_tlb_start = phys_to_virt(io_tlb_start); + if (io_tlb_start != 0) goto end; - } /* * Get IO TLB memory from any location. */ if (early) { - xen_io_tlb_start = memblock_alloc(PAGE_ALIGN(bytes), + start = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE); - if (!xen_io_tlb_start) + if (!start) panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, PAGE_ALIGN(bytes), PAGE_SIZE); } else { #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - xen_io_tlb_start = (void *)xen_get_swiotlb_free_pages(order); - if (xen_io_tlb_start) + start = (void *)xen_get_swiotlb_free_pages(order); + if (start) break; order--; } if (order != get_order(bytes)) { pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", (PAGE_SIZE << order) >> 20); - xen_io_tlb_nslabs = SLABS_PER_PAGE << order; - bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; + nslabs = SLABS_PER_PAGE << order; + bytes = nslabs << IO_TLB_SHIFT; } } - if (!xen_io_tlb_start) { + if (!start) { m_ret = XEN_SWIOTLB_ENOMEM; goto error; } /* * And replace that memory with pages under 4GB. */ - rc = xen_swiotlb_fixup(xen_io_tlb_start, + rc = xen_swiotlb_fixup(start, bytes, - xen_io_tlb_nslabs); + nslabs); if (rc) { if (early) - memblock_free(__pa(xen_io_tlb_start), + memblock_free(__pa(start), PAGE_ALIGN(bytes)); else { - free_pages((unsigned long)xen_io_tlb_start, order); - xen_io_tlb_start = NULL; + free_pages((unsigned long)start, order); + start = NULL; } m_ret = XEN_SWIOTLB_EFIXUP; goto error; } if (early) { - if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, + if (swiotlb_init_with_tbl(start, nslabs, verbose)) panic("Cannot allocate SWIOTLB buffer"); rc = 0; } else - rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); + rc = swiotlb_late_init_with_tbl(start, nslabs); end: if (!rc) @@ -246,17 +239,17 @@ end: return rc; error: if (repeat--) { - xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ - (xen_io_tlb_nslabs >> 1)); + nslabs = max(1024UL, /* Min is 2MB */ + (nslabs >> 1)); pr_info("Lowering to %luMB\n", - (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); + (nslabs << IO_TLB_SHIFT) >> 20); goto retry; } pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); if (early) panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); else - free_pages((unsigned long)xen_io_tlb_start, order); + free_pages((unsigned long)start, order); return rc; } From 5d0538b2b884f7fd239f6ab3b667148dc57123f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:31 +0100 Subject: [PATCH 0112/1379] swiotlb: lift the double initialization protection from xen-swiotlb Lift the double initialization protection from xen-swiotlb to the core code to avoid exposing too many swiotlb internals. Also upgrade the check to a warning as it should not happen. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 7 ------- kernel/dma/swiotlb.c | 8 ++++++++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 1a31ddf71397..060eeb056486 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -172,12 +172,6 @@ retry: bytes = nslabs << IO_TLB_SHIFT; order = get_order(bytes); - /* - * IO TLB memory already allocated. Just use it. - */ - if (io_tlb_start != 0) - goto end; - /* * Get IO TLB memory from any location. */ @@ -232,7 +226,6 @@ retry: } else rc = swiotlb_late_init_with_tbl(start, nslabs); -end: if (!rc) swiotlb_set_max_segment(PAGE_SIZE); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 5fe8781be6f2..35e24f0ff8b2 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -229,6 +229,10 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) unsigned long i, bytes; size_t alloc_size; + /* protect against double initialization */ + if (WARN_ON_ONCE(io_tlb_start)) + return -ENOMEM; + bytes = nslabs << IO_TLB_SHIFT; io_tlb_nslabs = nslabs; @@ -367,6 +371,10 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { unsigned long i, bytes; + /* protect against double initialization */ + if (WARN_ON_ONCE(io_tlb_start)) + return -ENOMEM; + bytes = nslabs << IO_TLB_SHIFT; io_tlb_nslabs = nslabs; From a98f565462f0fca9096e8f53933364dc2a74bc90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:32 +0100 Subject: [PATCH 0113/1379] xen-swiotlb: split xen_swiotlb_init Split xen_swiotlb_init into a normal an an early case. That makes both much simpler and more readable, and also allows marking the early code as __init and x86-only. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- arch/arm/xen/mm.c | 2 +- arch/x86/xen/pci-swiotlb-xen.c | 4 +- drivers/xen/swiotlb-xen.c | 124 +++++++++++++++++++-------------- include/xen/swiotlb-xen.h | 3 +- 4 files changed, 75 insertions(+), 58 deletions(-) diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 467fa225c3d0..aae950cd053f 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -140,7 +140,7 @@ static int __init xen_mm_init(void) struct gnttab_cache_flush cflush; if (!xen_initial_domain()) return 0; - xen_swiotlb_init(1, false); + xen_swiotlb_init(); cflush.op = 0; cflush.a.dev_bus_addr = 0; diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 19ae3e4fe4e9..54f9aa7e8457 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -59,7 +59,7 @@ int __init pci_xen_swiotlb_detect(void) void __init pci_xen_swiotlb_init(void) { if (xen_swiotlb) { - xen_swiotlb_init(1, true /* early */); + xen_swiotlb_init_early(); dma_ops = &xen_swiotlb_dma_ops; #ifdef CONFIG_PCI @@ -76,7 +76,7 @@ int pci_xen_swiotlb_init_late(void) if (xen_swiotlb) return 0; - rc = xen_swiotlb_init(1, false /* late */); + rc = xen_swiotlb_init(); if (rc) return rc; diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 060eeb056486..00adeb95ebb9 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -156,96 +156,112 @@ static const char *xen_swiotlb_error(enum xen_swiotlb_err err) #define DEFAULT_NSLABS ALIGN(SZ_64M >> IO_TLB_SHIFT, IO_TLB_SEGSIZE) -int __ref xen_swiotlb_init(int verbose, bool early) +int __ref xen_swiotlb_init(void) { - unsigned long bytes, order; - int rc = -ENOMEM; enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; + unsigned long nslabs, bytes, order; unsigned int repeat = 3; + int rc = -ENOMEM; char *start; - unsigned long nslabs; nslabs = swiotlb_nr_tbl(); -retry: if (!nslabs) nslabs = DEFAULT_NSLABS; +retry: + m_ret = XEN_SWIOTLB_ENOMEM; bytes = nslabs << IO_TLB_SHIFT; order = get_order(bytes); /* * Get IO TLB memory from any location. */ - if (early) { - start = memblock_alloc(PAGE_ALIGN(bytes), - PAGE_SIZE); - if (!start) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_ALIGN(bytes), PAGE_SIZE); - } else { #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) - while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - start = (void *)xen_get_swiotlb_free_pages(order); - if (start) - break; - order--; - } - if (order != get_order(bytes)) { - pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", - (PAGE_SIZE << order) >> 20); - nslabs = SLABS_PER_PAGE << order; - bytes = nslabs << IO_TLB_SHIFT; - } + while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { + start = (void *)xen_get_swiotlb_free_pages(order); + if (start) + break; + order--; } - if (!start) { - m_ret = XEN_SWIOTLB_ENOMEM; + if (!start) goto error; + if (order != get_order(bytes)) { + pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", + (PAGE_SIZE << order) >> 20); + nslabs = SLABS_PER_PAGE << order; + bytes = nslabs << IO_TLB_SHIFT; } + /* * And replace that memory with pages under 4GB. */ - rc = xen_swiotlb_fixup(start, - bytes, - nslabs); + rc = xen_swiotlb_fixup(start, bytes, nslabs); if (rc) { - if (early) - memblock_free(__pa(start), - PAGE_ALIGN(bytes)); - else { - free_pages((unsigned long)start, order); - start = NULL; - } + free_pages((unsigned long)start, order); m_ret = XEN_SWIOTLB_EFIXUP; goto error; } - if (early) { - if (swiotlb_init_with_tbl(start, nslabs, - verbose)) - panic("Cannot allocate SWIOTLB buffer"); - rc = 0; - } else - rc = swiotlb_late_init_with_tbl(start, nslabs); - - if (!rc) - swiotlb_set_max_segment(PAGE_SIZE); - - return rc; + rc = swiotlb_late_init_with_tbl(start, nslabs); + if (rc) + return rc; + swiotlb_set_max_segment(PAGE_SIZE); + return 0; error: if (repeat--) { - nslabs = max(1024UL, /* Min is 2MB */ - (nslabs >> 1)); + /* Min is 2MB */ + nslabs = max(1024UL, (nslabs >> 1)); pr_info("Lowering to %luMB\n", (nslabs << IO_TLB_SHIFT) >> 20); goto retry; } pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); - if (early) - panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); - else - free_pages((unsigned long)start, order); + free_pages((unsigned long)start, order); return rc; } +#ifdef CONFIG_X86 +void __init xen_swiotlb_init_early(void) +{ + unsigned long nslabs, bytes; + unsigned int repeat = 3; + char *start; + int rc; + + nslabs = swiotlb_nr_tbl(); + if (!nslabs) + nslabs = DEFAULT_NSLABS; +retry: + /* + * Get IO TLB memory from any location. + */ + bytes = nslabs << IO_TLB_SHIFT; + start = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE); + if (!start) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, PAGE_ALIGN(bytes), PAGE_SIZE); + + /* + * And replace that memory with pages under 4GB. + */ + rc = xen_swiotlb_fixup(start, bytes, nslabs); + if (rc) { + memblock_free(__pa(start), PAGE_ALIGN(bytes)); + if (repeat--) { + /* Min is 2MB */ + nslabs = max(1024UL, (nslabs >> 1)); + pr_info("Lowering to %luMB\n", + (nslabs << IO_TLB_SHIFT) >> 20); + goto retry; + } + panic("%s (rc:%d)", xen_swiotlb_error(XEN_SWIOTLB_EFIXUP), rc); + } + + if (swiotlb_init_with_tbl(start, nslabs, false)) + panic("Cannot allocate SWIOTLB buffer"); + swiotlb_set_max_segment(PAGE_SIZE); +} +#endif /* CONFIG_X86 */ + static void * xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags, diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index d5eaf9d682b8..6206b1ec9916 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -9,7 +9,8 @@ void xen_dma_sync_for_cpu(struct device *dev, dma_addr_t handle, void xen_dma_sync_for_device(struct device *dev, dma_addr_t handle, size_t size, enum dma_data_direction dir); -extern int xen_swiotlb_init(int verbose, bool early); +int xen_swiotlb_init(void); +void __init xen_swiotlb_init_early(void); extern const struct dma_map_ops xen_swiotlb_dma_ops; #endif /* __LINUX_SWIOTLB_XEN_H */ From 6bcd4ea717f3d26edf3da397c82fc9c2236f4f27 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Mar 2021 08:44:33 +0100 Subject: [PATCH 0114/1379] xen-swiotlb: remove the unused size argument from xen_swiotlb_fixup Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 00adeb95ebb9..4ecfce2c6f72 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -104,8 +104,7 @@ static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr) return 0; } -static int -xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) +static int xen_swiotlb_fixup(void *buf, unsigned long nslabs) { int i, rc; int dma_bits; @@ -195,7 +194,7 @@ retry: /* * And replace that memory with pages under 4GB. */ - rc = xen_swiotlb_fixup(start, bytes, nslabs); + rc = xen_swiotlb_fixup(start, nslabs); if (rc) { free_pages((unsigned long)start, order); m_ret = XEN_SWIOTLB_EFIXUP; @@ -243,7 +242,7 @@ retry: /* * And replace that memory with pages under 4GB. */ - rc = xen_swiotlb_fixup(start, bytes, nslabs); + rc = xen_swiotlb_fixup(start, nslabs); if (rc) { memblock_free(__pa(start), PAGE_ALIGN(bytes)); if (repeat--) { From 2f100585d04506004b8027ec9bbaee26940a769f Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sun, 7 Mar 2021 10:24:46 +0800 Subject: [PATCH 0115/1379] riscv: Enable generic clockevent broadcast When percpu-timers are stopped by deep power saving mode, we need system timer help to broadcast IPI_TIMER. This is first introduced by broken x86 hardware, where the local apic timer stops in C3 state. But many other architectures(powerpc, mips, arm, hexagon, openrisc, sh) have supported the infrastructure to deal with Power Management issues. Signed-off-by: Guo Ren Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 ++ arch/riscv/kernel/smp.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 6978b0739718..8ea60a0a19ae 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -29,6 +29,7 @@ config RISCV select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY select ARCH_HAS_STRICT_KERNEL_RWX if MMU + select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU @@ -40,6 +41,7 @@ config RISCV select EDAC_SUPPORT select GENERIC_ARCH_TOPOLOGY if SMP select GENERIC_ATOMIC64 if !64BIT + select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_EARLY_IOREMAP select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO select GENERIC_IOREMAP diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index ea028d9e0d24..8325d33411d8 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -27,6 +28,7 @@ enum ipi_message_type { IPI_CALL_FUNC, IPI_CPU_STOP, IPI_IRQ_WORK, + IPI_TIMER, IPI_MAX }; @@ -176,6 +178,12 @@ void handle_IPI(struct pt_regs *regs) irq_work_run(); } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + if (ops & (1 << IPI_TIMER)) { + stats[IPI_TIMER]++; + tick_receive_broadcast(); + } +#endif BUG_ON((ops >> IPI_MAX) != 0); /* Order data access and bit testing. */ @@ -192,6 +200,7 @@ static const char * const ipi_names[] = { [IPI_CALL_FUNC] = "Function call interrupts", [IPI_CPU_STOP] = "CPU stop interrupts", [IPI_IRQ_WORK] = "IRQ work interrupts", + [IPI_TIMER] = "Timer broadcast interrupts", }; void show_ipi_stats(struct seq_file *p, int prec) @@ -217,6 +226,13 @@ void arch_send_call_function_single_ipi(int cpu) send_ipi_single(cpu, IPI_CALL_FUNC); } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +void tick_broadcast(const struct cpumask *mask) +{ + send_ipi_mask(mask, IPI_TIMER); +} +#endif + void smp_send_stop(void) { unsigned long timeout; From 007d81a4519f04fa5ced5e9e28bf70cd753c398d Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Thu, 25 Feb 2021 22:31:19 +0100 Subject: [PATCH 0116/1379] thermal/drivers/qcom/tsens_v1: Enable sensor 3 on MSM8976 The sensor *is* in fact used and does report temperature. Signed-off-by: Konrad Dybcio Acked-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210225213119.116550-1-konrad.dybcio@somainline.org --- drivers/thermal/qcom/tsens-v1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/qcom/tsens-v1.c b/drivers/thermal/qcom/tsens-v1.c index 3c19a3800c6d..573e261ccca7 100644 --- a/drivers/thermal/qcom/tsens-v1.c +++ b/drivers/thermal/qcom/tsens-v1.c @@ -380,11 +380,11 @@ static const struct tsens_ops ops_8976 = { .get_temp = get_temp_tsens_valid, }; -/* Valid for both MSM8956 and MSM8976. Sensor ID 3 is unused. */ +/* Valid for both MSM8956 and MSM8976. */ struct tsens_plat_data data_8976 = { .num_sensors = 11, .ops = &ops_8976, - .hw_ids = (unsigned int[]){0, 1, 2, 4, 5, 6, 7, 8, 9, 10}, + .hw_ids = (unsigned int[]){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, .feat = &tsens_v1_feat, .fields = tsens_v1_regfields, }; From 60d7b22d25930e5c3e03eca32047e3313fa76897 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Thu, 11 Mar 2021 15:04:08 +0100 Subject: [PATCH 0117/1379] rpmsg: char: Rename rpmsg_char_init to rpmsg_chrdev_init To be coherent with the other functions which are prefixed by rpmsg_chrdev, rename the rpmsg_char_init function. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210311140413.31725-2-arnaud.pouliquen@foss.st.com Signed-off-by: Bjorn Andersson --- drivers/rpmsg/rpmsg_char.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rpmsg/rpmsg_char.c b/drivers/rpmsg/rpmsg_char.c index 4bbbacdbf3bb..9e33b53bbf56 100644 --- a/drivers/rpmsg/rpmsg_char.c +++ b/drivers/rpmsg/rpmsg_char.c @@ -543,7 +543,7 @@ static struct rpmsg_driver rpmsg_chrdev_driver = { }, }; -static int rpmsg_char_init(void) +static int rpmsg_chrdev_init(void) { int ret; @@ -569,7 +569,7 @@ static int rpmsg_char_init(void) return ret; } -postcore_initcall(rpmsg_char_init); +postcore_initcall(rpmsg_chrdev_init); static void rpmsg_chrdev_exit(void) { From 3093c3c7c136458af692d5c3d309a66c3c12d9f4 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Thu, 11 Mar 2021 15:04:09 +0100 Subject: [PATCH 0118/1379] rpmsg: Move RPMSG_ADDR_ANY in user API As the RPMSG_ADDR_ANY is a valid src or dst address that can be set by user applications, migrate its definition in user API. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210311140413.31725-3-arnaud.pouliquen@foss.st.com Signed-off-by: Bjorn Andersson --- include/linux/rpmsg.h | 3 +-- include/uapi/linux/rpmsg.h | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index a5db828b2420..d97dcd049f18 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -18,8 +18,7 @@ #include #include #include - -#define RPMSG_ADDR_ANY 0xFFFFFFFF +#include struct rpmsg_device; struct rpmsg_endpoint; diff --git a/include/uapi/linux/rpmsg.h b/include/uapi/linux/rpmsg.h index e14c6dab4223..5e00748da319 100644 --- a/include/uapi/linux/rpmsg.h +++ b/include/uapi/linux/rpmsg.h @@ -9,6 +9,8 @@ #include #include +#define RPMSG_ADDR_ANY 0xFFFFFFFF + /** * struct rpmsg_endpoint_info - endpoint info representation * @name: name of service From 809328b40cfb152f75541aa3dcbbe4903098963b Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Thu, 11 Mar 2021 15:04:10 +0100 Subject: [PATCH 0119/1379] rpmsg: Add short description of the IOCTL defined in UAPI. Add a description of the IOCTLs and provide information on the default value of the source and destination addresses. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210311140413.31725-4-arnaud.pouliquen@foss.st.com Signed-off-by: Bjorn Andersson --- include/uapi/linux/rpmsg.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/rpmsg.h b/include/uapi/linux/rpmsg.h index 5e00748da319..f5ca8740f3fb 100644 --- a/include/uapi/linux/rpmsg.h +++ b/include/uapi/linux/rpmsg.h @@ -14,8 +14,8 @@ /** * struct rpmsg_endpoint_info - endpoint info representation * @name: name of service - * @src: local address - * @dst: destination address + * @src: local address. To set to RPMSG_ADDR_ANY if not used. + * @dst: destination address. To set to RPMSG_ADDR_ANY if not used. */ struct rpmsg_endpoint_info { char name[32]; @@ -23,7 +23,14 @@ struct rpmsg_endpoint_info { __u32 dst; }; +/** + * Instantiate a new rmpsg char device endpoint. + */ #define RPMSG_CREATE_EPT_IOCTL _IOW(0xb5, 0x1, struct rpmsg_endpoint_info) + +/** + * Destroy a rpmsg char device endpoint created by the RPMSG_CREATE_EPT_IOCTL. + */ #define RPMSG_DESTROY_EPT_IOCTL _IO(0xb5, 0x2) #endif From b4ce7e2ebcc52ff907c5a922bf19c3dfa39dddb1 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Thu, 11 Mar 2021 15:04:11 +0100 Subject: [PATCH 0120/1379] rpmsg: char: Use rpmsg_sendto to specify the message destination address When the endpoint device is created by the application, a destination address is specified in the rpmsg_channel_info structure. Since the rpmsg_endpoint structure does not store the destination address, this destination address must be specified when sending a message. Replaces rpmsg_send with rpmsg_sendto to allow to specify the destination address. This implementation is requested for compatibly with some rpmsg backends like the virtio backend. For this, the GLINK an SMD drivers have been updated to support the rpmsg_sendto, even if the destination address is ignored for these backends. For these drivers, the rpmsg_send and rpmsg_trysend ops are preserved to avoid breaking the legacy. Signed-off-by: Arnaud Pouliquen Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/20210311140413.31725-5-arnaud.pouliquen@foss.st.com Signed-off-by: Bjorn Andersson --- drivers/rpmsg/qcom_glink_native.c | 16 ++++++++++++++++ drivers/rpmsg/qcom_smd.c | 16 ++++++++++++++++ drivers/rpmsg/rpmsg_char.c | 4 ++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/rpmsg/qcom_glink_native.c b/drivers/rpmsg/qcom_glink_native.c index 27a05167c18c..edae0c9a8236 100644 --- a/drivers/rpmsg/qcom_glink_native.c +++ b/drivers/rpmsg/qcom_glink_native.c @@ -1332,6 +1332,20 @@ static int qcom_glink_trysend(struct rpmsg_endpoint *ept, void *data, int len) return __qcom_glink_send(channel, data, len, false); } +static int qcom_glink_sendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst) +{ + struct glink_channel *channel = to_glink_channel(ept); + + return __qcom_glink_send(channel, data, len, true); +} + +static int qcom_glink_trysendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst) +{ + struct glink_channel *channel = to_glink_channel(ept); + + return __qcom_glink_send(channel, data, len, false); +} + /* * Finds the device_node for the glink child interested in this channel. */ @@ -1364,7 +1378,9 @@ static const struct rpmsg_device_ops glink_device_ops = { static const struct rpmsg_endpoint_ops glink_endpoint_ops = { .destroy_ept = qcom_glink_destroy_ept, .send = qcom_glink_send, + .sendto = qcom_glink_sendto, .trysend = qcom_glink_trysend, + .trysendto = qcom_glink_trysendto, }; static void qcom_glink_rpdev_release(struct device *dev) diff --git a/drivers/rpmsg/qcom_smd.c b/drivers/rpmsg/qcom_smd.c index 19903de6268d..8da1b5cb31b3 100644 --- a/drivers/rpmsg/qcom_smd.c +++ b/drivers/rpmsg/qcom_smd.c @@ -974,6 +974,20 @@ static int qcom_smd_trysend(struct rpmsg_endpoint *ept, void *data, int len) return __qcom_smd_send(qsept->qsch, data, len, false); } +static int qcom_smd_sendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst) +{ + struct qcom_smd_endpoint *qsept = to_smd_endpoint(ept); + + return __qcom_smd_send(qsept->qsch, data, len, true); +} + +static int qcom_smd_trysendto(struct rpmsg_endpoint *ept, void *data, int len, u32 dst) +{ + struct qcom_smd_endpoint *qsept = to_smd_endpoint(ept); + + return __qcom_smd_send(qsept->qsch, data, len, false); +} + static __poll_t qcom_smd_poll(struct rpmsg_endpoint *ept, struct file *filp, poll_table *wait) { @@ -1038,7 +1052,9 @@ static const struct rpmsg_device_ops qcom_smd_device_ops = { static const struct rpmsg_endpoint_ops qcom_smd_endpoint_ops = { .destroy_ept = qcom_smd_destroy_ept, .send = qcom_smd_send, + .sendto = qcom_smd_sendto, .trysend = qcom_smd_trysend, + .trysendto = qcom_smd_trysendto, .poll = qcom_smd_poll, }; diff --git a/drivers/rpmsg/rpmsg_char.c b/drivers/rpmsg/rpmsg_char.c index 9e33b53bbf56..95a65f7a9d8d 100644 --- a/drivers/rpmsg/rpmsg_char.c +++ b/drivers/rpmsg/rpmsg_char.c @@ -239,9 +239,9 @@ static ssize_t rpmsg_eptdev_write_iter(struct kiocb *iocb, } if (filp->f_flags & O_NONBLOCK) - ret = rpmsg_trysend(eptdev->ept, kbuf, len); + ret = rpmsg_trysendto(eptdev->ept, kbuf, len, eptdev->chinfo.dst); else - ret = rpmsg_send(eptdev->ept, kbuf, len); + ret = rpmsg_sendto(eptdev->ept, kbuf, len, eptdev->chinfo.dst); unlock_eptdev: mutex_unlock(&eptdev->ept_lock); From c486682ae1e2b149add22f44cf413b3103e3ef39 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Thu, 11 Mar 2021 15:04:12 +0100 Subject: [PATCH 0121/1379] rpmsg: virtio: Register the rpmsg_char device Instantiate the rpmsg_char device on virtio RPMsg bus creation. This provides the capability, with the RPMSG_CREATE_EPT_IOCTL ioctl, to create RPMsg char device endpoints relying on the rpmsg_chrdev_create_eptdev API. Notice that the created endpoints are attached to the rpmsg_ctldev device, but not associated to a channel. As consequence, the endpoint source and destination addresses have to been specified and there is no channel creation and no name service announcement to inform the remote side. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210311140413.31725-6-arnaud.pouliquen@foss.st.com Signed-off-by: Bjorn Andersson --- drivers/rpmsg/virtio_rpmsg_bus.c | 62 +++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index e87d4cf926eb..8e49a3bacfc7 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -813,14 +813,57 @@ static void rpmsg_xmit_done(struct virtqueue *svq) wake_up_interruptible(&vrp->sendq); } +/* + * Called to expose to user a /dev/rpmsg_ctrlX interface allowing to + * create endpoint-to-endpoint communication without associated RPMsg channel. + * The endpoints are rattached to the ctrldev RPMsg device. + */ +static struct rpmsg_device *rpmsg_virtio_add_ctrl_dev(struct virtio_device *vdev) +{ + struct virtproc_info *vrp = vdev->priv; + struct virtio_rpmsg_channel *vch; + struct rpmsg_device *rpdev_ctrl; + int err = 0; + + vch = kzalloc(sizeof(*vch), GFP_KERNEL); + if (!vch) + return ERR_PTR(-ENOMEM); + + /* Link the channel to the vrp */ + vch->vrp = vrp; + + /* Assign public information to the rpmsg_device */ + rpdev_ctrl = &vch->rpdev; + rpdev_ctrl->ops = &virtio_rpmsg_ops; + + rpdev_ctrl->dev.parent = &vrp->vdev->dev; + rpdev_ctrl->dev.release = virtio_rpmsg_release_device; + rpdev_ctrl->little_endian = virtio_is_little_endian(vrp->vdev); + + err = rpmsg_chrdev_register_device(rpdev_ctrl); + if (err) { + kfree(vch); + return ERR_PTR(err); + } + + return rpdev_ctrl; +} + +static void rpmsg_virtio_del_ctrl_dev(struct rpmsg_device *rpdev_ctrl) +{ + if (!rpdev_ctrl) + return; + kfree(to_virtio_rpmsg_channel(rpdev_ctrl)); +} + static int rpmsg_probe(struct virtio_device *vdev) { vq_callback_t *vq_cbs[] = { rpmsg_recv_done, rpmsg_xmit_done }; static const char * const names[] = { "input", "output" }; struct virtqueue *vqs[2]; struct virtproc_info *vrp; - struct virtio_rpmsg_channel *vch; - struct rpmsg_device *rpdev_ns; + struct virtio_rpmsg_channel *vch = NULL; + struct rpmsg_device *rpdev_ns, *rpdev_ctrl; void *bufs_va; int err = 0, i; size_t total_buf_space; @@ -894,12 +937,18 @@ static int rpmsg_probe(struct virtio_device *vdev) vdev->priv = vrp; + rpdev_ctrl = rpmsg_virtio_add_ctrl_dev(vdev); + if (IS_ERR(rpdev_ctrl)) { + err = PTR_ERR(rpdev_ctrl); + goto free_coherent; + } + /* if supported by the remote processor, enable the name service */ if (virtio_has_feature(vdev, VIRTIO_RPMSG_F_NS)) { vch = kzalloc(sizeof(*vch), GFP_KERNEL); if (!vch) { err = -ENOMEM; - goto free_coherent; + goto free_ctrldev; } /* Link the channel to our vrp */ @@ -915,7 +964,7 @@ static int rpmsg_probe(struct virtio_device *vdev) err = rpmsg_ns_register_device(rpdev_ns); if (err) - goto free_coherent; + goto free_vch; } /* @@ -939,8 +988,11 @@ static int rpmsg_probe(struct virtio_device *vdev) return 0; -free_coherent: +free_vch: kfree(vch); +free_ctrldev: + rpmsg_virtio_del_ctrl_dev(rpdev_ctrl); +free_coherent: dma_free_coherent(vdev->dev.parent, total_buf_space, bufs_va, vrp->bufs_dma); vqs_del: From 964e8bedd5a13a662e8e418ed763351c07d0dac7 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Thu, 11 Mar 2021 15:04:13 +0100 Subject: [PATCH 0122/1379] rpmsg: char: Return an error if device already open The rpmsg_create_ept function is invoked when the device is opened. As only one endpoint must be created per device. It is not possible to open the same device twice. But there is nothing to prevent multi open. Return -EBUSY when device is already opened to have a generic error instead of relying on the back-end to potentially detect the error. Without this patch for instance the GLINK driver return -EBUSY while the virtio bus return -ENOSPC. Reviewed-by: Bjorn Andersson Reviewed-by: Mathieu Poirier Signed-off-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210311140413.31725-7-arnaud.pouliquen@foss.st.com Signed-off-by: Bjorn Andersson --- drivers/rpmsg/rpmsg_char.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/rpmsg/rpmsg_char.c b/drivers/rpmsg/rpmsg_char.c index 95a65f7a9d8d..2bebc9b2d163 100644 --- a/drivers/rpmsg/rpmsg_char.c +++ b/drivers/rpmsg/rpmsg_char.c @@ -127,6 +127,9 @@ static int rpmsg_eptdev_open(struct inode *inode, struct file *filp) struct rpmsg_device *rpdev = eptdev->rpdev; struct device *dev = &eptdev->dev; + if (eptdev->ept) + return -EBUSY; + get_device(dev); ept = rpmsg_create_ept(rpdev, rpmsg_ept_cb, eptdev, eptdev->chinfo); From 1cb8f3e2d8fe7533c26df9925a83bd3d185b312e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 20 Jan 2021 14:25:37 +0100 Subject: [PATCH 0123/1379] hwspinlock: remove sirf driver The CSR SiRF prima2/atlas platforms are getting removed, so this driver is no longer needed. Cc: Barry Song Link: https://lore.kernel.org/linux-arm-kernel/20210120124812.2800027-1-arnd@kernel.org/T/ Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210120132537.2285157-1-arnd@kernel.org Signed-off-by: Bjorn Andersson --- .../bindings/hwlock/sirf,hwspinlock.txt | 28 ----- drivers/hwspinlock/Kconfig | 11 -- drivers/hwspinlock/Makefile | 1 - drivers/hwspinlock/sirf_hwspinlock.c | 105 ------------------ 4 files changed, 145 deletions(-) delete mode 100644 Documentation/devicetree/bindings/hwlock/sirf,hwspinlock.txt delete mode 100644 drivers/hwspinlock/sirf_hwspinlock.c diff --git a/Documentation/devicetree/bindings/hwlock/sirf,hwspinlock.txt b/Documentation/devicetree/bindings/hwlock/sirf,hwspinlock.txt deleted file mode 100644 index 9bb1240a68e0..000000000000 --- a/Documentation/devicetree/bindings/hwlock/sirf,hwspinlock.txt +++ /dev/null @@ -1,28 +0,0 @@ -SIRF Hardware spinlock device Binding ------------------------------------------------ - -Required properties : -- compatible : shall contain only one of the following: - "sirf,hwspinlock" - -- reg : the register address of hwspinlock - -- #hwlock-cells : hwlock users only use the hwlock id to represent a specific - hwlock, so the number of cells should be <1> here. - -Please look at the generic hwlock binding for usage information for consumers, -"Documentation/devicetree/bindings/hwlock/hwlock.txt" - -Example of hwlock provider: - hwlock { - compatible = "sirf,hwspinlock"; - reg = <0x13240000 0x00010000>; - #hwlock-cells = <1>; - }; - -Example of hwlock users: - node { - ... - hwlocks = <&hwlock 2>; - ... - }; diff --git a/drivers/hwspinlock/Kconfig b/drivers/hwspinlock/Kconfig index 32cd26352f38..53e13476e831 100644 --- a/drivers/hwspinlock/Kconfig +++ b/drivers/hwspinlock/Kconfig @@ -28,17 +28,6 @@ config HWSPINLOCK_QCOM If unsure, say N. -config HWSPINLOCK_SIRF - tristate "SIRF Hardware Spinlock device" - depends on ARCH_SIRF || COMPILE_TEST - help - Say y here to support the SIRF Hardware Spinlock device, which - provides a synchronisation mechanism for the various processors - on the SoC. - - It's safe to say n here if you're not interested in SIRF hardware - spinlock or just want a bare minimum kernel. - config HWSPINLOCK_SPRD tristate "SPRD Hardware Spinlock device" depends on ARCH_SPRD || COMPILE_TEST diff --git a/drivers/hwspinlock/Makefile b/drivers/hwspinlock/Makefile index ed053e3f02be..1f8dd6f5814f 100644 --- a/drivers/hwspinlock/Makefile +++ b/drivers/hwspinlock/Makefile @@ -6,7 +6,6 @@ obj-$(CONFIG_HWSPINLOCK) += hwspinlock_core.o obj-$(CONFIG_HWSPINLOCK_OMAP) += omap_hwspinlock.o obj-$(CONFIG_HWSPINLOCK_QCOM) += qcom_hwspinlock.o -obj-$(CONFIG_HWSPINLOCK_SIRF) += sirf_hwspinlock.o obj-$(CONFIG_HWSPINLOCK_SPRD) += sprd_hwspinlock.o obj-$(CONFIG_HWSPINLOCK_STM32) += stm32_hwspinlock.o obj-$(CONFIG_HSEM_U8500) += u8500_hsem.o diff --git a/drivers/hwspinlock/sirf_hwspinlock.c b/drivers/hwspinlock/sirf_hwspinlock.c deleted file mode 100644 index a3f77120bad7..000000000000 --- a/drivers/hwspinlock/sirf_hwspinlock.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * SIRF hardware spinlock driver - * - * Copyright (c) 2015 Cambridge Silicon Radio Limited, a CSR plc group company. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "hwspinlock_internal.h" - -struct sirf_hwspinlock { - void __iomem *io_base; - struct hwspinlock_device bank; -}; - -/* Number of Hardware Spinlocks*/ -#define HW_SPINLOCK_NUMBER 30 - -/* Hardware spinlock register offsets */ -#define HW_SPINLOCK_BASE 0x404 -#define HW_SPINLOCK_OFFSET(x) (HW_SPINLOCK_BASE + 0x4 * (x)) - -static int sirf_hwspinlock_trylock(struct hwspinlock *lock) -{ - void __iomem *lock_addr = lock->priv; - - /* attempt to acquire the lock by reading value == 1 from it */ - return !!readl(lock_addr); -} - -static void sirf_hwspinlock_unlock(struct hwspinlock *lock) -{ - void __iomem *lock_addr = lock->priv; - - /* release the lock by writing 0 to it */ - writel(0, lock_addr); -} - -static const struct hwspinlock_ops sirf_hwspinlock_ops = { - .trylock = sirf_hwspinlock_trylock, - .unlock = sirf_hwspinlock_unlock, -}; - -static int sirf_hwspinlock_probe(struct platform_device *pdev) -{ - struct sirf_hwspinlock *hwspin; - struct hwspinlock *hwlock; - int idx; - - if (!pdev->dev.of_node) - return -ENODEV; - - hwspin = devm_kzalloc(&pdev->dev, - struct_size(hwspin, bank.lock, - HW_SPINLOCK_NUMBER), - GFP_KERNEL); - if (!hwspin) - return -ENOMEM; - - /* retrieve io base */ - hwspin->io_base = devm_platform_ioremap_resource(pdev, 0); - if (IS_ERR(hwspin->io_base)) - return PTR_ERR(hwspin->io_base); - - for (idx = 0; idx < HW_SPINLOCK_NUMBER; idx++) { - hwlock = &hwspin->bank.lock[idx]; - hwlock->priv = hwspin->io_base + HW_SPINLOCK_OFFSET(idx); - } - - platform_set_drvdata(pdev, hwspin); - - return devm_hwspin_lock_register(&pdev->dev, &hwspin->bank, - &sirf_hwspinlock_ops, 0, - HW_SPINLOCK_NUMBER); -} - -static const struct of_device_id sirf_hwpinlock_ids[] = { - { .compatible = "sirf,hwspinlock", }, - {}, -}; -MODULE_DEVICE_TABLE(of, sirf_hwpinlock_ids); - -static struct platform_driver sirf_hwspinlock_driver = { - .probe = sirf_hwspinlock_probe, - .driver = { - .name = "atlas7_hwspinlock", - .of_match_table = sirf_hwpinlock_ids, - }, -}; - -module_platform_driver(sirf_hwspinlock_driver); - -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("SIRF Hardware spinlock driver"); -MODULE_AUTHOR("Wei Chen "); From 9af2a2a9c64ee68a5dc8271d54235609191f1cd1 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 11 Mar 2021 16:26:05 -0800 Subject: [PATCH 0124/1379] remoteproc: qcom_q6v5_mss: Provide errors for firmware-name parsing Failing to read the "firmware-name" DT property without informing the developer is annoying, add some helpful debug prints. Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/20210312002605.3273255-1-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_mss.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c index 66106ba25ba3..15abfbba78d2 100644 --- a/drivers/remoteproc/qcom_q6v5_mss.c +++ b/drivers/remoteproc/qcom_q6v5_mss.c @@ -1661,8 +1661,10 @@ static int q6v5_probe(struct platform_device *pdev) mba_image = desc->hexagon_mba_image; ret = of_property_read_string_index(pdev->dev.of_node, "firmware-name", 0, &mba_image); - if (ret < 0 && ret != -EINVAL) + if (ret < 0 && ret != -EINVAL) { + dev_err(&pdev->dev, "unable to read mba firmware-name\n"); return ret; + } rproc = rproc_alloc(&pdev->dev, pdev->name, &q6v5_ops, mba_image, sizeof(*qproc)); @@ -1680,8 +1682,10 @@ static int q6v5_probe(struct platform_device *pdev) qproc->hexagon_mdt_image = "modem.mdt"; ret = of_property_read_string_index(pdev->dev.of_node, "firmware-name", 1, &qproc->hexagon_mdt_image); - if (ret < 0 && ret != -EINVAL) + if (ret < 0 && ret != -EINVAL) { + dev_err(&pdev->dev, "unable to read mpss firmware-name\n"); goto free_rproc; + } platform_set_drvdata(pdev, qproc); From 3d2ee78906af5f08d499d6aa3aa504406fa38106 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 12 Mar 2021 15:20:02 -0800 Subject: [PATCH 0125/1379] remoteproc: qcom_q6v5_mss: Validate p_filesz in ELF loader Analog to the issue in the common mdt_loader code the MSS ELF loader does not validate that p_filesz bytes will fit in the memory region and that the loaded segments are not truncated. Fix this in the same way as proposed for the mdt_loader. Reviewed-by: Mathieu Poirier Fixes: 135b9e8d1cd8 ("remoteproc: qcom_q6v5_mss: Validate modem blob firmware size before load") Link: https://lore.kernel.org/r/20210312232002.3466791-1-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_mss.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/remoteproc/qcom_q6v5_mss.c b/drivers/remoteproc/qcom_q6v5_mss.c index 15abfbba78d2..423b31dfa574 100644 --- a/drivers/remoteproc/qcom_q6v5_mss.c +++ b/drivers/remoteproc/qcom_q6v5_mss.c @@ -1210,6 +1210,14 @@ static int q6v5_mpss_load(struct q6v5 *qproc) goto release_firmware; } + if (phdr->p_filesz > phdr->p_memsz) { + dev_err(qproc->dev, + "refusing to load segment %d with p_filesz > p_memsz\n", + i); + ret = -EINVAL; + goto release_firmware; + } + ptr = memremap(qproc->mpss_phys + offset, phdr->p_memsz, MEMREMAP_WC); if (!ptr) { dev_err(qproc->dev, @@ -1241,6 +1249,16 @@ static int q6v5_mpss_load(struct q6v5 *qproc) goto release_firmware; } + if (seg_fw->size != phdr->p_filesz) { + dev_err(qproc->dev, + "failed to load segment %d from truncated file %s\n", + i, fw_name); + ret = -EINVAL; + release_firmware(seg_fw); + memunmap(ptr); + goto release_firmware; + } + release_firmware(seg_fw); } From 16324fc8def1c08a92261089aaf503aca3381aa6 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:37 -0700 Subject: [PATCH 0126/1379] remoteproc: Remove useless check in rproc_del() Whether started at probe() time or thereafter from the command line, a remote processor needs to be shut down before the final cleanup phases can happen. Otherwise the system may be left in an unpredictable state where the remote processor is expecting the remoteproc core to be providing services when in fact it no longer exist. Invariably calling rproc_shutdown() is fine since it will return immediately if the remote processor has already been switched off. Signed-off-by: Mathieu Poirier Reviewed-by: Peng Fan Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-2-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 5071cdbfc926..4c2e678732f5 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -2353,10 +2353,8 @@ int rproc_del(struct rproc *rproc) if (!rproc) return -EINVAL; - /* if rproc is marked always-on, rproc_add() booted it */ /* TODO: make sure this works with rproc->power > 1 */ - if (rproc->auto_boot) - rproc_shutdown(rproc); + rproc_shutdown(rproc); mutex_lock(&rproc->lock); rproc->state = RPROC_DELETED; From 6a6c4dc0e5de5dc4fec0ccda417c26f5814be380 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:38 -0700 Subject: [PATCH 0127/1379] remoteproc: Rename function rproc_actuate() Rename function rproc_actuate() to rproc_attach(). That way it is easy to understand that it does the opposite of rproc_detach(). Signed-off-by: Mathieu Poirier Reviewed-by: Peng Fan Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-3-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 4c2e678732f5..9127b2aa5424 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1422,7 +1422,7 @@ reset_table_ptr: return ret; } -static int rproc_attach(struct rproc *rproc) +static int __rproc_attach(struct rproc *rproc) { struct device *dev = &rproc->dev; int ret; @@ -1547,7 +1547,7 @@ disable_iommu: * Attach to remote processor - similar to rproc_fw_boot() but without * the steps that deal with the firmware image. */ -static int rproc_actuate(struct rproc *rproc) +static int rproc_attach(struct rproc *rproc) { struct device *dev = &rproc->dev; int ret; @@ -1587,7 +1587,7 @@ static int rproc_actuate(struct rproc *rproc) goto clean_up_resources; } - ret = rproc_attach(rproc); + ret = __rproc_attach(rproc); if (ret) goto clean_up_resources; @@ -1808,7 +1808,7 @@ int rproc_boot(struct rproc *rproc) if (rproc->state == RPROC_DETACHED) { dev_info(dev, "attaching to %s\n", rproc->name); - ret = rproc_actuate(rproc); + ret = rproc_attach(rproc); } else { dev_info(dev, "powering up %s\n", rproc->name); From 4196d18903f94090f0a223d65de25e3bf50a3d13 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:39 -0700 Subject: [PATCH 0128/1379] remoteproc: Add new RPROC_ATTACHED state Add a new RPROC_ATTACHED state to take into account scenarios where the remoteproc core needs to attach to a remote processor that is booted by another entity. Signed-off-by: Mathieu Poirier Reviewed-by: Peng Fan Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-4-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_sysfs.c | 1 + include/linux/remoteproc.h | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c index 6840dad931d5..26fd9ceecdb5 100644 --- a/drivers/remoteproc/remoteproc_sysfs.c +++ b/drivers/remoteproc/remoteproc_sysfs.c @@ -172,6 +172,7 @@ static const char * const rproc_state_string[] = { [RPROC_RUNNING] = "running", [RPROC_CRASHED] = "crashed", [RPROC_DELETED] = "deleted", + [RPROC_ATTACHED] = "attached", [RPROC_DETACHED] = "detached", [RPROC_LAST] = "invalid", }; diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 1b7d56c7a453..9193a8fb5b68 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -407,6 +407,8 @@ struct rproc_ops { * @RPROC_RUNNING: device is up and running * @RPROC_CRASHED: device has crashed; need to start recovery * @RPROC_DELETED: device is deleted + * @RPROC_ATTACHED: device has been booted by another entity and the core + * has attached to it * @RPROC_DETACHED: device has been booted by another entity and waiting * for the core to attach to it * @RPROC_LAST: just keep this one at the end @@ -423,8 +425,9 @@ enum rproc_state { RPROC_RUNNING = 2, RPROC_CRASHED = 3, RPROC_DELETED = 4, - RPROC_DETACHED = 5, - RPROC_LAST = 6, + RPROC_ATTACHED = 5, + RPROC_DETACHED = 6, + RPROC_LAST = 7, }; /** From 76f4c87587e2ff41e9b9867ffde2137f27ba39b9 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:40 -0700 Subject: [PATCH 0129/1379] remoteproc: Properly represent the attached state There is a need to know when a remote processor has been attached to rather than booted by the remoteproc core. In order to avoid manipulating two variables, i.e rproc::autonomous and rproc::state, get rid of the former and simply use the newly introduced RPROC_ATTACHED state. Signed-off-by: Mathieu Poirier Reviewed-by: Peng Fan Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-5-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 20 +------------------- drivers/remoteproc/remoteproc_sysfs.c | 5 +---- include/linux/remoteproc.h | 2 -- 3 files changed, 2 insertions(+), 25 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 9127b2aa5424..d828c01f4f8a 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1450,7 +1450,7 @@ static int __rproc_attach(struct rproc *rproc) goto stop_rproc; } - rproc->state = RPROC_RUNNING; + rproc->state = RPROC_ATTACHED; dev_info(dev, "remote processor %s is now attached\n", rproc->name); @@ -1665,14 +1665,6 @@ static int rproc_stop(struct rproc *rproc, bool crashed) rproc->state = RPROC_OFFLINE; - /* - * The remote processor has been stopped and is now offline, which means - * that the next time it is brought back online the remoteproc core will - * be responsible to load its firmware. As such it is no longer - * autonomous. - */ - rproc->autonomous = false; - dev_info(dev, "stopped remote processor %s\n", rproc->name); return 0; @@ -2083,16 +2075,6 @@ int rproc_add(struct rproc *rproc) if (ret < 0) return ret; - /* - * Remind ourselves the remote processor has been attached to rather - * than booted by the remoteproc core. This is important because the - * RPROC_DETACHED state will be lost as soon as the remote processor - * has been attached to. Used in firmware_show() and reset in - * rproc_stop(). - */ - if (rproc->state == RPROC_DETACHED) - rproc->autonomous = true; - /* if rproc is marked always-on, request it to boot */ if (rproc->auto_boot) { ret = rproc_trigger_auto_boot(rproc); diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c index 26fd9ceecdb5..4f58be1e13c1 100644 --- a/drivers/remoteproc/remoteproc_sysfs.c +++ b/drivers/remoteproc/remoteproc_sysfs.c @@ -138,11 +138,8 @@ static ssize_t firmware_show(struct device *dev, struct device_attribute *attr, * If the remote processor has been started by an external * entity we have no idea of what image it is running. As such * simply display a generic string rather then rproc->firmware. - * - * Here we rely on the autonomous flag because a remote processor - * may have been attached to and currently in a running state. */ - if (rproc->autonomous) + if (rproc->state == RPROC_ATTACHED) firmware = "unknown"; return sprintf(buf, "%s\n", firmware); diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 9193a8fb5b68..9e42e90cd9da 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -514,7 +514,6 @@ struct rproc_dump_segment { * @table_sz: size of @cached_table * @has_iommu: flag to indicate if remote processor is behind an MMU * @auto_boot: flag to indicate if remote processor should be auto-started - * @autonomous: true if an external entity has booted the remote processor * @dump_segments: list of segments in the firmware * @nb_vdev: number of vdev currently handled by rproc * @char_dev: character device of the rproc @@ -551,7 +550,6 @@ struct rproc { size_t table_sz; bool has_iommu; bool auto_boot; - bool autonomous; struct list_head dump_segments; int nb_vdev; u8 elf_class; From 1a631382be1d22ddab0582dae3498b3d28e2e44a Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:41 -0700 Subject: [PATCH 0130/1379] remoteproc: Add new get_loaded_rsc_table() to rproc_ops Add a new get_loaded_rsc_table() operation in order to support scenarios where the remoteproc core has booted a remote processor and detaches from it. When re-attaching to the remote processor, the core needs to know where the resource table has been placed in memory. Signed-off-by: Mathieu Poirier Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-6-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 32 ++++++++++++++++++++++++ drivers/remoteproc/remoteproc_internal.h | 10 ++++++++ include/linux/remoteproc.h | 6 ++++- 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index d828c01f4f8a..c7e1cf1b6f66 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1543,6 +1543,32 @@ disable_iommu: return ret; } +static int rproc_set_rsc_table(struct rproc *rproc) +{ + struct resource_table *table_ptr; + struct device *dev = &rproc->dev; + size_t table_sz; + int ret; + + table_ptr = rproc_get_loaded_rsc_table(rproc, &table_sz); + if (!table_ptr) { + /* Not having a resource table is acceptable */ + return 0; + } + + if (IS_ERR(table_ptr)) { + ret = PTR_ERR(table_ptr); + dev_err(dev, "can't load resource table: %d\n", ret); + return ret; + } + + rproc->cached_table = NULL; + rproc->table_ptr = table_ptr; + rproc->table_sz = table_sz; + + return 0; +} + /* * Attach to remote processor - similar to rproc_fw_boot() but without * the steps that deal with the firmware image. @@ -1562,6 +1588,12 @@ static int rproc_attach(struct rproc *rproc) return ret; } + ret = rproc_set_rsc_table(rproc); + if (ret) { + dev_err(dev, "can't load resource table: %d\n", ret); + goto disable_iommu; + } + /* reset max_notifyid */ rproc->max_notifyid = -1; diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h index 9ea37aa687d2..a328e634b1de 100644 --- a/drivers/remoteproc/remoteproc_internal.h +++ b/drivers/remoteproc/remoteproc_internal.h @@ -177,6 +177,16 @@ struct resource_table *rproc_find_loaded_rsc_table(struct rproc *rproc, return NULL; } +static inline +struct resource_table *rproc_get_loaded_rsc_table(struct rproc *rproc, + size_t *size) +{ + if (rproc->ops->get_loaded_rsc_table) + return rproc->ops->get_loaded_rsc_table(rproc, size); + + return NULL; +} + static inline bool rproc_u64_fit_in_size_t(u64 val) { diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 9e42e90cd9da..eee338177a3d 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -370,7 +370,9 @@ enum rsc_handling_status { * RSC_HANDLED if resource was handled, RSC_IGNORED if not handled and a * negative value on error * @load_rsc_table: load resource table from firmware image - * @find_loaded_rsc_table: find the loaded resouce table + * @find_loaded_rsc_table: find the loaded resource table from firmware image + * @get_loaded_rsc_table: get resource table installed in memory + * by external entity * @load: load firmware to memory, where the remote processor * expects to find it * @sanity_check: sanity check the fw image @@ -392,6 +394,8 @@ struct rproc_ops { int offset, int avail); struct resource_table *(*find_loaded_rsc_table)( struct rproc *rproc, const struct firmware *fw); + struct resource_table *(*get_loaded_rsc_table)( + struct rproc *rproc, size_t *size); int (*load)(struct rproc *rproc, const struct firmware *fw); int (*sanity_check)(struct rproc *rproc, const struct firmware *fw); u64 (*get_boot_addr)(struct rproc *rproc, const struct firmware *fw); From 8a471396d21ca499d89d4071b2b670258f009ffa Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:42 -0700 Subject: [PATCH 0131/1379] remoteproc: stm32: Move resource table setup to rproc_ops Move the setting of the resource table installed by an external entity to rproc_ops::get_loaded_rsc_table(). This is to support scenarios where a remote processor has been attached to but is detached at a later stage. To re-attach the remote processor, the address of the resource table needs to be available at a later time than the platform driver's probe() function. Signed-off-by: Mathieu Poirier Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-7-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/stm32_rproc.c | 141 +++++++++++++++---------------- 1 file changed, 68 insertions(+), 73 deletions(-) diff --git a/drivers/remoteproc/stm32_rproc.c b/drivers/remoteproc/stm32_rproc.c index ccb3c14a0023..f647e565014b 100644 --- a/drivers/remoteproc/stm32_rproc.c +++ b/drivers/remoteproc/stm32_rproc.c @@ -546,6 +546,73 @@ static void stm32_rproc_kick(struct rproc *rproc, int vqid) } } +static int stm32_rproc_da_to_pa(struct rproc *rproc, + u64 da, phys_addr_t *pa) +{ + struct stm32_rproc *ddata = rproc->priv; + struct device *dev = rproc->dev.parent; + struct stm32_rproc_mem *p_mem; + unsigned int i; + + for (i = 0; i < ddata->nb_rmems; i++) { + p_mem = &ddata->rmems[i]; + + if (da < p_mem->dev_addr || + da >= p_mem->dev_addr + p_mem->size) + continue; + + *pa = da - p_mem->dev_addr + p_mem->bus_addr; + dev_dbg(dev, "da %llx to pa %#x\n", da, *pa); + + return 0; + } + + dev_err(dev, "can't translate da %llx\n", da); + + return -EINVAL; +} + +static struct resource_table * +stm32_rproc_get_loaded_rsc_table(struct rproc *rproc, size_t *table_sz) +{ + struct stm32_rproc *ddata = rproc->priv; + struct device *dev = rproc->dev.parent; + phys_addr_t rsc_pa; + u32 rsc_da; + int err; + + /* The resource table has already been mapped, nothing to do */ + if (ddata->rsc_va) + goto done; + + err = regmap_read(ddata->rsctbl.map, ddata->rsctbl.reg, &rsc_da); + if (err) { + dev_err(dev, "failed to read rsc tbl addr\n"); + return ERR_PTR(-EINVAL); + } + + if (!rsc_da) + /* no rsc table */ + return ERR_PTR(-ENOENT); + + err = stm32_rproc_da_to_pa(rproc, rsc_da, &rsc_pa); + if (err) + return ERR_PTR(err); + + ddata->rsc_va = devm_ioremap_wc(dev, rsc_pa, RSC_TBL_SIZE); + if (IS_ERR_OR_NULL(ddata->rsc_va)) { + dev_err(dev, "Unable to map memory region: %pa+%zx\n", + &rsc_pa, RSC_TBL_SIZE); + ddata->rsc_va = NULL; + return ERR_PTR(-ENOMEM); + } + +done: + /* Assuming the resource table fits in 1kB is fair */ + *table_sz = RSC_TBL_SIZE; + return (struct resource_table *)ddata->rsc_va; +} + static const struct rproc_ops st_rproc_ops = { .start = stm32_rproc_start, .stop = stm32_rproc_stop, @@ -554,6 +621,7 @@ static const struct rproc_ops st_rproc_ops = { .load = rproc_elf_load_segments, .parse_fw = stm32_rproc_parse_fw, .find_loaded_rsc_table = rproc_elf_find_loaded_rsc_table, + .get_loaded_rsc_table = stm32_rproc_get_loaded_rsc_table, .sanity_check = rproc_elf_sanity_check, .get_boot_addr = rproc_elf_get_boot_addr, }; @@ -695,75 +763,6 @@ static int stm32_rproc_get_m4_status(struct stm32_rproc *ddata, return regmap_read(ddata->m4_state.map, ddata->m4_state.reg, state); } -static int stm32_rproc_da_to_pa(struct platform_device *pdev, - struct stm32_rproc *ddata, - u64 da, phys_addr_t *pa) -{ - struct device *dev = &pdev->dev; - struct stm32_rproc_mem *p_mem; - unsigned int i; - - for (i = 0; i < ddata->nb_rmems; i++) { - p_mem = &ddata->rmems[i]; - - if (da < p_mem->dev_addr || - da >= p_mem->dev_addr + p_mem->size) - continue; - - *pa = da - p_mem->dev_addr + p_mem->bus_addr; - dev_dbg(dev, "da %llx to pa %#x\n", da, *pa); - - return 0; - } - - dev_err(dev, "can't translate da %llx\n", da); - - return -EINVAL; -} - -static int stm32_rproc_get_loaded_rsc_table(struct platform_device *pdev, - struct rproc *rproc, - struct stm32_rproc *ddata) -{ - struct device *dev = &pdev->dev; - phys_addr_t rsc_pa; - u32 rsc_da; - int err; - - err = regmap_read(ddata->rsctbl.map, ddata->rsctbl.reg, &rsc_da); - if (err) { - dev_err(dev, "failed to read rsc tbl addr\n"); - return err; - } - - if (!rsc_da) - /* no rsc table */ - return 0; - - err = stm32_rproc_da_to_pa(pdev, ddata, rsc_da, &rsc_pa); - if (err) - return err; - - ddata->rsc_va = devm_ioremap_wc(dev, rsc_pa, RSC_TBL_SIZE); - if (IS_ERR_OR_NULL(ddata->rsc_va)) { - dev_err(dev, "Unable to map memory region: %pa+%zx\n", - &rsc_pa, RSC_TBL_SIZE); - ddata->rsc_va = NULL; - return -ENOMEM; - } - - /* - * The resource table is already loaded in device memory, no need - * to work with a cached table. - */ - rproc->cached_table = NULL; - /* Assuming the resource table fits in 1kB is fair */ - rproc->table_sz = RSC_TBL_SIZE; - rproc->table_ptr = (struct resource_table *)ddata->rsc_va; - - return 0; -} - static int stm32_rproc_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -803,10 +802,6 @@ static int stm32_rproc_probe(struct platform_device *pdev) ret = stm32_rproc_parse_memory_regions(rproc); if (ret) goto free_resources; - - ret = stm32_rproc_get_loaded_rsc_table(pdev, rproc, ddata); - if (ret) - goto free_resources; } rproc->has_iommu = false; From 6e20a05104e55dc0e4899db8110013d521d20a6e Mon Sep 17 00:00:00 2001 From: Arnaud POULIQUEN Date: Fri, 12 Mar 2021 09:24:43 -0700 Subject: [PATCH 0132/1379] remoteproc: stm32: Move memory parsing to rproc_ops Some actions such as memory resources reallocation are needed when trying to reattach a co-processor. Use the prepare() operation for these actions. Co-developed-by: Mathieu Poirier Signed-off-by: Mathieu Poirier Signed-off-by: Arnaud POULIQUEN Link: https://lore.kernel.org/r/20210312162453.1234145-8-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 14 ++++++++++++-- drivers/remoteproc/stm32_rproc.c | 27 ++++++--------------------- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index c7e1cf1b6f66..de670f4f592c 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1588,10 +1588,17 @@ static int rproc_attach(struct rproc *rproc) return ret; } + /* Do anything that is needed to boot the remote processor */ + ret = rproc_prepare_device(rproc); + if (ret) { + dev_err(dev, "can't prepare rproc %s: %d\n", rproc->name, ret); + goto disable_iommu; + } + ret = rproc_set_rsc_table(rproc); if (ret) { dev_err(dev, "can't load resource table: %d\n", ret); - goto disable_iommu; + goto unprepare_device; } /* reset max_notifyid */ @@ -1608,7 +1615,7 @@ static int rproc_attach(struct rproc *rproc) ret = rproc_handle_resources(rproc, rproc_loading_handlers); if (ret) { dev_err(dev, "Failed to process resources: %d\n", ret); - goto disable_iommu; + goto unprepare_device; } /* Allocate carveout resources associated to rproc */ @@ -1627,6 +1634,9 @@ static int rproc_attach(struct rproc *rproc) clean_up_resources: rproc_resource_cleanup(rproc); +unprepare_device: + /* release HW resources if needed */ + rproc_unprepare_device(rproc); disable_iommu: rproc_disable_iommu(rproc); return ret; diff --git a/drivers/remoteproc/stm32_rproc.c b/drivers/remoteproc/stm32_rproc.c index f647e565014b..3d45f51de4d0 100644 --- a/drivers/remoteproc/stm32_rproc.c +++ b/drivers/remoteproc/stm32_rproc.c @@ -207,16 +207,7 @@ static int stm32_rproc_mbox_idx(struct rproc *rproc, const unsigned char *name) return -EINVAL; } -static int stm32_rproc_elf_load_rsc_table(struct rproc *rproc, - const struct firmware *fw) -{ - if (rproc_elf_load_rsc_table(rproc, fw)) - dev_warn(&rproc->dev, "no resource table found for this firmware\n"); - - return 0; -} - -static int stm32_rproc_parse_memory_regions(struct rproc *rproc) +static int stm32_rproc_prepare(struct rproc *rproc) { struct device *dev = rproc->dev.parent; struct device_node *np = dev->of_node; @@ -274,12 +265,10 @@ static int stm32_rproc_parse_memory_regions(struct rproc *rproc) static int stm32_rproc_parse_fw(struct rproc *rproc, const struct firmware *fw) { - int ret = stm32_rproc_parse_memory_regions(rproc); + if (rproc_elf_load_rsc_table(rproc, fw)) + dev_warn(&rproc->dev, "no resource table found for this firmware\n"); - if (ret) - return ret; - - return stm32_rproc_elf_load_rsc_table(rproc, fw); + return 0; } static irqreturn_t stm32_rproc_wdg(int irq, void *data) @@ -614,6 +603,7 @@ done: } static const struct rproc_ops st_rproc_ops = { + .prepare = stm32_rproc_prepare, .start = stm32_rproc_start, .stop = stm32_rproc_stop, .attach = stm32_rproc_attach, @@ -796,14 +786,9 @@ static int stm32_rproc_probe(struct platform_device *pdev) if (ret) goto free_rproc; - if (state == M4_STATE_CRUN) { + if (state == M4_STATE_CRUN) rproc->state = RPROC_DETACHED; - ret = stm32_rproc_parse_memory_regions(rproc); - if (ret) - goto free_resources; - } - rproc->has_iommu = false; ddata->workqueue = create_workqueue(dev_name(dev)); if (!ddata->workqueue) { From 7f3bd0c019cb813448d867c17c9b9dad205a13eb Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:44 -0700 Subject: [PATCH 0133/1379] remoteproc: Add new detach() remoteproc operation Add an new detach() operation in order to support scenarios where the remoteproc core is going away but the remote processor is kept operating. This could be the case when the system is rebooted or when the platform driver is removed. Signed-off-by: Mathieu Poirier Reviewed-by: Peng Fan Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-9-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index eee338177a3d..2f1f0fbc3994 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -363,6 +363,7 @@ enum rsc_handling_status { * @start: power on the device and boot it * @stop: power off the device * @attach: attach to a device that his already powered up + * @detach: detach from a device, leaving it powered up * @kick: kick a virtqueue (virtqueue id given as a parameter) * @da_to_va: optional platform hook to perform address translations * @parse_fw: parse firmware to extract information (e.g. resource table) @@ -387,6 +388,7 @@ struct rproc_ops { int (*start)(struct rproc *rproc); int (*stop)(struct rproc *rproc); int (*attach)(struct rproc *rproc); + int (*detach)(struct rproc *rproc); void (*kick)(struct rproc *rproc, int vqid); void * (*da_to_va)(struct rproc *rproc, u64 da, size_t len, bool *is_iomem); int (*parse_fw)(struct rproc *rproc, const struct firmware *fw); From 6070203fe43335a02b7fd103bae582095411adc5 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:45 -0700 Subject: [PATCH 0134/1379] remoteproc: Introduce function __rproc_detach() Introduce function __rproc_detach() to perform the same kind of operation as rproc_stop(), but instead of switching off the remote processor using rproc->ops->stop(), it uses rproc->ops->detach(). That way it is possible for the core to release the resources associated with a remote processor while the latter is kept operating. Signed-off-by: Mathieu Poirier Reviewed-by: Peng Fan Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-10-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index de670f4f592c..23e9040386c2 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1712,6 +1712,36 @@ static int rproc_stop(struct rproc *rproc, bool crashed) return 0; } +/* + * __rproc_detach(): Does the opposite of __rproc_attach() + */ +static int __maybe_unused __rproc_detach(struct rproc *rproc) +{ + struct device *dev = &rproc->dev; + int ret; + + /* No need to continue if a detach() operation has not been provided */ + if (!rproc->ops->detach) + return -EINVAL; + + /* Stop any subdevices for the remote processor */ + rproc_stop_subdevices(rproc, false); + + /* Tell the remote processor the core isn't available anymore */ + ret = rproc->ops->detach(rproc); + if (ret) { + dev_err(dev, "can't detach from rproc: %d\n", ret); + return ret; + } + + rproc_unprepare_subdevices(rproc); + + rproc->state = RPROC_DETACHED; + + dev_info(dev, "detached remote processor %s\n", rproc->name); + + return 0; +} /** * rproc_trigger_recovery() - recover a remoteproc From d3962a397885518a85d2dc6b0c51e6594f71c30f Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:46 -0700 Subject: [PATCH 0135/1379] remoteproc: Introduce function rproc_detach() Introduce function rproc_detach() to enable the remoteproc core to release the resources associated with a remote processor without stopping its operation. Signed-off-by: Mathieu Poirier Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-11-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 58 +++++++++++++++++++++++++++- include/linux/remoteproc.h | 1 + 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 23e9040386c2..78a36a3723ec 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1715,7 +1715,7 @@ static int rproc_stop(struct rproc *rproc, bool crashed) /* * __rproc_detach(): Does the opposite of __rproc_attach() */ -static int __maybe_unused __rproc_detach(struct rproc *rproc) +static int __rproc_detach(struct rproc *rproc) { struct device *dev = &rproc->dev; int ret; @@ -1954,6 +1954,62 @@ out: } EXPORT_SYMBOL(rproc_shutdown); +/** + * rproc_detach() - Detach the remote processor from the + * remoteproc core + * + * @rproc: the remote processor + * + * Detach a remote processor (previously attached to with rproc_attach()). + * + * In case @rproc is still being used by an additional user(s), then + * this function will just decrement the power refcount and exit, + * without disconnecting the device. + * + * Function rproc_detach() calls __rproc_detach() in order to let a remote + * processor know that services provided by the application processor are + * no longer available. From there it should be possible to remove the + * platform driver and even power cycle the application processor (if the HW + * supports it) without needing to switch off the remote processor. + */ +int rproc_detach(struct rproc *rproc) +{ + struct device *dev = &rproc->dev; + int ret; + + ret = mutex_lock_interruptible(&rproc->lock); + if (ret) { + dev_err(dev, "can't lock rproc %s: %d\n", rproc->name, ret); + return ret; + } + + /* if the remote proc is still needed, bail out */ + if (!atomic_dec_and_test(&rproc->power)) { + ret = 0; + goto out; + } + + ret = __rproc_detach(rproc); + if (ret) { + atomic_inc(&rproc->power); + goto out; + } + + /* clean up all acquired resources */ + rproc_resource_cleanup(rproc); + + /* release HW resources if needed */ + rproc_unprepare_device(rproc); + + rproc_disable_iommu(rproc); + + rproc->table_ptr = NULL; +out: + mutex_unlock(&rproc->lock); + return ret; +} +EXPORT_SYMBOL(rproc_detach); + /** * rproc_get_by_phandle() - find a remote processor by phandle * @phandle: phandle to the rproc diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 2f1f0fbc3994..fc2cca600423 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -664,6 +664,7 @@ rproc_of_resm_mem_entry_init(struct device *dev, u32 of_resm_idx, size_t len, int rproc_boot(struct rproc *rproc); void rproc_shutdown(struct rproc *rproc); +int rproc_detach(struct rproc *rproc); int rproc_set_firmware(struct rproc *rproc, const char *fw_name); void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type); void rproc_coredump_using_sections(struct rproc *rproc); From 9dc9507f1880fb6225e3e058cb5219b152cbf198 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:47 -0700 Subject: [PATCH 0136/1379] remoteproc: Properly deal with the resource table when detaching If it is possible to detach the remote processor, keep an untouched copy of the resource table. That way we can start from the same resource table without having to worry about original values or what elements the startup code has changed when re-attaching to the remote processor. Signed-off-by: Mathieu Poirier Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-12-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 81 ++++++++++++++++++++++++++++ include/linux/remoteproc.h | 3 ++ 2 files changed, 84 insertions(+) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 78a36a3723ec..57c851d22ba6 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1562,6 +1562,24 @@ static int rproc_set_rsc_table(struct rproc *rproc) return ret; } + /* + * If it is possible to detach the remote processor, keep an untouched + * copy of the resource table. That way we can start fresh again when + * the remote processor is re-attached, that is: + * + * DETACHED -> ATTACHED -> DETACHED -> ATTACHED + * + * Free'd in rproc_reset_rsc_table_on_detach() and + * rproc_reset_rsc_table_on_stop(). + */ + if (rproc->ops->detach) { + rproc->clean_table = kmemdup(table_ptr, table_sz, GFP_KERNEL); + if (!rproc->clean_table) + return -ENOMEM; + } else { + rproc->clean_table = NULL; + } + rproc->cached_table = NULL; rproc->table_ptr = table_ptr; rproc->table_sz = table_sz; @@ -1569,6 +1587,59 @@ static int rproc_set_rsc_table(struct rproc *rproc) return 0; } +static int rproc_reset_rsc_table_on_detach(struct rproc *rproc) +{ + struct resource_table *table_ptr; + + /* A resource table was never retrieved, nothing to do here */ + if (!rproc->table_ptr) + return 0; + + /* + * If we made it to this point a clean_table _must_ have been + * allocated in rproc_set_rsc_table(). If one isn't present + * something went really wrong and we must complain. + */ + if (WARN_ON(!rproc->clean_table)) + return -EINVAL; + + /* Remember where the external entity installed the resource table */ + table_ptr = rproc->table_ptr; + + /* + * If we made it here the remote processor was started by another + * entity and a cache table doesn't exist. As such make a copy of + * the resource table currently used by the remote processor and + * use that for the rest of the shutdown process. The memory + * allocated here is free'd in rproc_detach(). + */ + rproc->cached_table = kmemdup(rproc->table_ptr, + rproc->table_sz, GFP_KERNEL); + if (!rproc->cached_table) + return -ENOMEM; + + /* + * Use a copy of the resource table for the remainder of the + * shutdown process. + */ + rproc->table_ptr = rproc->cached_table; + + /* + * Reset the memory area where the firmware loaded the resource table + * to its original value. That way when we re-attach the remote + * processor the resource table is clean and ready to be used again. + */ + memcpy(table_ptr, rproc->clean_table, rproc->table_sz); + + /* + * The clean resource table is no longer needed. Allocated in + * rproc_set_rsc_table(). + */ + kfree(rproc->clean_table); + + return 0; +} + /* * Attach to remote processor - similar to rproc_fw_boot() but without * the steps that deal with the firmware image. @@ -1727,6 +1798,13 @@ static int __rproc_detach(struct rproc *rproc) /* Stop any subdevices for the remote processor */ rproc_stop_subdevices(rproc, false); + /* the installed resource table is no longer accessible */ + ret = rproc_reset_rsc_table_on_detach(rproc); + if (ret) { + dev_err(dev, "can't reset resource table: %d\n", ret); + return ret; + } + /* Tell the remote processor the core isn't available anymore */ ret = rproc->ops->detach(rproc); if (ret) { @@ -2003,6 +2081,9 @@ int rproc_detach(struct rproc *rproc) rproc_disable_iommu(rproc); + /* Free the copy of the resource table */ + kfree(rproc->cached_table); + rproc->cached_table = NULL; rproc->table_ptr = NULL; out: mutex_unlock(&rproc->lock); diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index fc2cca600423..8b795b544f75 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -516,6 +516,8 @@ struct rproc_dump_segment { * @recovery_disabled: flag that state if recovery was disabled * @max_notifyid: largest allocated notify id. * @table_ptr: pointer to the resource table in effect + * @clean_table: copy of the resource table without modifications. Used + * when a remote processor is attached or detached from the core * @cached_table: copy of the resource table * @table_sz: size of @cached_table * @has_iommu: flag to indicate if remote processor is behind an MMU @@ -552,6 +554,7 @@ struct rproc { bool recovery_disabled; int max_notifyid; struct resource_table *table_ptr; + struct resource_table *clean_table; struct resource_table *cached_table; size_t table_sz; bool has_iommu; From 8088dd4d9316964901b13df09a20ee0f917f414d Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:48 -0700 Subject: [PATCH 0137/1379] remoteproc: Properly deal with the resource table when stopping When a remote processor that was attached to is stopped, special care must be taken to make sure the shutdown process is similar to what it would be had it been started by the remoteproc core. This patch takes care of that by making a copy of the resource table currently used by the remote processor. From that point on the copy is used, as if the remote processor had been started by the remoteproc core. Signed-off-by: Mathieu Poirier Reviewed-by: Arnaud Pouliquen Reported-by: kernel test robot Link: https://lore.kernel.org/r/20210312162453.1234145-13-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 48 +++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 57c851d22ba6..163fad5b95a1 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1640,6 +1640,47 @@ static int rproc_reset_rsc_table_on_detach(struct rproc *rproc) return 0; } +static int rproc_reset_rsc_table_on_stop(struct rproc *rproc) +{ + /* A resource table was never retrieved, nothing to do here */ + if (!rproc->table_ptr) + return 0; + + /* + * If a cache table exists the remote processor was started by + * the remoteproc core. That cache table should be used for + * the rest of the shutdown process. + */ + if (rproc->cached_table) + goto out; + + /* + * If we made it here the remote processor was started by another + * entity and a cache table doesn't exist. As such make a copy of + * the resource table currently used by the remote processor and + * use that for the rest of the shutdown process. The memory + * allocated here is free'd in rproc_shutdown(). + */ + rproc->cached_table = kmemdup(rproc->table_ptr, + rproc->table_sz, GFP_KERNEL); + if (!rproc->cached_table) + return -ENOMEM; + + /* + * Since the remote processor is being switched off the clean table + * won't be needed. Allocated in rproc_set_rsc_table(). + */ + kfree(rproc->clean_table); + +out: + /* + * Use a copy of the resource table for the remainder of the + * shutdown process. + */ + rproc->table_ptr = rproc->cached_table; + return 0; +} + /* * Attach to remote processor - similar to rproc_fw_boot() but without * the steps that deal with the firmware image. @@ -1765,7 +1806,12 @@ static int rproc_stop(struct rproc *rproc, bool crashed) rproc_stop_subdevices(rproc, crashed); /* the installed resource table is no longer accessible */ - rproc->table_ptr = rproc->cached_table; + ret = rproc_reset_rsc_table_on_stop(rproc); + if (ret) { + dev_err(dev, "can't reset resource table: %d\n", ret); + return ret; + } + /* power off the remote processor */ ret = rproc->ops->stop(rproc); From 800dad0025ecb9ca8c885414cab070f8cc40e81e Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:49 -0700 Subject: [PATCH 0138/1379] remoteproc: Properly deal with a kernel panic when attached The panic handler operation of registered remote processors should also be called when remote processors have been attached to. Signed-off-by: Mathieu Poirier Reviewed-by: Peng Fan Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-14-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 163fad5b95a1..4cf6ef7e15b5 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -2733,7 +2733,11 @@ static int rproc_panic_handler(struct notifier_block *nb, unsigned long event, rcu_read_lock(); list_for_each_entry_rcu(rproc, &rproc_list, node) { - if (!rproc->ops->panic || rproc->state != RPROC_RUNNING) + if (!rproc->ops->panic) + continue; + + if (rproc->state != RPROC_RUNNING && + rproc->state != RPROC_ATTACHED) continue; d = rproc->ops->panic(rproc); From 83d4e6712c3b1a7dd5b43251737ea3d7d0a460f4 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:50 -0700 Subject: [PATCH 0139/1379] remoteproc: Properly deal with a start request when attached This patch takes into account scenarios where a remote processor has been attached to when receiving a "start" command from sysfs. As with the case with the running state, the command can't be carried out if the remote processor is already in operation. Signed-off-by: Mathieu Poirier Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-15-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_cdev.c | 3 ++- drivers/remoteproc/remoteproc_sysfs.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/remoteproc/remoteproc_cdev.c b/drivers/remoteproc/remoteproc_cdev.c index b19ea3057bde..b2cee9afb41b 100644 --- a/drivers/remoteproc/remoteproc_cdev.c +++ b/drivers/remoteproc/remoteproc_cdev.c @@ -32,7 +32,8 @@ static ssize_t rproc_cdev_write(struct file *filp, const char __user *buf, size_ return -EFAULT; if (!strncmp(cmd, "start", len)) { - if (rproc->state == RPROC_RUNNING) + if (rproc->state == RPROC_RUNNING || + rproc->state == RPROC_ATTACHED) return -EBUSY; ret = rproc_boot(rproc); diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c index 4f58be1e13c1..bbfa238e4707 100644 --- a/drivers/remoteproc/remoteproc_sysfs.c +++ b/drivers/remoteproc/remoteproc_sysfs.c @@ -194,7 +194,8 @@ static ssize_t state_store(struct device *dev, int ret = 0; if (sysfs_streq(buf, "start")) { - if (rproc->state == RPROC_RUNNING) + if (rproc->state == RPROC_RUNNING || + rproc->state == RPROC_ATTACHED) return -EBUSY; ret = rproc_boot(rproc); From d2008a96833082713094ba8a545141be1b01b266 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:51 -0700 Subject: [PATCH 0140/1379] remoteproc: Properly deal with a stop request when attached Allow a remote processor that was started by another entity to be switched off by the remoteproc core. For that to happen a rproc::ops::stop() operation needs to be available. Signed-off-by: Mathieu Poirier Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-16-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_cdev.c | 3 ++- drivers/remoteproc/remoteproc_core.c | 4 ++++ drivers/remoteproc/remoteproc_sysfs.c | 3 ++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/remoteproc/remoteproc_cdev.c b/drivers/remoteproc/remoteproc_cdev.c index b2cee9afb41b..0249d8f6c3f8 100644 --- a/drivers/remoteproc/remoteproc_cdev.c +++ b/drivers/remoteproc/remoteproc_cdev.c @@ -38,7 +38,8 @@ static ssize_t rproc_cdev_write(struct file *filp, const char __user *buf, size_ ret = rproc_boot(rproc); } else if (!strncmp(cmd, "stop", len)) { - if (rproc->state != RPROC_RUNNING) + if (rproc->state != RPROC_RUNNING && + rproc->state != RPROC_ATTACHED) return -EINVAL; rproc_shutdown(rproc); diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 4cf6ef7e15b5..626a6b90fba2 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1802,6 +1802,10 @@ static int rproc_stop(struct rproc *rproc, bool crashed) struct device *dev = &rproc->dev; int ret; + /* No need to continue if a stop() operation has not been provided */ + if (!rproc->ops->stop) + return -EINVAL; + /* Stop any subdevices for the remote processor */ rproc_stop_subdevices(rproc, crashed); diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c index bbfa238e4707..23c0dc1ddc34 100644 --- a/drivers/remoteproc/remoteproc_sysfs.c +++ b/drivers/remoteproc/remoteproc_sysfs.c @@ -202,7 +202,8 @@ static ssize_t state_store(struct device *dev, if (ret) dev_err(&rproc->dev, "Boot failed: %d\n", ret); } else if (sysfs_streq(buf, "stop")) { - if (rproc->state != RPROC_RUNNING) + if (rproc->state != RPROC_RUNNING && + rproc->state != RPROC_ATTACHED) return -EINVAL; rproc_shutdown(rproc); From 5daaeb5f07ed0681b734864dde58dcadab115963 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:52 -0700 Subject: [PATCH 0141/1379] remoteproc: Properly deal with a detach request when attached This patch introduces the capability to detach a remote processor that has been attached to by the remoteproc core. For that to happen a rproc::ops::detach() operation needs to be available. Signed-off-by: Mathieu Poirier Reviewed-by: Arnaud Pouliquen Link: https://lore.kernel.org/r/20210312162453.1234145-17-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_cdev.c | 5 +++++ drivers/remoteproc/remoteproc_sysfs.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/drivers/remoteproc/remoteproc_cdev.c b/drivers/remoteproc/remoteproc_cdev.c index 0249d8f6c3f8..2db494816d5f 100644 --- a/drivers/remoteproc/remoteproc_cdev.c +++ b/drivers/remoteproc/remoteproc_cdev.c @@ -43,6 +43,11 @@ static ssize_t rproc_cdev_write(struct file *filp, const char __user *buf, size_ return -EINVAL; rproc_shutdown(rproc); + } else if (!strncmp(cmd, "detach", len)) { + if (rproc->state != RPROC_ATTACHED) + return -EINVAL; + + ret = rproc_detach(rproc); } else { dev_err(&rproc->dev, "Unrecognized option\n"); ret = -EINVAL; diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c index 23c0dc1ddc34..ea8b89f97d7b 100644 --- a/drivers/remoteproc/remoteproc_sysfs.c +++ b/drivers/remoteproc/remoteproc_sysfs.c @@ -207,6 +207,11 @@ static ssize_t state_store(struct device *dev, return -EINVAL; rproc_shutdown(rproc); + } else if (sysfs_streq(buf, "detach")) { + if (rproc->state != RPROC_ATTACHED) + return -EINVAL; + + ret = rproc_detach(rproc); } else { dev_err(&rproc->dev, "Unrecognised option: %s\n", buf); ret = -EINVAL; From 6e71d2b2a2b717c3bddbe72cdf48dd07d53f8364 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 12 Mar 2021 09:24:53 -0700 Subject: [PATCH 0142/1379] remoteproc: Refactor function rproc_cdev_release() Refactor function rproc_cdev_release() to take into account the current state of the remote processor when choosing the state to transition to. Signed-off-by: Mathieu Poirier Link: https://lore.kernel.org/r/20210312162453.1234145-18-mathieu.poirier@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_cdev.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/remoteproc/remoteproc_cdev.c b/drivers/remoteproc/remoteproc_cdev.c index 2db494816d5f..0b8a84c04f76 100644 --- a/drivers/remoteproc/remoteproc_cdev.c +++ b/drivers/remoteproc/remoteproc_cdev.c @@ -86,11 +86,17 @@ static long rproc_device_ioctl(struct file *filp, unsigned int ioctl, unsigned l static int rproc_cdev_release(struct inode *inode, struct file *filp) { struct rproc *rproc = container_of(inode->i_cdev, struct rproc, cdev); + int ret = 0; - if (rproc->cdev_put_on_release && rproc->state == RPROC_RUNNING) + if (!rproc->cdev_put_on_release) + return 0; + + if (rproc->state == RPROC_RUNNING) rproc_shutdown(rproc); + else if (rproc->state == RPROC_ATTACHED) + ret = rproc_detach(rproc); - return 0; + return ret; } static const struct file_operations rproc_fops = { From 6549f42c3d179575cd1466c4fd65d76680e49fed Mon Sep 17 00:00:00 2001 From: Govind Singh Date: Fri, 29 Jan 2021 00:18:12 +0530 Subject: [PATCH 0143/1379] remoteproc: qcom: wcss: populate hardcoded param using driver data Q6 based WiFi fw loading is supported across different targets, ex: IPQ8074/QCS404. In order to support different fw names/pas id etc, populate hardcoded param using driver data. Signed-off-by: Govind Singh Signed-off-by: Gokul Sriram Palanisamy Link: https://lore.kernel.org/r/1611859695-11824-2-git-send-email-gokulsri@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_wcss.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/remoteproc/qcom_q6v5_wcss.c b/drivers/remoteproc/qcom_q6v5_wcss.c index 704cd63c9af4..0a37c6e665df 100644 --- a/drivers/remoteproc/qcom_q6v5_wcss.c +++ b/drivers/remoteproc/qcom_q6v5_wcss.c @@ -71,6 +71,11 @@ #define TCSR_WCSS_CLK_MASK 0x1F #define TCSR_WCSS_CLK_ENABLE 0x14 +struct wcss_data { + const char *firmware_name; + unsigned int crash_reason_smem; +}; + struct q6v5_wcss { struct device *dev; @@ -93,6 +98,8 @@ struct q6v5_wcss { void *mem_region; size_t mem_size; + unsigned int crash_reason_smem; + struct qcom_rproc_glink glink_subdev; struct qcom_rproc_ssr ssr_subdev; }; @@ -438,7 +445,7 @@ static int q6v5_wcss_load(struct rproc *rproc, const struct firmware *fw) return ret; } -static const struct rproc_ops q6v5_wcss_ops = { +static const struct rproc_ops q6v5_wcss_ipq8074_ops = { .start = q6v5_wcss_start, .stop = q6v5_wcss_stop, .da_to_va = q6v5_wcss_da_to_va, @@ -538,12 +545,17 @@ static int q6v5_alloc_memory_region(struct q6v5_wcss *wcss) static int q6v5_wcss_probe(struct platform_device *pdev) { + const struct wcss_data *desc; struct q6v5_wcss *wcss; struct rproc *rproc; int ret; - rproc = rproc_alloc(&pdev->dev, pdev->name, &q6v5_wcss_ops, - "IPQ8074/q6_fw.mdt", sizeof(*wcss)); + desc = device_get_match_data(&pdev->dev); + if (!desc) + return -EINVAL; + + rproc = rproc_alloc(&pdev->dev, pdev->name, &q6v5_wcss_ipq8074_ops, + desc->firmware_name, sizeof(*wcss)); if (!rproc) { dev_err(&pdev->dev, "failed to allocate rproc\n"); return -ENOMEM; @@ -551,6 +563,7 @@ static int q6v5_wcss_probe(struct platform_device *pdev) wcss = rproc->priv; wcss->dev = &pdev->dev; + wcss->crash_reason_smem = desc->crash_reason_smem; ret = q6v5_wcss_init_mmio(wcss, pdev); if (ret) @@ -564,7 +577,8 @@ static int q6v5_wcss_probe(struct platform_device *pdev) if (ret) goto free_rproc; - ret = qcom_q6v5_init(&wcss->q6v5, pdev, rproc, WCSS_CRASH_REASON, NULL); + ret = qcom_q6v5_init(&wcss->q6v5, pdev, rproc, desc->crash_reason_smem, + NULL); if (ret) goto free_rproc; @@ -595,8 +609,13 @@ static int q6v5_wcss_remove(struct platform_device *pdev) return 0; } +static const struct wcss_data wcss_ipq8074_res_init = { + .firmware_name = "IPQ8074/q6_fw.mdt", + .crash_reason_smem = WCSS_CRASH_REASON, +}; + static const struct of_device_id q6v5_wcss_of_match[] = { - { .compatible = "qcom,ipq8074-wcss-pil" }, + { .compatible = "qcom,ipq8074-wcss-pil", .data = &wcss_ipq8074_res_init }, { }, }; MODULE_DEVICE_TABLE(of, q6v5_wcss_of_match); From 34364712fcc48d589e88517395021a14f82fad2e Mon Sep 17 00:00:00 2001 From: Govind Singh Date: Fri, 29 Jan 2021 00:18:13 +0530 Subject: [PATCH 0144/1379] dt-bindings: remoteproc: qcom: Add Q6V5 Modem PIL binding for QCS404 Add a new modem compatible string for Qualcomm QCS404 SoCs Signed-off-by: Govind Singh Signed-off-by: Gokul Sriram Palanisamy Acked-by: Rob Herring Link: https://lore.kernel.org/r/1611859695-11824-3-git-send-email-gokulsri@codeaurora.org Signed-off-by: Bjorn Andersson --- .../devicetree/bindings/remoteproc/qcom,q6v5.txt | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt index 7ccd5534b0ae..69c49c7b2cff 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5.txt @@ -9,6 +9,7 @@ on the Qualcomm Hexagon core. Definition: must be one of: "qcom,q6v5-pil", "qcom,ipq8074-wcss-pil" + "qcom,qcs404-wcss-pil" "qcom,msm8916-mss-pil", "qcom,msm8974-mss-pil" "qcom,msm8996-mss-pil" @@ -39,6 +40,7 @@ on the Qualcomm Hexagon core. string: qcom,q6v5-pil: qcom,ipq8074-wcss-pil: + qcom,qcs404-wcss-pil: qcom,msm8916-mss-pil: qcom,msm8974-mss-pil: must be "wdog", "fatal", "ready", "handover", "stop-ack" @@ -67,6 +69,11 @@ on the Qualcomm Hexagon core. Definition: The clocks needed depend on the compatible string: qcom,ipq8074-wcss-pil: no clock names required + qcom,qcs404-wcss-pil: + must be "xo", "gcc_abhs_cbcr", "gcc_abhs_cbcr", + "gcc_axim_cbcr", "lcc_ahbfabric_cbc", "tcsr_lcc_cbc", + "lcc_abhs_cbc", "lcc_tcm_slave_cbc", "lcc_abhm_cbc", + "lcc_axim_cbc", "lcc_bcr_sleep" qcom,q6v5-pil: qcom,msm8916-mss-pil: qcom,msm8974-mss-pil: @@ -132,6 +139,14 @@ For the compatible string below the following supplies are required: Definition: reference to the regulators to be held on behalf of the booting of the Hexagon core +For the compatible string below the following supplies are required: + "qcom,qcs404-wcss-pil" +- cx-supply: + Usage: required + Value type: + Definition: reference to the regulators to be held on behalf of the + booting of the Hexagon core + For the compatible string below the following supplies are required: "qcom,msm8996-mss-pil" - pll-supply: From 0af65b9b915e52019aee91db3e1f8b39a7ec8d08 Mon Sep 17 00:00:00 2001 From: Govind Singh Date: Fri, 29 Jan 2021 00:18:14 +0530 Subject: [PATCH 0145/1379] remoteproc: qcom: wcss: Add non pas wcss Q6 support for QCS404 Add non PAS WCSS remoteproc driver support for QCS404 SOC. Add WCSS q6 bootup and shutdown sequence handled from Application Processor SubSystem(APSS). Signed-off-by: Govind Singh Signed-off-by: Gokul Sriram Palanisamy Link: https://lore.kernel.org/r/1611859695-11824-4-git-send-email-gokulsri@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_wcss.c | 568 ++++++++++++++++++++++++++-- 1 file changed, 529 insertions(+), 39 deletions(-) diff --git a/drivers/remoteproc/qcom_q6v5_wcss.c b/drivers/remoteproc/qcom_q6v5_wcss.c index 0a37c6e665df..d67264c93996 100644 --- a/drivers/remoteproc/qcom_q6v5_wcss.c +++ b/drivers/remoteproc/qcom_q6v5_wcss.c @@ -4,13 +4,18 @@ * Copyright (C) 2014 Sony Mobile Communications AB * Copyright (c) 2012-2018, The Linux Foundation. All rights reserved. */ +#include +#include +#include #include #include #include #include +#include #include #include #include +#include #include #include #include "qcom_common.h" @@ -24,6 +29,9 @@ #define Q6SS_GFMUX_CTL_REG 0x020 #define Q6SS_PWR_CTL_REG 0x030 #define Q6SS_MEM_PWR_CTL 0x0B0 +#define Q6SS_STRAP_ACC 0x110 +#define Q6SS_CGC_OVERRIDE 0x034 +#define Q6SS_BCR_REG 0x6000 /* AXI Halt Register Offsets */ #define AXI_HALTREQ_REG 0x0 @@ -37,14 +45,19 @@ #define Q6SS_CORE_ARES BIT(1) #define Q6SS_BUS_ARES_ENABLE BIT(2) +/* Q6SS_BRC_RESET */ +#define Q6SS_BRC_BLK_ARES BIT(0) + /* Q6SS_GFMUX_CTL */ #define Q6SS_CLK_ENABLE BIT(1) +#define Q6SS_SWITCH_CLK_SRC BIT(8) /* Q6SS_PWR_CTL */ #define Q6SS_L2DATA_STBY_N BIT(18) #define Q6SS_SLP_RET_N BIT(19) #define Q6SS_CLAMP_IO BIT(20) #define QDSS_BHS_ON BIT(21) +#define QDSS_Q6_MEMORIES GENMASK(15, 0) /* Q6SS parameters */ #define Q6SS_LDO_BYP BIT(25) @@ -53,6 +66,7 @@ #define Q6SS_CLAMP_QMC_MEM BIT(22) #define HALT_CHECK_MAX_LOOPS 200 #define Q6SS_XO_CBCR GENMASK(5, 3) +#define Q6SS_SLEEP_CBCR GENMASK(5, 2) /* Q6SS config/status registers */ #define TCSR_GLOBAL_CFG0 0x0 @@ -71,9 +85,23 @@ #define TCSR_WCSS_CLK_MASK 0x1F #define TCSR_WCSS_CLK_ENABLE 0x14 +#define MAX_HALT_REG 3 +enum { + WCSS_IPQ8074, + WCSS_QCS404, +}; + struct wcss_data { const char *firmware_name; unsigned int crash_reason_smem; + u32 version; + bool aon_reset_required; + bool wcss_q6_reset_required; + const char *ssr_name; + const char *sysmon_name; + int ssctl_id; + const struct rproc_ops *ops; + bool requires_force_stop; }; struct q6v5_wcss { @@ -87,9 +115,26 @@ struct q6v5_wcss { u32 halt_wcss; u32 halt_nc; + struct clk *xo; + struct clk *ahbfabric_cbcr_clk; + struct clk *gcc_abhs_cbcr; + struct clk *gcc_axim_cbcr; + struct clk *lcc_csr_cbcr; + struct clk *ahbs_cbcr; + struct clk *tcm_slave_cbcr; + struct clk *qdsp6ss_abhm_cbcr; + struct clk *qdsp6ss_sleep_cbcr; + struct clk *qdsp6ss_axim_cbcr; + struct clk *qdsp6ss_xo_cbcr; + struct clk *qdsp6ss_core_gfmux; + struct clk *lcc_bcr_sleep; + struct regulator *cx_supply; + struct qcom_sysmon *sysmon; + struct reset_control *wcss_aon_reset; struct reset_control *wcss_reset; struct reset_control *wcss_q6_reset; + struct reset_control *wcss_q6_bcr_reset; struct qcom_q6v5 q6v5; @@ -99,6 +144,8 @@ struct q6v5_wcss { size_t mem_size; unsigned int crash_reason_smem; + u32 version; + bool requires_force_stop; struct qcom_rproc_glink glink_subdev; struct qcom_rproc_ssr ssr_subdev; @@ -244,6 +291,207 @@ wcss_reset: return ret; } +static int q6v5_wcss_qcs404_power_on(struct q6v5_wcss *wcss) +{ + unsigned long val; + int ret, idx; + + /* Toggle the restart */ + reset_control_assert(wcss->wcss_reset); + usleep_range(200, 300); + reset_control_deassert(wcss->wcss_reset); + usleep_range(200, 300); + + /* Enable GCC_WDSP_Q6SS_AHBS_CBCR clock */ + ret = clk_prepare_enable(wcss->gcc_abhs_cbcr); + if (ret) + return ret; + + /* Remove reset to the WCNSS QDSP6SS */ + reset_control_deassert(wcss->wcss_q6_bcr_reset); + + /* Enable Q6SSTOP_AHBFABRIC_CBCR clock */ + ret = clk_prepare_enable(wcss->ahbfabric_cbcr_clk); + if (ret) + goto disable_gcc_abhs_cbcr_clk; + + /* Enable the LCCCSR CBC clock, Q6SSTOP_Q6SSTOP_LCC_CSR_CBCR clock */ + ret = clk_prepare_enable(wcss->lcc_csr_cbcr); + if (ret) + goto disable_ahbfabric_cbcr_clk; + + /* Enable the Q6AHBS CBC, Q6SSTOP_Q6SS_AHBS_CBCR clock */ + ret = clk_prepare_enable(wcss->ahbs_cbcr); + if (ret) + goto disable_csr_cbcr_clk; + + /* Enable the TCM slave CBC, Q6SSTOP_Q6SS_TCM_SLAVE_CBCR clock */ + ret = clk_prepare_enable(wcss->tcm_slave_cbcr); + if (ret) + goto disable_ahbs_cbcr_clk; + + /* Enable the Q6SS AHB master CBC, Q6SSTOP_Q6SS_AHBM_CBCR clock */ + ret = clk_prepare_enable(wcss->qdsp6ss_abhm_cbcr); + if (ret) + goto disable_tcm_slave_cbcr_clk; + + /* Enable the Q6SS AXI master CBC, Q6SSTOP_Q6SS_AXIM_CBCR clock */ + ret = clk_prepare_enable(wcss->qdsp6ss_axim_cbcr); + if (ret) + goto disable_abhm_cbcr_clk; + + /* Enable the Q6SS XO CBC */ + val = readl(wcss->reg_base + Q6SS_XO_CBCR); + val |= BIT(0); + writel(val, wcss->reg_base + Q6SS_XO_CBCR); + /* Read CLKOFF bit to go low indicating CLK is enabled */ + ret = readl_poll_timeout(wcss->reg_base + Q6SS_XO_CBCR, + val, !(val & BIT(31)), 1, + HALT_CHECK_MAX_LOOPS); + if (ret) { + dev_err(wcss->dev, + "xo cbcr enabling timed out (rc:%d)\n", ret); + return ret; + } + + writel(0, wcss->reg_base + Q6SS_CGC_OVERRIDE); + + /* Enable QDSP6 sleep clock clock */ + val = readl(wcss->reg_base + Q6SS_SLEEP_CBCR); + val |= BIT(0); + writel(val, wcss->reg_base + Q6SS_SLEEP_CBCR); + + /* Enable the Enable the Q6 AXI clock, GCC_WDSP_Q6SS_AXIM_CBCR*/ + ret = clk_prepare_enable(wcss->gcc_axim_cbcr); + if (ret) + goto disable_sleep_cbcr_clk; + + /* Assert resets, stop core */ + val = readl(wcss->reg_base + Q6SS_RESET_REG); + val |= Q6SS_CORE_ARES | Q6SS_BUS_ARES_ENABLE | Q6SS_STOP_CORE; + writel(val, wcss->reg_base + Q6SS_RESET_REG); + + /* Program the QDSP6SS PWR_CTL register */ + writel(0x01700000, wcss->reg_base + Q6SS_PWR_CTL_REG); + + writel(0x03700000, wcss->reg_base + Q6SS_PWR_CTL_REG); + + writel(0x03300000, wcss->reg_base + Q6SS_PWR_CTL_REG); + + writel(0x033C0000, wcss->reg_base + Q6SS_PWR_CTL_REG); + + /* + * Enable memories by turning on the QDSP6 memory foot/head switch, one + * bank at a time to avoid in-rush current + */ + for (idx = 28; idx >= 0; idx--) { + writel((readl(wcss->reg_base + Q6SS_MEM_PWR_CTL) | + (1 << idx)), wcss->reg_base + Q6SS_MEM_PWR_CTL); + } + + writel(0x031C0000, wcss->reg_base + Q6SS_PWR_CTL_REG); + writel(0x030C0000, wcss->reg_base + Q6SS_PWR_CTL_REG); + + val = readl(wcss->reg_base + Q6SS_RESET_REG); + val &= ~Q6SS_CORE_ARES; + writel(val, wcss->reg_base + Q6SS_RESET_REG); + + /* Enable the Q6 core clock at the GFM, Q6SSTOP_QDSP6SS_GFMUX_CTL */ + val = readl(wcss->reg_base + Q6SS_GFMUX_CTL_REG); + val |= Q6SS_CLK_ENABLE | Q6SS_SWITCH_CLK_SRC; + writel(val, wcss->reg_base + Q6SS_GFMUX_CTL_REG); + + /* Enable sleep clock branch needed for BCR circuit */ + ret = clk_prepare_enable(wcss->lcc_bcr_sleep); + if (ret) + goto disable_core_gfmux_clk; + + return 0; + +disable_core_gfmux_clk: + val = readl(wcss->reg_base + Q6SS_GFMUX_CTL_REG); + val &= ~(Q6SS_CLK_ENABLE | Q6SS_SWITCH_CLK_SRC); + writel(val, wcss->reg_base + Q6SS_GFMUX_CTL_REG); + clk_disable_unprepare(wcss->gcc_axim_cbcr); +disable_sleep_cbcr_clk: + val = readl(wcss->reg_base + Q6SS_SLEEP_CBCR); + val &= ~Q6SS_CLK_ENABLE; + writel(val, wcss->reg_base + Q6SS_SLEEP_CBCR); + val = readl(wcss->reg_base + Q6SS_XO_CBCR); + val &= ~Q6SS_CLK_ENABLE; + writel(val, wcss->reg_base + Q6SS_XO_CBCR); + clk_disable_unprepare(wcss->qdsp6ss_axim_cbcr); +disable_abhm_cbcr_clk: + clk_disable_unprepare(wcss->qdsp6ss_abhm_cbcr); +disable_tcm_slave_cbcr_clk: + clk_disable_unprepare(wcss->tcm_slave_cbcr); +disable_ahbs_cbcr_clk: + clk_disable_unprepare(wcss->ahbs_cbcr); +disable_csr_cbcr_clk: + clk_disable_unprepare(wcss->lcc_csr_cbcr); +disable_ahbfabric_cbcr_clk: + clk_disable_unprepare(wcss->ahbfabric_cbcr_clk); +disable_gcc_abhs_cbcr_clk: + clk_disable_unprepare(wcss->gcc_abhs_cbcr); + + return ret; +} + +static inline int q6v5_wcss_qcs404_reset(struct q6v5_wcss *wcss) +{ + unsigned long val; + + writel(0x80800000, wcss->reg_base + Q6SS_STRAP_ACC); + + /* Start core execution */ + val = readl(wcss->reg_base + Q6SS_RESET_REG); + val &= ~Q6SS_STOP_CORE; + writel(val, wcss->reg_base + Q6SS_RESET_REG); + + return 0; +} + +static int q6v5_qcs404_wcss_start(struct rproc *rproc) +{ + struct q6v5_wcss *wcss = rproc->priv; + int ret; + + ret = clk_prepare_enable(wcss->xo); + if (ret) + return ret; + + ret = regulator_enable(wcss->cx_supply); + if (ret) + goto disable_xo_clk; + + qcom_q6v5_prepare(&wcss->q6v5); + + ret = q6v5_wcss_qcs404_power_on(wcss); + if (ret) { + dev_err(wcss->dev, "wcss clk_enable failed\n"); + goto disable_cx_supply; + } + + writel(rproc->bootaddr >> 4, wcss->reg_base + Q6SS_RST_EVB); + + q6v5_wcss_qcs404_reset(wcss); + + ret = qcom_q6v5_wait_for_start(&wcss->q6v5, 5 * HZ); + if (ret == -ETIMEDOUT) { + dev_err(wcss->dev, "start timed out\n"); + goto disable_cx_supply; + } + + return 0; + +disable_cx_supply: + regulator_disable(wcss->cx_supply); +disable_xo_clk: + clk_disable_unprepare(wcss->xo); + + return ret; +} + static void q6v5_wcss_halt_axi_port(struct q6v5_wcss *wcss, struct regmap *halt_map, u32 offset) @@ -278,6 +526,70 @@ static void q6v5_wcss_halt_axi_port(struct q6v5_wcss *wcss, regmap_write(halt_map, offset + AXI_HALTREQ_REG, 0); } +static int q6v5_qcs404_wcss_shutdown(struct q6v5_wcss *wcss) +{ + unsigned long val; + int ret; + + q6v5_wcss_halt_axi_port(wcss, wcss->halt_map, wcss->halt_wcss); + + /* assert clamps to avoid MX current inrush */ + val = readl(wcss->reg_base + Q6SS_PWR_CTL_REG); + val |= (Q6SS_CLAMP_IO | Q6SS_CLAMP_WL | Q6SS_CLAMP_QMC_MEM); + writel(val, wcss->reg_base + Q6SS_PWR_CTL_REG); + + /* Disable memories by turning off memory foot/headswitch */ + writel((readl(wcss->reg_base + Q6SS_MEM_PWR_CTL) & + ~QDSS_Q6_MEMORIES), + wcss->reg_base + Q6SS_MEM_PWR_CTL); + + /* Clear the BHS_ON bit */ + val = readl(wcss->reg_base + Q6SS_PWR_CTL_REG); + val &= ~Q6SS_BHS_ON; + writel(val, wcss->reg_base + Q6SS_PWR_CTL_REG); + + clk_disable_unprepare(wcss->ahbfabric_cbcr_clk); + clk_disable_unprepare(wcss->lcc_csr_cbcr); + clk_disable_unprepare(wcss->tcm_slave_cbcr); + clk_disable_unprepare(wcss->qdsp6ss_abhm_cbcr); + clk_disable_unprepare(wcss->qdsp6ss_axim_cbcr); + + val = readl(wcss->reg_base + Q6SS_SLEEP_CBCR); + val &= ~BIT(0); + writel(val, wcss->reg_base + Q6SS_SLEEP_CBCR); + + val = readl(wcss->reg_base + Q6SS_XO_CBCR); + val &= ~BIT(0); + writel(val, wcss->reg_base + Q6SS_XO_CBCR); + + clk_disable_unprepare(wcss->ahbs_cbcr); + clk_disable_unprepare(wcss->lcc_bcr_sleep); + + val = readl(wcss->reg_base + Q6SS_GFMUX_CTL_REG); + val &= ~(Q6SS_CLK_ENABLE | Q6SS_SWITCH_CLK_SRC); + writel(val, wcss->reg_base + Q6SS_GFMUX_CTL_REG); + + clk_disable_unprepare(wcss->gcc_abhs_cbcr); + + ret = reset_control_assert(wcss->wcss_reset); + if (ret) { + dev_err(wcss->dev, "wcss_reset failed\n"); + return ret; + } + usleep_range(200, 300); + + ret = reset_control_deassert(wcss->wcss_reset); + if (ret) { + dev_err(wcss->dev, "wcss_reset failed\n"); + return ret; + } + usleep_range(200, 300); + + clk_disable_unprepare(wcss->gcc_axim_cbcr); + + return 0; +} + static int q6v5_wcss_powerdown(struct q6v5_wcss *wcss) { int ret; @@ -397,20 +709,28 @@ static int q6v5_wcss_stop(struct rproc *rproc) int ret; /* WCSS powerdown */ - ret = qcom_q6v5_request_stop(&wcss->q6v5, NULL); - if (ret == -ETIMEDOUT) { - dev_err(wcss->dev, "timed out on wait\n"); - return ret; + if (wcss->requires_force_stop) { + ret = qcom_q6v5_request_stop(&wcss->q6v5, NULL); + if (ret == -ETIMEDOUT) { + dev_err(wcss->dev, "timed out on wait\n"); + return ret; + } } - ret = q6v5_wcss_powerdown(wcss); - if (ret) - return ret; + if (wcss->version == WCSS_QCS404) { + ret = q6v5_qcs404_wcss_shutdown(wcss); + if (ret) + return ret; + } else { + ret = q6v5_wcss_powerdown(wcss); + if (ret) + return ret; - /* Q6 Power down */ - ret = q6v5_q6_powerdown(wcss); - if (ret) - return ret; + /* Q6 Power down */ + ret = q6v5_q6_powerdown(wcss); + if (ret) + return ret; + } qcom_q6v5_unprepare(&wcss->q6v5); @@ -453,14 +773,26 @@ static const struct rproc_ops q6v5_wcss_ipq8074_ops = { .get_boot_addr = rproc_elf_get_boot_addr, }; -static int q6v5_wcss_init_reset(struct q6v5_wcss *wcss) +static const struct rproc_ops q6v5_wcss_qcs404_ops = { + .start = q6v5_qcs404_wcss_start, + .stop = q6v5_wcss_stop, + .da_to_va = q6v5_wcss_da_to_va, + .load = q6v5_wcss_load, + .get_boot_addr = rproc_elf_get_boot_addr, + .parse_fw = qcom_register_dump_segments, +}; + +static int q6v5_wcss_init_reset(struct q6v5_wcss *wcss, + const struct wcss_data *desc) { struct device *dev = wcss->dev; - wcss->wcss_aon_reset = devm_reset_control_get(dev, "wcss_aon_reset"); - if (IS_ERR(wcss->wcss_aon_reset)) { - dev_err(wcss->dev, "unable to acquire wcss_aon_reset\n"); - return PTR_ERR(wcss->wcss_aon_reset); + if (desc->aon_reset_required) { + wcss->wcss_aon_reset = devm_reset_control_get(dev, "wcss_aon_reset"); + if (IS_ERR(wcss->wcss_aon_reset)) { + dev_err(wcss->dev, "fail to acquire wcss_aon_reset\n"); + return PTR_ERR(wcss->wcss_aon_reset); + } } wcss->wcss_reset = devm_reset_control_get(dev, "wcss_reset"); @@ -469,10 +801,18 @@ static int q6v5_wcss_init_reset(struct q6v5_wcss *wcss) return PTR_ERR(wcss->wcss_reset); } - wcss->wcss_q6_reset = devm_reset_control_get(dev, "wcss_q6_reset"); - if (IS_ERR(wcss->wcss_q6_reset)) { - dev_err(wcss->dev, "unable to acquire wcss_q6_reset\n"); - return PTR_ERR(wcss->wcss_q6_reset); + if (desc->wcss_q6_reset_required) { + wcss->wcss_q6_reset = devm_reset_control_get(dev, "wcss_q6_reset"); + if (IS_ERR(wcss->wcss_q6_reset)) { + dev_err(wcss->dev, "unable to acquire wcss_q6_reset\n"); + return PTR_ERR(wcss->wcss_q6_reset); + } + } + + wcss->wcss_q6_bcr_reset = devm_reset_control_get_exclusive(dev, "wcss_q6_bcr_reset"); + if (IS_ERR(wcss->wcss_q6_bcr_reset)) { + dev_err(wcss->dev, "unable to acquire wcss_q6_bcr_reset\n"); + return PTR_ERR(wcss->wcss_q6_bcr_reset); } return 0; @@ -481,35 +821,48 @@ static int q6v5_wcss_init_reset(struct q6v5_wcss *wcss) static int q6v5_wcss_init_mmio(struct q6v5_wcss *wcss, struct platform_device *pdev) { - struct of_phandle_args args; + unsigned int halt_reg[MAX_HALT_REG] = {0}; + struct device_node *syscon; struct resource *res; int ret; res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "qdsp6"); - wcss->reg_base = devm_ioremap_resource(&pdev->dev, res); + wcss->reg_base = devm_ioremap(&pdev->dev, res->start, + resource_size(res)); if (IS_ERR(wcss->reg_base)) return PTR_ERR(wcss->reg_base); - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rmb"); - wcss->rmb_base = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(wcss->rmb_base)) - return PTR_ERR(wcss->rmb_base); + if (wcss->version == WCSS_IPQ8074) { + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rmb"); + wcss->rmb_base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(wcss->rmb_base)) + return PTR_ERR(wcss->rmb_base); + } - ret = of_parse_phandle_with_fixed_args(pdev->dev.of_node, - "qcom,halt-regs", 3, 0, &args); + syscon = of_parse_phandle(pdev->dev.of_node, + "qcom,halt-regs", 0); + if (!syscon) { + dev_err(&pdev->dev, "failed to parse qcom,halt-regs\n"); + return -EINVAL; + } + + wcss->halt_map = syscon_node_to_regmap(syscon); + of_node_put(syscon); + if (IS_ERR(wcss->halt_map)) + return PTR_ERR(wcss->halt_map); + + ret = of_property_read_variable_u32_array(pdev->dev.of_node, + "qcom,halt-regs", + halt_reg, 0, + MAX_HALT_REG); if (ret < 0) { dev_err(&pdev->dev, "failed to parse qcom,halt-regs\n"); return -EINVAL; } - wcss->halt_map = syscon_node_to_regmap(args.np); - of_node_put(args.np); - if (IS_ERR(wcss->halt_map)) - return PTR_ERR(wcss->halt_map); - - wcss->halt_q6 = args.args[0]; - wcss->halt_wcss = args.args[1]; - wcss->halt_nc = args.args[2]; + wcss->halt_q6 = halt_reg[0]; + wcss->halt_wcss = halt_reg[1]; + wcss->halt_nc = halt_reg[2]; return 0; } @@ -543,6 +896,107 @@ static int q6v5_alloc_memory_region(struct q6v5_wcss *wcss) return 0; } +static int q6v5_wcss_init_clock(struct q6v5_wcss *wcss) +{ + int ret; + + wcss->xo = devm_clk_get(wcss->dev, "xo"); + if (IS_ERR(wcss->xo)) { + ret = PTR_ERR(wcss->xo); + if (ret != -EPROBE_DEFER) + dev_err(wcss->dev, "failed to get xo clock"); + return ret; + } + + wcss->gcc_abhs_cbcr = devm_clk_get(wcss->dev, "gcc_abhs_cbcr"); + if (IS_ERR(wcss->gcc_abhs_cbcr)) { + ret = PTR_ERR(wcss->gcc_abhs_cbcr); + if (ret != -EPROBE_DEFER) + dev_err(wcss->dev, "failed to get gcc abhs clock"); + return PTR_ERR(wcss->gcc_abhs_cbcr); + } + + wcss->gcc_axim_cbcr = devm_clk_get(wcss->dev, "gcc_axim_cbcr"); + if (IS_ERR(wcss->gcc_axim_cbcr)) { + ret = PTR_ERR(wcss->gcc_axim_cbcr); + if (ret != -EPROBE_DEFER) + dev_err(wcss->dev, "failed to get gcc axim clock\n"); + return PTR_ERR(wcss->gcc_axim_cbcr); + } + + wcss->ahbfabric_cbcr_clk = devm_clk_get(wcss->dev, + "lcc_ahbfabric_cbc"); + if (IS_ERR(wcss->ahbfabric_cbcr_clk)) { + ret = PTR_ERR(wcss->ahbfabric_cbcr_clk); + if (ret != -EPROBE_DEFER) + dev_err(wcss->dev, "failed to get ahbfabric clock\n"); + return PTR_ERR(wcss->ahbfabric_cbcr_clk); + } + + wcss->lcc_csr_cbcr = devm_clk_get(wcss->dev, "tcsr_lcc_cbc"); + if (IS_ERR(wcss->lcc_csr_cbcr)) { + ret = PTR_ERR(wcss->lcc_csr_cbcr); + if (ret != -EPROBE_DEFER) + dev_err(wcss->dev, "failed to get csr cbcr clk\n"); + return PTR_ERR(wcss->lcc_csr_cbcr); + } + + wcss->ahbs_cbcr = devm_clk_get(wcss->dev, + "lcc_abhs_cbc"); + if (IS_ERR(wcss->ahbs_cbcr)) { + ret = PTR_ERR(wcss->ahbs_cbcr); + if (ret != -EPROBE_DEFER) + dev_err(wcss->dev, "failed to get ahbs_cbcr clk\n"); + return PTR_ERR(wcss->ahbs_cbcr); + } + + wcss->tcm_slave_cbcr = devm_clk_get(wcss->dev, + "lcc_tcm_slave_cbc"); + if (IS_ERR(wcss->tcm_slave_cbcr)) { + ret = PTR_ERR(wcss->tcm_slave_cbcr); + if (ret != -EPROBE_DEFER) + dev_err(wcss->dev, "failed to get tcm cbcr clk\n"); + return PTR_ERR(wcss->tcm_slave_cbcr); + } + + wcss->qdsp6ss_abhm_cbcr = devm_clk_get(wcss->dev, "lcc_abhm_cbc"); + if (IS_ERR(wcss->qdsp6ss_abhm_cbcr)) { + ret = PTR_ERR(wcss->qdsp6ss_abhm_cbcr); + if (ret != -EPROBE_DEFER) + dev_err(wcss->dev, "failed to get abhm cbcr clk\n"); + return PTR_ERR(wcss->qdsp6ss_abhm_cbcr); + } + + wcss->qdsp6ss_axim_cbcr = devm_clk_get(wcss->dev, "lcc_axim_cbc"); + if (IS_ERR(wcss->qdsp6ss_axim_cbcr)) { + ret = PTR_ERR(wcss->qdsp6ss_axim_cbcr); + if (ret != -EPROBE_DEFER) + dev_err(wcss->dev, "failed to get axim cbcr clk\n"); + return PTR_ERR(wcss->qdsp6ss_abhm_cbcr); + } + + wcss->lcc_bcr_sleep = devm_clk_get(wcss->dev, "lcc_bcr_sleep"); + if (IS_ERR(wcss->lcc_bcr_sleep)) { + ret = PTR_ERR(wcss->lcc_bcr_sleep); + if (ret != -EPROBE_DEFER) + dev_err(wcss->dev, "failed to get bcr cbcr clk\n"); + return PTR_ERR(wcss->lcc_bcr_sleep); + } + + return 0; +} + +static int q6v5_wcss_init_regulator(struct q6v5_wcss *wcss) +{ + wcss->cx_supply = devm_regulator_get(wcss->dev, "cx"); + if (IS_ERR(wcss->cx_supply)) + return PTR_ERR(wcss->cx_supply); + + regulator_set_load(wcss->cx_supply, 100000); + + return 0; +} + static int q6v5_wcss_probe(struct platform_device *pdev) { const struct wcss_data *desc; @@ -554,7 +1008,7 @@ static int q6v5_wcss_probe(struct platform_device *pdev) if (!desc) return -EINVAL; - rproc = rproc_alloc(&pdev->dev, pdev->name, &q6v5_wcss_ipq8074_ops, + rproc = rproc_alloc(&pdev->dev, pdev->name, desc->ops, desc->firmware_name, sizeof(*wcss)); if (!rproc) { dev_err(&pdev->dev, "failed to allocate rproc\n"); @@ -563,7 +1017,10 @@ static int q6v5_wcss_probe(struct platform_device *pdev) wcss = rproc->priv; wcss->dev = &pdev->dev; - wcss->crash_reason_smem = desc->crash_reason_smem; + wcss->version = desc->version; + + wcss->version = desc->version; + wcss->requires_force_stop = desc->requires_force_stop; ret = q6v5_wcss_init_mmio(wcss, pdev); if (ret) @@ -573,7 +1030,17 @@ static int q6v5_wcss_probe(struct platform_device *pdev) if (ret) goto free_rproc; - ret = q6v5_wcss_init_reset(wcss); + if (wcss->version == WCSS_QCS404) { + ret = q6v5_wcss_init_clock(wcss); + if (ret) + goto free_rproc; + + ret = q6v5_wcss_init_regulator(wcss); + if (ret) + goto free_rproc; + } + + ret = q6v5_wcss_init_reset(wcss, desc); if (ret) goto free_rproc; @@ -585,6 +1052,11 @@ static int q6v5_wcss_probe(struct platform_device *pdev) qcom_add_glink_subdev(rproc, &wcss->glink_subdev, "q6wcss"); qcom_add_ssr_subdev(rproc, &wcss->ssr_subdev, "q6wcss"); + if (desc->ssctl_id) + wcss->sysmon = qcom_add_sysmon_subdev(rproc, + desc->sysmon_name, + desc->ssctl_id); + ret = rproc_add(rproc); if (ret) goto free_rproc; @@ -612,10 +1084,28 @@ static int q6v5_wcss_remove(struct platform_device *pdev) static const struct wcss_data wcss_ipq8074_res_init = { .firmware_name = "IPQ8074/q6_fw.mdt", .crash_reason_smem = WCSS_CRASH_REASON, + .aon_reset_required = true, + .wcss_q6_reset_required = true, + .ops = &q6v5_wcss_ipq8074_ops, + .requires_force_stop = true, +}; + +static const struct wcss_data wcss_qcs404_res_init = { + .crash_reason_smem = WCSS_CRASH_REASON, + .firmware_name = "wcnss.mdt", + .version = WCSS_QCS404, + .aon_reset_required = false, + .wcss_q6_reset_required = false, + .ssr_name = "mpss", + .sysmon_name = "wcnss", + .ssctl_id = 0x12, + .ops = &q6v5_wcss_qcs404_ops, + .requires_force_stop = false, }; static const struct of_device_id q6v5_wcss_of_match[] = { { .compatible = "qcom,ipq8074-wcss-pil", .data = &wcss_ipq8074_res_init }, + { .compatible = "qcom,qcs404-wcss-pil", .data = &wcss_qcs404_res_init }, { }, }; MODULE_DEVICE_TABLE(of, q6v5_wcss_of_match); From bb91c9ee518cd7353f2301f4bd6b65ea42a750d4 Mon Sep 17 00:00:00 2001 From: Govind Singh Date: Fri, 29 Jan 2021 00:18:15 +0530 Subject: [PATCH 0146/1379] remoteproc: qcom: wcss: explicitly request exclusive reset control Use request exclusive reset control for wcss reset controls. Signed-off-by: Govind Singh Signed-off-by: Gokul Sriram Palanisamy Link: https://lore.kernel.org/r/1611859695-11824-5-git-send-email-gokulsri@codeaurora.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_wcss.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/remoteproc/qcom_q6v5_wcss.c b/drivers/remoteproc/qcom_q6v5_wcss.c index d67264c93996..71ec1a451e35 100644 --- a/drivers/remoteproc/qcom_q6v5_wcss.c +++ b/drivers/remoteproc/qcom_q6v5_wcss.c @@ -788,21 +788,21 @@ static int q6v5_wcss_init_reset(struct q6v5_wcss *wcss, struct device *dev = wcss->dev; if (desc->aon_reset_required) { - wcss->wcss_aon_reset = devm_reset_control_get(dev, "wcss_aon_reset"); + wcss->wcss_aon_reset = devm_reset_control_get_exclusive(dev, "wcss_aon_reset"); if (IS_ERR(wcss->wcss_aon_reset)) { dev_err(wcss->dev, "fail to acquire wcss_aon_reset\n"); return PTR_ERR(wcss->wcss_aon_reset); } } - wcss->wcss_reset = devm_reset_control_get(dev, "wcss_reset"); + wcss->wcss_reset = devm_reset_control_get_exclusive(dev, "wcss_reset"); if (IS_ERR(wcss->wcss_reset)) { dev_err(wcss->dev, "unable to acquire wcss_reset\n"); return PTR_ERR(wcss->wcss_reset); } if (desc->wcss_q6_reset_required) { - wcss->wcss_q6_reset = devm_reset_control_get(dev, "wcss_q6_reset"); + wcss->wcss_q6_reset = devm_reset_control_get_exclusive(dev, "wcss_q6_reset"); if (IS_ERR(wcss->wcss_q6_reset)) { dev_err(wcss->dev, "unable to acquire wcss_q6_reset\n"); return PTR_ERR(wcss->wcss_q6_reset); From 48073935b9a4f820733937bd40a74c1c389caee6 Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 11 Mar 2021 16:24:41 -0800 Subject: [PATCH 0147/1379] remoteproc: qcom: wcnss: Allow specifying firmware-name Introduce a firmware-name property, in order to be able to support device/platform specific firmware for the wireless connectivity subsystem; in line with other Qualcomm remoteproc drivers. Acked-by: Mathieu Poirier Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20210312002441.3273183-1-bjorn.andersson@linaro.org Signed-off-by: Bjorn Andersson --- .../devicetree/bindings/remoteproc/qcom,wcnss-pil.txt | 6 ++++++ drivers/remoteproc/qcom_wcnss.c | 8 +++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,wcnss-pil.txt b/Documentation/devicetree/bindings/remoteproc/qcom,wcnss-pil.txt index da09c0d79ac0..a83080b8905c 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,wcnss-pil.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,wcnss-pil.txt @@ -34,6 +34,12 @@ on the Qualcomm WCNSS core. Definition: should be "wdog", "fatal", optionally followed by "ready", "handover", "stop-ack" +- firmware-name: + Usage: optional + Value type: + Definition: must list the relative firmware image path for the + WCNSS core. Defaults to "wcnss.mdt". + - vddmx-supply: (deprecated for qcom,pronto-v1/2-pil) - vddcx-supply: (deprecated for qcom,pronto-v1/2-pil) - vddpx-supply: diff --git a/drivers/remoteproc/qcom_wcnss.c b/drivers/remoteproc/qcom_wcnss.c index 3a131163064c..5f3455aa7e0e 100644 --- a/drivers/remoteproc/qcom_wcnss.c +++ b/drivers/remoteproc/qcom_wcnss.c @@ -530,6 +530,7 @@ static int wcnss_alloc_memory_region(struct qcom_wcnss *wcnss) static int wcnss_probe(struct platform_device *pdev) { + const char *fw_name = WCNSS_FIRMWARE_NAME; const struct wcnss_data *data; struct qcom_wcnss *wcnss; struct resource *res; @@ -547,8 +548,13 @@ static int wcnss_probe(struct platform_device *pdev) return -ENXIO; } + ret = of_property_read_string(pdev->dev.of_node, "firmware-name", + &fw_name); + if (ret < 0 && ret != -EINVAL) + return ret; + rproc = rproc_alloc(&pdev->dev, pdev->name, &wcnss_ops, - WCNSS_FIRMWARE_NAME, sizeof(*wcnss)); + fw_name, sizeof(*wcnss)); if (!rproc) { dev_err(&pdev->dev, "unable to allocate remoteproc\n"); return -ENOMEM; From e20044f7e9ae2b5395ca3ae9bd0907fdf43357a7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:01 -0400 Subject: [PATCH 0148/1379] ring-buffer: Separate out internal use of ring_buffer_event_time_stamp() The exported use of ring_buffer_event_time_stamp() is going to become different than how it is used internally. Move the internal logic out into a static function called rb_event_time_stamp(), and have the internal callers call that instead. Link: https://lkml.kernel.org/r/20210316164113.257790481@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 41 +++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 68744c51517e..941ac2021b97 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -287,6 +287,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define TS_MASK ((1ULL << TS_SHIFT) - 1) #define TS_DELTA_TEST (~TS_MASK) +static u64 rb_event_time_stamp(struct ring_buffer_event *event) +{ + u64 ts; + + ts = event->array[0]; + ts <<= TS_SHIFT; + ts += event->time_delta; + + return ts; +} + /** * ring_buffer_event_time_stamp - return the event's extended timestamp * @event: the event to get the timestamp of @@ -299,13 +310,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); */ u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event) { - u64 ts; - - ts = event->array[0]; - ts <<= TS_SHIFT; - ts += event->time_delta; - - return ts; + return rb_event_time_stamp(event); } /* Flag when events were overwritten */ @@ -2766,7 +2771,7 @@ static u64 rb_time_delta(struct ring_buffer_event *event) return 0; case RINGBUF_TYPE_TIME_EXTEND: - return ring_buffer_event_time_stamp(event); + return rb_event_time_stamp(event); case RINGBUF_TYPE_TIME_STAMP: return 0; @@ -3212,13 +3217,13 @@ static void dump_buffer_page(struct buffer_data_page *bpage, switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts += delta; pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); break; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts = delta; pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); break; @@ -3289,12 +3294,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, switch (event->type_len) { case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts += delta; break; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); ts = delta; break; @@ -4256,12 +4261,12 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); cpu_buffer->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); cpu_buffer->read_stamp = delta; return; @@ -4286,12 +4291,12 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, return; case RINGBUF_TYPE_TIME_EXTEND: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); iter->read_stamp += delta; return; case RINGBUF_TYPE_TIME_STAMP: - delta = ring_buffer_event_time_stamp(event); + delta = rb_event_time_stamp(event); iter->read_stamp = delta; return; @@ -4544,7 +4549,7 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, case RINGBUF_TYPE_TIME_STAMP: if (ts) { - *ts = ring_buffer_event_time_stamp(event); + *ts = rb_event_time_stamp(event); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } @@ -4635,7 +4640,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) case RINGBUF_TYPE_TIME_STAMP: if (ts) { - *ts = ring_buffer_event_time_stamp(event); + *ts = rb_event_time_stamp(event); ring_buffer_normalize_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu, ts); } From 8672e4948d0c44272cc05f8ff563dbf6b6c1289f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:02 -0400 Subject: [PATCH 0149/1379] ring-buffer: Add a event_stamp to cpu_buffer for each level of nesting Add a place to save the current event time stamp for each level of nesting. This will be used to retrieve the time stamp of the current event before it is committed. Link: https://lkml.kernel.org/r/20210316164113.399089673@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 941ac2021b97..470d97169081 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -492,6 +492,8 @@ struct rb_time_struct { #endif typedef struct rb_time_struct rb_time_t; +#define MAX_NEST 5 + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -529,6 +531,7 @@ struct ring_buffer_per_cpu { unsigned long read_bytes; rb_time_t write_stamp; rb_time_t before_stamp; + u64 event_stamp[MAX_NEST]; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; @@ -2715,6 +2718,10 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, { unsigned length = info->length; u64 delta = info->delta; + unsigned int nest = local_read(&cpu_buffer->committing) - 1; + + if (nest < MAX_NEST) + cpu_buffer->event_stamp[nest] = info->ts; /* * If we need to add a timestamp, then we @@ -3456,7 +3463,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, info->after, ts)) { /* Nothing came after this event between C and E */ info->delta = ts - info->after; - info->ts = ts; } else { /* * Interrupted between C and E: @@ -3468,6 +3474,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ info->delta = 0; } + info->ts = ts; info->add_timestamp &= ~RB_ADD_STAMP_FORCE; } @@ -5026,6 +5033,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) rb_time_set(&cpu_buffer->write_stamp, 0); rb_time_set(&cpu_buffer->before_stamp, 0); + memset(cpu_buffer->event_stamp, 0, sizeof(cpu_buffer->event_stamp)); + cpu_buffer->lost_events = 0; cpu_buffer->last_overrun = 0; From b47e330231acbf4506b049643145cc64268a1940 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:03 -0400 Subject: [PATCH 0150/1379] tracing: Pass buffer of event to trigger operations The ring_buffer_event_time_stamp() is going to be updated to extract the time stamp for the event without needing it to be set to have absolute values for all events. But to do so, it needs the buffer that the event is on as the buffer saves information for the event before it is committed to the buffer. If the trace buffer is disabled, a temporary buffer is used, and there's no access to this buffer from the current histogram triggers, even though it is passed to the trace event code. Pass the buffer that the event is on all the way down to the histogram triggers. Link: https://lkml.kernel.org/r/20210316164113.542448131@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 5 +- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 4 +- kernel/trace/trace_events_hist.c | 92 +++++++++++++++++++---------- kernel/trace/trace_events_trigger.c | 45 ++++++++------ 5 files changed, 95 insertions(+), 53 deletions(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 28e7af1406f2..8cba64ce23a4 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -640,7 +640,8 @@ enum event_trigger_type { extern int filter_match_preds(struct event_filter *filter, void *rec); extern enum event_trigger_type -event_triggers_call(struct trace_event_file *file, void *rec, +event_triggers_call(struct trace_event_file *file, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event); extern void event_triggers_post_call(struct trace_event_file *file, @@ -664,7 +665,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { if (eflags & EVENT_FILE_FL_TRIGGER_MODE) - event_triggers_call(file, NULL, NULL); + event_triggers_call(file, NULL, NULL, NULL); if (eflags & EVENT_FILE_FL_SOFT_DISABLED) return true; if (eflags & EVENT_FILE_FL_PID_FILTER) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eccb4e1187cc..f979220238a5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6763,7 +6763,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tr->trace_marker_file && !list_empty(&tr->trace_marker_file->triggers)) { /* do not add \n before testing triggers, but add \0 */ entry->buf[cnt] = '\0'; - tt = event_triggers_call(tr->trace_marker_file, entry, event); + tt = event_triggers_call(tr->trace_marker_file, buffer, entry, event); } if (entry->buf[cnt - 1] != '\n') { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a6446c03cfbc..798773178d7e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1329,7 +1329,7 @@ __event_trigger_test_discard(struct trace_event_file *file, unsigned long eflags = file->flags; if (eflags & EVENT_FILE_FL_TRIGGER_COND) - *tt = event_triggers_call(file, entry, event); + *tt = event_triggers_call(file, buffer, entry, event); if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && @@ -1626,7 +1626,7 @@ extern int register_trigger_hist_enable_disable_cmds(void); */ struct event_trigger_ops { void (*func)(struct event_trigger_data *data, - void *rec, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 39ebe1826fc3..6978aa3ee4c5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -81,6 +81,7 @@ struct hist_field; typedef u64 (*hist_field_fn_t) (struct hist_field *field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event); @@ -153,6 +154,7 @@ struct hist_field { static u64 hist_field_none(struct hist_field *field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -161,6 +163,7 @@ static u64 hist_field_none(struct hist_field *field, static u64 hist_field_counter(struct hist_field *field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -169,6 +172,7 @@ static u64 hist_field_counter(struct hist_field *field, static u64 hist_field_string(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -179,6 +183,7 @@ static u64 hist_field_string(struct hist_field *hist_field, static u64 hist_field_dynstring(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -191,6 +196,7 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, static u64 hist_field_pstring(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -201,52 +207,56 @@ static u64 hist_field_pstring(struct hist_field *hist_field, static u64 hist_field_log2(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand = hist_field->operands[0]; - u64 val = operand->fn(operand, elt, rbe, event); + u64 val = operand->fn(operand, elt, buffer, rbe, event); return (u64) ilog2(roundup_pow_of_two(val)); } static u64 hist_field_plus(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, rbe, event); - u64 val2 = operand2->fn(operand2, elt, rbe, event); + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); return val1 + val2; } static u64 hist_field_minus(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, rbe, event); - u64 val2 = operand2->fn(operand2, elt, rbe, event); + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); return val1 - val2; } static u64 hist_field_unary_minus(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { struct hist_field *operand = hist_field->operands[0]; - s64 sval = (s64)operand->fn(operand, elt, rbe, event); + s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event); u64 val = (u64)-sval; return val; @@ -255,6 +265,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field, #define DEFINE_HIST_FIELD_FN(type) \ static u64 hist_field_##type(struct hist_field *hist_field, \ struct tracing_map_elt *elt, \ + struct trace_buffer *buffer, \ struct ring_buffer_event *rbe, \ void *event) \ { \ @@ -380,7 +391,8 @@ struct hist_trigger_data { struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals); @@ -608,7 +620,8 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, } static void action_trace(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { @@ -624,6 +637,7 @@ struct hist_var_data { static u64 hist_field_timestamp(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -640,6 +654,7 @@ static u64 hist_field_timestamp(struct hist_field *hist_field, static u64 hist_field_cpu(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -1020,6 +1035,7 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, static u64 hist_field_var_ref(struct hist_field *hist_field, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *event) { @@ -2561,6 +2577,7 @@ find_target_event_var(struct hist_trigger_data *hist_data, } static inline void __update_field_vars(struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *rec, struct field_var **field_vars, @@ -2576,7 +2593,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, struct hist_field *var = field_var->var; struct hist_field *val = field_var->val; - var_val = val->fn(val, elt, rbe, rec); + var_val = val->fn(val, elt, buffer, rbe, rec); var_idx = var->var.idx; if (val->flags & HIST_FIELD_FL_STRING) { @@ -2592,19 +2609,21 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, static void update_field_vars(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, + struct trace_buffer *buffer, struct ring_buffer_event *rbe, void *rec) { - __update_field_vars(elt, rbe, rec, hist_data->field_vars, + __update_field_vars(elt, buffer, rbe, rec, hist_data->field_vars, hist_data->n_field_vars, 0); } static void save_track_data_vars(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { - __update_field_vars(elt, rbe, rec, hist_data->save_vars, + __update_field_vars(elt, buffer, rbe, rec, hist_data->save_vars, hist_data->n_save_vars, hist_data->n_field_var_str); } @@ -2780,12 +2799,14 @@ static void save_track_val(struct hist_trigger_data *hist_data, } static void save_track_data(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { if (data->track_data.save_data) - data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); + data->track_data.save_data(hist_data, elt, buffer, rec, rbe, + key, data, var_ref_vals); } static bool check_track_val(struct tracing_map_elt *elt, @@ -2836,7 +2857,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) } static void save_track_data_snapshot(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) @@ -2905,7 +2927,8 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) return false; } static void save_track_data_snapshot(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) {} @@ -2947,7 +2970,8 @@ static void track_data_print(struct seq_file *m, } static void ontrack_action(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { @@ -2955,7 +2979,8 @@ static void ontrack_action(struct hist_trigger_data *hist_data, if (check_track_val(elt, data, var_val)) { save_track_val(hist_data, elt, data, var_val); - save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); + save_track_data(hist_data, elt, buffer, rec, rbe, + key, data, var_ref_vals); } } @@ -4400,7 +4425,8 @@ create_hist_data(unsigned int map_bits, } static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, u64 *var_ref_vals) { @@ -4414,7 +4440,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; - hist_val = hist_field->fn(hist_field, elt, rbe, rec); + hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; @@ -4442,13 +4468,13 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_key_field(i, hist_data) { hist_field = hist_data->fields[i]; if (hist_field->flags & HIST_FIELD_FL_VAR) { - hist_val = hist_field->fn(hist_field, elt, rbe, rec); + hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); var_idx = hist_field->var.idx; tracing_map_set_var(elt, var_idx, hist_val); } } - update_field_vars(hist_data, elt, rbe, rec); + update_field_vars(hist_data, elt, buffer, rbe, rec); } static inline void add_to_key(char *compound_key, void *key, @@ -4478,7 +4504,8 @@ static inline void add_to_key(char *compound_key, void *key, static void hist_trigger_actions(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe, void *key, u64 *var_ref_vals) { @@ -4487,11 +4514,12 @@ hist_trigger_actions(struct hist_trigger_data *hist_data, for (i = 0; i < hist_data->n_actions; i++) { data = hist_data->actions[i]; - data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals); + data->fn(hist_data, elt, buffer, rec, rbe, key, data, var_ref_vals); } } -static void event_hist_trigger(struct event_trigger_data *data, void *rec, +static void event_hist_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *rbe) { struct hist_trigger_data *hist_data = data->private_data; @@ -4516,7 +4544,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, HIST_STACKTRACE_SKIP); key = entries; } else { - field_contents = key_field->fn(key_field, elt, rbe, rec); + field_contents = key_field->fn(key_field, elt, buffer, rbe, rec); if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; use_compound_key = true; @@ -4539,10 +4567,10 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, if (!elt) return; - hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals); + hist_trigger_elt_update(hist_data, elt, buffer, rec, rbe, var_ref_vals); if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals); + hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -5812,7 +5840,8 @@ __init int register_trigger_hist_cmd(void) } static void -hist_enable_trigger(struct event_trigger_data *data, void *rec, +hist_enable_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -5830,7 +5859,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec, } static void -hist_enable_count_trigger(struct event_trigger_data *data, void *rec, +hist_enable_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!data->count) @@ -5839,7 +5869,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - hist_enable_trigger(data, rec, event); + hist_enable_trigger(data, buffer, rec, event); } static struct event_trigger_ops hist_enable_trigger_ops = { diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f725802160c0..b8bfa8505b7b 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -53,7 +53,8 @@ void trigger_data_free(struct event_trigger_data *data) * any trigger that should be deferred, ETT_NONE if nothing to defer. */ enum event_trigger_type -event_triggers_call(struct trace_event_file *file, void *rec, +event_triggers_call(struct trace_event_file *file, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct event_trigger_data *data; @@ -67,7 +68,7 @@ event_triggers_call(struct trace_event_file *file, void *rec, if (data->paused) continue; if (!rec) { - data->ops->func(data, rec, event); + data->ops->func(data, buffer, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -77,7 +78,7 @@ event_triggers_call(struct trace_event_file *file, void *rec, tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data, rec, event); + data->ops->func(data, buffer, rec, event); } return tt; } @@ -105,7 +106,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data, NULL, NULL); + data->ops->func(data, NULL, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -937,7 +938,8 @@ get_named_trigger_data(struct event_trigger_data *data) } static void -traceon_trigger(struct event_trigger_data *data, void *rec, +traceon_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (tracing_is_on()) @@ -947,7 +949,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec, } static void -traceon_count_trigger(struct event_trigger_data *data, void *rec, +traceon_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (tracing_is_on()) @@ -963,7 +966,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec, } static void -traceoff_trigger(struct event_trigger_data *data, void *rec, +traceoff_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!tracing_is_on()) @@ -973,7 +977,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec, } static void -traceoff_count_trigger(struct event_trigger_data *data, void *rec, +traceoff_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!tracing_is_on()) @@ -1071,7 +1076,8 @@ static struct event_command trigger_traceoff_cmd = { #ifdef CONFIG_TRACER_SNAPSHOT static void -snapshot_trigger(struct event_trigger_data *data, void *rec, +snapshot_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct trace_event_file *file = data->private_data; @@ -1083,7 +1089,8 @@ snapshot_trigger(struct event_trigger_data *data, void *rec, } static void -snapshot_count_trigger(struct event_trigger_data *data, void *rec, +snapshot_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!data->count) @@ -1092,7 +1099,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - snapshot_trigger(data, rec, event); + snapshot_trigger(data, buffer, rec, event); } static int @@ -1176,14 +1183,16 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #endif static void -stacktrace_trigger(struct event_trigger_data *data, void *rec, +stacktrace_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { trace_dump_stack(STACK_SKIP); } static void -stacktrace_count_trigger(struct event_trigger_data *data, void *rec, +stacktrace_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { if (!data->count) @@ -1192,7 +1201,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - stacktrace_trigger(data, rec, event); + stacktrace_trigger(data, buffer, rec, event); } static int @@ -1254,7 +1263,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) } static void -event_enable_trigger(struct event_trigger_data *data, void *rec, +event_enable_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1266,7 +1276,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec, } static void -event_enable_count_trigger(struct event_trigger_data *data, void *rec, +event_enable_count_trigger(struct event_trigger_data *data, + struct trace_buffer *buffer, void *rec, struct ring_buffer_event *event) { struct enable_trigger_data *enable_data = data->private_data; @@ -1281,7 +1292,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec, if (data->count != -1) (data->count)--; - event_enable_trigger(data, rec, event); + event_enable_trigger(data, buffer, rec, event); } int event_enable_trigger_print(struct seq_file *m, From efe6196a6bc5bbc84b856316c4687fd24566a95c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:04 -0400 Subject: [PATCH 0151/1379] ring-buffer: Allow ring_buffer_event_time_stamp() to return time stamp of all events Currently, ring_buffer_event_time_stamp() only returns an accurate time stamp of the event if it has an absolute extended time stamp attached to it. To make it more robust, use the event_stamp() in case the event does not have an absolute value attached to it. This will allow ring_buffer_event_time_stamp() to be used in more cases than just histograms, and it will also allow histograms to not require including absolute values all the time. Link: https://lkml.kernel.org/r/20210316164113.704830885@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 3 +- kernel/trace/ring_buffer.c | 60 ++++++++++++++++++++++++-------- kernel/trace/trace_events_hist.c | 2 +- 3 files changed, 48 insertions(+), 17 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 136ea0997e6d..057b7ed4fe24 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -61,7 +61,8 @@ enum ring_buffer_type { unsigned ring_buffer_event_length(struct ring_buffer_event *event); void *ring_buffer_event_data(struct ring_buffer_event *event); -u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event); +u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, + struct ring_buffer_event *event); /* * ring_buffer_discard_commit will remove an event that has not diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 470d97169081..8fa2a84f714f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -298,21 +298,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) return ts; } -/** - * ring_buffer_event_time_stamp - return the event's extended timestamp - * @event: the event to get the timestamp of - * - * Returns the extended timestamp associated with a data event. - * An extended time_stamp is a 64-bit timestamp represented - * internally in a special way that makes the best use of space - * contained within a ring buffer event. This function decodes - * it and maps it to a straight u64 value. - */ -u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event) -{ - return rb_event_time_stamp(event); -} - /* Flag when events were overwritten */ #define RB_MISSED_EVENTS (1 << 31) /* Missed count stored at end */ @@ -757,6 +742,51 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) } #endif +static inline u64 rb_time_stamp(struct trace_buffer *buffer); + +/** + * ring_buffer_event_time_stamp - return the event's current time stamp + * @buffer: The buffer that the event is on + * @event: the event to get the time stamp of + * + * Note, this must be called after @event is reserved, and before it is + * committed to the ring buffer. And must be called from the same + * context where the event was reserved (normal, softirq, irq, etc). + * + * Returns the time stamp associated with the current event. + * If the event has an extended time stamp, then that is used as + * the time stamp to return. + * In the highly unlikely case that the event was nested more than + * the max nesting, then the write_stamp of the buffer is returned, + * otherwise current time is returned, but that really neither of + * the last two cases should ever happen. + */ +u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[smp_processor_id()]; + unsigned int nest; + u64 ts; + + /* If the event includes an absolute time, then just use that */ + if (event->type_len == RINGBUF_TYPE_TIME_STAMP) + return rb_event_time_stamp(event); + + /* Read the current saved nesting level time stamp */ + nest = local_read(&cpu_buffer->committing) - 1; + if (likely(nest < MAX_NEST)) + return cpu_buffer->event_stamp[nest]; + + WARN_ON_ONCE(1); + + /* Can only fail on 32 bit */ + if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) + /* Screw it, just read the current time */ + ts = rb_time_stamp(cpu_buffer->buffer); + + return ts; +} + /** * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 6978aa3ee4c5..45986cb4637e 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -644,7 +644,7 @@ static u64 hist_field_timestamp(struct hist_field *hist_field, struct hist_trigger_data *hist_data = hist_field->hist_data; struct trace_array *tr = hist_data->event_file->tr; - u64 ts = ring_buffer_event_time_stamp(rbe); + u64 ts = ring_buffer_event_time_stamp(buffer, rbe); if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr)) ts = ns2usecs(ts); From b94bc80df64823e676b506f8de7dcf6a688d681e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:05 -0400 Subject: [PATCH 0152/1379] tracing: Use a no_filter_buffering_ref to stop using the filter buffer Currently, the trace histograms relies on it using absolute time stamps to trigger the tracing to not use the temp buffer if filters are set. That's because the histograms need the full timestamp that is saved in the ring buffer. That is no longer the case, as the ring_buffer_event_time_stamp() can now return the time stamp for all events without all triggering a full absolute time stamp. Now that the absolute time stamp is an unrelated dependency to not using the filters. There's nothing about having absolute timestamps to keep from using the filter buffer. Instead, change the interface to explicitly state to disable filter buffering that the histogram logic can use. Link: https://lkml.kernel.org/r/20210316164113.847886563@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 28 ++++++++++++---------------- kernel/trace/trace.h | 4 ++-- kernel/trace/trace_events_hist.c | 6 +++--- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f979220238a5..b15436ff85e8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2737,12 +2737,13 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, unsigned int trace_ctx) { struct ring_buffer_event *entry; + struct trace_array *tr = trace_file->tr; int val; - *current_rb = trace_file->tr->array_buffer.buffer; + *current_rb = tr->array_buffer.buffer; - if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags & - (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && + if (!tr->no_filter_buffering_ref && + (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ val = this_cpu_inc_return(trace_buffered_event_cnt); @@ -6971,31 +6972,26 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) return ret; } -int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs) +/* + * Set or disable using the per CPU trace_buffer_event when possible. + */ +int tracing_set_filter_buffering(struct trace_array *tr, bool set) { int ret = 0; mutex_lock(&trace_types_lock); - if (abs && tr->time_stamp_abs_ref++) + if (set && tr->no_filter_buffering_ref++) goto out; - if (!abs) { - if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) { + if (!set) { + if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) { ret = -EINVAL; goto out; } - if (--tr->time_stamp_abs_ref) - goto out; + --tr->no_filter_buffering_ref; } - - ring_buffer_set_time_stamp_abs(tr->array_buffer.buffer, abs); - -#ifdef CONFIG_TRACER_MAX_TRACE - if (tr->max_buffer.buffer) - ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs); -#endif out: mutex_unlock(&trace_types_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 798773178d7e..f2a7a72825c7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -352,7 +352,7 @@ struct trace_array { /* function tracing enabled */ int function_enabled; #endif - int time_stamp_abs_ref; + int no_filter_buffering_ref; struct list_head hist_vars; #ifdef CONFIG_TRACER_SNAPSHOT struct cond_snapshot *cond_snapshot; @@ -372,7 +372,7 @@ extern int tracing_check_open_get_tr(struct trace_array *tr); extern struct trace_array *trace_array_find(const char *instance); extern struct trace_array *trace_array_find_get(const char *instance); -extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); +extern int tracing_set_filter_buffering(struct trace_array *tr, bool set); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 45986cb4637e..c1abd63f1d6c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5484,7 +5484,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, goto out; } - tracing_set_time_stamp_abs(file->tr, true); + tracing_set_filter_buffering(file->tr, true); } if (named_data) @@ -5592,7 +5592,7 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, if (hist_data->enable_timestamps) { if (!hist_data->remove || unregistered) - tracing_set_time_stamp_abs(file->tr, false); + tracing_set_filter_buffering(file->tr, false); } } @@ -5639,7 +5639,7 @@ static void hist_unreg_all(struct trace_event_file *file) update_cond_flag(file); if (hist_data->enable_timestamps) - tracing_set_time_stamp_abs(file->tr, false); + tracing_set_filter_buffering(file->tr, false); if (test->ops->free) test->ops->free(test->ops, test); } From a948c69d6fb1ba749a958a8a87d4eecdda28989d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:06 -0400 Subject: [PATCH 0153/1379] ring-buffer: Add verifier for using ring_buffer_event_time_stamp() The ring_buffer_event_time_stamp() must be only called by an event that has not been committed yet, and is on the buffer that is passed in. This was used to help debug converting the histogram logic over to using the new time stamp code, and was proven to be very useful. Add a verifier that can check that this is the case, and extra WARN_ONs to catch unexpected use cases. Link: https://lkml.kernel.org/r/20210316164113.987294354@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 56 +++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8fa2a84f714f..1c61a8cd7b99 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -742,6 +742,48 @@ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) } #endif +/* + * Enable this to make sure that the event passed to + * ring_buffer_event_time_stamp() is not committed and also + * is on the buffer that it passed in. + */ +//#define RB_VERIFY_EVENT +#ifdef RB_VERIFY_EVENT +static struct list_head *rb_list_head(struct list_head *list); +static void verify_event(struct ring_buffer_per_cpu *cpu_buffer, + void *event) +{ + struct buffer_page *page = cpu_buffer->commit_page; + struct buffer_page *tail_page = READ_ONCE(cpu_buffer->tail_page); + struct list_head *next; + long commit, write; + unsigned long addr = (unsigned long)event; + bool done = false; + int stop = 0; + + /* Make sure the event exists and is not committed yet */ + do { + if (page == tail_page || WARN_ON_ONCE(stop++ > 100)) + done = true; + commit = local_read(&page->page->commit); + write = local_read(&page->write); + if (addr >= (unsigned long)&page->page->data[commit] && + addr < (unsigned long)&page->page->data[write]) + return; + + next = rb_list_head(page->list.next); + page = list_entry(next, struct buffer_page, list); + } while (!done); + WARN_ON_ONCE(1); +} +#else +static inline void verify_event(struct ring_buffer_per_cpu *cpu_buffer, + void *event) +{ +} +#endif + + static inline u64 rb_time_stamp(struct trace_buffer *buffer); /** @@ -772,13 +814,19 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer, if (event->type_len == RINGBUF_TYPE_TIME_STAMP) return rb_event_time_stamp(event); + nest = local_read(&cpu_buffer->committing); + verify_event(cpu_buffer, event); + if (WARN_ON_ONCE(!nest)) + goto fail; + /* Read the current saved nesting level time stamp */ - nest = local_read(&cpu_buffer->committing) - 1; - if (likely(nest < MAX_NEST)) + if (likely(--nest < MAX_NEST)) return cpu_buffer->event_stamp[nest]; - WARN_ON_ONCE(1); + /* Shouldn't happen, warn if it does */ + WARN_ONCE(1, "nest (%d) greater than max", nest); + fail: /* Can only fail on 32 bit */ if (!rb_time_read(&cpu_buffer->write_stamp, &ts)) /* Screw it, just read the current time */ @@ -2750,7 +2798,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, u64 delta = info->delta; unsigned int nest = local_read(&cpu_buffer->committing) - 1; - if (nest < MAX_NEST) + if (!WARN_ON_ONCE(nest >= MAX_NEST)) cpu_buffer->event_stamp[nest] = info->ts; /* From d8279bfc5e9598682f657606d3830ab65932cfe4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 16 Mar 2021 12:41:07 -0400 Subject: [PATCH 0154/1379] tracing: Add tracing_event_time_stamp() API Add a tracing_event_time_stamp() API that checks if the event passed in is not on the ring buffer but a pointer to the per CPU trace_buffered_event which does not have its time stamp set yet. If it is a pointer to the trace_buffered_event, then just return the current time stamp that the ring buffer would produce. Otherwise, return the time stamp from the event. Link: https://lkml.kernel.org/r/20210316164114.131996180@goodmis.org Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 8 ++++++++ kernel/trace/trace.h | 1 + 2 files changed, 9 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b15436ff85e8..90ae3140756e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6972,6 +6972,14 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) return ret; } +u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe) +{ + if (rbe == this_cpu_read(trace_buffered_event)) + return ring_buffer_time_stamp(buffer, smp_processor_id()); + + return ring_buffer_event_time_stamp(buffer, rbe); +} + /* * Set or disable using the per CPU trace_buffer_event when possible. */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f2a7a72825c7..0d8f54f49a3a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -372,6 +372,7 @@ extern int tracing_check_open_get_tr(struct trace_array *tr); extern struct trace_array *trace_array_find(const char *instance); extern struct trace_array *trace_array_find_get(const char *instance); +extern u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe); extern int tracing_set_filter_buffering(struct trace_array *tr, bool set); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); From 2b7d2fe76f9c844af6f150d0f7a76c62dcfe7679 Mon Sep 17 00:00:00 2001 From: Cao jin Date: Thu, 11 Mar 2021 16:52:13 +0800 Subject: [PATCH 0155/1379] bootconfig: Update prototype of setup_boot_config() Parameter "cmdline" has no use, drop it. Link: https://lkml.kernel.org/r/20210311085213.27680-1-jojing64@gmail.com Acked-by: Masami Hiramatsu Signed-off-by: Cao jin Signed-off-by: Steven Rostedt (VMware) --- init/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/init/main.c b/init/main.c index 53b278845b88..407976d8669e 100644 --- a/init/main.c +++ b/init/main.c @@ -405,7 +405,7 @@ static int __init bootconfig_params(char *param, char *val, return 0; } -static void __init setup_boot_config(const char *cmdline) +static void __init setup_boot_config(void) { static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; const char *msg; @@ -472,7 +472,7 @@ static void __init setup_boot_config(const char *cmdline) #else -static void __init setup_boot_config(const char *cmdline) +static void __init setup_boot_config(void) { /* Remove bootconfig data from initrd */ get_boot_config_from_initrd(NULL, NULL); @@ -872,7 +872,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) pr_notice("%s", linux_banner); early_security_init(); setup_arch(&command_line); - setup_boot_config(command_line); + setup_boot_config(); setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); From 421d9d1bea6545543c00ffba4c83f369510de9a1 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Mon, 8 Mar 2021 02:24:59 +0000 Subject: [PATCH 0156/1379] tools/latency-collector: Remove unneeded semicolon Fix semicolon.cocci warning: tools/tracing/latency/latency-collector.c:1021:2-3: Unneeded semicolon Link: https://lkml.kernel.org/r/20210308022459.59881-1-vulab@iscas.ac.cn Reviewed-by: Viktor Rosendahl Signed-off-by: Xu Wang Signed-off-by: Steven Rostedt (VMware) --- tools/tracing/latency/latency-collector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/tracing/latency/latency-collector.c b/tools/tracing/latency/latency-collector.c index b69de9263ee6..3a2e6bb781a8 100644 --- a/tools/tracing/latency/latency-collector.c +++ b/tools/tracing/latency/latency-collector.c @@ -1018,7 +1018,7 @@ static long go_to_sleep(const struct entry *req) cond_timedwait(&printstate.cond, &printstate.mutex, &future); if (time_has_passed(&future)) break; - }; + } if (printstate_has_new_req_arrived(req)) delay = -1; @@ -1941,7 +1941,7 @@ static void scan_arguments(int argc, char *argv[]) if (value < 0) { warnx("TIME must be >= 0\n"); show_usage(); - ; + exit(0); } trace_enable = true; use_random_sleep = true; From e0196ae732343adfe8d854d88b3c0aae9595152f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 11 Mar 2021 09:40:22 +0000 Subject: [PATCH 0157/1379] ftrace: Fix spelling mistake "disabed" -> "disabled" There is a spelling mistake in a comment, fix it. Link: https://lkml.kernel.org/r/20210311094022.5978-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- arch/csky/kernel/probes/ftrace.c | 2 +- arch/riscv/kernel/probes/ftrace.c | 2 +- arch/x86/kernel/kprobes/ftrace.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index ae2b1c7b3b5c..ef2bb9bd9605 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -9,7 +9,7 @@ int arch_check_ftrace_location(struct kprobe *p) return 0; } -/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +/* Ftrace callback handler for kprobes -- called under preepmt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c index e6372490aa0b..c666e5ecb8da 100644 --- a/arch/riscv/kernel/probes/ftrace.c +++ b/arch/riscv/kernel/probes/ftrace.c @@ -2,7 +2,7 @@ #include -/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +/* Ftrace callback handler for kprobes -- called under preepmt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *regs) { diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 373e5fa3ce1f..51c7f5271aee 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -12,7 +12,7 @@ #include "common.h" -/* Ftrace callback handler for kprobes -- called under preepmt disabed */ +/* Ftrace callback handler for kprobes -- called under preepmt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { From 5013f454a352cce8e62162976026a9c472595e42 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 25 Feb 2021 16:51:23 -0500 Subject: [PATCH 0158/1379] tracing: Add check of trace event print fmts for dereferencing pointers Trace events record data into the ring buffer at the time of the event. The trace event has a printf logic to display the recorded data at a much later time when the user reads the trace file. This makes using dereferencing pointers unsafe if the dereferenced pointer points to the original source. The safe way to handle this is to create an array within the trace event and copy the source into the array. Then the dereference pointer may point to that array. As this is a easy mistake to make, a check is added to examine all trace event print fmts to make sure that they are safe to use. This only checks the various %p* dereferenced pointers like %pB, %pR, etc. It does not handle dereferencing of strings, as there are some use cases that are OK to dereference the source. That will be dealt with differently. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 210 ++++++++++++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a3563afd412d..f58106eaf8cb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -217,6 +217,214 @@ int trace_event_get_offsets(struct trace_event_call *call) return tail->offset + tail->size; } +/* + * Check if the referenced field is an array and return true, + * as arrays are OK to dereference. + */ +static bool test_field(const char *fmt, struct trace_event_call *call) +{ + struct trace_event_fields *field = call->class->fields_array; + const char *array_descriptor; + const char *p = fmt; + int len; + + if (!(len = str_has_prefix(fmt, "REC->"))) + return false; + fmt += len; + for (p = fmt; *p; p++) { + if (!isalnum(*p) && *p != '_') + break; + } + len = p - fmt; + + for (; field->type; field++) { + if (strncmp(field->name, fmt, len) || + field->name[len]) + continue; + array_descriptor = strchr(field->type, '['); + /* This is an array and is OK to dereference. */ + return array_descriptor != NULL; + } + return false; +} + +/* + * Examine the print fmt of the event looking for unsafe dereference + * pointers using %p* that could be recorded in the trace event and + * much later referenced after the pointer was freed. Dereferencing + * pointers are OK, if it is dereferenced into the event itself. + */ +static void test_event_printk(struct trace_event_call *call) +{ + u64 dereference_flags = 0; + bool first = true; + const char *fmt, *c, *r, *a; + int parens = 0; + char in_quote = 0; + int start_arg = 0; + int arg = 0; + int i; + + fmt = call->print_fmt; + + if (!fmt) + return; + + for (i = 0; fmt[i]; i++) { + switch (fmt[i]) { + case '\\': + i++; + if (!fmt[i]) + return; + continue; + case '"': + case '\'': + /* + * The print fmt starts with a string that + * is processed first to find %p* usage, + * then after the first string, the print fmt + * contains arguments that are used to check + * if the dereferenced %p* usage is safe. + */ + if (first) { + if (fmt[i] == '\'') + continue; + if (in_quote) { + arg = 0; + first = false; + /* + * If there was no %p* uses + * the fmt is OK. + */ + if (!dereference_flags) + return; + } + } + if (in_quote) { + if (in_quote == fmt[i]) + in_quote = 0; + } else { + in_quote = fmt[i]; + } + continue; + case '%': + if (!first || !in_quote) + continue; + i++; + if (!fmt[i]) + return; + switch (fmt[i]) { + case '%': + continue; + case 'p': + /* Find dereferencing fields */ + switch (fmt[i + 1]) { + case 'B': case 'R': case 'r': + case 'b': case 'M': case 'm': + case 'I': case 'i': case 'E': + case 'U': case 'V': case 'N': + case 'a': case 'd': case 'D': + case 'g': case 't': case 'C': + case 'O': case 'f': + if (WARN_ONCE(arg == 63, + "Too many args for event: %s", + trace_event_name(call))) + return; + dereference_flags |= 1ULL << arg; + } + break; + default: + { + bool star = false; + int j; + + /* Increment arg if %*s exists. */ + for (j = 0; fmt[i + j]; j++) { + if (isdigit(fmt[i + j]) || + fmt[i + j] == '.') + continue; + if (fmt[i + j] == '*') { + star = true; + continue; + } + if ((fmt[i + j] == 's') && star) + arg++; + break; + } + break; + } /* default */ + + } /* switch */ + arg++; + continue; + case '(': + if (in_quote) + continue; + parens++; + continue; + case ')': + if (in_quote) + continue; + parens--; + if (WARN_ONCE(parens < 0, + "Paren mismatch for event: %s\narg='%s'\n%*s", + trace_event_name(call), + fmt + start_arg, + (i - start_arg) + 5, "^")) + return; + continue; + case ',': + if (in_quote || parens) + continue; + i++; + while (isspace(fmt[i])) + i++; + start_arg = i; + if (!(dereference_flags & (1ULL << arg))) + goto next_arg; + + /* Find the REC-> in the argument */ + c = strchr(fmt + i, ','); + r = strstr(fmt + i, "REC->"); + if (r && (!c || r < c)) { + /* + * Addresses of events on the buffer, + * or an array on the buffer is + * OK to dereference. + * There's ways to fool this, but + * this is to catch common mistakes, + * not malicious code. + */ + a = strchr(fmt + i, '&'); + if ((a && (a < r)) || test_field(r, call)) + dereference_flags &= ~(1ULL << arg); + } + next_arg: + i--; + arg++; + } + } + + /* + * If you triggered the below warning, the trace event reported + * uses an unsafe dereference pointer %p*. As the data stored + * at the trace event time may no longer exist when the trace + * event is printed, dereferencing to the original source is + * unsafe. The source of the dereference must be copied into the + * event itself, and the dereference must access the copy instead. + */ + if (WARN_ON_ONCE(dereference_flags)) { + arg = 1; + while (!(dereference_flags & 1)) { + dereference_flags >>= 1; + arg++; + } + pr_warn("event %s has unsafe dereference of argument %d\n", + trace_event_name(call), arg); + pr_warn("print_fmt: %s\n", fmt); + } +} + int trace_event_raw_init(struct trace_event_call *call) { int id; @@ -225,6 +433,8 @@ int trace_event_raw_init(struct trace_event_call *call) if (!id) return -ENODEV; + test_event_printk(call); + return 0; } EXPORT_SYMBOL_GPL(trace_event_raw_init); From f2616c772c768485de18e7fcb2816bcdcd098339 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 17 Mar 2021 13:34:35 -0400 Subject: [PATCH 0159/1379] seq_buf: Add seq_buf_terminate() API In the case that the seq_buf buffer needs to be printed directly, add a way to make sure that the buffer is safe to read by forcing a nul terminating character at the end of the string, or the last byte of the buffer if the string has overflowed. Signed-off-by: Steven Rostedt (VMware) --- include/linux/seq_buf.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 9d6c28cc4d8f..5b31c5147969 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -71,6 +71,31 @@ static inline unsigned int seq_buf_used(struct seq_buf *s) return min(s->len, s->size); } +/** + * seq_buf_terminate - Make sure buffer is nul terminated + * @s: the seq_buf descriptor to terminate. + * + * This makes sure that the buffer in @s is nul terminated and + * safe to read as a string. + * + * Note, if this is called when the buffer has overflowed, then + * the last byte of the buffer is zeroed, and the len will still + * point passed it. + * + * After this function is called, s->buffer is safe to use + * in string operations. + */ +static inline void seq_buf_terminate(struct seq_buf *s) +{ + if (WARN_ON(s->size == 0)) + return; + + if (seq_buf_buffer_left(s)) + s->buffer[s->len] = 0; + else + s->buffer[s->size - 1] = 0; +} + /** * seq_buf_get_buf - get buffer to write arbitrary data to * @s: the seq_buf handle From 9a6944fee68e25084130386c608c5ac8db487581 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 25 Feb 2021 22:00:57 -0500 Subject: [PATCH 0160/1379] tracing: Add a verifier to check string pointers for trace events It is a common mistake for someone writing a trace event to save a pointer to a string in the TP_fast_assign() and then display that string pointer in the TP_printk() with %s. The problem is that those two events may happen a long time apart, where the source of the string may no longer exist. The proper way to handle displaying any string that is not guaranteed to be in the kernel core rodata section, is to copy it into the ring buffer via the __string(), __assign_str() and __get_str() helper macros. Add a check at run time while displaying the TP_printk() of events to make sure that every %s referenced is safe to dereference, and if it is not, trigger a warning and only show the address of the pointer, and the dereferenced string if it can be safely retrieved with a strncpy_from_kernel_nofault() call. In order to not have to copy the parsing of vsnprintf() formats, or even exporting its code, the verifier relies on vsnprintf() being able to modify the va_list that is passed to it, and it remains modified after it is called. This is the case for some architectures like x86_64, but other architectures like x86_32 pass the va_list to vsnprintf() as a value not a reference, and the verifier can not use it to parse the non string arguments. Thus, at boot up, it is checked if vsnprintf() modifies the passed in va_list or not, and a static branch will disable the verifier if it's not compatible. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 200 ++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 3 + kernel/trace/trace_output.c | 2 +- kernel/trace/trace_printk.c | 11 ++ 4 files changed, 215 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 90ae3140756e..e32f5a49f1cf 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3558,6 +3558,204 @@ static char *trace_iter_expand_format(struct trace_iterator *iter) return tmp; } +/* Returns true if the string is safe to dereference from an event */ +static bool trace_safe_str(struct trace_iterator *iter, const char *str) +{ + unsigned long addr = (unsigned long)str; + struct trace_event *trace_event; + struct trace_event_call *event; + + /* OK if part of the event data */ + if ((addr >= (unsigned long)iter->ent) && + (addr < (unsigned long)iter->ent + iter->ent_size)) + return true; + + /* OK if part of the temp seq buffer */ + if ((addr >= (unsigned long)iter->tmp_seq.buffer) && + (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE)) + return true; + + /* Core rodata can not be freed */ + if (is_kernel_rodata(addr)) + return true; + + if (trace_is_tracepoint_string(str)) + return true; + + /* + * Now this could be a module event, referencing core module + * data, which is OK. + */ + if (!iter->ent) + return false; + + trace_event = ftrace_find_event(iter->ent->type); + if (!trace_event) + return false; + + event = container_of(trace_event, struct trace_event_call, event); + if (!event->mod) + return false; + + /* Would rather have rodata, but this will suffice */ + if (within_module_core(addr, event->mod)) + return true; + + return false; +} + +static const char *show_buffer(struct trace_seq *s) +{ + struct seq_buf *seq = &s->seq; + + seq_buf_terminate(seq); + + return seq->buffer; +} + +static DEFINE_STATIC_KEY_FALSE(trace_no_verify); + +static int test_can_verify_check(const char *fmt, ...) +{ + char buf[16]; + va_list ap; + int ret; + + /* + * The verifier is dependent on vsnprintf() modifies the va_list + * passed to it, where it is sent as a reference. Some architectures + * (like x86_32) passes it by value, which means that vsnprintf() + * does not modify the va_list passed to it, and the verifier + * would then need to be able to understand all the values that + * vsnprintf can use. If it is passed by value, then the verifier + * is disabled. + */ + va_start(ap, fmt); + vsnprintf(buf, 16, "%d", ap); + ret = va_arg(ap, int); + va_end(ap); + + return ret; +} + +static void test_can_verify(void) +{ + if (!test_can_verify_check("%d %d", 0, 1)) { + pr_info("trace event string verifier disabled\n"); + static_branch_inc(&trace_no_verify); + } +} + +/** + * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer + * @iter: The iterator that holds the seq buffer and the event being printed + * @fmt: The format used to print the event + * @ap: The va_list holding the data to print from @fmt. + * + * This writes the data into the @iter->seq buffer using the data from + * @fmt and @ap. If the format has a %s, then the source of the string + * is examined to make sure it is safe to print, otherwise it will + * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string + * pointer. + */ +void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, + va_list ap) +{ + const char *p = fmt; + const char *str; + int i, j; + + if (WARN_ON_ONCE(!fmt)) + return; + + if (static_branch_unlikely(&trace_no_verify)) + goto print; + + /* Don't bother checking when doing a ftrace_dump() */ + if (iter->fmt == static_fmt_buf) + goto print; + + while (*p) { + j = 0; + + /* We only care about %s and variants */ + for (i = 0; p[i]; i++) { + if (i + 1 >= iter->fmt_size) { + /* + * If we can't expand the copy buffer, + * just print it. + */ + if (!trace_iter_expand_format(iter)) + goto print; + } + + if (p[i] == '\\' && p[i+1]) { + i++; + continue; + } + if (p[i] == '%') { + /* Need to test cases like %08.*s */ + for (j = 1; p[i+j]; j++) { + if (isdigit(p[i+j]) || + p[i+j] == '*' || + p[i+j] == '.') + continue; + break; + } + if (p[i+j] == 's') + break; + } + j = 0; + } + /* If no %s found then just print normally */ + if (!p[i]) + break; + + /* Copy up to the %s, and print that */ + strncpy(iter->fmt, p, i); + iter->fmt[i] = '\0'; + trace_seq_vprintf(&iter->seq, iter->fmt, ap); + + /* The ap now points to the string data of the %s */ + str = va_arg(ap, const char *); + + /* + * If you hit this warning, it is likely that the + * trace event in question used %s on a string that + * was saved at the time of the event, but may not be + * around when the trace is read. Use __string(), + * __assign_str() and __get_str() helpers in the TRACE_EVENT() + * instead. See samples/trace_events/trace-events-sample.h + * for reference. + */ + if (WARN_ONCE(!trace_safe_str(iter, str), + "fmt: '%s' current_buffer: '%s'", + fmt, show_buffer(&iter->seq))) { + int ret; + + /* Try to safely read the string */ + ret = strncpy_from_kernel_nofault(iter->fmt, str, + iter->fmt_size); + if (ret < 0) + trace_seq_printf(&iter->seq, "(0x%px)", str); + else + trace_seq_printf(&iter->seq, "(0x%px:%s)", + str, iter->fmt); + str = "[UNSAFE-MEMORY]"; + strcpy(iter->fmt, "%s"); + } else { + strncpy(iter->fmt, p + i, j + 1); + iter->fmt[j+1] = '\0'; + } + trace_seq_printf(&iter->seq, iter->fmt, str); + + p += i + j + 1; + } + print: + if (*p) + trace_seq_vprintf(&iter->seq, p, ap); +} + const char *trace_event_format(struct trace_iterator *iter, const char *fmt) { const char *p, *new_fmt; @@ -9675,6 +9873,8 @@ __init static int tracer_alloc_buffers(void) register_snapshot_cmd(); + test_can_verify(); + return 0; out_free_savedcmd: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0d8f54f49a3a..2952bd92bc62 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -582,7 +582,10 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, struct ring_buffer_event *event); +bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); +void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, + va_list ap); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 61255bad7e01..a0146e1fffdf 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -317,7 +317,7 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) va_list ap; va_start(ap, fmt); - trace_seq_vprintf(&iter->seq, trace_event_format(iter, fmt), ap); + trace_check_vprintf(iter, trace_event_format(iter, fmt), ap); va_end(ap); } EXPORT_SYMBOL(trace_event_printf); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index ff32476df072..4b320fe7df70 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -251,6 +251,17 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) } EXPORT_SYMBOL_GPL(__ftrace_vprintk); +bool trace_is_tracepoint_string(const char *str) +{ + const char **ptr = __start___tracepoint_str; + + for (ptr = __start___tracepoint_str; ptr < __stop___tracepoint_str; ptr++) { + if (str == *ptr) + return true; + } + return false; +} + static const char **find_next(void *v, loff_t *pos) { const char **fmt = v; From 73f620951b2b594bdc38722c0d647c3b3312af7a Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Thu, 18 Mar 2021 17:14:22 +0100 Subject: [PATCH 0161/1379] swiotlb: move global variables into a new io_tlb_mem structure Added a new struct, io_tlb_mem, as the IO TLB memory pool descriptor and moved relevant global variables into that struct. This will be useful later to allow for restricted DMA pool. Signed-off-by: Claire Chang [hch: rebased] Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 2 +- include/linux/swiotlb.h | 43 ++++- kernel/dma/swiotlb.c | 354 ++++++++++++++++++-------------------- 3 files changed, 206 insertions(+), 193 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 4ecfce2c6f72..5329ad54a5f3 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -548,7 +548,7 @@ xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, static int xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) { - return xen_phys_to_dma(hwdev, io_tlb_end - 1) <= mask; + return xen_phys_to_dma(hwdev, io_tlb_default_mem.end - 1) <= mask; } const struct dma_map_ops xen_swiotlb_dma_ops = { diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 0696bdc8072e..5ec5378b17c3 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -6,6 +6,7 @@ #include #include #include +#include struct device; struct page; @@ -61,11 +62,49 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys, #ifdef CONFIG_SWIOTLB extern enum swiotlb_force swiotlb_force; -extern phys_addr_t io_tlb_start, io_tlb_end; + +/** + * struct io_tlb_mem - IO TLB Memory Pool Descriptor + * + * @start: The start address of the swiotlb memory pool. Used to do a quick + * range check to see if the memory was in fact allocated by this + * API. + * @end: The end address of the swiotlb memory pool. Used to do a quick + * range check to see if the memory was in fact allocated by this + * API. + * @nslabs: The number of IO TLB blocks (in groups of 64) between @start and + * @end. This is command line adjustable via setup_io_tlb_npages. + * @used: The number of used IO TLB block. + * @list: The free list describing the number of free entries available + * from each index. + * @index: The index to start searching in the next round. + * @orig_addr: The original address corresponding to a mapped entry. + * @alloc_size: Size of the allocated buffer. + * @lock: The lock to protect the above data structures in the map and + * unmap calls. + * @debugfs: The dentry to debugfs. + * @late_alloc: %true if allocated using the page allocator + */ +struct io_tlb_mem { + phys_addr_t start; + phys_addr_t end; + unsigned long nslabs; + unsigned long used; + unsigned int *list; + unsigned int index; + phys_addr_t *orig_addr; + size_t *alloc_size; + spinlock_t lock; + struct dentry *debugfs; + bool late_alloc; +}; +extern struct io_tlb_mem io_tlb_default_mem; static inline bool is_swiotlb_buffer(phys_addr_t paddr) { - return paddr >= io_tlb_start && paddr < io_tlb_end; + struct io_tlb_mem *mem = &io_tlb_default_mem; + + return paddr >= mem->start && paddr < mem->end; } void __init swiotlb_exit(void); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 35e24f0ff8b2..d9c097f0f78c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -59,32 +59,11 @@ */ #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) + enum swiotlb_force swiotlb_force; -/* - * Used to do a quick range check in swiotlb_tbl_unmap_single and - * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this - * API. - */ -phys_addr_t io_tlb_start, io_tlb_end; - -/* - * The number of IO TLB blocks (in groups of 64) between io_tlb_start and - * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. - */ -static unsigned long io_tlb_nslabs; - -/* - * The number of used IO TLB block - */ -static unsigned long io_tlb_used; - -/* - * This is a free list describing the number of free entries available from - * each index - */ -static unsigned int *io_tlb_list; -static unsigned int io_tlb_index; +struct io_tlb_mem io_tlb_default_mem; /* * Max segment that we can provide which (if pages are contingous) will @@ -92,32 +71,15 @@ static unsigned int io_tlb_index; */ static unsigned int max_segment; -/* - * We need to save away the original address corresponding to a mapped entry - * for the sync operations. - */ -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) -static phys_addr_t *io_tlb_orig_addr; - -/* - * The mapped buffer's size should be validated during a sync operation. - */ -static size_t *io_tlb_alloc_size; - -/* - * Protect the above data structures in the map and unmap calls - */ -static DEFINE_SPINLOCK(io_tlb_lock); - -static int late_alloc; - static int __init setup_io_tlb_npages(char *str) { + struct io_tlb_mem *mem = &io_tlb_default_mem; + if (isdigit(*str)) { - io_tlb_nslabs = simple_strtoul(str, &str, 0); + mem->nslabs = simple_strtoul(str, &str, 0); /* avoid tail segment of size < IO_TLB_SEGSIZE */ - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE); } if (*str == ',') ++str; @@ -125,7 +87,7 @@ setup_io_tlb_npages(char *str) swiotlb_force = SWIOTLB_FORCE; } else if (!strcmp(str, "noforce")) { swiotlb_force = SWIOTLB_NO_FORCE; - io_tlb_nslabs = 1; + mem->nslabs = 1; } return 0; @@ -136,7 +98,7 @@ static bool no_iotlb_memory; unsigned long swiotlb_nr_tbl(void) { - return unlikely(no_iotlb_memory) ? 0 : io_tlb_nslabs; + return unlikely(no_iotlb_memory) ? 0 : io_tlb_default_mem.nslabs; } EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); @@ -158,13 +120,14 @@ unsigned long swiotlb_size_or_default(void) { unsigned long size; - size = io_tlb_nslabs << IO_TLB_SHIFT; + size = io_tlb_default_mem.nslabs << IO_TLB_SHIFT; return size ? size : (IO_TLB_DEFAULT_SIZE); } void __init swiotlb_adjust_size(unsigned long new_size) { + struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long size; /* @@ -172,10 +135,10 @@ void __init swiotlb_adjust_size(unsigned long new_size) * architectures such as those supporting memory encryption to * adjust/expand SWIOTLB size for their use. */ - if (!io_tlb_nslabs) { + if (!mem->nslabs) { size = ALIGN(new_size, IO_TLB_SIZE); - io_tlb_nslabs = size >> IO_TLB_SHIFT; - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + mem->nslabs = size >> IO_TLB_SHIFT; + mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE); pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); } @@ -183,14 +146,15 @@ void __init swiotlb_adjust_size(unsigned long new_size) void swiotlb_print_info(void) { - unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; + struct io_tlb_mem *mem = &io_tlb_default_mem; + unsigned long bytes = mem->nslabs << IO_TLB_SHIFT; if (no_iotlb_memory) { pr_warn("No low mem\n"); return; } - pr_info("mapped [mem %pa-%pa] (%luMB)\n", &io_tlb_start, &io_tlb_end, + pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end, bytes >> 20); } @@ -212,68 +176,71 @@ static inline unsigned long nr_slots(u64 val) */ void __init swiotlb_update_mem_attributes(void) { + struct io_tlb_mem *mem = &io_tlb_default_mem; void *vaddr; unsigned long bytes; - if (no_iotlb_memory || late_alloc) + if (no_iotlb_memory || mem->late_alloc) return; - vaddr = phys_to_virt(io_tlb_start); - bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); + vaddr = phys_to_virt(mem->start); + bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); memset(vaddr, 0, bytes); } int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { + struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long i, bytes; size_t alloc_size; /* protect against double initialization */ - if (WARN_ON_ONCE(io_tlb_start)) + if (WARN_ON_ONCE(mem->start)) return -ENOMEM; bytes = nslabs << IO_TLB_SHIFT; - io_tlb_nslabs = nslabs; - io_tlb_start = __pa(tlb); - io_tlb_end = io_tlb_start + bytes; + mem->nslabs = nslabs; + mem->start = __pa(tlb); + mem->end = mem->start + bytes; + mem->index = 0; + spin_lock_init(&mem->lock); /* * Allocate and initialize the free list array. This array is used * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE - * between io_tlb_start and io_tlb_end. + * between mem->start and mem->end. */ - alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(int)); - io_tlb_list = memblock_alloc(alloc_size, PAGE_SIZE); - if (!io_tlb_list) + alloc_size = PAGE_ALIGN(mem->nslabs * sizeof(int)); + mem->list = memblock_alloc(alloc_size, PAGE_SIZE); + if (!mem->list) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)); - io_tlb_orig_addr = memblock_alloc(alloc_size, PAGE_SIZE); - if (!io_tlb_orig_addr) + alloc_size = PAGE_ALIGN(mem->nslabs * sizeof(phys_addr_t)); + mem->orig_addr = memblock_alloc(alloc_size, PAGE_SIZE); + if (!mem->orig_addr) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)); - io_tlb_alloc_size = memblock_alloc(alloc_size, PAGE_SIZE); - if (!io_tlb_alloc_size) + alloc_size = PAGE_ALIGN(mem->nslabs * sizeof(size_t)); + mem->alloc_size = memblock_alloc(alloc_size, PAGE_SIZE); + if (mem->alloc_size) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); - for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - io_tlb_alloc_size[i] = 0; + for (i = 0; i < mem->nslabs; i++) { + mem->list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); + mem->orig_addr[i] = INVALID_PHYS_ADDR; + mem->alloc_size[i] = 0; } - io_tlb_index = 0; no_iotlb_memory = false; if (verbose) swiotlb_print_info(); - swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); + swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; } @@ -284,26 +251,27 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) void __init swiotlb_init(int verbose) { + struct io_tlb_mem *mem = &io_tlb_default_mem; size_t default_size = IO_TLB_DEFAULT_SIZE; unsigned char *vstart; unsigned long bytes; - if (!io_tlb_nslabs) { - io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + if (!mem->nslabs) { + mem->nslabs = (default_size >> IO_TLB_SHIFT); + mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE); } - bytes = io_tlb_nslabs << IO_TLB_SHIFT; + bytes = mem->nslabs << IO_TLB_SHIFT; /* Get IO TLB memory from the low pages */ vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE); - if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) + if (vstart && !swiotlb_init_with_tbl(vstart, mem->nslabs, verbose)) return; - if (io_tlb_start) { - memblock_free_early(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - io_tlb_start = 0; + if (mem->start) { + memblock_free_early(mem->start, + PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT)); + mem->start = 0; } pr_warn("Cannot allocate buffer"); no_iotlb_memory = true; @@ -317,22 +285,23 @@ swiotlb_init(int verbose) int swiotlb_late_init_with_default_size(size_t default_size) { - unsigned long bytes, req_nslabs = io_tlb_nslabs; + struct io_tlb_mem *mem = &io_tlb_default_mem; + unsigned long bytes, req_nslabs = mem->nslabs; unsigned char *vstart = NULL; unsigned int order; int rc = 0; - if (!io_tlb_nslabs) { - io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + if (!mem->nslabs) { + mem->nslabs = (default_size >> IO_TLB_SHIFT); + mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE); } /* * Get IO TLB memory from the low pages */ - order = get_order(io_tlb_nslabs << IO_TLB_SHIFT); - io_tlb_nslabs = SLABS_PER_PAGE << order; - bytes = io_tlb_nslabs << IO_TLB_SHIFT; + order = get_order(mem->nslabs << IO_TLB_SHIFT); + mem->nslabs = SLABS_PER_PAGE << order; + bytes = mem->nslabs << IO_TLB_SHIFT; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, @@ -343,15 +312,15 @@ swiotlb_late_init_with_default_size(size_t default_size) } if (!vstart) { - io_tlb_nslabs = req_nslabs; + mem->nslabs = req_nslabs; return -ENOMEM; } if (order != get_order(bytes)) { pr_warn("only able to allocate %ld MB\n", (PAGE_SIZE << order) >> 20); - io_tlb_nslabs = SLABS_PER_PAGE << order; + mem->nslabs = SLABS_PER_PAGE << order; } - rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); + rc = swiotlb_late_init_with_tbl(vstart, mem->nslabs); if (rc) free_pages((unsigned long)vstart, order); @@ -360,26 +329,32 @@ swiotlb_late_init_with_default_size(size_t default_size) static void swiotlb_cleanup(void) { - io_tlb_end = 0; - io_tlb_start = 0; - io_tlb_nslabs = 0; + struct io_tlb_mem *mem = &io_tlb_default_mem; + + mem->end = 0; + mem->start = 0; + mem->nslabs = 0; max_segment = 0; } int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { + struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long i, bytes; /* protect against double initialization */ - if (WARN_ON_ONCE(io_tlb_start)) + if (WARN_ON_ONCE(mem->start)) return -ENOMEM; bytes = nslabs << IO_TLB_SHIFT; - io_tlb_nslabs = nslabs; - io_tlb_start = virt_to_phys(tlb); - io_tlb_end = io_tlb_start + bytes; + mem->nslabs = nslabs; + mem->start = virt_to_phys(tlb); + mem->end = mem->start + bytes; + mem->index = 0; + mem->late_alloc = 1; + spin_lock_init(&mem->lock); set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); memset(tlb, 0, bytes); @@ -387,52 +362,45 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) /* * Allocate and initialize the free list array. This array is used * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE - * between io_tlb_start and io_tlb_end. + * between mem->start and mem->end. */ - io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * sizeof(int))); - if (!io_tlb_list) + mem->list = (unsigned int *)__get_free_pages(GFP_KERNEL, + get_order(mem->nslabs * sizeof(int))); + if (!mem->list) goto cleanup3; - io_tlb_orig_addr = (phys_addr_t *) + mem->orig_addr = (phys_addr_t *) __get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * + get_order(mem->nslabs * sizeof(phys_addr_t))); - if (!io_tlb_orig_addr) + if (!mem->orig_addr) goto cleanup4; - io_tlb_alloc_size = (size_t *) + mem->alloc_size = (size_t *) __get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * + get_order(mem->nslabs * sizeof(size_t))); - if (!io_tlb_alloc_size) + if (!mem->alloc_size) goto cleanup5; - - for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - io_tlb_alloc_size[i] = 0; + for (i = 0; i < mem->nslabs; i++) { + mem->list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); + mem->orig_addr[i] = INVALID_PHYS_ADDR; + mem->alloc_size[i] = 0; } - io_tlb_index = 0; no_iotlb_memory = false; swiotlb_print_info(); - - late_alloc = 1; - - swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); - + swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; cleanup5: - free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * - sizeof(phys_addr_t))); - + free_pages((unsigned long)mem->orig_addr, + get_order(mem->nslabs * sizeof(phys_addr_t))); cleanup4: - free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * - sizeof(int))); - io_tlb_list = NULL; + free_pages((unsigned long)mem->list, + get_order(mem->nslabs * sizeof(int))); + mem->list = NULL; cleanup3: swiotlb_cleanup(); return -ENOMEM; @@ -440,27 +408,29 @@ cleanup3: void __init swiotlb_exit(void) { - if (!io_tlb_orig_addr) + struct io_tlb_mem *mem = &io_tlb_default_mem; + + if (!mem->orig_addr) return; - if (late_alloc) { - free_pages((unsigned long)io_tlb_alloc_size, - get_order(io_tlb_nslabs * sizeof(size_t))); - free_pages((unsigned long)io_tlb_orig_addr, - get_order(io_tlb_nslabs * sizeof(phys_addr_t))); - free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * - sizeof(int))); - free_pages((unsigned long)phys_to_virt(io_tlb_start), - get_order(io_tlb_nslabs << IO_TLB_SHIFT)); + if (mem->late_alloc) { + free_pages((unsigned long)mem->alloc_size, + get_order(mem->nslabs * sizeof(size_t))); + free_pages((unsigned long)mem->orig_addr, + get_order(mem->nslabs * sizeof(phys_addr_t))); + free_pages((unsigned long)mem->list, + get_order(mem->nslabs * sizeof(int))); + free_pages((unsigned long)phys_to_virt(mem->start), + get_order(mem->nslabs << IO_TLB_SHIFT)); } else { - memblock_free_late(__pa(io_tlb_orig_addr), - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); - memblock_free_late(__pa(io_tlb_alloc_size), - PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t))); - memblock_free_late(__pa(io_tlb_list), - PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); - memblock_free_late(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + memblock_free_late(__pa(mem->alloc_size), + PAGE_ALIGN(mem->nslabs * sizeof(size_t))); + memblock_free_late(__pa(mem->orig_addr), + PAGE_ALIGN(mem->nslabs * sizeof(phys_addr_t))); + memblock_free_late(__pa(mem->list), + PAGE_ALIGN(mem->nslabs * sizeof(int))); + memblock_free_late(mem->start, + PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT)); } swiotlb_cleanup(); } @@ -471,9 +441,10 @@ void __init swiotlb_exit(void) static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; - size_t alloc_size = io_tlb_alloc_size[index]; - phys_addr_t orig_addr = io_tlb_orig_addr[index]; + struct io_tlb_mem *mem = &io_tlb_default_mem; + int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT; + phys_addr_t orig_addr = mem->orig_addr[index]; + size_t alloc_size = mem->alloc_size[index]; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = phys_to_virt(tlb_addr); @@ -538,9 +509,9 @@ static inline unsigned long get_max_slots(unsigned long boundary_mask) return nr_slots(boundary_mask + 1); } -static unsigned int wrap_index(unsigned int index) +static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) { - if (index >= io_tlb_nslabs) + if (index >= mem->nslabs) return 0; return index; } @@ -552,9 +523,10 @@ static unsigned int wrap_index(unsigned int index) static int find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size) { + struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = - phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask; + phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; unsigned long max_slots = get_max_slots(boundary_mask); unsigned int iotlb_align_mask = dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); @@ -573,15 +545,15 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr, if (alloc_size >= PAGE_SIZE) stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT)); - spin_lock_irqsave(&io_tlb_lock, flags); - if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) + spin_lock_irqsave(&mem->lock, flags); + if (unlikely(nslots > mem->nslabs - mem->used)) goto not_found; - index = wrap = wrap_index(ALIGN(io_tlb_index, stride)); + index = wrap = wrap_index(mem, ALIGN(mem->index, stride)); do { if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != (orig_addr & iotlb_align_mask)) { - index = wrap_index(index + 1); + index = wrap_index(mem, index + 1); continue; } @@ -593,34 +565,34 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr, if (!iommu_is_span_boundary(index, nslots, nr_slots(tbl_dma_addr), max_slots)) { - if (io_tlb_list[index] >= nslots) + if (mem->list[index] >= nslots) goto found; } - index = wrap_index(index + stride); + index = wrap_index(mem, index + stride); } while (index != wrap); not_found: - spin_unlock_irqrestore(&io_tlb_lock, flags); + spin_unlock_irqrestore(&mem->lock, flags); return -1; found: for (i = index; i < index + nslots; i++) - io_tlb_list[i] = 0; + mem->list[i] = 0; for (i = index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && - io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; + mem->list[i]; i--) + mem->list[i] = ++count; /* * Update the indices to avoid searching in the next round. */ - if (index + nslots < io_tlb_nslabs) - io_tlb_index = index + nslots; + if (index + nslots < mem->nslabs) + mem->index = index + nslots; else - io_tlb_index = 0; - io_tlb_used += nslots; + mem->index = 0; + mem->used += nslots; - spin_unlock_irqrestore(&io_tlb_lock, flags); + spin_unlock_irqrestore(&mem->lock, flags); return index; } @@ -628,6 +600,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { + struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned int index, i; phys_addr_t tlb_addr; @@ -649,7 +622,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, if (!(attrs & DMA_ATTR_NO_WARN)) dev_warn_ratelimited(dev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", - alloc_size, io_tlb_nslabs, io_tlb_used); + alloc_size, mem->nslabs, mem->used); return (phys_addr_t)DMA_MAPPING_ERROR; } @@ -659,10 +632,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * needed. */ for (i = 0; i < nr_slots(alloc_size + offset); i++) { - io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i); - io_tlb_alloc_size[index+i] = alloc_size - (i << IO_TLB_SHIFT); + mem->orig_addr[index + i] = slot_addr(orig_addr, i); + mem->alloc_size[index + i] = alloc_size - (i << IO_TLB_SHIFT); } - tlb_addr = slot_addr(io_tlb_start, index) + offset; + tlb_addr = slot_addr(mem->start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); @@ -676,10 +649,11 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, size_t mapping_size, enum dma_data_direction dir, unsigned long attrs) { + struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long flags; unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); - int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT; - int nslots = nr_slots(io_tlb_alloc_size[index] + offset); + int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; + int nslots = nr_slots(mem->alloc_size[index] + offset); int count, i; /* @@ -695,9 +669,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * While returning the entries to the free list, we merge the entries * with slots below and above the pool being returned. */ - spin_lock_irqsave(&io_tlb_lock, flags); + spin_lock_irqsave(&mem->lock, flags); if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) - count = io_tlb_list[index + nslots]; + count = mem->list[index + nslots]; else count = 0; @@ -706,9 +680,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * superceeding slots */ for (i = index + nslots - 1; i >= index; i--) { - io_tlb_list[i] = ++count; - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - io_tlb_alloc_size[i] = 0; + mem->list[i] = ++count; + mem->orig_addr[i] = INVALID_PHYS_ADDR; + mem->alloc_size[i] = 0; } /* @@ -716,11 +690,11 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * available (non zero) */ for (i = index - 1; - io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i]; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->list[i]; i--) - io_tlb_list[i] = ++count; - io_tlb_used -= nslots; - spin_unlock_irqrestore(&io_tlb_lock, flags); + mem->list[i] = ++count; + mem->used -= nslots; + spin_unlock_irqrestore(&mem->lock, flags); } void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr, @@ -783,21 +757,21 @@ size_t swiotlb_max_mapping_size(struct device *dev) bool is_swiotlb_active(void) { /* - * When SWIOTLB is initialized, even if io_tlb_start points to physical - * address zero, io_tlb_end surely doesn't. + * When SWIOTLB is initialized, even if mem->start points to physical + * address zero, mem->end surely doesn't. */ - return io_tlb_end != 0; + return io_tlb_default_mem.end != 0; } #ifdef CONFIG_DEBUG_FS static int __init swiotlb_create_debugfs(void) { - struct dentry *root; + struct io_tlb_mem *mem = &io_tlb_default_mem; - root = debugfs_create_dir("swiotlb", NULL); - debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs); - debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used); + mem->debugfs = debugfs_create_dir("swiotlb", NULL); + debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); + debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); return 0; } From 2d29960af0bee8cc6731b9bd3964850c9e7a6840 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Mar 2021 17:14:23 +0100 Subject: [PATCH 0162/1379] swiotlb: dynamically allocate io_tlb_default_mem Instead of allocating ->list and ->orig_addr separately just do one dynamic allocation for the actual io_tlb_mem structure. This simplifies a lot of the initialization code, and also allows to just check io_tlb_default_mem to see if swiotlb is in use. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/swiotlb-xen.c | 22 +-- include/linux/swiotlb.h | 18 ++- kernel/dma/swiotlb.c | 304 ++++++++++++-------------------------- 3 files changed, 116 insertions(+), 228 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 5329ad54a5f3..4c89afc0df62 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -158,17 +158,14 @@ static const char *xen_swiotlb_error(enum xen_swiotlb_err err) int __ref xen_swiotlb_init(void) { enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; - unsigned long nslabs, bytes, order; - unsigned int repeat = 3; + unsigned long bytes = swiotlb_size_or_default(); + unsigned long nslabs = bytes >> IO_TLB_SHIFT; + unsigned int order, repeat = 3; int rc = -ENOMEM; char *start; - nslabs = swiotlb_nr_tbl(); - if (!nslabs) - nslabs = DEFAULT_NSLABS; retry: m_ret = XEN_SWIOTLB_ENOMEM; - bytes = nslabs << IO_TLB_SHIFT; order = get_order(bytes); /* @@ -221,19 +218,16 @@ error: #ifdef CONFIG_X86 void __init xen_swiotlb_init_early(void) { - unsigned long nslabs, bytes; + unsigned long bytes = swiotlb_size_or_default(); + unsigned long nslabs = bytes >> IO_TLB_SHIFT; unsigned int repeat = 3; char *start; int rc; - nslabs = swiotlb_nr_tbl(); - if (!nslabs) - nslabs = DEFAULT_NSLABS; retry: /* * Get IO TLB memory from any location. */ - bytes = nslabs << IO_TLB_SHIFT; start = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE); if (!start) panic("%s: Failed to allocate %lu bytes align=0x%lx\n", @@ -248,8 +242,8 @@ retry: if (repeat--) { /* Min is 2MB */ nslabs = max(1024UL, (nslabs >> 1)); - pr_info("Lowering to %luMB\n", - (nslabs << IO_TLB_SHIFT) >> 20); + bytes = nslabs << IO_TLB_SHIFT; + pr_info("Lowering to %luMB\n", bytes >> 20); goto retry; } panic("%s (rc:%d)", xen_swiotlb_error(XEN_SWIOTLB_EFIXUP), rc); @@ -548,7 +542,7 @@ xen_swiotlb_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, static int xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) { - return xen_phys_to_dma(hwdev, io_tlb_default_mem.end - 1) <= mask; + return xen_phys_to_dma(hwdev, io_tlb_default_mem->end - 1) <= mask; } const struct dma_map_ops xen_swiotlb_dma_ops = { diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 5ec5378b17c3..63f7a63f61d0 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -90,28 +90,30 @@ struct io_tlb_mem { phys_addr_t end; unsigned long nslabs; unsigned long used; - unsigned int *list; unsigned int index; - phys_addr_t *orig_addr; - size_t *alloc_size; spinlock_t lock; struct dentry *debugfs; bool late_alloc; + struct io_tlb_slot { + phys_addr_t orig_addr; + size_t alloc_size; + unsigned int list; + } slots[]; }; -extern struct io_tlb_mem io_tlb_default_mem; +extern struct io_tlb_mem *io_tlb_default_mem; static inline bool is_swiotlb_buffer(phys_addr_t paddr) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_mem *mem = io_tlb_default_mem; - return paddr >= mem->start && paddr < mem->end; + return mem && paddr >= mem->start && paddr < mem->end; } void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); bool is_swiotlb_active(void); -void __init swiotlb_adjust_size(unsigned long new_size); +void __init swiotlb_adjust_size(unsigned long size); #else #define swiotlb_force SWIOTLB_NO_FORCE static inline bool is_swiotlb_buffer(phys_addr_t paddr) @@ -135,7 +137,7 @@ static inline bool is_swiotlb_active(void) return false; } -static inline void swiotlb_adjust_size(unsigned long new_size) +static inline void swiotlb_adjust_size(unsigned long size) { } #endif /* CONFIG_SWIOTLB */ diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d9c097f0f78c..13de669a9b46 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -63,7 +63,7 @@ enum swiotlb_force swiotlb_force; -struct io_tlb_mem io_tlb_default_mem; +struct io_tlb_mem *io_tlb_default_mem; /* * Max segment that we can provide which (if pages are contingous) will @@ -71,15 +71,15 @@ struct io_tlb_mem io_tlb_default_mem; */ static unsigned int max_segment; +static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT; + static int __init setup_io_tlb_npages(char *str) { - struct io_tlb_mem *mem = &io_tlb_default_mem; - if (isdigit(*str)) { - mem->nslabs = simple_strtoul(str, &str, 0); /* avoid tail segment of size < IO_TLB_SEGSIZE */ - mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE); + default_nslabs = + ALIGN(simple_strtoul(str, &str, 0), IO_TLB_SEGSIZE); } if (*str == ',') ++str; @@ -87,24 +87,22 @@ setup_io_tlb_npages(char *str) swiotlb_force = SWIOTLB_FORCE; } else if (!strcmp(str, "noforce")) { swiotlb_force = SWIOTLB_NO_FORCE; - mem->nslabs = 1; + default_nslabs = 1; } return 0; } early_param("swiotlb", setup_io_tlb_npages); -static bool no_iotlb_memory; - unsigned long swiotlb_nr_tbl(void) { - return unlikely(no_iotlb_memory) ? 0 : io_tlb_default_mem.nslabs; + return io_tlb_default_mem ? io_tlb_default_mem->nslabs : 0; } EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); unsigned int swiotlb_max_segment(void) { - return unlikely(no_iotlb_memory) ? 0 : max_segment; + return io_tlb_default_mem ? max_segment : 0; } EXPORT_SYMBOL_GPL(swiotlb_max_segment); @@ -118,44 +116,32 @@ void swiotlb_set_max_segment(unsigned int val) unsigned long swiotlb_size_or_default(void) { - unsigned long size; - - size = io_tlb_default_mem.nslabs << IO_TLB_SHIFT; - - return size ? size : (IO_TLB_DEFAULT_SIZE); + return default_nslabs << IO_TLB_SHIFT; } -void __init swiotlb_adjust_size(unsigned long new_size) +void __init swiotlb_adjust_size(unsigned long size) { - struct io_tlb_mem *mem = &io_tlb_default_mem; - unsigned long size; - /* * If swiotlb parameter has not been specified, give a chance to * architectures such as those supporting memory encryption to * adjust/expand SWIOTLB size for their use. */ - if (!mem->nslabs) { - size = ALIGN(new_size, IO_TLB_SIZE); - mem->nslabs = size >> IO_TLB_SHIFT; - mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE); - - pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); - } + size = ALIGN(size, IO_TLB_SIZE); + default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); + pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); } void swiotlb_print_info(void) { - struct io_tlb_mem *mem = &io_tlb_default_mem; - unsigned long bytes = mem->nslabs << IO_TLB_SHIFT; + struct io_tlb_mem *mem = io_tlb_default_mem; - if (no_iotlb_memory) { + if (!mem) { pr_warn("No low mem\n"); return; } pr_info("mapped [mem %pa-%pa] (%luMB)\n", &mem->start, &mem->end, - bytes >> 20); + (mem->nslabs << IO_TLB_SHIFT) >> 20); } static inline unsigned long io_tlb_offset(unsigned long val) @@ -176,13 +162,12 @@ static inline unsigned long nr_slots(u64 val) */ void __init swiotlb_update_mem_attributes(void) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_mem *mem = io_tlb_default_mem; void *vaddr; unsigned long bytes; - if (no_iotlb_memory || mem->late_alloc) + if (!mem || mem->late_alloc) return; - vaddr = phys_to_virt(mem->start); bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); @@ -191,55 +176,33 @@ void __init swiotlb_update_mem_attributes(void) int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { - struct io_tlb_mem *mem = &io_tlb_default_mem; - unsigned long i, bytes; + unsigned long bytes = nslabs << IO_TLB_SHIFT, i; + struct io_tlb_mem *mem; size_t alloc_size; /* protect against double initialization */ - if (WARN_ON_ONCE(mem->start)) + if (WARN_ON_ONCE(io_tlb_default_mem)) return -ENOMEM; - bytes = nslabs << IO_TLB_SHIFT; - + alloc_size = PAGE_ALIGN(struct_size(mem, slots, nslabs)); + mem = memblock_alloc(alloc_size, PAGE_SIZE); + if (!mem) + panic("%s: Failed to allocate %zu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); mem->nslabs = nslabs; mem->start = __pa(tlb); mem->end = mem->start + bytes; mem->index = 0; spin_lock_init(&mem->lock); - - /* - * Allocate and initialize the free list array. This array is used - * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE - * between mem->start and mem->end. - */ - alloc_size = PAGE_ALIGN(mem->nslabs * sizeof(int)); - mem->list = memblock_alloc(alloc_size, PAGE_SIZE); - if (!mem->list) - panic("%s: Failed to allocate %zu bytes align=0x%lx\n", - __func__, alloc_size, PAGE_SIZE); - - alloc_size = PAGE_ALIGN(mem->nslabs * sizeof(phys_addr_t)); - mem->orig_addr = memblock_alloc(alloc_size, PAGE_SIZE); - if (!mem->orig_addr) - panic("%s: Failed to allocate %zu bytes align=0x%lx\n", - __func__, alloc_size, PAGE_SIZE); - - alloc_size = PAGE_ALIGN(mem->nslabs * sizeof(size_t)); - mem->alloc_size = memblock_alloc(alloc_size, PAGE_SIZE); - if (mem->alloc_size) - panic("%s: Failed to allocate %zu bytes align=0x%lx\n", - __func__, alloc_size, PAGE_SIZE); - for (i = 0; i < mem->nslabs; i++) { - mem->list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); - mem->orig_addr[i] = INVALID_PHYS_ADDR; - mem->alloc_size[i] = 0; + mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); + mem->slots[i].orig_addr = INVALID_PHYS_ADDR; + mem->slots[i].alloc_size = 0; } - no_iotlb_memory = false; + io_tlb_default_mem = mem; if (verbose) swiotlb_print_info(); - swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; } @@ -251,30 +214,21 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) void __init swiotlb_init(int verbose) { - struct io_tlb_mem *mem = &io_tlb_default_mem; - size_t default_size = IO_TLB_DEFAULT_SIZE; - unsigned char *vstart; - unsigned long bytes; - - if (!mem->nslabs) { - mem->nslabs = (default_size >> IO_TLB_SHIFT); - mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE); - } - - bytes = mem->nslabs << IO_TLB_SHIFT; + size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT); + void *tlb; /* Get IO TLB memory from the low pages */ - vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE); - if (vstart && !swiotlb_init_with_tbl(vstart, mem->nslabs, verbose)) - return; + tlb = memblock_alloc_low(bytes, PAGE_SIZE); + if (!tlb) + goto fail; + if (swiotlb_init_with_tbl(tlb, default_nslabs, verbose)) + goto fail_free_mem; + return; - if (mem->start) { - memblock_free_early(mem->start, - PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT)); - mem->start = 0; - } +fail_free_mem: + memblock_free_early(__pa(tlb), bytes); +fail: pr_warn("Cannot allocate buffer"); - no_iotlb_memory = true; } /* @@ -285,23 +239,19 @@ swiotlb_init(int verbose) int swiotlb_late_init_with_default_size(size_t default_size) { - struct io_tlb_mem *mem = &io_tlb_default_mem; - unsigned long bytes, req_nslabs = mem->nslabs; + unsigned long nslabs = + ALIGN(default_size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); + unsigned long bytes; unsigned char *vstart = NULL; unsigned int order; int rc = 0; - if (!mem->nslabs) { - mem->nslabs = (default_size >> IO_TLB_SHIFT); - mem->nslabs = ALIGN(mem->nslabs, IO_TLB_SEGSIZE); - } - /* * Get IO TLB memory from the low pages */ - order = get_order(mem->nslabs << IO_TLB_SHIFT); - mem->nslabs = SLABS_PER_PAGE << order; - bytes = mem->nslabs << IO_TLB_SHIFT; + order = get_order(nslabs << IO_TLB_SHIFT); + nslabs = SLABS_PER_PAGE << order; + bytes = nslabs << IO_TLB_SHIFT; while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, @@ -311,43 +261,35 @@ swiotlb_late_init_with_default_size(size_t default_size) order--; } - if (!vstart) { - mem->nslabs = req_nslabs; + if (!vstart) return -ENOMEM; - } + if (order != get_order(bytes)) { pr_warn("only able to allocate %ld MB\n", (PAGE_SIZE << order) >> 20); - mem->nslabs = SLABS_PER_PAGE << order; + nslabs = SLABS_PER_PAGE << order; } - rc = swiotlb_late_init_with_tbl(vstart, mem->nslabs); + rc = swiotlb_late_init_with_tbl(vstart, nslabs); if (rc) free_pages((unsigned long)vstart, order); return rc; } -static void swiotlb_cleanup(void) -{ - struct io_tlb_mem *mem = &io_tlb_default_mem; - - mem->end = 0; - mem->start = 0; - mem->nslabs = 0; - max_segment = 0; -} - int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) { - struct io_tlb_mem *mem = &io_tlb_default_mem; - unsigned long i, bytes; + unsigned long bytes = nslabs << IO_TLB_SHIFT, i; + struct io_tlb_mem *mem; /* protect against double initialization */ - if (WARN_ON_ONCE(mem->start)) + if (WARN_ON_ONCE(io_tlb_default_mem)) return -ENOMEM; - bytes = nslabs << IO_TLB_SHIFT; + mem = (void *)__get_free_pages(GFP_KERNEL, + get_order(struct_size(mem, slots, nslabs))); + if (!mem) + return -ENOMEM; mem->nslabs = nslabs; mem->start = virt_to_phys(tlb); @@ -355,84 +297,35 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) mem->index = 0; mem->late_alloc = 1; spin_lock_init(&mem->lock); + for (i = 0; i < mem->nslabs; i++) { + mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i); + mem->slots[i].orig_addr = INVALID_PHYS_ADDR; + mem->slots[i].alloc_size = 0; + } set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); memset(tlb, 0, bytes); - /* - * Allocate and initialize the free list array. This array is used - * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE - * between mem->start and mem->end. - */ - mem->list = (unsigned int *)__get_free_pages(GFP_KERNEL, - get_order(mem->nslabs * sizeof(int))); - if (!mem->list) - goto cleanup3; - - mem->orig_addr = (phys_addr_t *) - __get_free_pages(GFP_KERNEL, - get_order(mem->nslabs * - sizeof(phys_addr_t))); - if (!mem->orig_addr) - goto cleanup4; - - mem->alloc_size = (size_t *) - __get_free_pages(GFP_KERNEL, - get_order(mem->nslabs * - sizeof(size_t))); - if (!mem->alloc_size) - goto cleanup5; - - for (i = 0; i < mem->nslabs; i++) { - mem->list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); - mem->orig_addr[i] = INVALID_PHYS_ADDR; - mem->alloc_size[i] = 0; - } - no_iotlb_memory = false; - + io_tlb_default_mem = mem; swiotlb_print_info(); swiotlb_set_max_segment(mem->nslabs << IO_TLB_SHIFT); return 0; - -cleanup5: - free_pages((unsigned long)mem->orig_addr, - get_order(mem->nslabs * sizeof(phys_addr_t))); -cleanup4: - free_pages((unsigned long)mem->list, - get_order(mem->nslabs * sizeof(int))); - mem->list = NULL; -cleanup3: - swiotlb_cleanup(); - return -ENOMEM; } void __init swiotlb_exit(void) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_mem *mem = io_tlb_default_mem; + size_t size; - if (!mem->orig_addr) + if (!mem) return; - if (mem->late_alloc) { - free_pages((unsigned long)mem->alloc_size, - get_order(mem->nslabs * sizeof(size_t))); - free_pages((unsigned long)mem->orig_addr, - get_order(mem->nslabs * sizeof(phys_addr_t))); - free_pages((unsigned long)mem->list, - get_order(mem->nslabs * sizeof(int))); - free_pages((unsigned long)phys_to_virt(mem->start), - get_order(mem->nslabs << IO_TLB_SHIFT)); - } else { - memblock_free_late(__pa(mem->alloc_size), - PAGE_ALIGN(mem->nslabs * sizeof(size_t))); - memblock_free_late(__pa(mem->orig_addr), - PAGE_ALIGN(mem->nslabs * sizeof(phys_addr_t))); - memblock_free_late(__pa(mem->list), - PAGE_ALIGN(mem->nslabs * sizeof(int))); - memblock_free_late(mem->start, - PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT)); - } - swiotlb_cleanup(); + size = struct_size(mem, slots, mem->nslabs); + if (mem->late_alloc) + free_pages((unsigned long)mem, get_order(size)); + else + memblock_free_late(__pa(mem), PAGE_ALIGN(size)); + io_tlb_default_mem = NULL; } /* @@ -441,10 +334,10 @@ void __init swiotlb_exit(void) static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, enum dma_data_direction dir) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_mem *mem = io_tlb_default_mem; int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT; - phys_addr_t orig_addr = mem->orig_addr[index]; - size_t alloc_size = mem->alloc_size[index]; + phys_addr_t orig_addr = mem->slots[index].orig_addr; + size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = phys_to_virt(tlb_addr); @@ -523,7 +416,7 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index) static int find_slots(struct device *dev, phys_addr_t orig_addr, size_t alloc_size) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_mem *mem = io_tlb_default_mem; unsigned long boundary_mask = dma_get_seg_boundary(dev); dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(dev, mem->start) & boundary_mask; @@ -565,7 +458,7 @@ static int find_slots(struct device *dev, phys_addr_t orig_addr, if (!iommu_is_span_boundary(index, nslots, nr_slots(tbl_dma_addr), max_slots)) { - if (mem->list[index] >= nslots) + if (mem->slots[index].list >= nslots) goto found; } index = wrap_index(mem, index + stride); @@ -577,11 +470,11 @@ not_found: found: for (i = index; i < index + nslots; i++) - mem->list[i] = 0; + mem->slots[i].list = 0; for (i = index - 1; io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && - mem->list[i]; i--) - mem->list[i] = ++count; + mem->slots[i].list; i--) + mem->slots[i].list = ++count; /* * Update the indices to avoid searching in the next round. @@ -600,12 +493,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_mem *mem = io_tlb_default_mem; unsigned int offset = swiotlb_align_offset(dev, orig_addr); unsigned int index, i; phys_addr_t tlb_addr; - if (no_iotlb_memory) + if (!mem) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); if (mem_encrypt_active()) @@ -632,8 +525,9 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, * needed. */ for (i = 0; i < nr_slots(alloc_size + offset); i++) { - mem->orig_addr[index + i] = slot_addr(orig_addr, i); - mem->alloc_size[index + i] = alloc_size - (i << IO_TLB_SHIFT); + mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); + mem->slots[index + i].alloc_size = + alloc_size - (i << IO_TLB_SHIFT); } tlb_addr = slot_addr(mem->start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && @@ -649,11 +543,11 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, size_t mapping_size, enum dma_data_direction dir, unsigned long attrs) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_mem *mem = io_tlb_default_mem; unsigned long flags; unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT; - int nslots = nr_slots(mem->alloc_size[index] + offset); + int nslots = nr_slots(mem->slots[index].alloc_size + offset); int count, i; /* @@ -671,7 +565,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, */ spin_lock_irqsave(&mem->lock, flags); if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) - count = mem->list[index + nslots]; + count = mem->slots[index + nslots].list; else count = 0; @@ -680,9 +574,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * superceeding slots */ for (i = index + nslots - 1; i >= index; i--) { - mem->list[i] = ++count; - mem->orig_addr[i] = INVALID_PHYS_ADDR; - mem->alloc_size[i] = 0; + mem->slots[i].list = ++count; + mem->slots[i].orig_addr = INVALID_PHYS_ADDR; + mem->slots[i].alloc_size = 0; } /* @@ -690,9 +584,9 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * available (non zero) */ for (i = index - 1; - io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->list[i]; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && mem->slots[i].list; i--) - mem->list[i] = ++count; + mem->slots[i].list = ++count; mem->used -= nslots; spin_unlock_irqrestore(&mem->lock, flags); } @@ -756,19 +650,17 @@ size_t swiotlb_max_mapping_size(struct device *dev) bool is_swiotlb_active(void) { - /* - * When SWIOTLB is initialized, even if mem->start points to physical - * address zero, mem->end surely doesn't. - */ - return io_tlb_default_mem.end != 0; + return io_tlb_default_mem != NULL; } #ifdef CONFIG_DEBUG_FS static int __init swiotlb_create_debugfs(void) { - struct io_tlb_mem *mem = &io_tlb_default_mem; + struct io_tlb_mem *mem = io_tlb_default_mem; + if (!mem) + return 0; mem->debugfs = debugfs_create_dir("swiotlb", NULL); debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); From 2cbc2776efe4faed0e17c48ae076aa03a0fcc61f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 Mar 2021 17:14:24 +0100 Subject: [PATCH 0163/1379] swiotlb: remove swiotlb_nr_tbl All callers just use it to check if swiotlb is active at all, for which they can just use is_swiotlb_active. In the longer run drivers need to stop using is_swiotlb_active as well, but let's do the simple step first. Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +- drivers/gpu/drm/nouveau/nouveau_ttm.c | 2 +- drivers/pci/xen-pcifront.c | 2 +- include/linux/swiotlb.h | 1 - kernel/dma/swiotlb.c | 7 +------ 5 files changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c index ad22f42541bd..a9d65fc8aa0e 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c @@ -42,7 +42,7 @@ static int i915_gem_object_get_pages_internal(struct drm_i915_gem_object *obj) max_order = MAX_ORDER; #ifdef CONFIG_SWIOTLB - if (swiotlb_nr_tbl()) { + if (is_swiotlb_active()) { unsigned int max_segment; max_segment = swiotlb_max_segment(); diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c b/drivers/gpu/drm/nouveau/nouveau_ttm.c index a37bc3d7b38b..9662522aa066 100644 --- a/drivers/gpu/drm/nouveau/nouveau_ttm.c +++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c @@ -321,7 +321,7 @@ nouveau_ttm_init(struct nouveau_drm *drm) } #if IS_ENABLED(CONFIG_SWIOTLB) && IS_ENABLED(CONFIG_X86) - need_swiotlb = !!swiotlb_nr_tbl(); + need_swiotlb = is_swiotlb_active(); #endif ret = ttm_bo_device_init(&drm->ttm.bdev, &nouveau_bo_driver, diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index 2d7502648219..b7a8f3a1921f 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -693,7 +693,7 @@ static int pcifront_connect_and_init_dma(struct pcifront_device *pdev) spin_unlock(&pcifront_dev_lock); - if (!err && !swiotlb_nr_tbl()) { + if (!err && !is_swiotlb_active()) { err = pci_xen_swiotlb_init_late(); if (err) dev_err(&pdev->xdev->dev, "Could not setup SWIOTLB!\n"); diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 63f7a63f61d0..216854a5e513 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -37,7 +37,6 @@ enum swiotlb_force { extern void swiotlb_init(int verbose); int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); -extern unsigned long swiotlb_nr_tbl(void); unsigned long swiotlb_size_or_default(void); extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); extern int swiotlb_late_init_with_default_size(size_t default_size); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 13de669a9b46..539c76beb52e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -94,12 +94,6 @@ setup_io_tlb_npages(char *str) } early_param("swiotlb", setup_io_tlb_npages); -unsigned long swiotlb_nr_tbl(void) -{ - return io_tlb_default_mem ? io_tlb_default_mem->nslabs : 0; -} -EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); - unsigned int swiotlb_max_segment(void) { return io_tlb_default_mem ? max_segment : 0; @@ -652,6 +646,7 @@ bool is_swiotlb_active(void) { return io_tlb_default_mem != NULL; } +EXPORT_SYMBOL_GPL(is_swiotlb_active); #ifdef CONFIG_DEBUG_FS From fcf044891c84e38fc90eb736b818781bccf94e38 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 18 Mar 2021 21:03:33 -0700 Subject: [PATCH 0164/1379] ARM: Qualify enabling of swiotlb_init() We do not need a SWIOTLB unless we have DRAM that is addressable beyond the arm_dma_limit. Compare max_pfn with arm_dma_pfn_limit to determine whether we do need a SWIOTLB to be initialized. Fixes: ad3c7b18c5b3 ("arm: use swiotlb for bounce buffering on LPAE configs") Signed-off-by: Florian Fainelli Signed-off-by: Konrad Rzeszutek Wilk --- arch/arm/mm/init.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 828a2561b229..8356bf1daa28 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -301,7 +301,11 @@ static void __init free_highpages(void) void __init mem_init(void) { #ifdef CONFIG_ARM_LPAE - swiotlb_init(1); + if (swiotlb_force == SWIOTLB_FORCE || + max_pfn > arm_dma_pfn_limit) + swiotlb_init(1); + else + swiotlb_force = SWIOTLB_NO_FORCE; #endif set_max_mapnr(pfn_to_page(max_pfn) - mem_map); From 20e1dbf2bbe2431072571000ed31dfef09359c08 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Sat, 13 Mar 2021 00:55:20 +0100 Subject: [PATCH 0165/1379] media: uvcvideo: Use dma_alloc_noncontiguous API On architectures where there is no coherent caching such as ARM use the dma_alloc_noncontiguous API and handle manually the cache flushing using dma_sync_sgtable(). If the architechture has coherent cache, the API falls back to alloc_dma_pages, so we can remove the coherent caching code-path from the driver, making it simpler. With this patch on the affected architectures we can measure up to 20x performance improvement in uvc_video_copy_data_work(). Eg: aarch64 with an external usb camera NON_CONTIGUOUS frames: 999 packets: 999 empty: 0 (0 %) errors: 0 invalid: 0 pts: 0 early, 0 initial, 999 ok scr: 0 count ok, 0 diff ok sof: 2048 <= sof <= 0, freq 0.000 kHz bytes 67034480 : duration 33303 FPS: 29.99 URB: 523446/4993 uS/qty: 104.836 avg 132.532 std 13.230 min 831.094 max (uS) header: 76564/4993 uS/qty: 15.334 avg 15.229 std 3.438 min 186.875 max (uS) latency: 468945/4992 uS/qty: 93.939 avg 132.577 std 9.531 min 824.010 max (uS) decode: 54161/4993 uS/qty: 10.847 avg 6.313 std 1.614 min 111.458 max (uS) raw decode speed: 9.931 Gbits/s raw URB handling speed: 1.025 Gbits/s throughput: 16.102 Mbits/s URB decode CPU usage 0.162600 % COHERENT frames: 999 packets: 999 empty: 0 (0 %) errors: 0 invalid: 0 pts: 0 early, 0 initial, 999 ok scr: 0 count ok, 0 diff ok sof: 2048 <= sof <= 0, freq 0.000 kHz bytes 54683536 : duration 33302 FPS: 29.99 URB: 1478135/4000 uS/qty: 369.533 avg 390.357 std 22.968 min 3337.865 max (uS) header: 79761/4000 uS/qty: 19.940 avg 18.495 std 1.875 min 336.719 max (uS) latency: 281077/4000 uS/qty: 70.269 avg 83.102 std 5.104 min 735.000 max (uS) decode: 1197057/4000 uS/qty: 299.264 avg 318.080 std 1.615 min 2806.667 max (uS) raw decode speed: 365.470 Mbits/s raw URB handling speed: 295.986 Mbits/s throughput: 13.136 Mbits/s URB decode CPU usage 3.594500 % In non-affected architectures we see no significant impact. Eg: x86 with an external usb camera NON_CONTIGUOUS frames: 999 packets: 999 empty: 0 (0 %) errors: 0 invalid: 0 pts: 0 early, 0 initial, 999 ok scr: 0 count ok, 0 diff ok sof: 2048 <= sof <= 0, freq 0.000 kHz bytes 70179056 : duration 33301 FPS: 29.99 URB: 288901/4897 uS/qty: 58.995 avg 26.022 std 4.319 min 253.853 max (uS) header: 54792/4897 uS/qty: 11.189 avg 6.218 std 0.620 min 61.750 max (uS) latency: 236602/4897 uS/qty: 48.315 avg 24.244 std 1.764 min 240.924 max (uS) decode: 52298/4897 uS/qty: 10.679 avg 8.299 std 1.638 min 108.861 max (uS) raw decode speed: 10.796 Gbits/s raw URB handling speed: 1.949 Gbits/s throughput: 16.859 Mbits/s URB decode CPU usage 0.157000 % COHERENT frames: 999 packets: 999 empty: 0 (0 %) errors: 0 invalid: 0 pts: 0 early, 0 initial, 999 ok scr: 0 count ok, 0 diff ok sof: 2048 <= sof <= 0, freq 0.000 kHz bytes 71818320 : duration 33301 FPS: 29.99 URB: 321021/5000 uS/qty: 64.204 avg 23.001 std 10.430 min 268.837 max (uS) header: 54308/5000 uS/qty: 10.861 avg 5.104 std 0.778 min 54.736 max (uS) latency: 268799/5000 uS/qty: 53.759 avg 21.827 std 6.095 min 255.153 max (uS) decode: 52222/5000 uS/qty: 10.444 avg 7.137 std 1.874 min 71.103 max (uS) raw decode speed: 11.048 Gbits/s raw URB handling speed: 1.789 Gbits/s throughput: 17.253 Mbits/s URB decode CPU usage 0.156800 % Signed-off-by: Ricardo Ribalda Reviewed-by: Laurent Pinchart Reviewed-by: Tomasz Figa Signed-off-by: Christoph Hellwig --- drivers/media/usb/uvc/uvc_video.c | 94 +++++++++++++++++++++++-------- drivers/media/usb/uvc/uvcvideo.h | 5 +- 2 files changed, 73 insertions(+), 26 deletions(-) diff --git a/drivers/media/usb/uvc/uvc_video.c b/drivers/media/usb/uvc/uvc_video.c index f2f565281e63..a777b389a66e 100644 --- a/drivers/media/usb/uvc/uvc_video.c +++ b/drivers/media/usb/uvc/uvc_video.c @@ -6,11 +6,14 @@ * Laurent Pinchart (laurent.pinchart@ideasonboard.com) */ +#include +#include #include #include #include #include #include +#include #include #include #include @@ -1096,6 +1099,29 @@ static int uvc_video_decode_start(struct uvc_streaming *stream, return data[0]; } +static inline enum dma_data_direction uvc_stream_dir( + struct uvc_streaming *stream) +{ + if (stream->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) + return DMA_FROM_DEVICE; + else + return DMA_TO_DEVICE; +} + +static inline struct device *uvc_stream_to_dmadev(struct uvc_streaming *stream) +{ + return bus_to_hcd(stream->dev->udev->bus)->self.sysdev; +} + +static int uvc_submit_urb(struct uvc_urb *uvc_urb, gfp_t mem_flags) +{ + /* Sync DMA. */ + dma_sync_sgtable_for_device(uvc_stream_to_dmadev(uvc_urb->stream), + uvc_urb->sgt, + uvc_stream_dir(uvc_urb->stream)); + return usb_submit_urb(uvc_urb->urb, mem_flags); +} + /* * uvc_video_decode_data_work: Asynchronous memcpy processing * @@ -1117,7 +1143,7 @@ static void uvc_video_copy_data_work(struct work_struct *work) uvc_queue_buffer_release(op->buf); } - ret = usb_submit_urb(uvc_urb->urb, GFP_KERNEL); + ret = uvc_submit_urb(uvc_urb, GFP_KERNEL); if (ret < 0) dev_err(&uvc_urb->stream->intf->dev, "Failed to resubmit video URB (%d).\n", ret); @@ -1537,6 +1563,12 @@ static void uvc_video_complete(struct urb *urb) /* Re-initialise the URB async work. */ uvc_urb->async_operations = 0; + /* Sync DMA and invalidate vmap range. */ + dma_sync_sgtable_for_cpu(uvc_stream_to_dmadev(uvc_urb->stream), + uvc_urb->sgt, uvc_stream_dir(stream)); + invalidate_kernel_vmap_range(uvc_urb->buffer, + uvc_urb->stream->urb_size); + /* * Process the URB headers, and optionally queue expensive memcpy tasks * to be deferred to a work queue. @@ -1545,7 +1577,7 @@ static void uvc_video_complete(struct urb *urb) /* If no async work is needed, resubmit the URB immediately. */ if (!uvc_urb->async_operations) { - ret = usb_submit_urb(uvc_urb->urb, GFP_ATOMIC); + ret = uvc_submit_urb(uvc_urb, GFP_ATOMIC); if (ret < 0) dev_err(&stream->intf->dev, "Failed to resubmit video URB (%d).\n", ret); @@ -1560,24 +1592,49 @@ static void uvc_video_complete(struct urb *urb) */ static void uvc_free_urb_buffers(struct uvc_streaming *stream) { + struct device *dma_dev = uvc_stream_to_dmadev(stream); struct uvc_urb *uvc_urb; for_each_uvc_urb(uvc_urb, stream) { if (!uvc_urb->buffer) continue; -#ifndef CONFIG_DMA_NONCOHERENT - usb_free_coherent(stream->dev->udev, stream->urb_size, - uvc_urb->buffer, uvc_urb->dma); -#else - kfree(uvc_urb->buffer); -#endif + dma_vunmap_noncontiguous(dma_dev, uvc_urb->buffer); + dma_free_noncontiguous(dma_dev, stream->urb_size, uvc_urb->sgt, + uvc_stream_dir(stream)); + uvc_urb->buffer = NULL; + uvc_urb->sgt = NULL; } stream->urb_size = 0; } +static bool uvc_alloc_urb_buffer(struct uvc_streaming *stream, + struct uvc_urb *uvc_urb, gfp_t gfp_flags) +{ + struct device *dma_dev = uvc_stream_to_dmadev(stream); + + uvc_urb->sgt = dma_alloc_noncontiguous(dma_dev, stream->urb_size, + uvc_stream_dir(stream), + gfp_flags, 0); + if (!uvc_urb->sgt) + return false; + uvc_urb->dma = uvc_urb->sgt->sgl->dma_address; + + uvc_urb->buffer = dma_vmap_noncontiguous(dma_dev, stream->urb_size, + uvc_urb->sgt); + if (!uvc_urb->buffer) { + dma_free_noncontiguous(dma_dev, stream->urb_size, + uvc_urb->sgt, + uvc_stream_dir(stream)); + uvc_urb->sgt = NULL; + return false; + } + + return true; +} + /* * Allocate transfer buffers. This function can be called with buffers * already allocated when resuming from suspend, in which case it will @@ -1608,19 +1665,12 @@ static int uvc_alloc_urb_buffers(struct uvc_streaming *stream, /* Retry allocations until one succeed. */ for (; npackets > 1; npackets /= 2) { + stream->urb_size = psize * npackets; + for (i = 0; i < UVC_URBS; ++i) { struct uvc_urb *uvc_urb = &stream->uvc_urb[i]; - stream->urb_size = psize * npackets; -#ifndef CONFIG_DMA_NONCOHERENT - uvc_urb->buffer = usb_alloc_coherent( - stream->dev->udev, stream->urb_size, - gfp_flags | __GFP_NOWARN, &uvc_urb->dma); -#else - uvc_urb->buffer = - kmalloc(stream->urb_size, gfp_flags | __GFP_NOWARN); -#endif - if (!uvc_urb->buffer) { + if (!uvc_alloc_urb_buffer(stream, uvc_urb, gfp_flags)) { uvc_free_urb_buffers(stream); break; } @@ -1730,12 +1780,8 @@ static int uvc_init_video_isoc(struct uvc_streaming *stream, urb->context = uvc_urb; urb->pipe = usb_rcvisocpipe(stream->dev->udev, ep->desc.bEndpointAddress); -#ifndef CONFIG_DMA_NONCOHERENT urb->transfer_flags = URB_ISO_ASAP | URB_NO_TRANSFER_DMA_MAP; urb->transfer_dma = uvc_urb->dma; -#else - urb->transfer_flags = URB_ISO_ASAP; -#endif urb->interval = ep->desc.bInterval; urb->transfer_buffer = uvc_urb->buffer; urb->complete = uvc_video_complete; @@ -1795,10 +1841,8 @@ static int uvc_init_video_bulk(struct uvc_streaming *stream, usb_fill_bulk_urb(urb, stream->dev->udev, pipe, uvc_urb->buffer, size, uvc_video_complete, uvc_urb); -#ifndef CONFIG_DMA_NONCOHERENT urb->transfer_flags = URB_NO_TRANSFER_DMA_MAP; urb->transfer_dma = uvc_urb->dma; -#endif uvc_urb->urb = urb; } @@ -1895,7 +1939,7 @@ static int uvc_video_start_transfer(struct uvc_streaming *stream, /* Submit the URBs. */ for_each_uvc_urb(uvc_urb, stream) { - ret = usb_submit_urb(uvc_urb->urb, gfp_flags); + ret = uvc_submit_urb(uvc_urb, gfp_flags); if (ret < 0) { dev_err(&stream->intf->dev, "Failed to submit URB %u (%d).\n", diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h index 97df5ecd66c9..cce5e38133cd 100644 --- a/drivers/media/usb/uvc/uvcvideo.h +++ b/drivers/media/usb/uvc/uvcvideo.h @@ -219,6 +219,7 @@ */ struct gpio_desc; +struct sg_table; struct uvc_device; /* TODO: Put the most frequently accessed fields at the beginning of @@ -545,7 +546,8 @@ struct uvc_copy_op { * @urb: the URB described by this context structure * @stream: UVC streaming context * @buffer: memory storage for the URB - * @dma: DMA coherent addressing for the urb_buffer + * @dma: Allocated DMA handle + * @sgt: sgt_table with the urb locations in memory * @async_operations: counter to indicate the number of copy operations * @copy_operations: work descriptors for asynchronous copy operations * @work: work queue entry for asynchronous decode @@ -556,6 +558,7 @@ struct uvc_urb { char *buffer; dma_addr_t dma; + struct sg_table *sgt; unsigned int async_operations; struct uvc_copy_op copy_operations[UVC_MAX_PACKETS]; From 84fcfbdadbfdd86c9a43a52703203e05fe7efd92 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Fri, 12 Mar 2021 10:19:12 +0800 Subject: [PATCH 0166/1379] dma-mapping: remove a pointless empty line in dma_alloc_coherent Signed-off-by: Wang Qing Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 954847f9a3e0..e9d19b974f26 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -435,7 +435,6 @@ static inline void dma_sync_sgtable_for_device(struct device *dev, static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { - return dma_alloc_attrs(dev, size, dma_handle, gfp, (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0); } From ef6e01af398acff63eb33c58e72839e50a3e1c4b Mon Sep 17 00:00:00 2001 From: Thara Gopinath Date: Fri, 19 Mar 2021 11:37:11 -0400 Subject: [PATCH 0167/1379] MAINTAINERS: Add co-maintainer for Qualcomm tsens thermal drivers Add myself as the maintainer for Qualcomm tsens drivers so that I can help Daniel by taking care of/reviewing changes to these drivers. Signed-off-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210319153711.2836652-1-thara.gopinath@linaro.org --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index d92f85ca831d..f919aa861165 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14891,6 +14891,7 @@ F: include/linux/if_rmnet.h QUALCOMM TSENS THERMAL DRIVER M: Amit Kucheria +M: Thara Gopinath L: linux-pm@vger.kernel.org L: linux-arm-msm@vger.kernel.org S: Maintained From bfae2779fe4b92421fbe1b2008bfd6b9fada823e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Fri, 19 Mar 2021 20:35:58 -0700 Subject: [PATCH 0168/1379] Input: ims-pcu - drop redundant driver-data assignment The driver data for the data interface has already been set by usb_driver_claim_interface() so drop the subsequent redundant assignment. Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210318155525.22496-1-johan@kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/misc/ims-pcu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/input/misc/ims-pcu.c b/drivers/input/misc/ims-pcu.c index 08b9b5cdb943..81de8c4e37d0 100644 --- a/drivers/input/misc/ims-pcu.c +++ b/drivers/input/misc/ims-pcu.c @@ -2018,7 +2018,6 @@ static int ims_pcu_probe(struct usb_interface *intf, } usb_set_intfdata(pcu->ctrl_intf, pcu); - usb_set_intfdata(pcu->data_intf, pcu); error = ims_pcu_buffers_alloc(pcu); if (error) From 4895bfe91199e19fd6bc9b43307cf4adb0409746 Mon Sep 17 00:00:00 2001 From: "edison.jiang" Date: Fri, 19 Mar 2021 20:56:00 -0700 Subject: [PATCH 0169/1379] Input: lpc32xx_ts - convert to use BIT() There is error from cppcheck tool. "Shifting signed 32-bit value by 31 bits is undefined behaviour errors" Signed-off-by: edison.jiang Link: https://lore.kernel.org/r/20210316153150.1207-1-jzp0409@163.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/lpc32xx_ts.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/input/touchscreen/lpc32xx_ts.c b/drivers/input/touchscreen/lpc32xx_ts.c index b51450b3d943..15b5cb763526 100644 --- a/drivers/input/touchscreen/lpc32xx_ts.c +++ b/drivers/input/touchscreen/lpc32xx_ts.c @@ -34,18 +34,18 @@ #define LPC32XX_TSC_AUX_MIN 0x38 #define LPC32XX_TSC_AUX_MAX 0x3C -#define LPC32XX_TSC_STAT_FIFO_OVRRN (1 << 8) -#define LPC32XX_TSC_STAT_FIFO_EMPTY (1 << 7) +#define LPC32XX_TSC_STAT_FIFO_OVRRN BIT(8) +#define LPC32XX_TSC_STAT_FIFO_EMPTY BIT(7) #define LPC32XX_TSC_SEL_DEFVAL 0x0284 #define LPC32XX_TSC_ADCCON_IRQ_TO_FIFO_4 (0x1 << 11) #define LPC32XX_TSC_ADCCON_X_SAMPLE_SIZE(s) ((10 - (s)) << 7) #define LPC32XX_TSC_ADCCON_Y_SAMPLE_SIZE(s) ((10 - (s)) << 4) -#define LPC32XX_TSC_ADCCON_POWER_UP (1 << 2) -#define LPC32XX_TSC_ADCCON_AUTO_EN (1 << 0) +#define LPC32XX_TSC_ADCCON_POWER_UP BIT(2) +#define LPC32XX_TSC_ADCCON_AUTO_EN BIT(0) -#define LPC32XX_TSC_FIFO_TS_P_LEVEL (1 << 31) +#define LPC32XX_TSC_FIFO_TS_P_LEVEL BIT(31) #define LPC32XX_TSC_FIFO_NORMALIZE_X_VAL(x) (((x) & 0x03FF0000) >> 16) #define LPC32XX_TSC_FIFO_NORMALIZE_Y_VAL(y) ((y) & 0x000003FF) From ae4c86a024f634d5523e048a68635ae62765fcc4 Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Sat, 20 Mar 2021 19:19:58 -0700 Subject: [PATCH 0170/1379] dt-bindings: input: atmel_mxt_ts: Document atmel,wakeup-method and WAKE line GPIO Some Atmel touchscreen controllers have a WAKE line that needs to be asserted low in order to wake up controller from a deep sleep. Document the wakeup methods and the new GPIO properties. Reviewed-by: Rob Herring Reviewed-by: Linus Walleij Signed-off-by: Dmitry Osipenko Link: https://lore.kernel.org/r/20210302102158.10533-2-digetx@gmail.com Signed-off-by: Dmitry Torokhov --- .../bindings/input/atmel,maxtouch.yaml | 29 +++++++++++++++++++ include/dt-bindings/input/atmel-maxtouch.h | 10 +++++++ 2 files changed, 39 insertions(+) create mode 100644 include/dt-bindings/input/atmel-maxtouch.h diff --git a/Documentation/devicetree/bindings/input/atmel,maxtouch.yaml b/Documentation/devicetree/bindings/input/atmel,maxtouch.yaml index 8c6418f76e94..e6b03a1e7c30 100644 --- a/Documentation/devicetree/bindings/input/atmel,maxtouch.yaml +++ b/Documentation/devicetree/bindings/input/atmel,maxtouch.yaml @@ -39,6 +39,13 @@ properties: (active low). The line must be flagged with GPIO_ACTIVE_LOW. + wake-gpios: + maxItems: 1 + description: + Optional GPIO specifier for the touchscreen's wake pin + (active low). The line must be flagged with + GPIO_ACTIVE_LOW. + linux,gpio-keymap: $ref: /schemas/types.yaml#/definitions/uint32-array description: | @@ -53,6 +60,26 @@ properties: or experiment to determine which bit corresponds to which input. Use KEY_RESERVED for unused padding values. + atmel,wakeup-method: + $ref: /schemas/types.yaml#/definitions/uint32 + description: | + The WAKE line is an active-low input that is used to wake up the touch + controller from deep-sleep mode before communication with the controller + could be started. This optional feature used to minimize current + consumption when the controller is in deep sleep mode. This feature is + relevant only to some controller families, like mXT1386 controller for + example. + + The WAKE pin can be connected in one of the following ways: + 1) left permanently low + 2) connected to the I2C-compatible SCL pin + 3) connected to a GPIO pin on the host + enum: + - 0 # ATMEL_MXT_WAKEUP_NONE + - 1 # ATMEL_MXT_WAKEUP_I2C_SCL + - 2 # ATMEL_MXT_WAKEUP_GPIO + default: 0 + required: - compatible - reg @@ -63,6 +90,7 @@ additionalProperties: false examples: - | #include + #include #include i2c { #address-cells = <1>; @@ -75,6 +103,7 @@ examples: reset-gpios = <&gpio 27 GPIO_ACTIVE_LOW>; vdda-supply = <&ab8500_ldo_aux2_reg>; vdd-supply = <&ab8500_ldo_aux5_reg>; + atmel,wakeup-method = ; }; }; diff --git a/include/dt-bindings/input/atmel-maxtouch.h b/include/dt-bindings/input/atmel-maxtouch.h new file mode 100644 index 000000000000..7345ab32224d --- /dev/null +++ b/include/dt-bindings/input/atmel-maxtouch.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef _DT_BINDINGS_ATMEL_MAXTOUCH_H +#define _DT_BINDINGS_ATMEL_MAXTOUCH_H + +#define ATMEL_MXT_WAKEUP_NONE 0 +#define ATMEL_MXT_WAKEUP_I2C_SCL 1 +#define ATMEL_MXT_WAKEUP_GPIO 2 + +#endif /* _DT_BINDINGS_ATMEL_MAXTOUCH_H */ From 8b488ef295f206885dbe48de09346059df620dfa Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Sat, 20 Mar 2021 19:23:44 -0700 Subject: [PATCH 0171/1379] Input: atmel_mxt_ts - support wakeup methods According to datasheets, chips like mXT1386 have a WAKE line, it is used to wake the chip up from deep sleep mode before communicating with it via the I2C-compatible interface. If the WAKE line is connected to a GPIO line, the line must be asserted 25 ms before the host attempts to communicate with the controller. If the WAKE line is connected to the SCL pin, the controller will send a NACK on the first attempt to address it, the host must then retry 25 ms later. Implement the wake-up methods in the driver. Touchscreen now works properly on devices like Acer A500 tablet, fixing problems like this: atmel_mxt_ts 0-004c: __mxt_read_reg: i2c transfer failed (-121) atmel_mxt_ts 0-004c: mxt_bootloader_read: i2c recv failed (-121) atmel_mxt_ts 0-004c: Trying alternate bootloader address atmel_mxt_ts 0-004c: mxt_bootloader_read: i2c recv failed (-121) atmel_mxt_ts: probe of 0-004c failed with error -121 Reviewed-by: Linus Walleij Signed-off-by: Jiada Wang Signed-off-by: Dmitry Osipenko Link: https://lore.kernel.org/r/20210302102158.10533-3-digetx@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/atmel_mxt_ts.c | 78 ++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c index 383a848eb601..5ed23689047b 100644 --- a/drivers/input/touchscreen/atmel_mxt_ts.c +++ b/drivers/input/touchscreen/atmel_mxt_ts.c @@ -31,6 +31,7 @@ #include #include #include +#include /* Firmware files */ #define MXT_FW_NAME "maxtouch.fw" @@ -199,6 +200,7 @@ enum t100_type { #define MXT_CRC_TIMEOUT 1000 /* msec */ #define MXT_FW_RESET_TIME 3000 /* msec */ #define MXT_FW_CHG_TIMEOUT 300 /* msec */ +#define MXT_WAKEUP_TIME 25 /* msec */ /* Command to unlock bootloader */ #define MXT_UNLOCK_CMD_MSB 0xaa @@ -312,6 +314,7 @@ struct mxt_data { struct mxt_dbg dbg; struct regulator_bulk_data regulators[2]; struct gpio_desc *reset_gpio; + struct gpio_desc *wake_gpio; bool use_retrigen_workaround; /* Cached parameters from object table */ @@ -342,6 +345,8 @@ struct mxt_data { unsigned int t19_num_keys; enum mxt_suspend_mode suspend_mode; + + u32 wakeup_method; }; struct mxt_vb2_buffer { @@ -621,10 +626,42 @@ static int mxt_send_bootloader_cmd(struct mxt_data *data, bool unlock) return mxt_bootloader_write(data, buf, sizeof(buf)); } +static bool mxt_wakeup_toggle(struct i2c_client *client, + bool wake_up, bool in_i2c) +{ + struct mxt_data *data = i2c_get_clientdata(client); + + switch (data->wakeup_method) { + case ATMEL_MXT_WAKEUP_I2C_SCL: + if (!in_i2c) + return false; + break; + + case ATMEL_MXT_WAKEUP_GPIO: + if (in_i2c) + return false; + + gpiod_set_value(data->wake_gpio, wake_up); + break; + + default: + return false; + } + + if (wake_up) { + dev_dbg(&client->dev, "waking up controller\n"); + + msleep(MXT_WAKEUP_TIME); + } + + return true; +} + static int __mxt_read_reg(struct i2c_client *client, u16 reg, u16 len, void *val) { struct i2c_msg xfer[2]; + bool retried = false; u8 buf[2]; int ret; @@ -643,9 +680,13 @@ static int __mxt_read_reg(struct i2c_client *client, xfer[1].len = len; xfer[1].buf = val; +retry: ret = i2c_transfer(client->adapter, xfer, 2); if (ret == 2) { ret = 0; + } else if (!retried && mxt_wakeup_toggle(client, true, true)) { + retried = true; + goto retry; } else { if (ret >= 0) ret = -EIO; @@ -659,6 +700,7 @@ static int __mxt_read_reg(struct i2c_client *client, static int __mxt_write_reg(struct i2c_client *client, u16 reg, u16 len, const void *val) { + bool retried = false; u8 *buf; size_t count; int ret; @@ -672,9 +714,13 @@ static int __mxt_write_reg(struct i2c_client *client, u16 reg, u16 len, buf[1] = (reg >> 8) & 0xff; memcpy(&buf[2], val, len); +retry: ret = i2c_master_send(client, buf, count); if (ret == count) { ret = 0; + } else if (!retried && mxt_wakeup_toggle(client, true, true)) { + retried = true; + goto retry; } else { if (ret >= 0) ret = -EIO; @@ -2975,6 +3021,8 @@ static const struct attribute_group mxt_attr_group = { static void mxt_start(struct mxt_data *data) { + mxt_wakeup_toggle(data->client, true, false); + switch (data->suspend_mode) { case MXT_SUSPEND_T9_CTRL: mxt_soft_reset(data); @@ -3009,6 +3057,8 @@ static void mxt_stop(struct mxt_data *data) mxt_set_t7_power_cfg(data, MXT_POWER_CFG_DEEPSLEEP); break; } + + mxt_wakeup_toggle(data->client, false, false); } static int mxt_input_open(struct input_dev *dev) @@ -3155,6 +3205,15 @@ static int mxt_probe(struct i2c_client *client, const struct i2c_device_id *id) return error; } + /* Request the WAKE line as asserted so we go out of sleep */ + data->wake_gpio = devm_gpiod_get_optional(&client->dev, + "wake", GPIOD_OUT_HIGH); + if (IS_ERR(data->wake_gpio)) { + error = PTR_ERR(data->wake_gpio); + dev_err(&client->dev, "Failed to get wake gpio: %d\n", error); + return error; + } + error = devm_request_threaded_irq(&client->dev, client->irq, NULL, mxt_interrupt, IRQF_ONESHOT, client->name, data); @@ -3185,6 +3244,25 @@ static int mxt_probe(struct i2c_client *client, const struct i2c_device_id *id) msleep(MXT_RESET_INVALID_CHG); } + /* + * Controllers like mXT1386 have a dedicated WAKE line that could be + * connected to a GPIO or to I2C SCL pin, or permanently asserted low. + * + * This WAKE line is used for waking controller from a deep-sleep and + * it needs to be asserted low for 25 milliseconds before I2C transfers + * could be accepted by controller if it was in a deep-sleep mode. + * Controller will go into sleep automatically after 2 seconds of + * inactivity if WAKE line is deasserted and deep sleep is activated. + * + * If WAKE line is connected to I2C SCL pin, then the first I2C transfer + * will get an instant NAK and transfer needs to be retried after 25ms. + * + * If WAKE line is connected to a GPIO line, the line must be asserted + * 25ms before the host attempts to communicate with the controller. + */ + device_property_read_u32(&client->dev, "atmel,wakeup-method", + &data->wakeup_method); + error = mxt_initialize(data); if (error) goto err_disable_regulators; From ca0d2fb790eb26fc53d851007ed1ead6c048be11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 14 Jan 2021 21:48:04 +0100 Subject: [PATCH 0172/1379] pwm: bcm2835: Improve period and duty cycle calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With an input clk rate bigger than 2000000000, scaler would have been zero which then would have resulted in a division by zero. Also the originally implemented algorithm divided by the result of a division. This nearly always looses precision. Consider a requested period of 1000000 ns. With an input clock frequency of 32786885 Hz the hardware was configured with an actual period of 983869.007 ns (PERIOD = 32258) while the hardware can provide 1000003.508 ns (PERIOD = 32787). And note if the input clock frequency was 32786886 Hz instead, the hardware was configured to 1016656.477 ns (PERIOD = 33333) while the optimal setting results in 1000003.477 ns (PERIOD = 32787). This patch implements proper range checking and only divides once for the calculation of period (and similar for duty_cycle). Signed-off-by: Uwe Kleine-König Reviewed-by: Lino Sanfilippo Tested-by: Lino Sanfilippo Signed-off-by: Thierry Reding --- drivers/pwm/pwm-bcm2835.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/drivers/pwm/pwm-bcm2835.c b/drivers/pwm/pwm-bcm2835.c index 6ff5f04b3e07..d593cce249d9 100644 --- a/drivers/pwm/pwm-bcm2835.c +++ b/drivers/pwm/pwm-bcm2835.c @@ -64,8 +64,9 @@ static int bcm2835_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, struct bcm2835_pwm *pc = to_bcm2835_pwm(chip); unsigned long rate = clk_get_rate(pc->clk); - unsigned long long period; - unsigned long scaler; + unsigned long long period_cycles; + u64 max_period; + u32 val; if (!rate) { @@ -73,18 +74,36 @@ static int bcm2835_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, return -EINVAL; } - scaler = DIV_ROUND_CLOSEST(NSEC_PER_SEC, rate); - /* set period */ - period = DIV_ROUND_CLOSEST_ULL(state->period, scaler); + /* + * period_cycles must be a 32 bit value, so period * rate / NSEC_PER_SEC + * must be <= U32_MAX. As U32_MAX * NSEC_PER_SEC < U64_MAX the + * multiplication period * rate doesn't overflow. + * To calculate the maximal possible period that guarantees the + * above inequality: + * + * round(period * rate / NSEC_PER_SEC) <= U32_MAX + * <=> period * rate / NSEC_PER_SEC < U32_MAX + 0.5 + * <=> period * rate < (U32_MAX + 0.5) * NSEC_PER_SEC + * <=> period < ((U32_MAX + 0.5) * NSEC_PER_SEC) / rate + * <=> period < ((U32_MAX * NSEC_PER_SEC + NSEC_PER_SEC/2) / rate + * <=> period <= ceil((U32_MAX * NSEC_PER_SEC + NSEC_PER_SEC/2) / rate) - 1 + */ + max_period = DIV_ROUND_UP_ULL((u64)U32_MAX * NSEC_PER_SEC + NSEC_PER_SEC / 2, rate) - 1; - /* dont accept a period that is too small or has been truncated */ - if ((period < PERIOD_MIN) || (period > U32_MAX)) + if (state->period > max_period) return -EINVAL; - writel(period, pc->base + PERIOD(pwm->hwpwm)); + /* set period */ + period_cycles = DIV_ROUND_CLOSEST_ULL(state->period * rate, NSEC_PER_SEC); + + /* don't accept a period that is too small */ + if (period_cycles < PERIOD_MIN) + return -EINVAL; + + writel(period_cycles, pc->base + PERIOD(pwm->hwpwm)); /* set duty cycle */ - val = DIV_ROUND_CLOSEST_ULL(state->duty_cycle, scaler); + val = DIV_ROUND_CLOSEST_ULL(state->duty_cycle * rate, NSEC_PER_SEC); writel(val, pc->base + DUTY(pwm->hwpwm)); /* set polarity */ From acf3402d83636ef4fb81aa35593f1c1fd7f05738 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 1 Mar 2021 19:45:37 +0100 Subject: [PATCH 0173/1379] pwm: ab8500: Implement .apply instead of .config, .enable and .disable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). Signed-off-by: Uwe Kleine-König Reviewed-by: Linus Walleij Signed-off-by: Thierry Reding --- drivers/pwm/pwm-ab8500.c | 53 +++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/drivers/pwm/pwm-ab8500.c b/drivers/pwm/pwm-ab8500.c index 58c6c0f5b0ec..5b0a71243d0f 100644 --- a/drivers/pwm/pwm-ab8500.c +++ b/drivers/pwm/pwm-ab8500.c @@ -24,23 +24,37 @@ struct ab8500_pwm_chip { struct pwm_chip chip; }; -static int ab8500_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) +static int ab8500_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) { - int ret = 0; - unsigned int higher_val, lower_val; + int ret; u8 reg; + unsigned int higher_val, lower_val; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + ret = abx500_mask_and_set_register_interruptible(chip->dev, + AB8500_MISC, AB8500_PWM_OUT_CTRL7_REG, + 1 << (chip->base - 1), 0); + + if (ret < 0) + dev_err(chip->dev, "%s: Failed to disable PWM, Error %d\n", + pwm->label, ret); + return ret; + } /* * get the first 8 bits that are be written to * AB8500_PWM_OUT_CTRL1_REG[0:7] */ - lower_val = duty_ns & 0x00FF; + lower_val = state->duty_cycle & 0x00FF; /* * get bits [9:10] that are to be written to * AB8500_PWM_OUT_CTRL2_REG[0:1] */ - higher_val = ((duty_ns & 0x0300) >> 8); + higher_val = ((state->duty_cycle & 0x0300) >> 8); reg = AB8500_PWM_OUT_CTRL1_REG + ((chip->base - 1) * 2); @@ -48,15 +62,11 @@ static int ab8500_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, reg, (u8)lower_val); if (ret < 0) return ret; + ret = abx500_set_register_interruptible(chip->dev, AB8500_MISC, (reg + 1), (u8)higher_val); - - return ret; -} - -static int ab8500_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) -{ - int ret; + if (ret < 0) + return ret; ret = abx500_mask_and_set_register_interruptible(chip->dev, AB8500_MISC, AB8500_PWM_OUT_CTRL7_REG, @@ -64,25 +74,12 @@ static int ab8500_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) if (ret < 0) dev_err(chip->dev, "%s: Failed to enable PWM, Error %d\n", pwm->label, ret); + return ret; } -static void ab8500_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) -{ - int ret; - - ret = abx500_mask_and_set_register_interruptible(chip->dev, - AB8500_MISC, AB8500_PWM_OUT_CTRL7_REG, - 1 << (chip->base - 1), 0); - if (ret < 0) - dev_err(chip->dev, "%s: Failed to disable PWM, Error %d\n", - pwm->label, ret); -} - static const struct pwm_ops ab8500_pwm_ops = { - .config = ab8500_pwm_config, - .enable = ab8500_pwm_enable, - .disable = ab8500_pwm_disable, + .apply = ab8500_pwm_apply, .owner = THIS_MODULE, }; From 5a43c201c9d05a65f1997877ba45ec41ee91b8b5 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Wed, 3 Mar 2021 23:42:42 -0300 Subject: [PATCH 0174/1379] pwm: imx-tpm: Use a single line for error message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no need to split the dev_err() call in three lines. Use a single line to improve readability. Signed-off-by: Fabio Estevam Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-imx-tpm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-imx-tpm.c b/drivers/pwm/pwm-imx-tpm.c index aaf629bd8c35..eec9ec4e1a2a 100644 --- a/drivers/pwm/pwm-imx-tpm.c +++ b/drivers/pwm/pwm-imx-tpm.c @@ -411,9 +411,7 @@ static int __maybe_unused pwm_imx_tpm_resume(struct device *dev) ret = clk_prepare_enable(tpm->clk); if (ret) - dev_err(dev, - "failed to prepare or enable clock: %d\n", - ret); + dev_err(dev, "failed to prepare or enable clock: %d\n", ret); return ret; } From f9a8ee8c8bcd118e800d88772c6457381db45224 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 1 Mar 2021 19:57:19 +0100 Subject: [PATCH 0175/1379] pwm: Always allocate PWM chip base ID dynamically MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 5e5da1e9fbee ("pwm: ab8500: Explicitly allocate pwm chip base dynamically") all drivers use dynamic ID allocation explicitly. New drivers are supposed to do the same, so remove support for driver specified base IDs and drop all assignments in the low-level drivers. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 25 +++++++------------------ drivers/pwm/pwm-ab8500.c | 1 - drivers/pwm/pwm-atmel-hlcdc.c | 1 - drivers/pwm/pwm-atmel-tcb.c | 1 - drivers/pwm/pwm-atmel.c | 1 - drivers/pwm/pwm-bcm-iproc.c | 1 - drivers/pwm/pwm-bcm-kona.c | 1 - drivers/pwm/pwm-bcm2835.c | 1 - drivers/pwm/pwm-berlin.c | 1 - drivers/pwm/pwm-brcmstb.c | 1 - drivers/pwm/pwm-clps711x.c | 1 - drivers/pwm/pwm-crc.c | 1 - drivers/pwm/pwm-cros-ec.c | 1 - drivers/pwm/pwm-dwc.c | 1 - drivers/pwm/pwm-ep93xx.c | 1 - drivers/pwm/pwm-fsl-ftm.c | 1 - drivers/pwm/pwm-hibvt.c | 1 - drivers/pwm/pwm-img.c | 1 - drivers/pwm/pwm-imx-tpm.c | 1 - drivers/pwm/pwm-imx1.c | 1 - drivers/pwm/pwm-imx27.c | 1 - drivers/pwm/pwm-intel-lgm.c | 1 - drivers/pwm/pwm-iqs620a.c | 1 - drivers/pwm/pwm-jz4740.c | 1 - drivers/pwm/pwm-keembay.c | 1 - drivers/pwm/pwm-lp3943.c | 1 - drivers/pwm/pwm-lpc18xx-sct.c | 1 - drivers/pwm/pwm-lpc32xx.c | 1 - drivers/pwm/pwm-lpss.c | 1 - drivers/pwm/pwm-mediatek.c | 1 - drivers/pwm/pwm-meson.c | 1 - drivers/pwm/pwm-mtk-disp.c | 1 - drivers/pwm/pwm-mxs.c | 1 - drivers/pwm/pwm-omap-dmtimer.c | 1 - drivers/pwm/pwm-pca9685.c | 1 - drivers/pwm/pwm-pxa.c | 1 - drivers/pwm/pwm-rcar.c | 1 - drivers/pwm/pwm-renesas-tpu.c | 1 - drivers/pwm/pwm-rockchip.c | 1 - drivers/pwm/pwm-samsung.c | 1 - drivers/pwm/pwm-sifive.c | 1 - drivers/pwm/pwm-sl28cpld.c | 1 - drivers/pwm/pwm-spear.c | 1 - drivers/pwm/pwm-sprd.c | 1 - drivers/pwm/pwm-sti.c | 1 - drivers/pwm/pwm-stm32-lp.c | 1 - drivers/pwm/pwm-stm32.c | 1 - drivers/pwm/pwm-stmpe.c | 1 - drivers/pwm/pwm-sun4i.c | 1 - drivers/pwm/pwm-tegra.c | 1 - drivers/pwm/pwm-tiecap.c | 1 - drivers/pwm/pwm-tiehrpwm.c | 1 - drivers/pwm/pwm-twl-led.c | 1 - drivers/pwm/pwm-twl.c | 1 - drivers/pwm/pwm-vt8500.c | 1 - 55 files changed, 7 insertions(+), 72 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index a8eff4b3ee36..8904eaa6d769 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -37,23 +37,13 @@ static struct pwm_device *pwm_to_device(unsigned int pwm) return radix_tree_lookup(&pwm_tree, pwm); } -static int alloc_pwms(int pwm, unsigned int count) +static int alloc_pwms(unsigned int count) { - unsigned int from = 0; unsigned int start; - if (pwm >= MAX_PWMS) - return -EINVAL; - - if (pwm >= 0) - from = pwm; - - start = bitmap_find_next_zero_area(allocated_pwms, MAX_PWMS, from, + start = bitmap_find_next_zero_area(allocated_pwms, MAX_PWMS, 0, count, 0); - if (pwm >= 0 && start != pwm) - return -EEXIST; - if (start + count > MAX_PWMS) return -ENOSPC; @@ -264,9 +254,8 @@ static bool pwm_ops_check(const struct pwm_chip *chip) * @chip: the PWM chip to add * @polarity: initial polarity of PWM channels * - * Register a new PWM chip. If chip->base < 0 then a dynamically assigned base - * will be used. The initial polarity for all channels is specified by the - * @polarity parameter. + * Register a new PWM chip. The initial polarity for all channels is specified + * by the @polarity parameter. * * Returns: 0 on success or a negative error code on failure. */ @@ -285,18 +274,18 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip, mutex_lock(&pwm_lock); - ret = alloc_pwms(chip->base, chip->npwm); + ret = alloc_pwms(chip->npwm); if (ret < 0) goto out; + chip->base = ret; + chip->pwms = kcalloc(chip->npwm, sizeof(*pwm), GFP_KERNEL); if (!chip->pwms) { ret = -ENOMEM; goto out; } - chip->base = ret; - for (i = 0; i < chip->npwm; i++) { pwm = &chip->pwms[i]; diff --git a/drivers/pwm/pwm-ab8500.c b/drivers/pwm/pwm-ab8500.c index 5b0a71243d0f..e2a26d9da25b 100644 --- a/drivers/pwm/pwm-ab8500.c +++ b/drivers/pwm/pwm-ab8500.c @@ -98,7 +98,6 @@ static int ab8500_pwm_probe(struct platform_device *pdev) ab8500->chip.dev = &pdev->dev; ab8500->chip.ops = &ab8500_pwm_ops; - ab8500->chip.base = -1; ab8500->chip.npwm = 1; err = pwmchip_add(&ab8500->chip); diff --git a/drivers/pwm/pwm-atmel-hlcdc.c b/drivers/pwm/pwm-atmel-hlcdc.c index dcbc0489dfd4..538dbafe3e75 100644 --- a/drivers/pwm/pwm-atmel-hlcdc.c +++ b/drivers/pwm/pwm-atmel-hlcdc.c @@ -265,7 +265,6 @@ static int atmel_hlcdc_pwm_probe(struct platform_device *pdev) chip->hlcdc = hlcdc; chip->chip.ops = &atmel_hlcdc_pwm_ops; chip->chip.dev = dev; - chip->chip.base = -1; chip->chip.npwm = 1; chip->chip.of_xlate = of_pwm_xlate_with_flags; chip->chip.of_pwm_n_cells = 3; diff --git a/drivers/pwm/pwm-atmel-tcb.c b/drivers/pwm/pwm-atmel-tcb.c index 5ccc3e7420e9..ee70a615532b 100644 --- a/drivers/pwm/pwm-atmel-tcb.c +++ b/drivers/pwm/pwm-atmel-tcb.c @@ -454,7 +454,6 @@ static int atmel_tcb_pwm_probe(struct platform_device *pdev) tcbpwm->chip.ops = &atmel_tcb_pwm_ops; tcbpwm->chip.of_xlate = of_pwm_xlate_with_flags; tcbpwm->chip.of_pwm_n_cells = 3; - tcbpwm->chip.base = -1; tcbpwm->chip.npwm = NPWM; tcbpwm->channel = channel; tcbpwm->regmap = regmap; diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c index 5813339b597b..a4d0be6b265b 100644 --- a/drivers/pwm/pwm-atmel.c +++ b/drivers/pwm/pwm-atmel.c @@ -429,7 +429,6 @@ static int atmel_pwm_probe(struct platform_device *pdev) atmel_pwm->chip.ops = &atmel_pwm_ops; atmel_pwm->chip.of_xlate = of_pwm_xlate_with_flags; atmel_pwm->chip.of_pwm_n_cells = 3; - atmel_pwm->chip.base = -1; atmel_pwm->chip.npwm = 4; ret = pwmchip_add(&atmel_pwm->chip); diff --git a/drivers/pwm/pwm-bcm-iproc.c b/drivers/pwm/pwm-bcm-iproc.c index f4853c4a2d75..529a66ab692d 100644 --- a/drivers/pwm/pwm-bcm-iproc.c +++ b/drivers/pwm/pwm-bcm-iproc.c @@ -209,7 +209,6 @@ static int iproc_pwmc_probe(struct platform_device *pdev) ip->chip.dev = &pdev->dev; ip->chip.ops = &iproc_pwm_ops; - ip->chip.base = -1; ip->chip.npwm = 4; ip->chip.of_xlate = of_pwm_xlate_with_flags; ip->chip.of_pwm_n_cells = 3; diff --git a/drivers/pwm/pwm-bcm-kona.c b/drivers/pwm/pwm-bcm-kona.c index 578b3621c97e..2e8aede0318c 100644 --- a/drivers/pwm/pwm-bcm-kona.c +++ b/drivers/pwm/pwm-bcm-kona.c @@ -271,7 +271,6 @@ static int kona_pwmc_probe(struct platform_device *pdev) kp->chip.dev = &pdev->dev; kp->chip.ops = &kona_pwm_ops; - kp->chip.base = -1; kp->chip.npwm = 6; kp->chip.of_xlate = of_pwm_xlate_with_flags; kp->chip.of_pwm_n_cells = 3; diff --git a/drivers/pwm/pwm-bcm2835.c b/drivers/pwm/pwm-bcm2835.c index d593cce249d9..e4b54675b356 100644 --- a/drivers/pwm/pwm-bcm2835.c +++ b/drivers/pwm/pwm-bcm2835.c @@ -158,7 +158,6 @@ static int bcm2835_pwm_probe(struct platform_device *pdev) pc->chip.dev = &pdev->dev; pc->chip.ops = &bcm2835_pwm_ops; - pc->chip.base = -1; pc->chip.npwm = 2; pc->chip.of_xlate = of_pwm_xlate_with_flags; pc->chip.of_pwm_n_cells = 3; diff --git a/drivers/pwm/pwm-berlin.c b/drivers/pwm/pwm-berlin.c index fe405289e582..acb6fbc3cc32 100644 --- a/drivers/pwm/pwm-berlin.c +++ b/drivers/pwm/pwm-berlin.c @@ -206,7 +206,6 @@ static int berlin_pwm_probe(struct platform_device *pdev) pwm->chip.dev = &pdev->dev; pwm->chip.ops = &berlin_pwm_ops; - pwm->chip.base = -1; pwm->chip.npwm = 4; pwm->chip.of_xlate = of_pwm_xlate_with_flags; pwm->chip.of_pwm_n_cells = 3; diff --git a/drivers/pwm/pwm-brcmstb.c b/drivers/pwm/pwm-brcmstb.c index 8b66f9d2f589..8b1d1e7aa856 100644 --- a/drivers/pwm/pwm-brcmstb.c +++ b/drivers/pwm/pwm-brcmstb.c @@ -258,7 +258,6 @@ static int brcmstb_pwm_probe(struct platform_device *pdev) p->chip.dev = &pdev->dev; p->chip.ops = &brcmstb_pwm_ops; - p->chip.base = -1; p->chip.npwm = 2; p->base = devm_platform_ioremap_resource(pdev, 0); diff --git a/drivers/pwm/pwm-clps711x.c b/drivers/pwm/pwm-clps711x.c index cb1af86873ee..f3d17a590305 100644 --- a/drivers/pwm/pwm-clps711x.c +++ b/drivers/pwm/pwm-clps711x.c @@ -128,7 +128,6 @@ static int clps711x_pwm_probe(struct platform_device *pdev) priv->chip.ops = &clps711x_pwm_ops; priv->chip.dev = &pdev->dev; - priv->chip.base = -1; priv->chip.npwm = 2; priv->chip.of_xlate = clps711x_pwm_xlate; priv->chip.of_pwm_n_cells = 1; diff --git a/drivers/pwm/pwm-crc.c b/drivers/pwm/pwm-crc.c index 1e2276808b7a..02522a9a3073 100644 --- a/drivers/pwm/pwm-crc.c +++ b/drivers/pwm/pwm-crc.c @@ -168,7 +168,6 @@ static int crystalcove_pwm_probe(struct platform_device *pdev) pwm->chip.dev = &pdev->dev; pwm->chip.ops = &crc_pwm_ops; - pwm->chip.base = -1; pwm->chip.npwm = 1; /* get the PMIC regmap */ diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c index c1c337969e4e..d3115cb0e058 100644 --- a/drivers/pwm/pwm-cros-ec.c +++ b/drivers/pwm/pwm-cros-ec.c @@ -253,7 +253,6 @@ static int cros_ec_pwm_probe(struct platform_device *pdev) chip->ops = &cros_ec_pwm_ops; chip->of_xlate = cros_ec_pwm_xlate; chip->of_pwm_n_cells = 1; - chip->base = -1; ret = cros_ec_num_pwms(ec); if (ret < 0) { dev_err(dev, "Couldn't find PWMs: %d\n", ret); diff --git a/drivers/pwm/pwm-dwc.c b/drivers/pwm/pwm-dwc.c index f6c98e0d57c2..7568300bb11e 100644 --- a/drivers/pwm/pwm-dwc.c +++ b/drivers/pwm/pwm-dwc.c @@ -233,7 +233,6 @@ static int dwc_pwm_probe(struct pci_dev *pci, const struct pci_device_id *id) dwc->chip.dev = dev; dwc->chip.ops = &dwc_pwm_ops; dwc->chip.npwm = DWC_TIMERS_TOTAL; - dwc->chip.base = -1; ret = pwmchip_add(&dwc->chip); if (ret) diff --git a/drivers/pwm/pwm-ep93xx.c b/drivers/pwm/pwm-ep93xx.c index c9fc6f223640..4ca70794ad96 100644 --- a/drivers/pwm/pwm-ep93xx.c +++ b/drivers/pwm/pwm-ep93xx.c @@ -185,7 +185,6 @@ static int ep93xx_pwm_probe(struct platform_device *pdev) ep93xx_pwm->chip.dev = &pdev->dev; ep93xx_pwm->chip.ops = &ep93xx_pwm_ops; - ep93xx_pwm->chip.base = -1; ep93xx_pwm->chip.npwm = 1; ret = pwmchip_add(&ep93xx_pwm->chip); diff --git a/drivers/pwm/pwm-fsl-ftm.c b/drivers/pwm/pwm-fsl-ftm.c index 2a6801226aba..0e1ae9469eda 100644 --- a/drivers/pwm/pwm-fsl-ftm.c +++ b/drivers/pwm/pwm-fsl-ftm.c @@ -453,7 +453,6 @@ static int fsl_pwm_probe(struct platform_device *pdev) fpc->chip.ops = &fsl_pwm_ops; fpc->chip.of_xlate = of_pwm_xlate_with_flags; fpc->chip.of_pwm_n_cells = 3; - fpc->chip.base = -1; fpc->chip.npwm = 8; ret = pwmchip_add(&fpc->chip); diff --git a/drivers/pwm/pwm-hibvt.c b/drivers/pwm/pwm-hibvt.c index a1900d0a872e..82d17fc75c21 100644 --- a/drivers/pwm/pwm-hibvt.c +++ b/drivers/pwm/pwm-hibvt.c @@ -205,7 +205,6 @@ static int hibvt_pwm_probe(struct platform_device *pdev) pwm_chip->chip.ops = &hibvt_pwm_ops; pwm_chip->chip.dev = &pdev->dev; - pwm_chip->chip.base = -1; pwm_chip->chip.npwm = soc->num_pwms; pwm_chip->chip.of_xlate = of_pwm_xlate_with_flags; pwm_chip->chip.of_pwm_n_cells = 3; diff --git a/drivers/pwm/pwm-img.c b/drivers/pwm/pwm-img.c index 6faf5b5a5584..cc37054589cc 100644 --- a/drivers/pwm/pwm-img.c +++ b/drivers/pwm/pwm-img.c @@ -304,7 +304,6 @@ static int img_pwm_probe(struct platform_device *pdev) pwm->chip.dev = &pdev->dev; pwm->chip.ops = &img_pwm_ops; - pwm->chip.base = -1; pwm->chip.npwm = IMG_PWM_NPWM; ret = pwmchip_add(&pwm->chip); diff --git a/drivers/pwm/pwm-imx-tpm.c b/drivers/pwm/pwm-imx-tpm.c index eec9ec4e1a2a..97c9133b6876 100644 --- a/drivers/pwm/pwm-imx-tpm.c +++ b/drivers/pwm/pwm-imx-tpm.c @@ -363,7 +363,6 @@ static int pwm_imx_tpm_probe(struct platform_device *pdev) tpm->chip.dev = &pdev->dev; tpm->chip.ops = &imx_tpm_pwm_ops; - tpm->chip.base = -1; tpm->chip.of_xlate = of_pwm_xlate_with_flags; tpm->chip.of_pwm_n_cells = 3; diff --git a/drivers/pwm/pwm-imx1.c b/drivers/pwm/pwm-imx1.c index 727e0d3e249e..c957b365448e 100644 --- a/drivers/pwm/pwm-imx1.c +++ b/drivers/pwm/pwm-imx1.c @@ -155,7 +155,6 @@ static int pwm_imx1_probe(struct platform_device *pdev) imx->chip.ops = &pwm_imx1_ops; imx->chip.dev = &pdev->dev; - imx->chip.base = -1; imx->chip.npwm = 1; imx->mmio_base = devm_platform_ioremap_resource(pdev, 0); diff --git a/drivers/pwm/pwm-imx27.c b/drivers/pwm/pwm-imx27.c index 18055326a2f3..ba695115c160 100644 --- a/drivers/pwm/pwm-imx27.c +++ b/drivers/pwm/pwm-imx27.c @@ -327,7 +327,6 @@ static int pwm_imx27_probe(struct platform_device *pdev) imx->chip.ops = &pwm_imx27_ops; imx->chip.dev = &pdev->dev; - imx->chip.base = -1; imx->chip.npwm = 1; imx->chip.of_xlate = of_pwm_xlate_with_flags; diff --git a/drivers/pwm/pwm-intel-lgm.c b/drivers/pwm/pwm-intel-lgm.c index e9e54dda07aa..015f5eba09a1 100644 --- a/drivers/pwm/pwm-intel-lgm.c +++ b/drivers/pwm/pwm-intel-lgm.c @@ -207,7 +207,6 @@ static int lgm_pwm_probe(struct platform_device *pdev) pc->chip.dev = dev; pc->chip.ops = &lgm_pwm_ops; pc->chip.npwm = 1; - pc->chip.base = -1; lgm_pwm_init(pc); diff --git a/drivers/pwm/pwm-iqs620a.c b/drivers/pwm/pwm-iqs620a.c index 957b972c458b..6c6e26d18329 100644 --- a/drivers/pwm/pwm-iqs620a.c +++ b/drivers/pwm/pwm-iqs620a.c @@ -206,7 +206,6 @@ static int iqs620_pwm_probe(struct platform_device *pdev) iqs620_pwm->chip.dev = &pdev->dev; iqs620_pwm->chip.ops = &iqs620_pwm_ops; - iqs620_pwm->chip.base = -1; iqs620_pwm->chip.npwm = 1; mutex_init(&iqs620_pwm->lock); diff --git a/drivers/pwm/pwm-jz4740.c b/drivers/pwm/pwm-jz4740.c index 00c642fa2eed..5b6bdcdcecf5 100644 --- a/drivers/pwm/pwm-jz4740.c +++ b/drivers/pwm/pwm-jz4740.c @@ -244,7 +244,6 @@ static int jz4740_pwm_probe(struct platform_device *pdev) jz4740->chip.dev = dev; jz4740->chip.ops = &jz4740_pwm_ops; jz4740->chip.npwm = info->num_pwms; - jz4740->chip.base = -1; jz4740->chip.of_xlate = of_pwm_xlate_with_flags; jz4740->chip.of_pwm_n_cells = 3; diff --git a/drivers/pwm/pwm-keembay.c b/drivers/pwm/pwm-keembay.c index cdfdef66ff8e..521a825c8ba0 100644 --- a/drivers/pwm/pwm-keembay.c +++ b/drivers/pwm/pwm-keembay.c @@ -203,7 +203,6 @@ static int keembay_pwm_probe(struct platform_device *pdev) if (ret) return ret; - priv->chip.base = -1; priv->chip.dev = dev; priv->chip.ops = &keembay_pwm_ops; priv->chip.npwm = KMB_TOTAL_PWM_CHANNELS; diff --git a/drivers/pwm/pwm-lp3943.c b/drivers/pwm/pwm-lp3943.c index bf3f14fb5f24..7551253ada32 100644 --- a/drivers/pwm/pwm-lp3943.c +++ b/drivers/pwm/pwm-lp3943.c @@ -275,7 +275,6 @@ static int lp3943_pwm_probe(struct platform_device *pdev) lp3943_pwm->chip.dev = &pdev->dev; lp3943_pwm->chip.ops = &lp3943_pwm_ops; lp3943_pwm->chip.npwm = LP3943_NUM_PWMS; - lp3943_pwm->chip.base = -1; platform_set_drvdata(pdev, lp3943_pwm); diff --git a/drivers/pwm/pwm-lpc18xx-sct.c b/drivers/pwm/pwm-lpc18xx-sct.c index 7ef40243eb6c..3f8e54ec28c6 100644 --- a/drivers/pwm/pwm-lpc18xx-sct.c +++ b/drivers/pwm/pwm-lpc18xx-sct.c @@ -370,7 +370,6 @@ static int lpc18xx_pwm_probe(struct platform_device *pdev) lpc18xx_pwm->chip.dev = &pdev->dev; lpc18xx_pwm->chip.ops = &lpc18xx_pwm_ops; - lpc18xx_pwm->chip.base = -1; lpc18xx_pwm->chip.npwm = 16; lpc18xx_pwm->chip.of_xlate = of_pwm_xlate_with_flags; lpc18xx_pwm->chip.of_pwm_n_cells = 3; diff --git a/drivers/pwm/pwm-lpc32xx.c b/drivers/pwm/pwm-lpc32xx.c index 6b4090436c06..c42cc314c170 100644 --- a/drivers/pwm/pwm-lpc32xx.c +++ b/drivers/pwm/pwm-lpc32xx.c @@ -116,7 +116,6 @@ static int lpc32xx_pwm_probe(struct platform_device *pdev) lpc32xx->chip.dev = &pdev->dev; lpc32xx->chip.ops = &lpc32xx_pwm_ops; lpc32xx->chip.npwm = 1; - lpc32xx->chip.base = -1; ret = pwmchip_add(&lpc32xx->chip); if (ret < 0) { diff --git a/drivers/pwm/pwm-lpss.c b/drivers/pwm/pwm-lpss.c index 939de93c157b..9ea6a176cbe5 100644 --- a/drivers/pwm/pwm-lpss.c +++ b/drivers/pwm/pwm-lpss.c @@ -234,7 +234,6 @@ struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, struct resource *r, lpwm->chip.dev = dev; lpwm->chip.ops = &pwm_lpss_ops; - lpwm->chip.base = -1; lpwm->chip.npwm = info->npwm; ret = pwmchip_add(&lpwm->chip); diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index fcfc3b147e5f..87fb37d43814 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -263,7 +263,6 @@ static int pwm_mediatek_probe(struct platform_device *pdev) pc->chip.dev = &pdev->dev; pc->chip.ops = &pwm_mediatek_ops; - pc->chip.base = -1; pc->chip.npwm = pc->soc->num_pwms; ret = pwmchip_add(&pc->chip); diff --git a/drivers/pwm/pwm-meson.c b/drivers/pwm/pwm-meson.c index a3ce9789412a..9eb060613cb4 100644 --- a/drivers/pwm/pwm-meson.c +++ b/drivers/pwm/pwm-meson.c @@ -550,7 +550,6 @@ static int meson_pwm_probe(struct platform_device *pdev) spin_lock_init(&meson->lock); meson->chip.dev = &pdev->dev; meson->chip.ops = &meson_pwm_ops; - meson->chip.base = -1; meson->chip.npwm = MESON_NUM_PWMS; meson->chip.of_xlate = of_pwm_xlate_with_flags; meson->chip.of_pwm_n_cells = 3; diff --git a/drivers/pwm/pwm-mtk-disp.c b/drivers/pwm/pwm-mtk-disp.c index 87c6b4bc5d43..9b3ba401a3db 100644 --- a/drivers/pwm/pwm-mtk-disp.c +++ b/drivers/pwm/pwm-mtk-disp.c @@ -202,7 +202,6 @@ static int mtk_disp_pwm_probe(struct platform_device *pdev) mdp->chip.dev = &pdev->dev; mdp->chip.ops = &mtk_disp_pwm_ops; - mdp->chip.base = -1; mdp->chip.npwm = 1; ret = pwmchip_add(&mdp->chip); diff --git a/drivers/pwm/pwm-mxs.c b/drivers/pwm/pwm-mxs.c index 7ce616923c52..0266e84e982c 100644 --- a/drivers/pwm/pwm-mxs.c +++ b/drivers/pwm/pwm-mxs.c @@ -140,7 +140,6 @@ static int mxs_pwm_probe(struct platform_device *pdev) mxs->chip.ops = &mxs_pwm_ops; mxs->chip.of_xlate = of_pwm_xlate_with_flags; mxs->chip.of_pwm_n_cells = 3; - mxs->chip.base = -1; ret = of_property_read_u32(np, "fsl,pwm-number", &mxs->chip.npwm); if (ret < 0) { diff --git a/drivers/pwm/pwm-omap-dmtimer.c b/drivers/pwm/pwm-omap-dmtimer.c index 358db4ff9d4f..612b3c859295 100644 --- a/drivers/pwm/pwm-omap-dmtimer.c +++ b/drivers/pwm/pwm-omap-dmtimer.c @@ -403,7 +403,6 @@ static int pwm_omap_dmtimer_probe(struct platform_device *pdev) omap->chip.dev = &pdev->dev; omap->chip.ops = &pwm_omap_dmtimer_ops; - omap->chip.base = -1; omap->chip.npwm = 1; omap->chip.of_xlate = of_pwm_xlate_with_flags; omap->chip.of_pwm_n_cells = 3; diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c index 4a55dc18656c..00976b2c57b2 100644 --- a/drivers/pwm/pwm-pca9685.c +++ b/drivers/pwm/pwm-pca9685.c @@ -493,7 +493,6 @@ static int pca9685_pwm_probe(struct i2c_client *client, pca->chip.npwm = PCA9685_MAXCHAN + 1; pca->chip.dev = &client->dev; - pca->chip.base = -1; ret = pwmchip_add(&pca->chip); if (ret < 0) diff --git a/drivers/pwm/pwm-pxa.c b/drivers/pwm/pwm-pxa.c index d06cf60e6575..cfb683827d32 100644 --- a/drivers/pwm/pwm-pxa.c +++ b/drivers/pwm/pwm-pxa.c @@ -184,7 +184,6 @@ static int pwm_probe(struct platform_device *pdev) pwm->chip.dev = &pdev->dev; pwm->chip.ops = &pxa_pwm_ops; - pwm->chip.base = -1; pwm->chip.npwm = (id->driver_data & HAS_SECONDARY_PWM) ? 2 : 1; if (IS_ENABLED(CONFIG_OF)) { diff --git a/drivers/pwm/pwm-rcar.c b/drivers/pwm/pwm-rcar.c index 002ab79a7ec2..9daca0c772c7 100644 --- a/drivers/pwm/pwm-rcar.c +++ b/drivers/pwm/pwm-rcar.c @@ -224,7 +224,6 @@ static int rcar_pwm_probe(struct platform_device *pdev) rcar_pwm->chip.dev = &pdev->dev; rcar_pwm->chip.ops = &rcar_pwm_ops; - rcar_pwm->chip.base = -1; rcar_pwm->chip.npwm = 1; pm_runtime_enable(&pdev->dev); diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c index d02b24b77cdf..e2959fae0969 100644 --- a/drivers/pwm/pwm-renesas-tpu.c +++ b/drivers/pwm/pwm-renesas-tpu.c @@ -410,7 +410,6 @@ static int tpu_probe(struct platform_device *pdev) tpu->chip.ops = &tpu_pwm_ops; tpu->chip.of_xlate = of_pwm_xlate_with_flags; tpu->chip.of_pwm_n_cells = 3; - tpu->chip.base = -1; tpu->chip.npwm = TPU_CHANNEL_MAX; pm_runtime_enable(&pdev->dev); diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index 6ad7d0a50aed..301785fa293e 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -352,7 +352,6 @@ static int rockchip_pwm_probe(struct platform_device *pdev) pc->data = id->data; pc->chip.dev = &pdev->dev; pc->chip.ops = &rockchip_pwm_ops; - pc->chip.base = -1; pc->chip.npwm = 1; if (pc->data->supports_polarity) { diff --git a/drivers/pwm/pwm-samsung.c b/drivers/pwm/pwm-samsung.c index 645d0066ff0a..515489fa4f6d 100644 --- a/drivers/pwm/pwm-samsung.c +++ b/drivers/pwm/pwm-samsung.c @@ -519,7 +519,6 @@ static int pwm_samsung_probe(struct platform_device *pdev) chip->chip.dev = &pdev->dev; chip->chip.ops = &pwm_samsung_ops; - chip->chip.base = -1; chip->chip.npwm = SAMSUNG_PWM_NUM; chip->inverter_mask = BIT(SAMSUNG_PWM_NUM) - 1; diff --git a/drivers/pwm/pwm-sifive.c b/drivers/pwm/pwm-sifive.c index 2a7cd2deaeea..688737f091ac 100644 --- a/drivers/pwm/pwm-sifive.c +++ b/drivers/pwm/pwm-sifive.c @@ -244,7 +244,6 @@ static int pwm_sifive_probe(struct platform_device *pdev) chip->ops = &pwm_sifive_ops; chip->of_xlate = of_pwm_xlate_with_flags; chip->of_pwm_n_cells = 3; - chip->base = -1; chip->npwm = 4; ddata->regs = devm_platform_ioremap_resource(pdev, 0); diff --git a/drivers/pwm/pwm-sl28cpld.c b/drivers/pwm/pwm-sl28cpld.c index 0b01ec25e2f0..7a69c1a0c060 100644 --- a/drivers/pwm/pwm-sl28cpld.c +++ b/drivers/pwm/pwm-sl28cpld.c @@ -229,7 +229,6 @@ static int sl28cpld_pwm_probe(struct platform_device *pdev) chip = &priv->pwm_chip; chip->dev = &pdev->dev; chip->ops = &sl28cpld_pwm_ops; - chip->base = -1; chip->npwm = 1; platform_set_drvdata(pdev, priv); diff --git a/drivers/pwm/pwm-spear.c b/drivers/pwm/pwm-spear.c index f63b54aae1b4..1a1cedfd11ce 100644 --- a/drivers/pwm/pwm-spear.c +++ b/drivers/pwm/pwm-spear.c @@ -193,7 +193,6 @@ static int spear_pwm_probe(struct platform_device *pdev) pc->chip.dev = &pdev->dev; pc->chip.ops = &spear_pwm_ops; - pc->chip.base = -1; pc->chip.npwm = NUM_PWM; ret = clk_prepare(pc->clk); diff --git a/drivers/pwm/pwm-sprd.c b/drivers/pwm/pwm-sprd.c index 5123d948efd6..108cbec88667 100644 --- a/drivers/pwm/pwm-sprd.c +++ b/drivers/pwm/pwm-sprd.c @@ -268,7 +268,6 @@ static int sprd_pwm_probe(struct platform_device *pdev) spc->chip.dev = &pdev->dev; spc->chip.ops = &sprd_pwm_ops; - spc->chip.base = -1; spc->chip.npwm = spc->num_pwms; ret = pwmchip_add(&spc->chip); diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c index 99c70e07858d..aa2b211d7ee3 100644 --- a/drivers/pwm/pwm-sti.c +++ b/drivers/pwm/pwm-sti.c @@ -619,7 +619,6 @@ static int sti_pwm_probe(struct platform_device *pdev) pc->chip.dev = dev; pc->chip.ops = &sti_pwm_ops; - pc->chip.base = -1; pc->chip.npwm = pc->cdata->pwm_num_devs; ret = pwmchip_add(&pc->chip); diff --git a/drivers/pwm/pwm-stm32-lp.c b/drivers/pwm/pwm-stm32-lp.c index 134c14621ee0..af08f564ef1d 100644 --- a/drivers/pwm/pwm-stm32-lp.c +++ b/drivers/pwm/pwm-stm32-lp.c @@ -205,7 +205,6 @@ static int stm32_pwm_lp_probe(struct platform_device *pdev) priv->regmap = ddata->regmap; priv->clk = ddata->clk; - priv->chip.base = -1; priv->chip.dev = &pdev->dev; priv->chip.ops = &stm32_pwm_lp_ops; priv->chip.npwm = 1; diff --git a/drivers/pwm/pwm-stm32.c b/drivers/pwm/pwm-stm32.c index d3be944f2ae9..c46fb90036ab 100644 --- a/drivers/pwm/pwm-stm32.c +++ b/drivers/pwm/pwm-stm32.c @@ -633,7 +633,6 @@ static int stm32_pwm_probe(struct platform_device *pdev) stm32_pwm_detect_complementary(priv); - priv->chip.base = -1; priv->chip.dev = dev; priv->chip.ops = &stm32pwm_ops; priv->chip.npwm = stm32_pwm_detect_channels(priv); diff --git a/drivers/pwm/pwm-stmpe.c b/drivers/pwm/pwm-stmpe.c index be5f6d7359d4..9dc983a3cbf1 100644 --- a/drivers/pwm/pwm-stmpe.c +++ b/drivers/pwm/pwm-stmpe.c @@ -278,7 +278,6 @@ static int __init stmpe_pwm_probe(struct platform_device *pdev) pwm->stmpe = stmpe; pwm->chip.dev = &pdev->dev; - pwm->chip.base = -1; if (stmpe->partnum == STMPE2401 || stmpe->partnum == STMPE2403) { pwm->chip.ops = &stmpe_24xx_pwm_ops; diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c index ce5c4fc8da6f..e01becd102c0 100644 --- a/drivers/pwm/pwm-sun4i.c +++ b/drivers/pwm/pwm-sun4i.c @@ -459,7 +459,6 @@ static int sun4i_pwm_probe(struct platform_device *pdev) pwm->chip.dev = &pdev->dev; pwm->chip.ops = &sun4i_pwm_ops; - pwm->chip.base = -1; pwm->chip.npwm = pwm->data->npwm; pwm->chip.of_xlate = of_pwm_xlate_with_flags; pwm->chip.of_pwm_n_cells = 3; diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c index 55bc63d5a0ae..c529a170bcdd 100644 --- a/drivers/pwm/pwm-tegra.c +++ b/drivers/pwm/pwm-tegra.c @@ -285,7 +285,6 @@ static int tegra_pwm_probe(struct platform_device *pdev) pwm->chip.dev = &pdev->dev; pwm->chip.ops = &tegra_pwm_ops; - pwm->chip.base = -1; pwm->chip.npwm = pwm->soc->num_channels; ret = pwmchip_add(&pwm->chip); diff --git a/drivers/pwm/pwm-tiecap.c b/drivers/pwm/pwm-tiecap.c index 2a8949014bb1..b9a17ab0c202 100644 --- a/drivers/pwm/pwm-tiecap.c +++ b/drivers/pwm/pwm-tiecap.c @@ -226,7 +226,6 @@ static int ecap_pwm_probe(struct platform_device *pdev) pc->chip.ops = &ecap_pwm_ops; pc->chip.of_xlate = of_pwm_xlate_with_flags; pc->chip.of_pwm_n_cells = 3; - pc->chip.base = -1; pc->chip.npwm = 1; pc->mmio_base = devm_platform_ioremap_resource(pdev, 0); diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c index a7fb224d6535..90095a19bf2d 100644 --- a/drivers/pwm/pwm-tiehrpwm.c +++ b/drivers/pwm/pwm-tiehrpwm.c @@ -449,7 +449,6 @@ static int ehrpwm_pwm_probe(struct platform_device *pdev) pc->chip.ops = &ehrpwm_pwm_ops; pc->chip.of_xlate = of_pwm_xlate_with_flags; pc->chip.of_pwm_n_cells = 3; - pc->chip.base = -1; pc->chip.npwm = NUM_PWM_CHANNEL; pc->mmio_base = devm_platform_ioremap_resource(pdev, 0); diff --git a/drivers/pwm/pwm-twl-led.c b/drivers/pwm/pwm-twl-led.c index 630b9a578820..6c8df5f4e87d 100644 --- a/drivers/pwm/pwm-twl-led.c +++ b/drivers/pwm/pwm-twl-led.c @@ -291,7 +291,6 @@ static int twl_pwmled_probe(struct platform_device *pdev) } twl->chip.dev = &pdev->dev; - twl->chip.base = -1; mutex_init(&twl->mutex); diff --git a/drivers/pwm/pwm-twl.c b/drivers/pwm/pwm-twl.c index aee67974f353..e83a826bf621 100644 --- a/drivers/pwm/pwm-twl.c +++ b/drivers/pwm/pwm-twl.c @@ -310,7 +310,6 @@ static int twl_pwm_probe(struct platform_device *pdev) twl->chip.ops = &twl6030_pwm_ops; twl->chip.dev = &pdev->dev; - twl->chip.base = -1; twl->chip.npwm = 2; mutex_init(&twl->mutex); diff --git a/drivers/pwm/pwm-vt8500.c b/drivers/pwm/pwm-vt8500.c index 6e36851a22bb..52fe5d19473a 100644 --- a/drivers/pwm/pwm-vt8500.c +++ b/drivers/pwm/pwm-vt8500.c @@ -209,7 +209,6 @@ static int vt8500_pwm_probe(struct platform_device *pdev) chip->chip.ops = &vt8500_pwm_ops; chip->chip.of_xlate = of_pwm_xlate_with_flags; chip->chip.of_pwm_n_cells = 3; - chip->chip.base = -1; chip->chip.npwm = VT8500_NR_PWMS; chip->clk = devm_clk_get(&pdev->dev, NULL); From d58cb0ee51ef58acc80f984407979fd5926da9e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 1 Mar 2021 19:23:07 +0100 Subject: [PATCH 0176/1379] pwm: Return -EINVAL for old-style drivers without .set_polarity callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 2b1c1a5d5148 ("pwm: Use -EINVAL for unsupported polarity") all drivers implementing the apply callback are unified to return -EINVAL if an unsupported polarity is requested. Do the same in the compat code for old-style drivers. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 8904eaa6d769..25ee06a14bb3 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -596,7 +596,7 @@ int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state) */ if (state->polarity != pwm->state.polarity) { if (!chip->ops->set_polarity) - return -ENOTSUPP; + return -EINVAL; /* * Changing the polarity of a running PWM is From 30882cf130078e6ba7d84d6d56e056b8b5e705d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 8 Mar 2021 10:50:12 +0100 Subject: [PATCH 0177/1379] pwm: atmel-tcb: Implement .apply callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is just pushing down the core's compat code down into the driver using the legacy callback nearly unchanged. The call to .enable() was just dropped from .config() because .apply() calls it unconditionally. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-atmel-tcb.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/drivers/pwm/pwm-atmel-tcb.c b/drivers/pwm/pwm-atmel-tcb.c index ee70a615532b..4d2253f3048c 100644 --- a/drivers/pwm/pwm-atmel-tcb.c +++ b/drivers/pwm/pwm-atmel-tcb.c @@ -362,20 +362,37 @@ static int atmel_tcb_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, tcbpwm->div = i; tcbpwm->duty = duty; - /* If the PWM is enabled, call enable to apply the new conf */ - if (pwm_is_enabled(pwm)) - atmel_tcb_pwm_enable(chip, pwm); - return 0; } +static int atmel_tcb_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int duty_cycle, period; + int ret; + + /* This function only sets a flag in driver data */ + atmel_tcb_pwm_set_polarity(chip, pwm, state->polarity); + + if (!state->enabled) { + atmel_tcb_pwm_disable(chip, pwm); + return 0; + } + + period = state->period < INT_MAX ? state->period : INT_MAX; + duty_cycle = state->duty_cycle < INT_MAX ? state->duty_cycle : INT_MAX; + + ret = atmel_tcb_pwm_config(chip, pwm, duty_cycle, period); + if (ret) + return ret; + + return atmel_tcb_pwm_enable(chip, pwm); +} + static const struct pwm_ops atmel_tcb_pwm_ops = { .request = atmel_tcb_pwm_request, .free = atmel_tcb_pwm_free, - .config = atmel_tcb_pwm_config, - .set_polarity = atmel_tcb_pwm_set_polarity, - .enable = atmel_tcb_pwm_enable, - .disable = atmel_tcb_pwm_disable, + .apply = atmel_tcb_pwm_apply, .owner = THIS_MODULE, }; From c77e99f434c29d79505bd740cfead9648dfe0795 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 8 Mar 2021 10:51:50 +0100 Subject: [PATCH 0178/1379] pwm: atmel-tcb: Only free resources after pwm_chip_remove() returned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise the PWM stops working before the PWM core and its consumers are aware the device is going away. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-atmel-tcb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pwm/pwm-atmel-tcb.c b/drivers/pwm/pwm-atmel-tcb.c index 4d2253f3048c..8451d3e846be 100644 --- a/drivers/pwm/pwm-atmel-tcb.c +++ b/drivers/pwm/pwm-atmel-tcb.c @@ -507,14 +507,14 @@ static int atmel_tcb_pwm_remove(struct platform_device *pdev) struct atmel_tcb_pwm_chip *tcbpwm = platform_get_drvdata(pdev); int err; - clk_disable_unprepare(tcbpwm->slow_clk); - clk_put(tcbpwm->slow_clk); - clk_put(tcbpwm->clk); - err = pwmchip_remove(&tcbpwm->chip); if (err < 0) return err; + clk_disable_unprepare(tcbpwm->slow_clk); + clk_put(tcbpwm->slow_clk); + clk_put(tcbpwm->clk); + return 0; } From 09081c9ba6c22bd63b6ce681e60d71a95acbc115 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 12 Mar 2021 09:59:16 +0100 Subject: [PATCH 0179/1379] pwm: sprd: Refuse requests with unsupported polarity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver only supports normal polarity and so should refuse requests for inversed polarity. Signed-off-by: Uwe Kleine-König Acked-by: Chunyan Zhang Signed-off-by: Thierry Reding --- drivers/pwm/pwm-sprd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pwm/pwm-sprd.c b/drivers/pwm/pwm-sprd.c index 108cbec88667..98c479dfae31 100644 --- a/drivers/pwm/pwm-sprd.c +++ b/drivers/pwm/pwm-sprd.c @@ -164,6 +164,9 @@ static int sprd_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, struct pwm_state *cstate = &pwm->state; int ret; + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + if (state->enabled) { if (!cstate->enabled) { /* From 9f0f6107e07289c99f599d4e4ad9c62dec4abfd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 12 Mar 2021 10:00:58 +0100 Subject: [PATCH 0180/1379] pwm: cros-ec: Refuse requests with unsupported polarity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver only supports normal polarity and so should refuse requests for inversed polarity. Signed-off-by: Uwe Kleine-König Acked-by: Enric Balletbo i Serra Signed-off-by: Thierry Reding --- drivers/pwm/pwm-cros-ec.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c index d3115cb0e058..9fffb566af5f 100644 --- a/drivers/pwm/pwm-cros-ec.c +++ b/drivers/pwm/pwm-cros-ec.c @@ -124,6 +124,9 @@ static int cros_ec_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, if (state->period != EC_PWM_MAX_DUTY) return -EINVAL; + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + /* * EC doesn't separate the concept of duty cycle and enabled, but * kernel does. Translate. From fc423f29f718a963a2775edba8ac258e762ea989 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 7 Dec 2020 14:45:54 +0100 Subject: [PATCH 0181/1379] pwm: bcm-kona: Use pwmchip_add() instead of pwmchip_add_with_polarity() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only side effect of this change is that pwm->state.polarity is initialized to PWM_POLARITY_NORMAL instead of PWM_POLARITY_INVERSED. However all other members of pwm->state are uninitialized and consumers are expected to provide the right polarity (either by setting it explicitly or by using a helper like pwm_init_state() that overwrites .polarity anyhow with a value independent of the initial value). The eventual goal is to remove pwmchip_add_with_polarity() and so simplify the data flow in the PWM core. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-bcm-kona.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-bcm-kona.c b/drivers/pwm/pwm-bcm-kona.c index 2e8aede0318c..f09a31042859 100644 --- a/drivers/pwm/pwm-bcm-kona.c +++ b/drivers/pwm/pwm-bcm-kona.c @@ -300,7 +300,7 @@ static int kona_pwmc_probe(struct platform_device *pdev) clk_disable_unprepare(kp->clk); - ret = pwmchip_add_with_polarity(&kp->chip, PWM_POLARITY_INVERSED); + ret = pwmchip_add(&kp->chip); if (ret < 0) dev_err(&pdev->dev, "failed to add PWM chip: %d\n", ret); From 965ebe39c953a8248a45413c25833621529da03c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 7 Dec 2020 14:45:55 +0100 Subject: [PATCH 0182/1379] pwm: atmel-hlcdc: Use pwmchip_add() instead of pwmchip_add_with_polarity() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only side effect of this change is that pwm->state.polarity is initialized to PWM_POLARITY_NORMAL instead of PWM_POLARITY_INVERSED. However all other members of pwm->state are uninitialized and consumers are expected to provide the right polarity (either by setting it explicitly or by using a helper like pwm_init_state() that overwrites .polarity anyhow with a value independent of the initial value). Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-atmel-hlcdc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-atmel-hlcdc.c b/drivers/pwm/pwm-atmel-hlcdc.c index 538dbafe3e75..6ab597e54005 100644 --- a/drivers/pwm/pwm-atmel-hlcdc.c +++ b/drivers/pwm/pwm-atmel-hlcdc.c @@ -269,7 +269,7 @@ static int atmel_hlcdc_pwm_probe(struct platform_device *pdev) chip->chip.of_xlate = of_pwm_xlate_with_flags; chip->chip.of_pwm_n_cells = 3; - ret = pwmchip_add_with_polarity(&chip->chip, PWM_POLARITY_INVERSED); + ret = pwmchip_add(&chip->chip); if (ret) { clk_disable_unprepare(hlcdc->periph_clk); return ret; From 9666cec380d60808eb86d3be4caf84faeebe3081 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 7 Dec 2020 14:45:56 +0100 Subject: [PATCH 0183/1379] pwm: Drop function pwmchip_add_with_polarity() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pwmchip_add() only calls pwmchip_add_with_polarity() and nothing else. All other users of pwmchip_add_with_polarity() are gone. So drop pwmchip_add_with_polarity() and move the code instead to pwmchip_add(). The initial assignment to pwm->state.polarity is dropped. In every correct usage of the PWM API this value is overwritten later anyhow. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 25 +++---------------------- include/linux/pwm.h | 2 -- 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 25ee06a14bb3..c4d5c0667137 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -250,17 +250,14 @@ static bool pwm_ops_check(const struct pwm_chip *chip) } /** - * pwmchip_add_with_polarity() - register a new PWM chip + * pwmchip_add() - register a new PWM chip * @chip: the PWM chip to add - * @polarity: initial polarity of PWM channels * - * Register a new PWM chip. The initial polarity for all channels is specified - * by the @polarity parameter. + * Register a new PWM chip. * * Returns: 0 on success or a negative error code on failure. */ -int pwmchip_add_with_polarity(struct pwm_chip *chip, - enum pwm_polarity polarity) +int pwmchip_add(struct pwm_chip *chip) { struct pwm_device *pwm; unsigned int i; @@ -292,7 +289,6 @@ int pwmchip_add_with_polarity(struct pwm_chip *chip, pwm->chip = chip; pwm->pwm = chip->base + i; pwm->hwpwm = i; - pwm->state.polarity = polarity; radix_tree_insert(&pwm_tree, pwm->pwm, pwm); } @@ -315,21 +311,6 @@ out: return ret; } -EXPORT_SYMBOL_GPL(pwmchip_add_with_polarity); - -/** - * pwmchip_add() - register a new PWM chip - * @chip: the PWM chip to add - * - * Register a new PWM chip. If chip->base < 0 then a dynamically assigned base - * will be used. The initial polarity for all channels is normal. - * - * Returns: 0 on success or a negative error code on failure. - */ -int pwmchip_add(struct pwm_chip *chip) -{ - return pwmchip_add_with_polarity(chip, PWM_POLARITY_NORMAL); -} EXPORT_SYMBOL_GPL(pwmchip_add); /** diff --git a/include/linux/pwm.h b/include/linux/pwm.h index e4d84d4db293..8f4eefd129aa 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -392,8 +392,6 @@ int pwm_capture(struct pwm_device *pwm, struct pwm_capture *result, int pwm_set_chip_data(struct pwm_device *pwm, void *data); void *pwm_get_chip_data(struct pwm_device *pwm); -int pwmchip_add_with_polarity(struct pwm_chip *chip, - enum pwm_polarity polarity); int pwmchip_add(struct pwm_chip *chip); int pwmchip_remove(struct pwm_chip *chip); struct pwm_device *pwm_request_from_chip(struct pwm_chip *chip, From d895ce703098fe4939b081026d77afccddec44df Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Sun, 7 Feb 2021 23:16:04 +0100 Subject: [PATCH 0184/1379] PCI: tegra: Constify static structs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only usage of them is to assign their address to the 'ops' field in the pcie_port and the dw_pcie_ep structs, both which are pointers to const. Make them const to allow the compiler to put them in read-only memory. Link: https://lore.kernel.org/r/20210207221604.48910-1-rikard.falkeborn@gmail.com Signed-off-by: Rikard Falkeborn Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Wilczyński --- drivers/pci/controller/dwc/pcie-tegra194.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index 6fa216e52d14..18acd48e8e9b 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -1019,7 +1019,7 @@ static const struct dw_pcie_ops tegra_dw_pcie_ops = { .stop_link = tegra_pcie_dw_stop_link, }; -static struct dw_pcie_host_ops tegra_pcie_dw_host_ops = { +static const struct dw_pcie_host_ops tegra_pcie_dw_host_ops = { .host_init = tegra_pcie_dw_host_init, }; @@ -1881,7 +1881,7 @@ tegra_pcie_ep_get_features(struct dw_pcie_ep *ep) return &tegra_pcie_epc_features; } -static struct dw_pcie_ep_ops pcie_ep_ops = { +static const struct dw_pcie_ep_ops pcie_ep_ops = { .raise_irq = tegra_pcie_ep_raise_irq, .get_features = tegra_pcie_ep_get_features, }; From 9b4a824b889e1cc5e0430b80e40cfe9838c5b5f0 Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Wed, 10 Feb 2021 09:13:14 -0700 Subject: [PATCH 0185/1379] iommu/vt-d: Use Real PCI DMA device for IRTE VMD retransmits child device MSI-X with the VMD endpoint's requester-id. In order to support direct interrupt remapping of VMD child devices, ensure that the IRTE is programmed with the VMD endpoint's requester-id using pci_real_dma_dev(). Link: https://lore.kernel.org/r/20210210161315.316097-2-jonathan.derrick@intel.com Signed-off-by: Jon Derrick Signed-off-by: Lorenzo Pieralisi Acked-by: Lu Baolu Acked-by: Joerg Roedel --- drivers/iommu/intel/irq_remapping.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 611ef5243cb6..75429a5373ec 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1280,7 +1280,8 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, break; case X86_IRQ_ALLOC_TYPE_PCI_MSI: case X86_IRQ_ALLOC_TYPE_PCI_MSIX: - set_msi_sid(irte, msi_desc_to_pci_dev(info->desc)); + set_msi_sid(irte, + pci_real_dma_dev(msi_desc_to_pci_dev(info->desc))); break; default: BUG_ON(1); From ee81ee84f8739e584c9ccf113ba3c796187b7080 Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Wed, 10 Feb 2021 09:13:15 -0700 Subject: [PATCH 0186/1379] PCI: vmd: Disable MSI-X remapping when possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VMD will retransmit child device MSI-X using its own MSI-X table and requester-id. This limits the number of MSI-X available to the whole child device domain to the number of VMD MSI-X interrupts. Some VMD devices have a mode where this remapping can be disabled, allowing child device interrupts to bypass processing with the VMD MSI-X domain interrupt handler and going straight the child device interrupt handler, allowing for better performance and scaling. The requester-id still gets changed to the VMD endpoint's requester-id, and the interrupt remapping handlers have been updated to properly set IRTE for child device interrupts to the VMD endpoint's context. Some VMD platforms have existing production BIOS which rely on MSI-X remapping and won't explicitly program the MSI-X remapping bit. This re-enables MSI-X remapping on unload. Link: https://lore.kernel.org/r/20210210161315.316097-3-jonathan.derrick@intel.com Signed-off-by: Jon Derrick Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Wilczyński Acked-by: Joerg Roedel --- drivers/pci/controller/vmd.c | 63 +++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 5e80f28f0119..e3fcdfec58b3 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -28,6 +28,7 @@ #define BUS_RESTRICT_CAP(vmcap) (vmcap & 0x1) #define PCI_REG_VMCONFIG 0x44 #define BUS_RESTRICT_CFG(vmcfg) ((vmcfg >> 8) & 0x3) +#define VMCONFIG_MSI_REMAP 0x2 #define PCI_REG_VMLOCK 0x70 #define MB2_SHADOW_EN(vmlock) (vmlock & 0x2) @@ -59,6 +60,13 @@ enum vmd_features { * be used for MSI remapping */ VMD_FEAT_OFFSET_FIRST_VECTOR = (1 << 3), + + /* + * Device can bypass remapping MSI-X transactions into its MSI-X table, + * avoiding the requirement of a VMD MSI domain for child device + * interrupt handling. + */ + VMD_FEAT_CAN_BYPASS_MSI_REMAP = (1 << 4), }; /* @@ -306,6 +314,16 @@ static struct msi_domain_info vmd_msi_domain_info = { .chip = &vmd_msi_controller, }; +static void vmd_set_msi_remapping(struct vmd_dev *vmd, bool enable) +{ + u16 reg; + + pci_read_config_word(vmd->dev, PCI_REG_VMCONFIG, ®); + reg = enable ? (reg & ~VMCONFIG_MSI_REMAP) : + (reg | VMCONFIG_MSI_REMAP); + pci_write_config_word(vmd->dev, PCI_REG_VMCONFIG, reg); +} + static int vmd_create_irq_domain(struct vmd_dev *vmd) { struct fwnode_handle *fn; @@ -325,6 +343,13 @@ static int vmd_create_irq_domain(struct vmd_dev *vmd) static void vmd_remove_irq_domain(struct vmd_dev *vmd) { + /* + * Some production BIOS won't enable remapping between soft reboots. + * Ensure remapping is restored before unloading the driver. + */ + if (!vmd->msix_count) + vmd_set_msi_remapping(vmd, true); + if (vmd->irq_domain) { struct fwnode_handle *fn = vmd->irq_domain->fwnode; @@ -679,15 +704,32 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features) sd->node = pcibus_to_node(vmd->dev->bus); - ret = vmd_create_irq_domain(vmd); - if (ret) - return ret; - /* - * Override the irq domain bus token so the domain can be distinguished - * from a regular PCI/MSI domain. + * Currently MSI remapping must be enabled in guest passthrough mode + * due to some missing interrupt remapping plumbing. This is probably + * acceptable because the guest is usually CPU-limited and MSI + * remapping doesn't become a performance bottleneck. */ - irq_domain_update_bus_token(vmd->irq_domain, DOMAIN_BUS_VMD_MSI); + if (!(features & VMD_FEAT_CAN_BYPASS_MSI_REMAP) || + offset[0] || offset[1]) { + ret = vmd_alloc_irqs(vmd); + if (ret) + return ret; + + vmd_set_msi_remapping(vmd, true); + + ret = vmd_create_irq_domain(vmd); + if (ret) + return ret; + + /* + * Override the IRQ domain bus token so the domain can be + * distinguished from a regular PCI/MSI domain. + */ + irq_domain_update_bus_token(vmd->irq_domain, DOMAIN_BUS_VMD_MSI); + } else { + vmd_set_msi_remapping(vmd, false); + } pci_add_resource(&resources, &vmd->resources[0]); pci_add_resource_offset(&resources, &vmd->resources[1], offset[0]); @@ -753,10 +795,6 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) if (features & VMD_FEAT_OFFSET_FIRST_VECTOR) vmd->first_vec = 1; - err = vmd_alloc_irqs(vmd); - if (err) - return err; - spin_lock_init(&vmd->cfg_lock); pci_set_drvdata(dev, vmd); err = vmd_enable_domain(vmd, features); @@ -825,7 +863,8 @@ static const struct pci_device_id vmd_ids[] = { .driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP,}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_VMD_28C0), .driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW | - VMD_FEAT_HAS_BUS_RESTRICTIONS,}, + VMD_FEAT_HAS_BUS_RESTRICTIONS | + VMD_FEAT_CAN_BYPASS_MSI_REMAP,}, {PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x467f), .driver_data = VMD_FEAT_HAS_MEMBAR_SHADOW_VSCAP | VMD_FEAT_HAS_BUS_RESTRICTIONS | From 1e83130f01b04c16579ed5a5e03d729bcffc4c5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Wed, 3 Mar 2021 15:22:02 +0100 Subject: [PATCH 0187/1379] PCI: iproc: Fix return value of iproc_msi_irq_domain_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IRQ domain alloc function should return zero on success. Non-zero value indicates failure. Link: https://lore.kernel.org/r/20210303142202.25780-1-pali@kernel.org Fixes: fc54bae28818 ("PCI: iproc: Allow allocation of multiple MSIs") Signed-off-by: Pali Rohár Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Wilczyński Acked-by: Ray Jui Acked-by: Marc Zyngier --- drivers/pci/controller/pcie-iproc-msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pcie-iproc-msi.c b/drivers/pci/controller/pcie-iproc-msi.c index 908475d27e0e..eede4e8f3f75 100644 --- a/drivers/pci/controller/pcie-iproc-msi.c +++ b/drivers/pci/controller/pcie-iproc-msi.c @@ -271,7 +271,7 @@ static int iproc_msi_irq_domain_alloc(struct irq_domain *domain, NULL, NULL); } - return hwirq; + return 0; } static void iproc_msi_irq_domain_free(struct irq_domain *domain, From 2c61f32124b0c8868ac8579bb626ea579f2e08bb Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 8 Mar 2021 09:48:42 +0000 Subject: [PATCH 0188/1379] PCI: microchip: Make some symbols static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sparse tool complains as follows: drivers/pci/controller/pcie-microchip-host.c:304:18: warning: symbol 'pcie_event_to_event' was not declared. Should it be static? drivers/pci/controller/pcie-microchip-host.c:310:18: warning: symbol 'sec_error_to_event' was not declared. Should it be static? drivers/pci/controller/pcie-microchip-host.c:317:18: warning: symbol 'ded_error_to_event' was not declared. Should it be static? drivers/pci/controller/pcie-microchip-host.c:324:18: warning: symbol 'local_status_to_event' was not declared. Should it be static? Those symbols are not used outside of pcie-microchip-host.c, so this commit marks them static. Link: https://lore.kernel.org/r/20210308094842.3588847-1-weiyongjun1@huawei.com Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Wilczyński --- drivers/pci/controller/pcie-microchip-host.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pci/controller/pcie-microchip-host.c b/drivers/pci/controller/pcie-microchip-host.c index 04c19ff81aff..132631cfe4b6 100644 --- a/drivers/pci/controller/pcie-microchip-host.c +++ b/drivers/pci/controller/pcie-microchip-host.c @@ -301,27 +301,27 @@ static const struct cause event_cause[NUM_EVENTS] = { LOCAL_EVENT_CAUSE(PM_MSI_INT_SYS_ERR, "system error"), }; -struct event_map pcie_event_to_event[] = { +static struct event_map pcie_event_to_event[] = { PCIE_EVENT_TO_EVENT_MAP(L2_EXIT), PCIE_EVENT_TO_EVENT_MAP(HOTRST_EXIT), PCIE_EVENT_TO_EVENT_MAP(DLUP_EXIT), }; -struct event_map sec_error_to_event[] = { +static struct event_map sec_error_to_event[] = { SEC_ERROR_TO_EVENT_MAP(TX_RAM_SEC_ERR), SEC_ERROR_TO_EVENT_MAP(RX_RAM_SEC_ERR), SEC_ERROR_TO_EVENT_MAP(PCIE2AXI_RAM_SEC_ERR), SEC_ERROR_TO_EVENT_MAP(AXI2PCIE_RAM_SEC_ERR), }; -struct event_map ded_error_to_event[] = { +static struct event_map ded_error_to_event[] = { DED_ERROR_TO_EVENT_MAP(TX_RAM_DED_ERR), DED_ERROR_TO_EVENT_MAP(RX_RAM_DED_ERR), DED_ERROR_TO_EVENT_MAP(PCIE2AXI_RAM_DED_ERR), DED_ERROR_TO_EVENT_MAP(AXI2PCIE_RAM_DED_ERR), }; -struct event_map local_status_to_event[] = { +static struct event_map local_status_to_event[] = { LOCAL_STATUS_TO_EVENT_MAP(DMA_END_ENGINE_0), LOCAL_STATUS_TO_EVENT_MAP(DMA_END_ENGINE_1), LOCAL_STATUS_TO_EVENT_MAP(DMA_ERROR_ENGINE_0), From 6e7628c8c3c1af74ea31e8da85b641a50fe3a86c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Wed, 10 Mar 2021 13:19:13 +0000 Subject: [PATCH 0189/1379] PCI: microchip: Remove dev_err() when handing an error from platform_get_irq() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no need to call the dev_err() function directly to print a custom message when handling an error from either the platform_get_irq() or platform_get_irq_byname() functions as both are going to display an appropriate error message in case of a failure. This change is as per suggestions from Coccinelle, e.g., drivers/pci/controller/pcie-microchip-host.c:1027:2-9: line 1027 is redundant because platform_get_irq() already prints an error Related commit caecb05c8000 ("PCI: Remove dev_err() when handing an error from platform_get_irq()"). Link: https://lore.kernel.org/r/20210310131913.2802385-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/pcie-microchip-host.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pci/controller/pcie-microchip-host.c b/drivers/pci/controller/pcie-microchip-host.c index 132631cfe4b6..89c68c56d93b 100644 --- a/drivers/pci/controller/pcie-microchip-host.c +++ b/drivers/pci/controller/pcie-microchip-host.c @@ -1023,10 +1023,8 @@ static int mc_platform_init(struct pci_config_window *cfg) } irq = platform_get_irq(pdev, 0); - if (irq < 0) { - dev_err(dev, "unable to request IRQ%d\n", irq); + if (irq < 0) return -ENODEV; - } for (i = 0; i < NUM_EVENTS; i++) { event_irq = irq_create_mapping(port->event_domain, i); From 3d0b2a3a87ce5ae85de46c4241afd52ab8b566fe Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Wed, 17 Mar 2021 18:45:18 +0530 Subject: [PATCH 0190/1379] PCI: keystone: Let AM65 use the pci_ops defined in pcie-designware-host.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both TI's AM65x (K3) and TI's K2 PCIe driver are implemented in pci-keystone. However Only K2 PCIe driver should use it's own pci_ops for configuration space accesses. But commit 10a797c6e54a ("PCI: dwc: keystone: Use pci_ops for config space accessors") used custom pci_ops for both AM65x and K2. This breaks configuration space access for AM65x platform. Fix it here. Link: https://lore.kernel.org/r/20210317131518.11040-1-kishon@ti.com Fixes: 10a797c6e54a ("PCI: dwc: keystone: Use pci_ops for config space accessors") Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Wilczyński Cc: # v5.10 --- drivers/pci/controller/dwc/pci-keystone.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pci-keystone.c b/drivers/pci/controller/dwc/pci-keystone.c index 53aa35cb3a49..a59ecbec601f 100644 --- a/drivers/pci/controller/dwc/pci-keystone.c +++ b/drivers/pci/controller/dwc/pci-keystone.c @@ -798,7 +798,8 @@ static int __init ks_pcie_host_init(struct pcie_port *pp) int ret; pp->bridge->ops = &ks_pcie_ops; - pp->bridge->child_ops = &ks_child_pcie_ops; + if (!ks_pcie->is_am6) + pp->bridge->child_ops = &ks_child_pcie_ops; ret = ks_pcie_config_legacy_irq(ks_pcie); if (ret) From 1b7996a528b3f81bb8dac6d29a957db1d33546d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Thu, 11 Mar 2021 03:37:45 +0000 Subject: [PATCH 0191/1379] PCI: layerscape: Correct syntax by changing comma to semicolon MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace command with a semicolon to correct syntax and to prevent potential unspecified behaviour and/or unintended side effects. Related: https://lore.kernel.org/linux-pci/20201216131944.14990-1-zhengyongjun3@huawei.com/ Co-authored-by: Zheng Yongjun Link: https://lore.kernel.org/r/20210311033745.1547044-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Lorenzo Pieralisi Acked-by: Roy Zang --- drivers/pci/controller/dwc/pci-layerscape-ep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c index 39fe2ed5a6a2..39f4664bd84c 100644 --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c @@ -154,7 +154,7 @@ static int __init ls_pcie_ep_probe(struct platform_device *pdev) pci->dev = dev; pci->ops = pcie->drvdata->dw_pcie_ops; - ls_epc->bar_fixed_64bit = (1 << BAR_2) | (1 << BAR_4), + ls_epc->bar_fixed_64bit = (1 << BAR_2) | (1 << BAR_4); pcie->pci = pci; pcie->ls_epc = ls_epc; From b5d9209d50838474fc1b2901d0e11bba59906428 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 8 Mar 2021 13:56:19 +0000 Subject: [PATCH 0192/1379] PCI: brcmstb: Fix error return code in brcm_pcie_probe() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix to return negative error code -ENODEV from the unsupported revision error handling case instead of 0, as done elsewhere in this function. Link: https://lore.kernel.org/r/20210308135619.19133-1-weiyongjun1@huawei.com Fixes: 0cdfaceb9889 ("PCI: brcmstb: support BCM4908 with external PERST# signal controller") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Wilczyński Acked-by: Florian Fainelli --- drivers/pci/controller/pcie-brcmstb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index e330e6811f0b..69c999222cc8 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -1296,6 +1296,7 @@ static int brcm_pcie_probe(struct platform_device *pdev) pcie->hw_rev = readl(pcie->base + PCIE_MISC_REVISION); if (pcie->type == BCM4908 && pcie->hw_rev >= BRCM_PCIE_HW_REV_3_20) { dev_err(pcie->dev, "hardware revision with unsupported PERST# setup\n"); + ret = -ENODEV; goto fail; } From 10739e2a5e83ecac6a7d2422369c5fe8a1a72b04 Mon Sep 17 00:00:00 2001 From: Wesley Sheng Date: Thu, 31 Dec 2020 11:25:39 +0800 Subject: [PATCH 0193/1379] PCI: tegra: Fix typo for PCIe endpoint mode in Tegra194 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In config PCIE_TEGRA194_EP the mode incorrectly is referred to as host mode. Fix it. Link: https://lore.kernel.org/r/20201231032539.22322-1-wesley.sheng@amd.com Signed-off-by: Wesley Sheng Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Wilczyński Acked-by: Vidya Sagar --- drivers/pci/controller/dwc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 22c5529e9a65..c26572e1a7ef 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -280,7 +280,7 @@ config PCIE_TEGRA194_EP select PCIE_TEGRA194 help Enables support for the PCIe controller in the NVIDIA Tegra194 SoC to - work in host mode. There are two instances of PCIe controllers in + work in endpoint mode. There are two instances of PCIe controllers in Tegra194. This controller can work either as EP or RC. In order to enable host-specific features PCIE_TEGRA194_HOST must be selected and in order to enable device-specific features PCIE_TEGRA194_EP must be From 2cf3af7aa6df0e173f2bff57b73427bb05b30ba0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 22 Mar 2021 17:45:22 -0400 Subject: [PATCH 0194/1379] scripts/recordmcount.pl: Make indent spacing consistent Emacs by default will have perl files have 4 space indents, where 8 spaces are represented with a single tab. There are some places in recordmcount.pl that has 8 spaces where a tab should be used. Replace them to make the file consistent. No functional changes. Cc: "John (Warthog9) Hawley" Signed-off-by: Steven Rostedt (VMware) --- scripts/recordmcount.pl | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 867860ea57da..5652da5e345e 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -266,9 +266,9 @@ if ($arch eq "x86_64") { # force flags for this arch $ld .= " -m shlelf_linux"; if ($endian eq "big") { - $objcopy .= " -O elf32-shbig-linux"; + $objcopy .= " -O elf32-shbig-linux"; } else { - $objcopy .= " -O elf32-sh-linux"; + $objcopy .= " -O elf32-sh-linux"; } } elsif ($arch eq "powerpc") { @@ -289,12 +289,12 @@ if ($arch eq "x86_64") { $ldemulation = "lppc" } if ($bits == 64) { - $type = ".quad"; - $cc .= " -m64 "; - $ld .= " -m elf64".$ldemulation." "; + $type = ".quad"; + $cc .= " -m64 "; + $ld .= " -m elf64".$ldemulation." "; } else { - $cc .= " -m32 "; - $ld .= " -m elf32".$ldemulation." "; + $cc .= " -m32 "; + $ld .= " -m elf32".$ldemulation." "; } } elsif ($arch eq "arm") { @@ -313,7 +313,7 @@ if ($arch eq "x86_64") { $type = "data8"; if ($is_module eq "0") { - $cc .= " -mconstant-gp"; + $cc .= " -mconstant-gp"; } } elsif ($arch eq "sparc64") { # In the objdump output there are giblets like: @@ -530,10 +530,10 @@ while () { $read_function = defined($text_sections{$1}); if (!$read_function) { foreach my $prefix (keys %text_section_prefixes) { - if (substr($1, 0, length $prefix) eq $prefix) { - $read_function = 1; - last; - } + if (substr($1, 0, length $prefix) eq $prefix) { + $read_function = 1; + last; + } } } # print out any recorded offsets From b700fc3a63f16d6e130433fdcbe3f5f223c7662c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 22 Mar 2021 17:47:37 -0400 Subject: [PATCH 0195/1379] scripts/recordmcount.pl: Make vim and emacs indent the same By default, emacs indents Perl files with 4 spaces, but will use tabs where 8 spaces are used. Add a vim command of softtabstop=4, to make vim behave the same. This should remove the issue of developers using vim having causing different indentation. "John (Warthog9) Hawley" Signed-off-by: Steven Rostedt (VMware) --- scripts/recordmcount.pl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 5652da5e345e..a5429b3dac24 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -642,3 +642,5 @@ if ($#converts >= 0) { `$rm $mcount_o $mcount_s`; exit(0); + +# vim: softtabstop=4 From 021a90fe60ea08262ad01f9c9d0514d63462b4a7 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 8 Feb 2021 15:23:01 +0100 Subject: [PATCH 0196/1379] PCI: mobiveil: Improve PCIE_LAYERSCAPE_GEN4 dependencies - Drop the dependency on PCI, as this is implied by the dependency on PCI_MSI_IRQ_DOMAIN, - Drop the dependencies on OF and ARM64, as the driver compiles fine without OF and/or on other architectures, - The Freescale Layerscape PCIe Gen4 controller is present only on Freescale Layerscape SoCs. Hence depend on ARCH_LAYERSCAPE, to prevent asking the user about this driver when configuring a kernel without Freescale Layerscape support, unless compile-testing. Link: https://lore.kernel.org/r/20210208142301.413582-1-geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/mobiveil/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pci/controller/mobiveil/Kconfig b/drivers/pci/controller/mobiveil/Kconfig index a62d247018cf..e4643fb94e78 100644 --- a/drivers/pci/controller/mobiveil/Kconfig +++ b/drivers/pci/controller/mobiveil/Kconfig @@ -24,8 +24,7 @@ config PCIE_MOBIVEIL_PLAT config PCIE_LAYERSCAPE_GEN4 bool "Freescale Layerscape PCIe Gen4 controller" - depends on PCI - depends on OF && (ARM64 || ARCH_LAYERSCAPE) + depends on ARCH_LAYERSCAPE || COMPILE_TEST depends on PCI_MSI_IRQ_DOMAIN select PCIE_MOBIVEIL_HOST help From f9875d1a3630de80240a40c180f0871bee40298a Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Mon, 8 Mar 2021 12:05:47 +0530 Subject: [PATCH 0197/1379] dt-bindings: PCI: ti,j721e: Add binding to represent refclk to the connector Add binding to represent refclk to the PCIe connector. Link: https://lore.kernel.org/r/20210308063550.6227-2-kishon@ti.com Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../devicetree/bindings/pci/ti,j721e-pci-host.yaml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml b/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml index 0880a613ece6..963f90816645 100644 --- a/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml +++ b/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml @@ -46,12 +46,17 @@ properties: maxItems: 1 clocks: - maxItems: 1 - description: clock-specifier to represent input to the PCIe + minItems: 1 + maxItems: 2 + description: |+ + clock-specifier to represent input to the PCIe for 1 item. + 2nd item if present represents reference clock to the connector. clock-names: + minItems: 1 items: - const: fck + - const: pcie_refclk vendor-id: const: 0x104c From 3201f355e9a92114d440f05c0c4410173ef7042c Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Mon, 8 Mar 2021 12:05:48 +0530 Subject: [PATCH 0198/1379] dt-bindings: PCI: ti,j721e: Add host mode dt-bindings for TI's AM64 SoC Add host mode dt-bindings for TI's AM64 SoC. This is the same IP used in J7200, however AM64 is a non-coherent architecture. Link: https://lore.kernel.org/r/20210308063550.6227-3-kishon@ti.com Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../devicetree/bindings/pci/ti,j721e-pci-host.yaml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml b/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml index 963f90816645..cc900202df29 100644 --- a/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml +++ b/Documentation/devicetree/bindings/pci/ti,j721e-pci-host.yaml @@ -16,13 +16,15 @@ allOf: properties: compatible: oneOf: + - const: ti,j721e-pcie-host + - description: PCIe controller in AM64 + items: + - const: ti,am64-pcie-host + - const: ti,j721e-pcie-host - description: PCIe controller in J7200 items: - const: ti,j7200-pcie-host - const: ti,j721e-pcie-host - - description: PCIe controller in J721E - items: - - const: ti,j721e-pcie-host reg: maxItems: 4 @@ -67,6 +69,8 @@ properties: - const: 0xb00d - items: - const: 0xb00f + - items: + - const: 0xb010 msi-map: true @@ -83,7 +87,6 @@ required: - vendor-id - device-id - msi-map - - dma-coherent - dma-ranges - ranges - reset-gpios From 6b7d5394c21d0f0a98a9a9ecc11c8e0f264e9e4b Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Mon, 8 Mar 2021 12:05:49 +0530 Subject: [PATCH 0199/1379] dt-bindings: PCI: ti,j721e: Add endpoint mode dt-bindings for TI's AM64 SoC Add endpoint mode dt-bindings for TI's AM64 SoC. This is the same IP used in J7200, however AM64 is a non-coherent architecture. Link: https://lore.kernel.org/r/20210308063550.6227-4-kishon@ti.com Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../devicetree/bindings/pci/ti,j721e-pci-ep.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/pci/ti,j721e-pci-ep.yaml b/Documentation/devicetree/bindings/pci/ti,j721e-pci-ep.yaml index d06f0c4464c6..aed437dac363 100644 --- a/Documentation/devicetree/bindings/pci/ti,j721e-pci-ep.yaml +++ b/Documentation/devicetree/bindings/pci/ti,j721e-pci-ep.yaml @@ -16,13 +16,15 @@ allOf: properties: compatible: oneOf: + - const: ti,j721e-pcie-ep + - description: PCIe EP controller in AM64 + items: + - const: ti,am64-pcie-ep + - const: ti,j721e-pcie-ep - description: PCIe EP controller in J7200 items: - const: ti,j7200-pcie-ep - const: ti,j721e-pcie-ep - - description: PCIe EP controller in J721E - items: - - const: ti,j721e-pcie-ep reg: maxItems: 4 @@ -66,7 +68,6 @@ required: - power-domains - clocks - clock-names - - dma-coherent - max-functions - phys - phy-names From 49e0efdce791256fef94c3940aea77a0a6b0622e Mon Sep 17 00:00:00 2001 From: Kishon Vijay Abraham I Date: Mon, 8 Mar 2021 12:05:50 +0530 Subject: [PATCH 0200/1379] PCI: j721e: Add support to provide refclk to PCIe connector Add support to provide refclk to PCIe connector. Link: https://lore.kernel.org/r/20210308063550.6227-5-kishon@ti.com Signed-off-by: Kishon Vijay Abraham I Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/cadence/pci-j721e.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/cadence/pci-j721e.c b/drivers/pci/controller/cadence/pci-j721e.c index 849f1e416ea5..9c102efc306a 100644 --- a/drivers/pci/controller/cadence/pci-j721e.c +++ b/drivers/pci/controller/cadence/pci-j721e.c @@ -6,6 +6,7 @@ * Author: Kishon Vijay Abraham I */ +#include #include #include #include @@ -50,6 +51,7 @@ enum link_status { struct j721e_pcie { struct device *dev; + struct clk *refclk; u32 mode; u32 num_lanes; struct cdns_pcie *cdns_pcie; @@ -312,6 +314,7 @@ static int j721e_pcie_probe(struct platform_device *pdev) struct cdns_pcie_ep *ep; struct gpio_desc *gpiod; void __iomem *base; + struct clk *clk; u32 num_lanes; u32 mode; int ret; @@ -411,6 +414,20 @@ static int j721e_pcie_probe(struct platform_device *pdev) goto err_get_sync; } + clk = devm_clk_get_optional(dev, "pcie_refclk"); + if (IS_ERR(clk)) { + ret = PTR_ERR(clk); + dev_err(dev, "failed to get pcie_refclk\n"); + goto err_pcie_setup; + } + + ret = clk_prepare_enable(clk); + if (ret) { + dev_err(dev, "failed to enable pcie_refclk\n"); + goto err_get_sync; + } + pcie->refclk = clk; + /* * "Power Sequencing and Reset Signal Timings" table in * PCI EXPRESS CARD ELECTROMECHANICAL SPECIFICATION, REV. 3.0 @@ -425,8 +442,10 @@ static int j721e_pcie_probe(struct platform_device *pdev) } ret = cdns_pcie_host_setup(rc); - if (ret < 0) + if (ret < 0) { + clk_disable_unprepare(pcie->refclk); goto err_pcie_setup; + } break; case PCI_MODE_EP: @@ -479,6 +498,7 @@ static int j721e_pcie_remove(struct platform_device *pdev) struct cdns_pcie *cdns_pcie = pcie->cdns_pcie; struct device *dev = &pdev->dev; + clk_disable_unprepare(pcie->refclk); cdns_pcie_disable_phy(cdns_pcie); pm_runtime_put(dev); pm_runtime_disable(dev); From c99e755a4a4c165cad6effb39faffd0f3377c02d Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Mon, 25 Jan 2021 02:28:26 +0300 Subject: [PATCH 0201/1379] PCI: Release OF node in pci_scan_device()'s error path In pci_scan_device(), if pci_setup_device() fails for any reason, the code will not release device's of_node by calling pci_release_of_node(). Fix that by calling the release function. Fixes: 98d9f30c820d ("pci/of: Match PCI devices to OF nodes dynamically") Link: https://lore.kernel.org/r/20210124232826.1879-1-dmitry.baryshkov@linaro.org Signed-off-by: Dmitry Baryshkov Signed-off-by: Bjorn Helgaas Reviewed-by: Leon Romanovsky --- drivers/pci/probe.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 953f15abc850..be51670572fa 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2353,6 +2353,7 @@ static struct pci_dev *pci_scan_device(struct pci_bus *bus, int devfn) pci_set_of_node(dev); if (pci_setup_device(dev)) { + pci_release_of_node(dev); pci_bus_put(dev->bus); kfree(dev); return NULL; From e1175f02291141bbd924fc578299305fcde35855 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 5 Mar 2021 17:56:01 +0800 Subject: [PATCH 0202/1379] f2fs: fix to align to section for fallocate() on pinned file Now, fallocate() on a pinned file only allocates blocks which aligns to segment rather than section, so GC may try to migrate pinned file's block, and after several times of failure, pinned file's block could be migrated to other place, however user won't be aware of such condition, and then old obsolete block address may be readed/written incorrectly. To avoid such condition, let's try to allocate pinned file's blocks with section alignment. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 19 +++++++++---------- fs/f2fs/segment.c | 34 ++++++++++++++++++++++++++-------- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fe9e0d4d3920..ac57072d73cf 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3389,7 +3389,7 @@ void f2fs_get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, int dir); void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); -void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type); +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type); void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1863944f4073..bd5a77091d23 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1646,27 +1646,26 @@ static int expand_inode_data(struct inode *inode, loff_t offset, return 0; if (f2fs_is_pinned_file(inode)) { - block_t len = (map.m_len >> sbi->log_blocks_per_seg) << - sbi->log_blocks_per_seg; + block_t sec_blks = BLKS_PER_SEC(sbi); + block_t sec_len = roundup(map.m_len, sec_blks); block_t done = 0; - if (map.m_len % sbi->blocks_per_seg) - len += sbi->blocks_per_seg; - - map.m_len = sbi->blocks_per_seg; + map.m_len = sec_blks; next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) + if (err && err != -ENODATA && err != -EAGAIN) { + map.m_len = done; goto out_err; + } } down_write(&sbi->pin_sem); f2fs_lock_op(sbi); - f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA_PINNED); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED); f2fs_unlock_op(sbi); map.m_seg_type = CURSEG_COLD_DATA_PINNED; @@ -1675,9 +1674,9 @@ next_alloc: up_write(&sbi->pin_sem); done += map.m_len; - len -= map.m_len; + sec_len -= map.m_len; map.m_lblk += map.m_len; - if (!err && len) + if (!err && sec_len) goto next_alloc; map.m_len = done; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b5a40a39a03f..f2b22da8c134 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2899,7 +2899,8 @@ unlock: up_read(&SM_I(sbi)->curseg_lock); } -static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) +static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, + bool new_sec) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; @@ -2907,10 +2908,22 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) if (!curseg->inited) goto alloc; - if (!curseg->next_blkoff && - !get_valid_blocks(sbi, curseg->segno, false) && - !get_ckpt_valid_blocks(sbi, curseg->segno)) - return; + if (curseg->next_blkoff || + get_valid_blocks(sbi, curseg->segno, new_sec)) + goto alloc; + + if (new_sec) { + unsigned int segno = START_SEGNO(curseg->segno); + int i; + + for (i = 0; i < sbi->segs_per_sec; i++, segno++) { + if (get_ckpt_valid_blocks(sbi, segno)) + goto alloc; + } + } else { + if (!get_ckpt_valid_blocks(sbi, curseg->segno)) + return; + } alloc: old_segno = curseg->segno; @@ -2918,10 +2931,15 @@ alloc: locate_dirty_segment(sbi, old_segno); } -void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type) +static void __allocate_new_section(struct f2fs_sb_info *sbi, int type) +{ + __allocate_new_segment(sbi, type, true); +} + +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type) { down_write(&SIT_I(sbi)->sentry_lock); - __allocate_new_segment(sbi, type); + __allocate_new_section(sbi, type); up_write(&SIT_I(sbi)->sentry_lock); } @@ -2931,7 +2949,7 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segment(sbi, i); + __allocate_new_segment(sbi, i, false); up_write(&SIT_I(sbi)->sentry_lock); } From f2cc020d7876de7583feb52ec939a32419cf9468 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Mar 2021 18:49:35 +0100 Subject: [PATCH 0203/1379] tracing: Fix various typos in comments Fix ~59 single-word typos in the tracing code comments, and fix the grammar in a handful of places. Link: https://lore.kernel.org/r/20210322224546.GA1981273@gmail.com Link: https://lkml.kernel.org/r/20210323174935.GA4176821@gmail.com Reviewed-by: Randy Dunlap Signed-off-by: Ingo Molnar Signed-off-by: Steven Rostedt (VMware) --- arch/microblaze/include/asm/ftrace.h | 2 +- arch/nds32/kernel/ftrace.c | 2 +- arch/powerpc/include/asm/ftrace.h | 4 ++-- arch/sh/kernel/ftrace.c | 2 +- arch/sparc/include/asm/ftrace.h | 2 +- fs/tracefs/inode.c | 2 +- include/linux/ftrace.h | 4 ++-- include/linux/trace_events.h | 2 +- include/linux/tracepoint.h | 2 +- include/trace/events/io_uring.h | 2 +- include/trace/events/rcu.h | 2 +- include/trace/events/sched.h | 2 +- include/trace/events/timer.h | 2 +- kernel/trace/bpf_trace.c | 5 +++-- kernel/trace/fgraph.c | 4 ++-- kernel/trace/ftrace.c | 8 ++++---- kernel/trace/ring_buffer.c | 2 +- kernel/trace/synth_event_gen_test.c | 2 +- kernel/trace/trace.c | 18 +++++++++--------- kernel/trace/trace.h | 4 ++-- kernel/trace/trace_event_perf.c | 2 +- kernel/trace/trace_events.c | 4 ++-- kernel/trace/trace_events_filter.c | 4 ++-- kernel/trace/trace_events_synth.c | 2 +- kernel/trace/trace_functions_graph.c | 2 +- kernel/trace/trace_hwlat.c | 4 ++-- kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.c | 6 +++--- kernel/trace/trace_probe.h | 2 +- kernel/trace/trace_probe_tmpl.h | 2 +- kernel/trace/trace_selftest.c | 4 ++-- kernel/trace/trace_seq.c | 12 ++++++------ 32 files changed, 60 insertions(+), 59 deletions(-) diff --git a/arch/microblaze/include/asm/ftrace.h b/arch/microblaze/include/asm/ftrace.h index 5db7f4489f05..6a92bed37794 100644 --- a/arch/microblaze/include/asm/ftrace.h +++ b/arch/microblaze/include/asm/ftrace.h @@ -13,7 +13,7 @@ extern void ftrace_call_graph(void); #endif #ifdef CONFIG_DYNAMIC_FTRACE -/* reloction of mcount call site is the same as the address */ +/* relocation of mcount call site is the same as the address */ static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c index 414f8a780cc3..0e23e3a8df6b 100644 --- a/arch/nds32/kernel/ftrace.c +++ b/arch/nds32/kernel/ftrace.c @@ -236,7 +236,7 @@ void __naked return_to_handler(void) "bal ftrace_return_to_handler\n\t" "move $lp, $r0 \n\t" - /* restore state nedded by the ABI */ + /* restore state needed by the ABI */ "lmw.bim $r0,[$sp],$r1,#0x0 \n\t"); } diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index bc76970b6ee5..debe8c4f7062 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -12,7 +12,7 @@ #ifdef __ASSEMBLY__ -/* Based off of objdump optput from glibc */ +/* Based off of objdump output from glibc */ #define MCOUNT_SAVE_FRAME \ stwu r1,-48(r1); \ @@ -52,7 +52,7 @@ extern void _mcount(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { - /* reloction of mcount call site is the same as the address */ + /* relocation of mcount call site is the same as the address */ return addr; } diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c index 0646c5961846..295c43315bbe 100644 --- a/arch/sh/kernel/ftrace.c +++ b/arch/sh/kernel/ftrace.c @@ -67,7 +67,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) * Modifying code must take extra care. On an SMP machine, if * the code being modified is also being executed on another CPU * that CPU will have undefined results and possibly take a GPF. - * We use kstop_machine to stop other CPUS from exectuing code. + * We use kstop_machine to stop other CPUS from executing code. * But this does not stop NMIs from happening. We still need * to protect against that. We separate out the modification of * the code to take care of this. diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index d3aa1a524431..e284394cb3aa 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -17,7 +17,7 @@ void _mcount(void); #endif #ifdef CONFIG_DYNAMIC_FTRACE -/* reloction of mcount call site is the same as the address */ +/* relocation of mcount call site is the same as the address */ static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 4b83cbded559..1261e8b41edb 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -477,7 +477,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) * * The instances directory is special as it allows for mkdir and rmdir to * to be done by userspace. When a mkdir or rmdir is performed, the inode - * locks are released and the methhods passed in (@mkdir and @rmdir) are + * locks are released and the methods passed in (@mkdir and @rmdir) are * called without locks and with the name of the directory being created * within the instances directory. * diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 86e5028bfa20..a69f363b61bf 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -33,7 +33,7 @@ /* * If the arch's mcount caller does not support all of ftrace's * features, then it must call an indirect function that - * does. Or at least does enough to prevent any unwelcomed side effects. + * does. Or at least does enough to prevent any unwelcome side effects. */ #if !ARCH_SUPPORTS_FTRACE_OPS # define FTRACE_FORCE_LIST_FUNC 1 @@ -389,7 +389,7 @@ DECLARE_PER_CPU(int, disable_stack_tracer); */ static inline void stack_tracer_disable(void) { - /* Preemption or interupts must be disabled */ + /* Preemption or interrupts must be disabled */ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); this_cpu_inc(disable_stack_tracer); diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8cba64ce23a4..36e27c1f42e0 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -206,7 +206,7 @@ static inline unsigned int tracing_gen_ctx_dec(void) trace_ctx = tracing_gen_ctx(); /* - * Subtract one from the preeption counter if preemption is enabled, + * Subtract one from the preemption counter if preemption is enabled, * see trace_event_buffer_reserve()for details. */ if (IS_ENABLED(CONFIG_PREEMPTION)) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 9cfb099da58f..13f65420f188 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -465,7 +465,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * * * * The declared 'local variable' is called '__entry' * * - * * __field(pid_t, prev_prid) is equivalent to a standard declariton: + * * __field(pid_t, prev_prid) is equivalent to a standard declaration: * * * * pid_t prev_pid; * * diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 9f0d3b7d56b0..ba78a5602cd1 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -49,7 +49,7 @@ TRACE_EVENT(io_uring_create, ); /** - * io_uring_register - called after a buffer/file/eventfd was succesfully + * io_uring_register - called after a buffer/file/eventfd was successfully * registered for a ring * * @ctx: pointer to a ring context structure diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 5fc29400e1a2..97177c10bf64 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -48,7 +48,7 @@ TRACE_EVENT(rcu_utilization, * RCU flavor, the grace-period number, and a string identifying the * grace-period-related event as follows: * - * "AccReadyCB": CPU acclerates new callbacks to RCU_NEXT_READY_TAIL. + * "AccReadyCB": CPU accelerates new callbacks to RCU_NEXT_READY_TAIL. * "AccWaitCB": CPU accelerates new callbacks to RCU_WAIT_TAIL. * "newreq": Request a new grace period. * "start": Start a grace period. diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index cbe3e152d24c..1eca2305ca42 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -174,7 +174,7 @@ DEFINE_EVENT(sched_wakeup_template, sched_waking, TP_ARGS(p)); /* - * Tracepoint called when the task is actually woken; p->state == TASK_RUNNNG. + * Tracepoint called when the task is actually woken; p->state == TASK_RUNNING. * It is not always called from the waking context. */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup, diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 19abb6c3eb73..6ad031c71be7 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -119,7 +119,7 @@ TRACE_EVENT(timer_expire_entry, * When used in combination with the timer_expire_entry tracepoint we can * determine the runtime of the timer callback function. * - * NOTE: Do NOT derefernce timer in TP_fast_assign. The pointer might + * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might * be invalid. We solely track the pointer. */ DEFINE_EVENT(timer_class, timer_expire_exit, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b0c45d923f0f..49f7ddb36149 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -658,7 +658,7 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ i++; - /* skip optional "[0 +-][num]" width formating field */ + /* skip optional "[0 +-][num]" width formatting field */ while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || fmt[i] == ' ') i++; @@ -748,7 +748,8 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, fmt_cnt++; } - /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give + /* + * The maximum we can have is MAX_SEQ_PRINTF_VARARGS parameters, so just give * all of them to seq_printf(). */ seq_printf(m, fmt, params[0], params[1], params[2], params[3], diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 29a6ebeebc9e..b8a0d1d564fb 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -42,7 +42,7 @@ bool ftrace_graph_is_dead(void) } /** - * ftrace_graph_stop - set to permanently disable function graph tracincg + * ftrace_graph_stop - set to permanently disable function graph tracing * * In case of an error int function graph tracing, this is called * to try to keep function graph tracing from causing any more harm. @@ -117,7 +117,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, /* * Skip graph tracing if the return location is served by direct trampoline, - * since call sequence and return addresses is unpredicatable anymore. + * since call sequence and return addresses are unpredictable anyway. * Ex: BPF trampoline may call original function and may skip frame * depending on type of BPF programs attached. */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4d8e35575549..eb7d677cb466 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1045,7 +1045,7 @@ struct ftrace_ops global_ops = { }; /* - * Used by the stack undwinder to know about dynamic ftrace trampolines. + * Used by the stack unwinder to know about dynamic ftrace trampolines. */ struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) { @@ -3000,7 +3000,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command) * When the kernel is preemptive, tasks can be preempted * while on a ftrace trampoline. Just scheduling a task on * a CPU is not good enough to flush them. Calling - * synchornize_rcu_tasks() will wait for those tasks to + * synchronize_rcu_tasks() will wait for those tasks to * execute and either schedule voluntarily or enter user space. */ if (IS_ENABLED(CONFIG_PREEMPTION)) @@ -5373,7 +5373,7 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct); * @reset - non zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled - * If @ip is NULL, it failes to update filter. + * If @ip is NULL, it fails to update filter. */ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset) @@ -6325,7 +6325,7 @@ clear_mod_from_hash(struct ftrace_page *pg, struct ftrace_hash *hash) } } -/* Clear any records from hashs */ +/* Clear any records from hashes */ static void clear_mod_from_hashes(struct ftrace_page *pg) { struct trace_array *tr; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1c61a8cd7b99..f4216df58e31 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3154,7 +3154,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * is called before preempt_count() is updated, since the check will * be on the NORMAL bit, the TRANSITION bit will then be set. If an * NMI then comes in, it will set the NMI bit, but when the NMI code - * does the trace_recursive_unlock() it will clear the TRANSTION bit + * does the trace_recursive_unlock() it will clear the TRANSITION bit * and leave the NMI bit set. But this is fine, because the interrupt * code that set the TRANSITION bit will then clear the NMI bit when it * calls trace_recursive_unlock(). If another NMI comes in, it will diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index a4b4bbf8c3bf..0b15e975d2c2 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Test module for in-kernel sythetic event creation and generation. + * Test module for in-kernel synthetic event creation and generation. * * Copyright (C) 2019 Tom Zanussi */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e32f5a49f1cf..c8e54b674d3e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -514,7 +514,7 @@ void trace_free_pid_list(struct trace_pid_list *pid_list) * @filtered_pids: The list of pids to check * @search_pid: The PID to find in @filtered_pids * - * Returns true if @search_pid is fonud in @filtered_pids, and false otherwis. + * Returns true if @search_pid is found in @filtered_pids, and false otherwise. */ bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) @@ -545,7 +545,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) { /* - * If filterd_no_pids is not empty, and the task's pid is listed + * If filtered_no_pids is not empty, and the task's pid is listed * in filtered_no_pids, then return true. * Otherwise, if filtered_pids is empty, that means we can * trace all tasks. If it has content, then only trace pids @@ -612,7 +612,7 @@ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) (*pos)++; - /* pid already is +1 of the actual prevous bit */ + /* pid already is +1 of the actual previous bit */ pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); /* Return pid + 1 to allow zero to be represented */ @@ -834,7 +834,7 @@ DEFINE_MUTEX(trace_types_lock); * The content of events may become garbage if we allow other process consumes * these events concurrently: * A) the page of the consumed events may become a normal page - * (not reader page) in ring buffer, and this page will be rewrited + * (not reader page) in ring buffer, and this page will be rewritten * by events producer. * B) The page of the consumed events may become a page for splice_read, * and this page will be returned to system. @@ -1520,7 +1520,7 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) #undef C #define C(a, b) b -/* These must match the bit postions in trace_iterator_flags */ +/* These must match the bit positions in trace_iterator_flags */ static const char *trace_options[] = { TRACE_FLAGS NULL @@ -3368,7 +3368,7 @@ int trace_array_vprintk(struct trace_array *tr, * buffer (use trace_printk() for that), as writing into the top level * buffer should only have events that can be individually disabled. * trace_printk() is only used for debugging a kernel, and should not - * be ever encorporated in normal use. + * be ever incorporated in normal use. * * trace_array_printk() can be used, as it will not add noise to the * top level tracing buffer. @@ -7533,11 +7533,11 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) * @cmd: The tracing command that caused the error * @str: The string to position the caret at within @cmd * - * Finds the position of the first occurence of @str within @cmd. The + * Finds the position of the first occurrence of @str within @cmd. The * return value can be passed to tracing_log_err() for caret placement * within @cmd. * - * Returns the index within @cmd of the first occurence of @str or 0 + * Returns the index within @cmd of the first occurrence of @str or 0 * if @str was not found. */ unsigned int err_pos(char *cmd, const char *str) @@ -9320,7 +9320,7 @@ int tracing_init_dentry(void) * As there may still be users that expect the tracing * files to exist in debugfs/tracing, we must automount * the tracefs file system there, so older tools still - * work with the newer kerenl. + * work with the newer kernel. */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2952bd92bc62..5506424eae2a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1347,7 +1347,7 @@ __event_trigger_test_discard(struct trace_event_file *file, /** * event_trigger_unlock_commit - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event + * @file: The file pointer associated with the event * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself @@ -1374,7 +1374,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, /** * event_trigger_unlock_commit_regs - handle triggers and finish event commit - * @file: The file pointer assoctiated to the event + * @file: The file pointer associated with the event * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 288ad2c274fb..03be4435d103 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -16,7 +16,7 @@ static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses - * suprises + * surprises */ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) perf_trace_t; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f58106eaf8cb..80e96989770e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2646,7 +2646,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) } /* - * Since calls are grouped by systems, the likelyhood that the + * Since calls are grouped by systems, the likelihood that the * next call in the iteration belongs to the same system as the * previous call is high. As an optimization, we skip searching * for a map[] that matches the call's system if the last call @@ -2706,7 +2706,7 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr) } /* - * Just create a decriptor for early init. A descriptor is required + * Just create a descriptor for early init. A descriptor is required * for enabling events at boot. We want to enable events before * the filesystem is initialized. */ diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e91259f6a722..9730acf3c03e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -296,7 +296,7 @@ enum { * and "FALSE" the program entry after that, we are now done with the first * pass. * - * Making the above "a || b && c" have a progam of: + * Making the above "a || b && c" have a program of: * prog[0] = { "a", 1, 2 } * prog[1] = { "b", 0, 2 } * prog[2] = { "c", 0, 3 } @@ -390,7 +390,7 @@ enum { * F: return FALSE * * As "r = a; if (!r) goto n5;" is obviously the same as - * "if (!a) goto n5;" without doing anything we can interperate the + * "if (!a) goto n5;" without doing anything we can interpret the * program as: * n1: if (!a) goto n5; * n2: if (!b) goto n5; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 8d71e6c83f10..2ac75eb6aa86 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1385,7 +1385,7 @@ static int destroy_synth_event(struct synth_event *se) /** * synth_event_delete - Delete a synthetic event - * @event_name: The name of the new sythetic event + * @event_name: The name of the new synthetic event * * Delete a synthetic event that was created with synth_event_create(). * diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0aa6e6faa943..0de6837722da 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -764,7 +764,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, * - we are inside irq code * - we just entered irq code * - * retunns 0 if + * returns 0 if * - funcgraph-interrupts option is set * - we are not inside irq code */ diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 34dc1a712dcb..632ef88131a9 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -83,7 +83,7 @@ struct hwlat_sample { u64 nmi_total_ts; /* Total time spent in NMIs */ struct timespec64 timestamp; /* wall time */ int nmi_count; /* # NMIs during this sample */ - int count; /* # of iteratons over threash */ + int count; /* # of iterations over thresh */ }; /* keep the global state somewhere. */ @@ -389,7 +389,7 @@ static int start_kthread(struct trace_array *tr) } /** - * stop_kthread - Inform the hardware latency samping/detector kthread to stop + * stop_kthread - Inform the hardware latency sampling/detector kthread to stop * * This kicks the running hardware latency sampling/detector kernel thread and * tells it to stop sampling now. Use this on unload and at system shutdown. diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6fe770d86dc3..ea6178cb5e33 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1748,7 +1748,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) if (trace_probe_test_flag(&tk->tp, TP_FLAG_PROFILE)) kretprobe_perf_func(tk, ri, regs); #endif - return 0; /* We don't tweek kernel, so just return 0 */ + return 0; /* We don't tweak kernel, so just return 0 */ } NOKPROBE_SYMBOL(kretprobe_dispatcher); diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ec589a4612df..15413ad7cef2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -168,7 +168,7 @@ void __trace_probe_log_err(int offset, int err_type) if (!trace_probe_log.argv) return; - /* Recalcurate the length and allocate buffer */ + /* Recalculate the length and allocate buffer */ for (i = 0; i < trace_probe_log.argc; i++) { if (i == trace_probe_log.index) pos = len; @@ -182,7 +182,7 @@ void __trace_probe_log_err(int offset, int err_type) /** * Set the error position is next to the last arg + space. * Note that len includes the terminal null and the cursor - * appaers at pos + 1. + * appears at pos + 1. */ pos = len; offset = 0; @@ -592,7 +592,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } /* - * Since $comm and immediate string can not be dereferred, + * Since $comm and immediate string can not be dereferenced, * we can find those by strcmp. */ if (strcmp(arg, "$comm") == 0 || strncmp(arg, "\\\"", 2) == 0) { diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 7ce4027089ee..227d518e5ba5 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -134,7 +134,7 @@ struct fetch_type { size_t size; /* Byte size of type */ int is_signed; /* Signed flag */ print_type_func_t print; /* Print functions */ - const char *fmt; /* Fromat string */ + const char *fmt; /* Format string */ const char *fmttype; /* Name in format file */ }; diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index e5282828f4a6..f003c5d02a3a 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -167,7 +167,7 @@ array: return code->op == FETCH_OP_END ? ret : -EILSEQ; } -/* Sum up total data length for dynamic arraies (strings) */ +/* Sum up total data length for dynamic arrays (strings) */ static nokprobe_inline int __get_data_size(struct trace_probe *tp, struct pt_regs *regs) { diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 73ef12092250..adf7ef194005 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -878,7 +878,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) int ret; /* - * Now that the big kernel lock is no longer preemptable, + * Now that the big kernel lock is no longer preemptible, * and this is called with the BKL held, it will always * fail. If preemption is already disabled, simply * pass the test. When the BKL is removed, or becomes @@ -940,7 +940,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * int ret; /* - * Now that the big kernel lock is no longer preemptable, + * Now that the big kernel lock is no longer preemptible, * and this is called with the BKL held, it will always * fail. If preemption is already disabled, simply * pass the test. When the BKL is removed, or becomes diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 1d84fcc78e3e..9c90b3a7dce2 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -16,7 +16,7 @@ * The buffer size is currently PAGE_SIZE, although it may become dynamic * in the future. * - * A write to the buffer will either succed or fail. That is, unlike + * A write to the buffer will either succeed or fail. That is, unlike * sprintf() there will not be a partial write (well it may write into * the buffer but it wont update the pointers). This allows users to * try to write something into the trace_seq buffer and if it fails @@ -73,7 +73,7 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) * @fmt: printf format string * * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace + * copy to user routines. To simplify formatting of a trace * trace_seq_printf() is used to store strings into a special * buffer (@s). Then the output may be either used by * the sequencer or pulled into another buffer. @@ -133,7 +133,7 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask); * @fmt: printf format string * * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace + * copy to user routines. To simplify formatting of a trace * trace_seq_printf is used to store strings into a special * buffer (@s). Then the output may be either used by * the sequencer or pulled into another buffer. @@ -226,7 +226,7 @@ EXPORT_SYMBOL_GPL(trace_seq_puts); * @c: simple character to record * * The tracer may use either the sequence operations or its own - * copy to user routines. This function records a simple charater + * copy to user routines. This function records a simple character * into a special buffer (@s) for later retrieval by a sequencer * or other mechanism. */ @@ -348,7 +348,7 @@ int trace_seq_path(struct trace_seq *s, const struct path *path) EXPORT_SYMBOL_GPL(trace_seq_path); /** - * trace_seq_to_user - copy the squence buffer to user space + * trace_seq_to_user - copy the sequence buffer to user space * @s: trace sequence descriptor * @ubuf: The userspace memory location to copy to * @cnt: The amount to copy @@ -363,7 +363,7 @@ EXPORT_SYMBOL_GPL(trace_seq_path); * * On failure it returns -EBUSY if all of the content in the * sequence has been already read, which includes nothing in the - * sequenc (@s->len == @s->readpos). + * sequence (@s->len == @s->readpos). * * Returns -EFAULT if the copy to userspace fails. */ From e43d5c7c3c3459b428431754672052503c5db9c8 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 11 Mar 2021 16:40:56 -0700 Subject: [PATCH 0204/1379] dt-bindings: i3c: Fix silvaco,i3c-master-v1 compatible string The example for the Silvaco I3C master doesn't match the schema's compatible string. Fix it. Cc: Miquel Raynal Cc: Conor Culhane Cc: Alexandre Belloni Cc: linux-i3c@lists.infradead.org Signed-off-by: Rob Herring Reviewed-by: Miquel Raynal Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210311234056.1588751-1-robh@kernel.org --- Documentation/devicetree/bindings/i3c/silvaco,i3c-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/i3c/silvaco,i3c-master.yaml b/Documentation/devicetree/bindings/i3c/silvaco,i3c-master.yaml index adb5165505aa..62f3ca66274f 100644 --- a/Documentation/devicetree/bindings/i3c/silvaco,i3c-master.yaml +++ b/Documentation/devicetree/bindings/i3c/silvaco,i3c-master.yaml @@ -49,7 +49,7 @@ additionalProperties: true examples: - | i3c-master@a0000000 { - compatible = "silvaco,i3c-master"; + compatible = "silvaco,i3c-master-v1"; clocks = <&zynqmp_clk 71>, <&fclk>, <&sclk>; clock-names = "pclk", "fast_clk", "slow_clk"; interrupt-parent = <&gic>; From 8d69f62fddf6c1a8c7745120c4d6aab9322b001a Mon Sep 17 00:00:00 2001 From: Johannes Hahn Date: Wed, 17 Mar 2021 08:52:27 +0100 Subject: [PATCH 0205/1379] rtc: rx6110: add ACPI bindings to I2C This allows the RX6110 driver to be automatically assigned to the right device on the I2C bus. Signed-off-by: Johannes Hahn Co-developed-by: Claudius Heine Signed-off-by: Claudius Heine Reviewed-by: Andy Shevchenko Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210317075228.683184-1-ch@denx.de --- drivers/rtc/rtc-rx6110.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/rtc/rtc-rx6110.c b/drivers/rtc/rtc-rx6110.c index 79161d4c6ce4..f4d425002f7f 100644 --- a/drivers/rtc/rtc-rx6110.c +++ b/drivers/rtc/rtc-rx6110.c @@ -447,6 +447,12 @@ static int rx6110_i2c_probe(struct i2c_client *client, return rx6110_probe(rx6110, &client->dev); } +static const struct acpi_device_id rx6110_i2c_acpi_match[] = { + { "SECC6110" }, + { } +}; +MODULE_DEVICE_TABLE(acpi, rx6110_i2c_acpi_match); + static const struct i2c_device_id rx6110_i2c_id[] = { { "rx6110", 0 }, { } @@ -456,6 +462,7 @@ MODULE_DEVICE_TABLE(i2c, rx6110_i2c_id); static struct i2c_driver rx6110_i2c_driver = { .driver = { .name = RX6110_DRIVER_NAME, + .acpi_match_table = rx6110_i2c_acpi_match, }, .probe = rx6110_i2c_probe, .id_table = rx6110_i2c_id, From 4613bdcc122e9e60e0763c5851337470d25d7e40 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Wed, 17 Mar 2021 15:24:01 +0530 Subject: [PATCH 0206/1379] kernel: trace: Mundane typo fixes in the file trace_events_filter.c s/callin/calling/ Link: https://lkml.kernel.org/r/20210317095401.1854544-1-unixbhaskar@gmail.com Acked-by: Randy Dunlap Signed-off-by: Bhaskar Chowdhury [ Other fixes already done by Ingo Molnar ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9730acf3c03e..49de3e21e9bc 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -256,7 +256,7 @@ enum { * is "&&" we don't call update_preds(). Instead continue to "c". As the * next token after "c" is not "&&" but the end of input, we first process the * "&&" by calling update_preds() for the "&&" then we process the "||" by - * callin updates_preds() with the values for processing "||". + * calling updates_preds() with the values for processing "||". * * What does that mean? What update_preds() does is to first save the "target" * of the program entry indexed by the current program entry's "target" From e9d54be9ad5ee2eed3056d8901ac4b3b115d95a5 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 16 Feb 2021 20:14:46 +0100 Subject: [PATCH 0207/1379] ARM: 9061/1: kprobes: fix UNPREDICTABLE warnings GNU as warns twice for this file: Warning: using r15 results in unpredictable behaviour via the Arm ARM: K1.1.1 Overview of the constraints on Armv7 UNPREDICTABLE behaviors The term UNPREDICTABLE describes a number of cases where the architecture has a feature that software must not use. Ard notes: These are selftests that aim to ensure that kprobe never attempts to replace the opcodes in question with a probe, but they are not actually executed, or expected to occur in real code. Link: https://github.com/ClangBuiltLinux/linux/issues/1271 Link: https://reviews.llvm.org/D95586 Reported-by: kernelci.org bot Suggested-by: Peter Smith Suggested-by: Renato Golin Suggested-by: David Spickett Acked-by: Ard Biesheuvel Signed-off-by: Nick Desaulniers Signed-off-by: Russell King --- arch/arm/probes/kprobes/test-arm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/probes/kprobes/test-arm.c b/arch/arm/probes/kprobes/test-arm.c index 977369f1aa48..2543106a203e 100644 --- a/arch/arm/probes/kprobes/test-arm.c +++ b/arch/arm/probes/kprobes/test-arm.c @@ -166,10 +166,10 @@ void kprobe_arm_test_cases(void) /* Data-processing with PC as a target and status registers updated */ TEST_UNSUPPORTED("movs pc, r1") - TEST_UNSUPPORTED("movs pc, r1, lsl r2") + TEST_UNSUPPORTED(__inst_arm(0xe1b0f211) " @movs pc, r1, lsl r2") TEST_UNSUPPORTED("movs pc, #0x10000") TEST_UNSUPPORTED("adds pc, lr, r1") - TEST_UNSUPPORTED("adds pc, lr, r1, lsl r2") + TEST_UNSUPPORTED(__inst_arm(0xe09ef211) " @adds pc, lr, r1, lsl r2") TEST_UNSUPPORTED("adds pc, lr, #4") /* Data-processing with SP as target */ From 7c182ebab9f338d07571c7f86d4d64c385ad1b9c Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 16 Feb 2021 20:16:13 +0100 Subject: [PATCH 0208/1379] ARM: 9062/1: kprobes: rewrite test-arm.c in UAL Clang's integrated assembler only accepts UAL syntax, rewrite the instructions that were changed by RVCTv2.1. The document "Assembly language changes after RVCTv2.1" was very helpful. .syntax unified directive is added, since -masm-syntax-unified is unreliable for older but supported versions of GCC. See also: commit fe09d9c641f2 ("ARM: 8852/1: uaccess: use unified assembler language syntax") Link: https://developer.arm.com/documentation/dui0473/c/writing-arm-assembly-language/assembly-language-changes-after-rvctv2-1 Link: https://github.com/ClangBuiltLinux/linux/issues/1271 Reported-by: Arnd Bergmann Acked-by: Ard Biesheuvel Reviewed-by: Nathan Chancellor Signed-off-by: Nick Desaulniers Signed-off-by: Russell King --- arch/arm/probes/kprobes/test-arm.c | 290 ++++++++++++++-------------- arch/arm/probes/kprobes/test-core.h | 1 + 2 files changed, 146 insertions(+), 145 deletions(-) diff --git a/arch/arm/probes/kprobes/test-arm.c b/arch/arm/probes/kprobes/test-arm.c index 2543106a203e..a0dae35ffacd 100644 --- a/arch/arm/probes/kprobes/test-arm.c +++ b/arch/arm/probes/kprobes/test-arm.c @@ -55,25 +55,25 @@ void kprobe_arm_test_cases(void) TEST_GROUP("Data-processing (register), (register-shifted register), (immediate)") #define _DATA_PROCESSING_DNM(op,s,val) \ - TEST_RR( op "eq" s " r0, r",1, VAL1,", r",2, val, "") \ - TEST_RR( op "ne" s " r1, r",1, VAL1,", r",2, val, ", lsl #3") \ - TEST_RR( op "cs" s " r2, r",3, VAL1,", r",2, val, ", lsr #4") \ - TEST_RR( op "cc" s " r3, r",3, VAL1,", r",2, val, ", asr #5") \ - TEST_RR( op "mi" s " r4, r",5, VAL1,", r",2, N(val),", asr #6") \ - TEST_RR( op "pl" s " r5, r",5, VAL1,", r",2, val, ", ror #7") \ - TEST_RR( op "vs" s " r6, r",7, VAL1,", r",2, val, ", rrx") \ - TEST_R( op "vc" s " r6, r",7, VAL1,", pc, lsl #3") \ - TEST_R( op "vc" s " r6, r",7, VAL1,", sp, lsr #4") \ - TEST_R( op "vc" s " r6, pc, r",7, VAL1,", asr #5") \ - TEST_R( op "vc" s " r6, sp, r",7, VAL1,", ror #6") \ - TEST_RRR( op "hi" s " r8, r",9, VAL1,", r",14,val, ", lsl r",0, 3,"")\ - TEST_RRR( op "ls" s " r9, r",9, VAL1,", r",14,val, ", lsr r",7, 4,"")\ - TEST_RRR( op "ge" s " r10, r",11,VAL1,", r",14,val, ", asr r",7, 5,"")\ - TEST_RRR( op "lt" s " r11, r",11,VAL1,", r",14,N(val),", asr r",7, 6,"")\ - TEST_RR( op "gt" s " r12, r13" ", r",14,val, ", ror r",14,7,"")\ - TEST_RR( op "le" s " r14, r",0, val, ", r13" ", lsl r",14,8,"")\ - TEST_R( op "eq" s " r0, r",11,VAL1,", #0xf5") \ - TEST_R( op "ne" s " r11, r",0, VAL1,", #0xf5000000") \ + TEST_RR( op s "eq r0, r",1, VAL1,", r",2, val, "") \ + TEST_RR( op s "ne r1, r",1, VAL1,", r",2, val, ", lsl #3") \ + TEST_RR( op s "cs r2, r",3, VAL1,", r",2, val, ", lsr #4") \ + TEST_RR( op s "cc r3, r",3, VAL1,", r",2, val, ", asr #5") \ + TEST_RR( op s "mi r4, r",5, VAL1,", r",2, N(val),", asr #6") \ + TEST_RR( op s "pl r5, r",5, VAL1,", r",2, val, ", ror #7") \ + TEST_RR( op s "vs r6, r",7, VAL1,", r",2, val, ", rrx") \ + TEST_R( op s "vc r6, r",7, VAL1,", pc, lsl #3") \ + TEST_R( op s "vc r6, r",7, VAL1,", sp, lsr #4") \ + TEST_R( op s "vc r6, pc, r",7, VAL1,", asr #5") \ + TEST_R( op s "vc r6, sp, r",7, VAL1,", ror #6") \ + TEST_RRR( op s "hi r8, r",9, VAL1,", r",14,val, ", lsl r",0, 3,"")\ + TEST_RRR( op s "ls r9, r",9, VAL1,", r",14,val, ", lsr r",7, 4,"")\ + TEST_RRR( op s "ge r10, r",11,VAL1,", r",14,val, ", asr r",7, 5,"")\ + TEST_RRR( op s "lt r11, r",11,VAL1,", r",14,N(val),", asr r",7, 6,"")\ + TEST_RR( op s "gt r12, r13" ", r",14,val, ", ror r",14,7,"")\ + TEST_RR( op s "le r14, r",0, val, ", r13" ", lsl r",14,8,"")\ + TEST_R( op s "eq r0, r",11,VAL1,", #0xf5") \ + TEST_R( op s "ne r11, r",0, VAL1,", #0xf5000000") \ TEST_R( op s " r7, r",8, VAL2,", #0x000af000") \ TEST( op s " r4, pc" ", #0x00005a00") @@ -104,23 +104,23 @@ void kprobe_arm_test_cases(void) TEST_R( op " r",8, VAL2,", #0x000af000") #define _DATA_PROCESSING_DM(op,s,val) \ - TEST_R( op "eq" s " r0, r",1, val, "") \ - TEST_R( op "ne" s " r1, r",1, val, ", lsl #3") \ - TEST_R( op "cs" s " r2, r",3, val, ", lsr #4") \ - TEST_R( op "cc" s " r3, r",3, val, ", asr #5") \ - TEST_R( op "mi" s " r4, r",5, N(val),", asr #6") \ - TEST_R( op "pl" s " r5, r",5, val, ", ror #7") \ - TEST_R( op "vs" s " r6, r",10,val, ", rrx") \ - TEST( op "vs" s " r7, pc, lsl #3") \ - TEST( op "vs" s " r7, sp, lsr #4") \ - TEST_RR( op "vc" s " r8, r",7, val, ", lsl r",0, 3,"") \ - TEST_RR( op "hi" s " r9, r",9, val, ", lsr r",7, 4,"") \ - TEST_RR( op "ls" s " r10, r",9, val, ", asr r",7, 5,"") \ - TEST_RR( op "ge" s " r11, r",11,N(val),", asr r",7, 6,"") \ - TEST_RR( op "lt" s " r12, r",11,val, ", ror r",14,7,"") \ - TEST_R( op "gt" s " r14, r13" ", lsl r",14,8,"") \ - TEST( op "eq" s " r0, #0xf5") \ - TEST( op "ne" s " r11, #0xf5000000") \ + TEST_R( op s "eq r0, r",1, val, "") \ + TEST_R( op s "ne r1, r",1, val, ", lsl #3") \ + TEST_R( op s "cs r2, r",3, val, ", lsr #4") \ + TEST_R( op s "cc r3, r",3, val, ", asr #5") \ + TEST_R( op s "mi r4, r",5, N(val),", asr #6") \ + TEST_R( op s "pl r5, r",5, val, ", ror #7") \ + TEST_R( op s "vs r6, r",10,val, ", rrx") \ + TEST( op s "vs r7, pc, lsl #3") \ + TEST( op s "vs r7, sp, lsr #4") \ + TEST_RR( op s "vc r8, r",7, val, ", lsl r",0, 3,"") \ + TEST_RR( op s "hi r9, r",9, val, ", lsr r",7, 4,"") \ + TEST_RR( op s "ls r10, r",9, val, ", asr r",7, 5,"") \ + TEST_RR( op s "ge r11, r",11,N(val),", asr r",7, 6,"") \ + TEST_RR( op s "lt r12, r",11,val, ", ror r",14,7,"") \ + TEST_R( op s "gt r14, r13" ", lsl r",14,8,"") \ + TEST( op s "eq r0, #0xf5") \ + TEST( op s "ne r11, #0xf5000000") \ TEST( op s " r7, #0x000af000") \ TEST( op s " r4, #0x00005a00") @@ -352,7 +352,7 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe000029f) " @ mul r0, pc, r2") TEST_UNSUPPORTED(__inst_arm(0xe0000f91) " @ mul r0, r1, pc") TEST_RR( "muls r0, r",1, VAL1,", r",2, VAL2,"") - TEST_RR( "mullss r7, r",8, VAL2,", r",9, VAL2,"") + TEST_RR( "mulsls r7, r",8, VAL2,", r",9, VAL2,"") TEST_R( "muls lr, r",4, VAL3,", r13") TEST_UNSUPPORTED(__inst_arm(0xe01f0291) " @ muls pc, r1, r2") @@ -361,7 +361,7 @@ void kprobe_arm_test_cases(void) TEST_RR( "mla lr, r",1, VAL2,", r",2, VAL3,", r13") TEST_UNSUPPORTED(__inst_arm(0xe02f3291) " @ mla pc, r1, r2, r3") TEST_RRR( "mlas r0, r",1, VAL1,", r",2, VAL2,", r",3, VAL3,"") - TEST_RRR( "mlahis r7, r",8, VAL3,", r",9, VAL1,", r",10, VAL2,"") + TEST_RRR( "mlashi r7, r",8, VAL3,", r",9, VAL1,", r",10, VAL2,"") TEST_RR( "mlas lr, r",1, VAL2,", r",2, VAL3,", r13") TEST_UNSUPPORTED(__inst_arm(0xe03f3291) " @ mlas pc, r1, r2, r3") @@ -394,7 +394,7 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe081f392) " @ umull pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe08f1392) " @ umull r1, pc, r2, r3") TEST_RR( "umulls r0, r1, r",2, VAL1,", r",3, VAL2,"") - TEST_RR( "umulllss r7, r8, r",9, VAL2,", r",10, VAL1,"") + TEST_RR( "umullsls r7, r8, r",9, VAL2,", r",10, VAL1,"") TEST_R( "umulls lr, r12, r",11,VAL3,", r13") TEST_UNSUPPORTED(__inst_arm(0xe091f392) " @ umulls pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe09f1392) " @ umulls r1, pc, r2, r3") @@ -405,7 +405,7 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe0af1392) " @ umlal pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe0a1f392) " @ umlal r1, pc, r2, r3") TEST_RRRR( "umlals r",0, VAL1,", r",1, VAL2,", r",2, VAL3,", r",3, VAL4) - TEST_RRRR( "umlalles r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3) + TEST_RRRR( "umlalsle r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3) TEST_RRR( "umlals r",14,VAL3,", r",7, VAL4,", r",5, VAL1,", r13") TEST_UNSUPPORTED(__inst_arm(0xe0bf1392) " @ umlals pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe0b1f392) " @ umlals r1, pc, r2, r3") @@ -416,7 +416,7 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe0c1f392) " @ smull pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe0cf1392) " @ smull r1, pc, r2, r3") TEST_RR( "smulls r0, r1, r",2, VAL1,", r",3, VAL2,"") - TEST_RR( "smulllss r7, r8, r",9, VAL2,", r",10, VAL1,"") + TEST_RR( "smullsls r7, r8, r",9, VAL2,", r",10, VAL1,"") TEST_R( "smulls lr, r12, r",11,VAL3,", r13") TEST_UNSUPPORTED(__inst_arm(0xe0d1f392) " @ smulls pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe0df1392) " @ smulls r1, pc, r2, r3") @@ -427,7 +427,7 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe0ef1392) " @ smlal pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe0e1f392) " @ smlal r1, pc, r2, r3") TEST_RRRR( "smlals r",0, VAL1,", r",1, VAL2,", r",2, VAL3,", r",3, VAL4) - TEST_RRRR( "smlalles r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3) + TEST_RRRR( "smlalsle r",8, VAL4,", r",9, VAL1,", r",10,VAL2,", r",11,VAL3) TEST_RRR( "smlals r",14,VAL3,", r",7, VAL4,", r",5, VAL1,", r13") TEST_UNSUPPORTED(__inst_arm(0xe0ff1392) " @ smlals pc, r1, r2, r3") TEST_UNSUPPORTED(__inst_arm(0xe0f0f392) " @ smlals r0, pc, r2, r3") @@ -450,7 +450,7 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe10f0091) " @ swp r0, r1, [pc]") #if __LINUX_ARM_ARCH__ < 6 TEST_RP("swpb lr, r",7,VAL2,", [r",8,0,"]") - TEST_R( "swpvsb r0, r",1,VAL1,", [sp]") + TEST_R( "swpbvs r0, r",1,VAL1,", [sp]") #else TEST_UNSUPPORTED(__inst_arm(0xe148e097) " @ swpb lr, r7, [r8]") TEST_UNSUPPORTED(__inst_arm(0x614d0091) " @ swpvsb r0, r1, [sp]") @@ -477,11 +477,11 @@ void kprobe_arm_test_cases(void) TEST_GROUP("Extra load/store instructions") TEST_RPR( "strh r",0, VAL1,", [r",1, 48,", -r",2, 24,"]") - TEST_RPR( "streqh r",14,VAL2,", [r",11,0, ", r",12, 48,"]") - TEST_UNSUPPORTED( "streqh r14, [r13, r12]") - TEST_UNSUPPORTED( "streqh r14, [r12, r13]") + TEST_RPR( "strheq r",14,VAL2,", [r",11,0, ", r",12, 48,"]") + TEST_UNSUPPORTED( "strheq r14, [r13, r12]") + TEST_UNSUPPORTED( "strheq r14, [r12, r13]") TEST_RPR( "strh r",1, VAL1,", [r",2, 24,", r",3, 48,"]!") - TEST_RPR( "strneh r",12,VAL2,", [r",11,48,", -r",10,24,"]!") + TEST_RPR( "strhne r",12,VAL2,", [r",11,48,", -r",10,24,"]!") TEST_RPR( "strh r",2, VAL1,", [r",3, 24,"], r",4, 48,"") TEST_RPR( "strh r",10,VAL2,", [r",9, 48,"], -r",11,24,"") TEST_UNSUPPORTED(__inst_arm(0xe1afc0ba) " @ strh r12, [pc, r10]!") @@ -489,9 +489,9 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe089a0bf) " @ strh r10, [r9], pc") TEST_PR( "ldrh r0, [r",0, 48,", -r",2, 24,"]") - TEST_PR( "ldrcsh r14, [r",13,0, ", r",12, 48,"]") + TEST_PR( "ldrhcs r14, [r",13,0, ", r",12, 48,"]") TEST_PR( "ldrh r1, [r",2, 24,", r",3, 48,"]!") - TEST_PR( "ldrcch r12, [r",11,48,", -r",10,24,"]!") + TEST_PR( "ldrhcc r12, [r",11,48,", -r",10,24,"]!") TEST_PR( "ldrh r2, [r",3, 24,"], r",4, 48,"") TEST_PR( "ldrh r10, [r",9, 48,"], -r",11,24,"") TEST_UNSUPPORTED(__inst_arm(0xe1bfc0ba) " @ ldrh r12, [pc, r10]!") @@ -499,9 +499,9 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe099a0bf) " @ ldrh r10, [r9], pc") TEST_RP( "strh r",0, VAL1,", [r",1, 24,", #-2]") - TEST_RP( "strmih r",14,VAL2,", [r",13,0, ", #2]") + TEST_RP( "strhmi r",14,VAL2,", [r",13,0, ", #2]") TEST_RP( "strh r",1, VAL1,", [r",2, 24,", #4]!") - TEST_RP( "strplh r",12,VAL2,", [r",11,24,", #-4]!") + TEST_RP( "strhpl r",12,VAL2,", [r",11,24,", #-4]!") TEST_RP( "strh r",2, VAL1,", [r",3, 24,"], #48") TEST_RP( "strh r",10,VAL2,", [r",9, 64,"], #-48") TEST_RP( "strh r",3, VAL1,", [r",13,TEST_MEMORY_SIZE,", #-"__stringify(MAX_STACK_SIZE)"]!") @@ -511,9 +511,9 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe0c9f3b0) " @ strh pc, [r9], #48") TEST_P( "ldrh r0, [r",0, 24,", #-2]") - TEST_P( "ldrvsh r14, [r",13,0, ", #2]") + TEST_P( "ldrhvs r14, [r",13,0, ", #2]") TEST_P( "ldrh r1, [r",2, 24,", #4]!") - TEST_P( "ldrvch r12, [r",11,24,", #-4]!") + TEST_P( "ldrhvc r12, [r",11,24,", #-4]!") TEST_P( "ldrh r2, [r",3, 24,"], #48") TEST_P( "ldrh r10, [r",9, 64,"], #-48") TEST( "ldrh r0, [pc, #0]") @@ -521,18 +521,18 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe0d9f3b0) " @ ldrh pc, [r9], #48") TEST_PR( "ldrsb r0, [r",0, 48,", -r",2, 24,"]") - TEST_PR( "ldrhisb r14, [r",13,0,", r",12, 48,"]") + TEST_PR( "ldrsbhi r14, [r",13,0,", r",12, 48,"]") TEST_PR( "ldrsb r1, [r",2, 24,", r",3, 48,"]!") - TEST_PR( "ldrlssb r12, [r",11,48,", -r",10,24,"]!") + TEST_PR( "ldrsbls r12, [r",11,48,", -r",10,24,"]!") TEST_PR( "ldrsb r2, [r",3, 24,"], r",4, 48,"") TEST_PR( "ldrsb r10, [r",9, 48,"], -r",11,24,"") TEST_UNSUPPORTED(__inst_arm(0xe1bfc0da) " @ ldrsb r12, [pc, r10]!") TEST_UNSUPPORTED(__inst_arm(0xe099f0db) " @ ldrsb pc, [r9], r11") TEST_P( "ldrsb r0, [r",0, 24,", #-1]") - TEST_P( "ldrgesb r14, [r",13,0, ", #1]") + TEST_P( "ldrsbge r14, [r",13,0, ", #1]") TEST_P( "ldrsb r1, [r",2, 24,", #4]!") - TEST_P( "ldrltsb r12, [r",11,24,", #-4]!") + TEST_P( "ldrsblt r12, [r",11,24,", #-4]!") TEST_P( "ldrsb r2, [r",3, 24,"], #48") TEST_P( "ldrsb r10, [r",9, 64,"], #-48") TEST( "ldrsb r0, [pc, #0]") @@ -540,18 +540,18 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe0d9f3d0) " @ ldrsb pc, [r9], #48") TEST_PR( "ldrsh r0, [r",0, 48,", -r",2, 24,"]") - TEST_PR( "ldrgtsh r14, [r",13,0, ", r",12, 48,"]") + TEST_PR( "ldrshgt r14, [r",13,0, ", r",12, 48,"]") TEST_PR( "ldrsh r1, [r",2, 24,", r",3, 48,"]!") - TEST_PR( "ldrlesh r12, [r",11,48,", -r",10,24,"]!") + TEST_PR( "ldrshle r12, [r",11,48,", -r",10,24,"]!") TEST_PR( "ldrsh r2, [r",3, 24,"], r",4, 48,"") TEST_PR( "ldrsh r10, [r",9, 48,"], -r",11,24,"") TEST_UNSUPPORTED(__inst_arm(0xe1bfc0fa) " @ ldrsh r12, [pc, r10]!") TEST_UNSUPPORTED(__inst_arm(0xe099f0fb) " @ ldrsh pc, [r9], r11") TEST_P( "ldrsh r0, [r",0, 24,", #-1]") - TEST_P( "ldreqsh r14, [r",13,0 ,", #1]") + TEST_P( "ldrsheq r14, [r",13,0 ,", #1]") TEST_P( "ldrsh r1, [r",2, 24,", #4]!") - TEST_P( "ldrnesh r12, [r",11,24,", #-4]!") + TEST_P( "ldrshne r12, [r",11,24,", #-4]!") TEST_P( "ldrsh r2, [r",3, 24,"], #48") TEST_P( "ldrsh r10, [r",9, 64,"], #-48") TEST( "ldrsh r0, [pc, #0]") @@ -571,30 +571,30 @@ void kprobe_arm_test_cases(void) #if __LINUX_ARM_ARCH__ >= 5 TEST_RPR( "strd r",0, VAL1,", [r",1, 48,", -r",2,24,"]") - TEST_RPR( "strccd r",8, VAL2,", [r",11,0, ", r",12,48,"]") - TEST_UNSUPPORTED( "strccd r8, [r13, r12]") - TEST_UNSUPPORTED( "strccd r8, [r12, r13]") + TEST_RPR( "strdcc r",8, VAL2,", [r",11,0, ", r",12,48,"]") + TEST_UNSUPPORTED( "strdcc r8, [r13, r12]") + TEST_UNSUPPORTED( "strdcc r8, [r12, r13]") TEST_RPR( "strd r",4, VAL1,", [r",2, 24,", r",3, 48,"]!") - TEST_RPR( "strcsd r",12,VAL2,", [r",11,48,", -r",10,24,"]!") - TEST_RPR( "strd r",2, VAL1,", [r",5, 24,"], r",4,48,"") - TEST_RPR( "strd r",10,VAL2,", [r",9, 48,"], -r",7,24,"") + TEST_RPR( "strdcs r",12,VAL2,", r13, [r",11,48,", -r",10,24,"]!") + TEST_RPR( "strd r",2, VAL1,", r3, [r",5, 24,"], r",4,48,"") + TEST_RPR( "strd r",10,VAL2,", r11, [r",9, 48,"], -r",7,24,"") TEST_UNSUPPORTED(__inst_arm(0xe1afc0fa) " @ strd r12, [pc, r10]!") TEST_PR( "ldrd r0, [r",0, 48,", -r",2,24,"]") - TEST_PR( "ldrmid r8, [r",13,0, ", r",12,48,"]") + TEST_PR( "ldrdmi r8, [r",13,0, ", r",12,48,"]") TEST_PR( "ldrd r4, [r",2, 24,", r",3, 48,"]!") - TEST_PR( "ldrpld r6, [r",11,48,", -r",10,24,"]!") - TEST_PR( "ldrd r2, [r",5, 24,"], r",4,48,"") - TEST_PR( "ldrd r10, [r",9,48,"], -r",7,24,"") + TEST_PR( "ldrdpl r6, [r",11,48,", -r",10,24,"]!") + TEST_PR( "ldrd r2, r3, [r",5, 24,"], r",4,48,"") + TEST_PR( "ldrd r10, r11, [r",9,48,"], -r",7,24,"") TEST_UNSUPPORTED(__inst_arm(0xe1afc0da) " @ ldrd r12, [pc, r10]!") TEST_UNSUPPORTED(__inst_arm(0xe089f0db) " @ ldrd pc, [r9], r11") TEST_UNSUPPORTED(__inst_arm(0xe089e0db) " @ ldrd lr, [r9], r11") TEST_UNSUPPORTED(__inst_arm(0xe089c0df) " @ ldrd r12, [r9], pc") TEST_RP( "strd r",0, VAL1,", [r",1, 24,", #-8]") - TEST_RP( "strvsd r",8, VAL2,", [r",13,0, ", #8]") + TEST_RP( "strdvs r",8, VAL2,", [r",13,0, ", #8]") TEST_RP( "strd r",4, VAL1,", [r",2, 24,", #16]!") - TEST_RP( "strvcd r",12,VAL2,", [r",11,24,", #-16]!") + TEST_RP( "strdvc r",12,VAL2,", r13, [r",11,24,", #-16]!") TEST_RP( "strd r",2, VAL1,", [r",4, 24,"], #48") TEST_RP( "strd r",10,VAL2,", [r",9, 64,"], #-48") TEST_RP( "strd r",6, VAL1,", [r",13,TEST_MEMORY_SIZE,", #-"__stringify(MAX_STACK_SIZE)"]!") @@ -603,9 +603,9 @@ void kprobe_arm_test_cases(void) TEST_UNSUPPORTED(__inst_arm(0xe1efc3f0) " @ strd r12, [pc, #48]!") TEST_P( "ldrd r0, [r",0, 24,", #-8]") - TEST_P( "ldrhid r8, [r",13,0, ", #8]") + TEST_P( "ldrdhi r8, [r",13,0, ", #8]") TEST_P( "ldrd r4, [r",2, 24,", #16]!") - TEST_P( "ldrlsd r6, [r",11,24,", #-16]!") + TEST_P( "ldrdls r6, [r",11,24,", #-16]!") TEST_P( "ldrd r2, [r",5, 24,"], #48") TEST_P( "ldrd r10, [r",9,6,"], #-48") TEST_UNSUPPORTED(__inst_arm(0xe1efc3d0) " @ ldrd r12, [pc, #48]!") @@ -1084,63 +1084,63 @@ void kprobe_arm_test_cases(void) TEST_GROUP("Branch, branch with link, and block data transfer") TEST_P( "stmda r",0, 16*4,", {r0}") - TEST_P( "stmeqda r",4, 16*4,", {r0-r15}") - TEST_P( "stmneda r",8, 16*4,"!, {r8-r15}") + TEST_P( "stmdaeq r",4, 16*4,", {r0-r15}") + TEST_P( "stmdane r",8, 16*4,"!, {r8-r15}") TEST_P( "stmda r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_P( "stmda r",13,0, "!, {pc}") TEST_P( "ldmda r",0, 16*4,", {r0}") - TEST_BF_P("ldmcsda r",4, 15*4,", {r0-r15}") - TEST_BF_P("ldmccda r",7, 15*4,"!, {r8-r15}") + TEST_BF_P("ldmdacs r",4, 15*4,", {r0-r15}") + TEST_BF_P("ldmdacc r",7, 15*4,"!, {r8-r15}") TEST_P( "ldmda r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_BF_P("ldmda r",14,15*4,"!, {pc}") TEST_P( "stmia r",0, 16*4,", {r0}") - TEST_P( "stmmiia r",4, 16*4,", {r0-r15}") - TEST_P( "stmplia r",8, 16*4,"!, {r8-r15}") + TEST_P( "stmiami r",4, 16*4,", {r0-r15}") + TEST_P( "stmiapl r",8, 16*4,"!, {r8-r15}") TEST_P( "stmia r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_P( "stmia r",14,0, "!, {pc}") TEST_P( "ldmia r",0, 16*4,", {r0}") - TEST_BF_P("ldmvsia r",4, 0, ", {r0-r15}") - TEST_BF_P("ldmvcia r",7, 8*4, "!, {r8-r15}") + TEST_BF_P("ldmiavs r",4, 0, ", {r0-r15}") + TEST_BF_P("ldmiavc r",7, 8*4, "!, {r8-r15}") TEST_P( "ldmia r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_BF_P("ldmia r",14,15*4,"!, {pc}") TEST_P( "stmdb r",0, 16*4,", {r0}") - TEST_P( "stmhidb r",4, 16*4,", {r0-r15}") - TEST_P( "stmlsdb r",8, 16*4,"!, {r8-r15}") + TEST_P( "stmdbhi r",4, 16*4,", {r0-r15}") + TEST_P( "stmdbls r",8, 16*4,"!, {r8-r15}") TEST_P( "stmdb r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_P( "stmdb r",13,4, "!, {pc}") TEST_P( "ldmdb r",0, 16*4,", {r0}") - TEST_BF_P("ldmgedb r",4, 16*4,", {r0-r15}") - TEST_BF_P("ldmltdb r",7, 16*4,"!, {r8-r15}") + TEST_BF_P("ldmdbge r",4, 16*4,", {r0-r15}") + TEST_BF_P("ldmdblt r",7, 16*4,"!, {r8-r15}") TEST_P( "ldmdb r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_BF_P("ldmdb r",14,16*4,"!, {pc}") TEST_P( "stmib r",0, 16*4,", {r0}") - TEST_P( "stmgtib r",4, 16*4,", {r0-r15}") - TEST_P( "stmleib r",8, 16*4,"!, {r8-r15}") + TEST_P( "stmibgt r",4, 16*4,", {r0-r15}") + TEST_P( "stmible r",8, 16*4,"!, {r8-r15}") TEST_P( "stmib r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_P( "stmib r",13,-4, "!, {pc}") TEST_P( "ldmib r",0, 16*4,", {r0}") - TEST_BF_P("ldmeqib r",4, -4,", {r0-r15}") - TEST_BF_P("ldmneib r",7, 7*4,"!, {r8-r15}") + TEST_BF_P("ldmibeq r",4, -4,", {r0-r15}") + TEST_BF_P("ldmibne r",7, 7*4,"!, {r8-r15}") TEST_P( "ldmib r",12,16*4,"!, {r1,r3,r5,r7,r8-r11,r14}") TEST_BF_P("ldmib r",14,14*4,"!, {pc}") TEST_P( "stmdb r",13,16*4,"!, {r3-r12,lr}") - TEST_P( "stmeqdb r",13,16*4,"!, {r3-r12}") - TEST_P( "stmnedb r",2, 16*4,", {r3-r12,lr}") + TEST_P( "stmdbeq r",13,16*4,"!, {r3-r12}") + TEST_P( "stmdbne r",2, 16*4,", {r3-r12,lr}") TEST_P( "stmdb r",13,16*4,"!, {r2-r12,lr}") TEST_P( "stmdb r",0, 16*4,", {r0-r12}") TEST_P( "stmdb r",0, 16*4,", {r0-r12,lr}") TEST_BF_P("ldmia r",13,5*4, "!, {r3-r12,pc}") - TEST_P( "ldmccia r",13,5*4, "!, {r3-r12}") - TEST_BF_P("ldmcsia r",2, 5*4, "!, {r3-r12,pc}") + TEST_P( "ldmiacc r",13,5*4, "!, {r3-r12}") + TEST_BF_P("ldmiacs r",2, 5*4, "!, {r3-r12,pc}") TEST_BF_P("ldmia r",13,4*4, "!, {r2-r12,pc}") TEST_P( "ldmia r",0, 16*4,", {r0-r12}") TEST_P( "ldmia r",0, 16*4,", {r0-r12,lr}") @@ -1174,80 +1174,80 @@ void kprobe_arm_test_cases(void) #define TEST_COPROCESSOR(code) TEST_UNSUPPORTED(code) #define COPROCESSOR_INSTRUCTIONS_ST_LD(two,cc) \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13, #4]") \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13, #-4]") \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13, #4]!") \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13, #-4]!") \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13], #4") \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13], #-4") \ - TEST_COPROCESSOR("stc"two" 0, cr0, [r13], {1}") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13, #4]") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13, #-4]") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13, #4]!") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13, #-4]!") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13], #4") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13], #-4") \ - TEST_COPROCESSOR("stc"two"l 0, cr0, [r13], {1}") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13, #4]") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13, #-4]") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13, #4]!") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13, #-4]!") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13], #4") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13], #-4") \ - TEST_COPROCESSOR("ldc"two" 0, cr0, [r13], {1}") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13, #4]") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13, #-4]") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13, #4]!") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13, #-4]!") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13], #4") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13], #-4") \ - TEST_COPROCESSOR("ldc"two"l 0, cr0, [r13], {1}") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13, #4]") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13, #-4]") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13, #4]!") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13, #-4]!") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13], #4") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13], #-4") \ + TEST_COPROCESSOR("stc"two" p0, cr0, [r13], {1}") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13, #4]") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13, #-4]") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13, #4]!") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13, #-4]!") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13], #4") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13], #-4") \ + TEST_COPROCESSOR("stc"two"l p0, cr0, [r13], {1}") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13, #4]") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13, #-4]") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13, #4]!") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13, #-4]!") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13], #4") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13], #-4") \ + TEST_COPROCESSOR("ldc"two" p0, cr0, [r13], {1}") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13, #4]") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13, #-4]") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13, #4]!") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13, #-4]!") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13], #4") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13], #-4") \ + TEST_COPROCESSOR("ldc"two"l p0, cr0, [r13], {1}") \ \ - TEST_COPROCESSOR( "stc"two" 0, cr0, [r15, #4]") \ - TEST_COPROCESSOR( "stc"two" 0, cr0, [r15, #-4]") \ + TEST_COPROCESSOR( "stc"two" p0, cr0, [r15, #4]") \ + TEST_COPROCESSOR( "stc"two" p0, cr0, [r15, #-4]") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##daf0001) " @ stc"two" 0, cr0, [r15, #4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##d2f0001) " @ stc"two" 0, cr0, [r15, #-4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##caf0001) " @ stc"two" 0, cr0, [r15], #4") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c2f0001) " @ stc"two" 0, cr0, [r15], #-4") \ - TEST_COPROCESSOR( "stc"two" 0, cr0, [r15], {1}") \ - TEST_COPROCESSOR( "stc"two"l 0, cr0, [r15, #4]") \ - TEST_COPROCESSOR( "stc"two"l 0, cr0, [r15, #-4]") \ + TEST_COPROCESSOR( "stc"two" p0, cr0, [r15], {1}") \ + TEST_COPROCESSOR( "stc"two"l p0, cr0, [r15, #4]") \ + TEST_COPROCESSOR( "stc"two"l p0, cr0, [r15, #-4]") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##def0001) " @ stc"two"l 0, cr0, [r15, #4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##d6f0001) " @ stc"two"l 0, cr0, [r15, #-4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##cef0001) " @ stc"two"l 0, cr0, [r15], #4") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c6f0001) " @ stc"two"l 0, cr0, [r15], #-4") \ - TEST_COPROCESSOR( "stc"two"l 0, cr0, [r15], {1}") \ - TEST_COPROCESSOR( "ldc"two" 0, cr0, [r15, #4]") \ - TEST_COPROCESSOR( "ldc"two" 0, cr0, [r15, #-4]") \ + TEST_COPROCESSOR( "stc"two"l p0, cr0, [r15], {1}") \ + TEST_COPROCESSOR( "ldc"two" p0, cr0, [r15, #4]") \ + TEST_COPROCESSOR( "ldc"two" p0, cr0, [r15, #-4]") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##dbf0001) " @ ldc"two" 0, cr0, [r15, #4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##d3f0001) " @ ldc"two" 0, cr0, [r15, #-4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##cbf0001) " @ ldc"two" 0, cr0, [r15], #4") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c3f0001) " @ ldc"two" 0, cr0, [r15], #-4") \ - TEST_COPROCESSOR( "ldc"two" 0, cr0, [r15], {1}") \ - TEST_COPROCESSOR( "ldc"two"l 0, cr0, [r15, #4]") \ - TEST_COPROCESSOR( "ldc"two"l 0, cr0, [r15, #-4]") \ + TEST_COPROCESSOR( "ldc"two" p0, cr0, [r15], {1}") \ + TEST_COPROCESSOR( "ldc"two"l p0, cr0, [r15, #4]") \ + TEST_COPROCESSOR( "ldc"two"l p0, cr0, [r15, #-4]") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##dff0001) " @ ldc"two"l 0, cr0, [r15, #4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##d7f0001) " @ ldc"two"l 0, cr0, [r15, #-4]!") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##cff0001) " @ ldc"two"l 0, cr0, [r15], #4") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c7f0001) " @ ldc"two"l 0, cr0, [r15], #-4") \ - TEST_COPROCESSOR( "ldc"two"l 0, cr0, [r15], {1}") + TEST_COPROCESSOR( "ldc"two"l p0, cr0, [r15], {1}") #define COPROCESSOR_INSTRUCTIONS_MC_MR(two,cc) \ \ - TEST_COPROCESSOR( "mcrr"two" 0, 15, r0, r14, cr0") \ - TEST_COPROCESSOR( "mcrr"two" 15, 0, r14, r0, cr15") \ + TEST_COPROCESSOR( "mcrr"two" p0, 15, r0, r14, cr0") \ + TEST_COPROCESSOR( "mcrr"two" p15, 0, r14, r0, cr15") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c4f00f0) " @ mcrr"two" 0, 15, r0, r15, cr0") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c40ff0f) " @ mcrr"two" 15, 0, r15, r0, cr15") \ - TEST_COPROCESSOR( "mrrc"two" 0, 15, r0, r14, cr0") \ - TEST_COPROCESSOR( "mrrc"two" 15, 0, r14, r0, cr15") \ + TEST_COPROCESSOR( "mrrc"two" p0, 15, r0, r14, cr0") \ + TEST_COPROCESSOR( "mrrc"two" p15, 0, r14, r0, cr15") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c5f00f0) " @ mrrc"two" 0, 15, r0, r15, cr0") \ TEST_UNSUPPORTED(__inst_arm(0x##cc##c50ff0f) " @ mrrc"two" 15, 0, r15, r0, cr15") \ - TEST_COPROCESSOR( "cdp"two" 15, 15, cr15, cr15, cr15, 7") \ - TEST_COPROCESSOR( "cdp"two" 0, 0, cr0, cr0, cr0, 0") \ - TEST_COPROCESSOR( "mcr"two" 15, 7, r15, cr15, cr15, 7") \ - TEST_COPROCESSOR( "mcr"two" 0, 0, r0, cr0, cr0, 0") \ - TEST_COPROCESSOR( "mrc"two" 15, 7, r15, cr15, cr15, 7") \ - TEST_COPROCESSOR( "mrc"two" 0, 0, r0, cr0, cr0, 0") + TEST_COPROCESSOR( "cdp"two" p15, 15, cr15, cr15, cr15, 7") \ + TEST_COPROCESSOR( "cdp"two" p0, 0, cr0, cr0, cr0, 0") \ + TEST_COPROCESSOR( "mcr"two" p15, 7, r15, cr15, cr15, 7") \ + TEST_COPROCESSOR( "mcr"two" p0, 0, r0, cr0, cr0, 0") \ + TEST_COPROCESSOR( "mrc"two" p15, 7, r14, cr15, cr15, 7") \ + TEST_COPROCESSOR( "mrc"two" p0, 0, r0, cr0, cr0, 0") COPROCESSOR_INSTRUCTIONS_ST_LD("",e) #if __LINUX_ARM_ARCH__ >= 5 diff --git a/arch/arm/probes/kprobes/test-core.h b/arch/arm/probes/kprobes/test-core.h index 19a5b2add41e..f1d5583e7bbb 100644 --- a/arch/arm/probes/kprobes/test-core.h +++ b/arch/arm/probes/kprobes/test-core.h @@ -108,6 +108,7 @@ struct test_arg_end { #define TESTCASE_START(title) \ __asm__ __volatile__ ( \ + ".syntax unified \n\t" \ "bl __kprobes_test_case_start \n\t" \ ".pushsection .rodata \n\t" \ "10: \n\t" \ From a506bd5756290821a4314f502b4bafc2afcf5260 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Thu, 18 Feb 2021 03:00:05 +0100 Subject: [PATCH 0209/1379] ARM: 9064/1: hw_breakpoint: Do not directly check the event's overflow_handler hook The commit 1879445dfa7b ("perf/core: Set event's default ::overflow_handler()") set a default event->overflow_handler in perf_event_alloc(), and replace the check event->overflow_handler with is_default_overflow_handler(), but one is missing. Currently, the bp->overflow_handler can not be NULL. As a result, enable_single_step() is always not invoked. Comments from Zhen Lei: https://patchwork.kernel.org/project/linux-arm-kernel/patch/20210207105934.2001-1-thunder.leizhen@huawei.com/ Fixes: 1879445dfa7b ("perf/core: Set event's default ::overflow_handler()") Signed-off-by: Zhen Lei Cc: Wang Nan Acked-by: Will Deacon Signed-off-by: Russell King --- arch/arm/kernel/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 08660ae9dcbc..b1423fb130ea 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -886,7 +886,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs) info->trigger = addr; pr_debug("breakpoint fired: address = 0x%x\n", addr); perf_bp_event(bp, regs); - if (!bp->overflow_handler) + if (is_default_overflow_handler(bp)) enable_single_step(bp, addr); goto unlock; } From 8252ca87c7a2111502ee13994956f8c309faad7f Mon Sep 17 00:00:00 2001 From: "louis.wang" Date: Wed, 24 Feb 2021 13:25:53 +0100 Subject: [PATCH 0210/1379] ARM: 9066/1: ftrace: pause/unpause function graph tracer in cpu_suspend() Enabling function_graph tracer on ARM causes kernel panic, because the function graph tracer updates the "return address" of a function in order to insert a trace callback on function exit, it saves the function's original return address in a return trace stack, but cpu_suspend() may not return through the normal return path. cpu_suspend() will resume directly via the cpu_resume path, but the return trace stack has been set-up by the subfunctions of cpu_suspend(), which makes the "return address" inconsistent with cpu_suspend(). This patch refers to Commit de818bd4522c40ea02a81b387d2fa86f989c9623 ("arm64: kernel: pause/unpause function graph tracer in cpu_suspend()"), fixes the issue by pausing/resuming the function graph tracer on the thread executing cpu_suspend(), so that the function graph tracer state is kept consistent across functions that enter power down states and never return by effectively disabling graph tracer while they are executing. Signed-off-by: louis.wang Signed-off-by: Russell King --- arch/arm/kernel/suspend.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c index 24bd20564be7..43f0a3ebf390 100644 --- a/arch/arm/kernel/suspend.c +++ b/arch/arm/kernel/suspend.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include @@ -25,6 +26,13 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) if (!idmap_pgd) return -EINVAL; + /* + * Function graph tracer state gets incosistent when the kernel + * calls functions that never return (aka suspend finishers) hence + * disable graph tracing during their execution. + */ + pause_graph_tracing(); + /* * Provide a temporary page table with an identity mapping for * the MMU-enable code, required for resuming. On successful @@ -32,6 +40,9 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) * back to the correct page tables. */ ret = __cpu_suspend(arg, fn, __mpidr); + + unpause_graph_tracing(); + if (ret == 0) { cpu_switch_mm(mm->pgd, mm); local_flush_bp_all(); @@ -45,7 +56,13 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) { u32 __mpidr = cpu_logical_map(smp_processor_id()); - return __cpu_suspend(arg, fn, __mpidr); + int ret; + + pause_graph_tracing(); + ret = __cpu_suspend(arg, fn, __mpidr); + unpause_graph_tracing(); + + return ret; } #define idmap_pgd NULL #endif From a7ed7150f351177e46409cca15874101f95370cb Mon Sep 17 00:00:00 2001 From: Rahul Tanwar Date: Thu, 25 Mar 2021 11:35:10 +0100 Subject: [PATCH 0211/1379] leds: lgm: Improve Kconfig help Remove unnecessary Kconfig symbol LEDS_BLINK Improve Kconfig help text to make it more useful. Signed-off-by: Rahul Tanwar Signed-off-by: Pavel Machek --- drivers/leds/Kconfig | 5 ++--- drivers/leds/Makefile | 2 +- drivers/leds/blink/Kconfig | 36 +++++++++++++++++------------------- drivers/leds/blink/Makefile | 2 +- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index b6742b4231bf..4ca8cd594518 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -928,13 +928,12 @@ config LEDS_ACER_A500 This option enables support for the Power Button LED of Acer Iconia Tab A500. +source "drivers/leds/blink/Kconfig" + comment "Flash and Torch LED drivers" source "drivers/leds/flash/Kconfig" comment "LED Triggers" source "drivers/leds/trigger/Kconfig" -comment "LED Blink" -source "drivers/leds/blink/Kconfig" - endif # NEW_LEDS diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile index 2a698df9da57..7e604d3028c8 100644 --- a/drivers/leds/Makefile +++ b/drivers/leds/Makefile @@ -110,4 +110,4 @@ obj-$(CONFIG_LEDS_CLASS_FLASH) += flash/ obj-$(CONFIG_LEDS_TRIGGERS) += trigger/ # LED Blink -obj-$(CONFIG_LEDS_BLINK) += blink/ +obj-y += blink/ diff --git a/drivers/leds/blink/Kconfig b/drivers/leds/blink/Kconfig index 6dedc58c47b3..964c5037d9a5 100644 --- a/drivers/leds/blink/Kconfig +++ b/drivers/leds/blink/Kconfig @@ -1,21 +1,19 @@ -menuconfig LEDS_BLINK - bool "LED Blink support" - depends on LEDS_CLASS - help - This option enables blink support for the leds class. - If unsure, say Y. +config LEDS_LGM + tristate "LED support for LGM SoC series" + depends on GPIOLIB + depends on LEDS_CLASS + depends on MFD_SYSCON + depends on OF + help + This option enables support for LEDs connected to GPIO lines on + Lightning Mountain (LGM) SoC. Lightning Mountain is a AnyWAN + gateway-on-a-chip SoC to be shipped on mid and high end home + gateways and routers. -if LEDS_BLINK + These LEDs are driven by a Serial Shift Output (SSO) controller. + The driver supports hardware blinking and the LEDs can be configured + to be triggered by software/CPU or by hardware. -config LEDS_BLINK_LGM - tristate "LED support for Intel LGM SoC series" - depends on GPIOLIB - depends on LEDS_CLASS - depends on MFD_SYSCON - depends on OF - help - Parallel to serial conversion, which is also called SSO controller, - can drive external shift register for LED outputs. - This enables LED support for Serial Shift Output controller(SSO). - -endif # LEDS_BLINK + Say 'Y' here if you are working on LGM SoC based platform. Otherwise, + say 'N'. To compile this driver as a module, choose M here: the module + will be called leds-lgm-sso. diff --git a/drivers/leds/blink/Makefile b/drivers/leds/blink/Makefile index 2fa6c7b7b67e..fa5d04dccf13 100644 --- a/drivers/leds/blink/Makefile +++ b/drivers/leds/blink/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_LEDS_BLINK_LGM) += leds-lgm-sso.o +obj-$(CONFIG_LEDS_LGM) += leds-lgm-sso.o From 0047eb9f0905f797df6dd33b8bcbff9c6b116eda Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Mar 2021 15:28:34 +0100 Subject: [PATCH 0212/1379] ARM: 9068/1: syscalls: switch to generic syscalltbl.sh Many architectures duplicate similar shell scripts. This commit converts ARM to use scripts/syscalltbl.sh. Signed-off-by: Masahiro Yamada Acked-by: Linus Walleij Signed-off-by: Russell King --- arch/arm/kernel/entry-common.S | 8 ++++---- arch/arm/tools/Makefile | 9 ++++----- arch/arm/tools/syscalltbl.sh | 22 ---------------------- 3 files changed, 8 insertions(+), 31 deletions(-) delete mode 100644 arch/arm/tools/syscalltbl.sh diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index e0d7833a1827..7f0b7aba1498 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -344,20 +344,19 @@ ENTRY(\sym) .size \sym, . - \sym .endm -#define NATIVE(nr, func) syscall nr, func +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) +#define __SYSCALL(nr, func) syscall nr, func /* * This is the syscall table declaration for native ABI syscalls. * With EABI a couple syscalls are obsolete and defined as sys_ni_syscall. */ syscall_table_start sys_call_table -#define COMPAT(nr, native, compat) syscall nr, native #ifdef CONFIG_AEABI #include #else #include #endif -#undef COMPAT syscall_table_end sys_call_table /*============================================================================ @@ -455,7 +454,8 @@ ENDPROC(sys_oabi_readahead) * using the compatibility syscall entries. */ syscall_table_start sys_oabi_call_table -#define COMPAT(nr, native, compat) syscall nr, compat +#undef __SYSCALL_WITH_COMPAT +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) #include syscall_table_end sys_oabi_call_table diff --git a/arch/arm/tools/Makefile b/arch/arm/tools/Makefile index 3654f979851b..d8be90f2bb41 100644 --- a/arch/arm/tools/Makefile +++ b/arch/arm/tools/Makefile @@ -10,7 +10,7 @@ kapi := $(gen)/asm uapi := $(gen)/uapi/asm syshdr := $(srctree)/$(src)/syscallhdr.sh sysnr := $(srctree)/$(src)/syscallnr.sh -systbl := $(srctree)/$(src)/syscalltbl.sh +systbl := $(srctree)/scripts/syscalltbl.sh syscall := $(src)/syscall.tbl gen-y := $(gen)/calls-oabi.S @@ -47,8 +47,7 @@ quiet_cmd_syshdr = SYSHDR $@ '__NR_SYSCALL_BASE' quiet_cmd_systbl = SYSTBL $@ - cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@' \ - '$(systbl_abi_$(basetarget))' + cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ quiet_cmd_sysnr = SYSNR $@ cmd_sysnr = $(CONFIG_SHELL) '$(sysnr)' '$<' '$@' \ @@ -70,10 +69,10 @@ sysnr_abi_unistd-nr := common,oabi,eabi,compat $(kapi)/unistd-nr.h: $(syscall) $(sysnr) FORCE $(call if_changed,sysnr) -systbl_abi_calls-oabi := common,oabi +$(gen)/calls-oabi.S: abis := common,oabi $(gen)/calls-oabi.S: $(syscall) $(systbl) FORCE $(call if_changed,systbl) -systbl_abi_calls-eabi := common,eabi +$(gen)/calls-eabi.S: abis := common,eabi $(gen)/calls-eabi.S: $(syscall) $(systbl) FORCE $(call if_changed,systbl) diff --git a/arch/arm/tools/syscalltbl.sh b/arch/arm/tools/syscalltbl.sh deleted file mode 100644 index ae7e93cfbfd3..000000000000 --- a/arch/arm/tools/syscalltbl.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` - -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - while read nr abi name entry compat; do - if [ "$abi" = "eabi" -a -n "$compat" ]; then - echo "$in: error: a compat entry for an EABI syscall ($name) makes no sense" >&2 - exit 1 - fi - - if [ -n "$entry" ]; then - if [ -z "$compat" ]; then - echo "NATIVE($nr, $entry)" - else - echo "COMPAT($nr, $entry, $compat)" - fi - fi - done -) > "$out" From 32e9a0d5ffaffe035f99188602d328665c43f38f Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Mar 2021 15:29:03 +0100 Subject: [PATCH 0213/1379] ARM: 9067/1: syscalls: switch to generic syscallhdr.sh Many architectures duplicate similar shell scripts. This commit converts ARM to use scripts/syscallhdr.sh, and also collects OABI/OEBI syscalls into unistd-eabi.h/unistd-oabi.h, removing unistd-common.h. Signed-off-by: Masahiro Yamada Signed-off-by: Russell King --- arch/arm/include/uapi/asm/Kbuild | 1 - arch/arm/include/uapi/asm/unistd.h | 1 - arch/arm/tools/Makefile | 17 +++++----------- arch/arm/tools/syscallhdr.sh | 31 ------------------------------ 4 files changed, 5 insertions(+), 45 deletions(-) delete mode 100644 arch/arm/tools/syscallhdr.sh diff --git a/arch/arm/include/uapi/asm/Kbuild b/arch/arm/include/uapi/asm/Kbuild index ce8573157774..63748af8bc9d 100644 --- a/arch/arm/include/uapi/asm/Kbuild +++ b/arch/arm/include/uapi/asm/Kbuild @@ -1,6 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -generated-y += unistd-common.h generated-y += unistd-oabi.h generated-y += unistd-eabi.h generic-y += kvm_para.h diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h index 93ecf8aa4fe5..ae7749e15726 100644 --- a/arch/arm/include/uapi/asm/unistd.h +++ b/arch/arm/include/uapi/asm/unistd.h @@ -24,7 +24,6 @@ #include #endif -#include #define __NR_sync_file_range2 __NR_arm_sync_file_range /* diff --git a/arch/arm/tools/Makefile b/arch/arm/tools/Makefile index d8be90f2bb41..87de1f63f649 100644 --- a/arch/arm/tools/Makefile +++ b/arch/arm/tools/Makefile @@ -8,7 +8,7 @@ gen := arch/$(ARCH)/include/generated kapi := $(gen)/asm uapi := $(gen)/uapi/asm -syshdr := $(srctree)/$(src)/syscallhdr.sh +syshdr := $(srctree)/scripts/syscallhdr.sh sysnr := $(srctree)/$(src)/syscallnr.sh systbl := $(srctree)/scripts/syscalltbl.sh syscall := $(src)/syscall.tbl @@ -17,7 +17,6 @@ gen-y := $(gen)/calls-oabi.S gen-y += $(gen)/calls-eabi.S kapi-hdrs-y := $(kapi)/unistd-nr.h kapi-hdrs-y += $(kapi)/mach-types.h -uapi-hdrs-y := $(uapi)/unistd-common.h uapi-hdrs-y += $(uapi)/unistd-oabi.h uapi-hdrs-y += $(uapi)/unistd-eabi.h @@ -41,10 +40,8 @@ $(kapi)/mach-types.h: $(src)/gen-mach-types $(src)/mach-types FORCE $(call if_changed,gen_mach) quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ - '$(syshdr_abi_$(basetarget))' \ - '$(syshdr_pfx_$(basetarget))' \ - '__NR_SYSCALL_BASE' + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --abis $(abis) \ + --offset __NR_SYSCALL_BASE $< $@ quiet_cmd_systbl = SYSTBL $@ cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ @@ -53,15 +50,11 @@ quiet_cmd_sysnr = SYSNR $@ cmd_sysnr = $(CONFIG_SHELL) '$(sysnr)' '$<' '$@' \ '$(syshdr_abi_$(basetarget))' -syshdr_abi_unistd-common := common -$(uapi)/unistd-common.h: $(syscall) $(syshdr) FORCE - $(call if_changed,syshdr) - -syshdr_abi_unistd-oabi := oabi +$(uapi)/unistd-oabi.h: abis := common,oabi $(uapi)/unistd-oabi.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abi_unistd-eabi := eabi +$(uapi)/unistd-eabi.h: abis := common,eabi $(uapi)/unistd-eabi.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) diff --git a/arch/arm/tools/syscallhdr.sh b/arch/arm/tools/syscallhdr.sh deleted file mode 100644 index 6b2f25cdd721..000000000000 --- a/arch/arm/tools/syscallhdr.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -prefix="$4" -offset="$5" - -fileguard=_ASM_ARM_`basename "$out" | sed \ - -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ - -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` -if echo $out | grep -q uapi; then - fileguard="_UAPI$fileguard" -fi -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - echo "#ifndef ${fileguard}" - echo "#define ${fileguard} 1" - echo "" - - while read nr abi name entry ; do - if [ -z "$offset" ]; then - echo "#define __NR_${prefix}${name} $nr" - else - echo "#define __NR_${prefix}${name} ($offset + $nr)" - fi - done - - echo "" - echo "#endif /* ${fileguard} */" -) > "$out" From 8f50743feedd9a4d322322ef1d91426401e98e10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sun, 21 Mar 2021 15:24:12 -0700 Subject: [PATCH 0214/1379] Input: max8997 - simplify open coding of a division using up to 64 divisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The for loop is just a complicate way to express a division. Replace it by the actual division which is both simpler to understand for a human and more efficient for a CPU to calculate. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20210316212233.50765-1-uwe@kleine-koenig.org Signed-off-by: Dmitry Torokhov --- drivers/input/misc/max8997_haptic.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/input/misc/max8997_haptic.c b/drivers/input/misc/max8997_haptic.c index 20ff087b8a44..cd5e99ec1d3c 100644 --- a/drivers/input/misc/max8997_haptic.c +++ b/drivers/input/misc/max8997_haptic.c @@ -61,15 +61,10 @@ static int max8997_haptic_set_duty_cycle(struct max8997_haptic *chip) unsigned int duty = chip->pwm_period * chip->level / 100; ret = pwm_config(chip->pwm, duty, chip->pwm_period); } else { - int i; u8 duty_index = 0; - for (i = 0; i <= 64; i++) { - if (chip->level <= i * 100 / 64) { - duty_index = i; - break; - } - } + duty_index = DIV_ROUND_UP(chip->level * 64, 100); + switch (chip->internal_mode_pattern) { case 0: max8997_write_reg(chip->client, From 5b0e6fd8c505ec8a01e0ea5d78f2b707c91cf2c4 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 21 Mar 2021 15:44:44 -0700 Subject: [PATCH 0215/1379] dt-bindings: input: atmel,maxtouch: add wakeup-source The touchscreen can be a wake up source and it's being used in DTS: arch/arm/boot/dts/exynos5250-spring.dt.yaml: trackpad@4b: 'wakeup-source' does not match any of the regexes: 'pinctrl-[0-9]+' Signed-off-by: Krzysztof Kozlowski Reviewed-by: Linus Walleij Acked-by: Rob Herring Link: https://lore.kernel.org/r/20210212163806.69996-1-krzk@kernel.org Signed-off-by: Dmitry Torokhov --- Documentation/devicetree/bindings/input/atmel,maxtouch.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/input/atmel,maxtouch.yaml b/Documentation/devicetree/bindings/input/atmel,maxtouch.yaml index e6b03a1e7c30..3ec579d63570 100644 --- a/Documentation/devicetree/bindings/input/atmel,maxtouch.yaml +++ b/Documentation/devicetree/bindings/input/atmel,maxtouch.yaml @@ -80,6 +80,9 @@ properties: - 2 # ATMEL_MXT_WAKEUP_GPIO default: 0 + wakeup-source: + type: boolean + required: - compatible - reg From 6484e7581732d2785fc754f598f26fd4239b03c6 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Sun, 21 Mar 2021 16:01:14 -0700 Subject: [PATCH 0216/1379] Input: rotary-encoder - update docs according to the latest API changes The old device property API is about to be removed, so explaing how to use complete software nodes instead. Signed-off-by: Heikki Krogerus Link: https://lore.kernel.org/r/20210304090948.27014-1-heikki.krogerus@linux.intel.com Signed-off-by: Dmitry Torokhov --- Documentation/input/devices/rotary-encoder.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Documentation/input/devices/rotary-encoder.rst b/Documentation/input/devices/rotary-encoder.rst index 810ae02bdaa0..5865748c13b9 100644 --- a/Documentation/input/devices/rotary-encoder.rst +++ b/Documentation/input/devices/rotary-encoder.rst @@ -107,13 +107,17 @@ example below: }, }; - static const struct property_entry rotary_encoder_properties[] __initconst = { + static const struct property_entry rotary_encoder_properties[] = { PROPERTY_ENTRY_U32("rotary-encoder,steps-per-period", 24), PROPERTY_ENTRY_U32("linux,axis", ABS_X), PROPERTY_ENTRY_U32("rotary-encoder,relative_axis", 0), { }, }; + static const struct software_node rotary_encoder_node = { + .properties = rotary_encoder_properties, + }; + static struct platform_device rotary_encoder_device = { .name = "rotary-encoder", .id = 0, @@ -122,7 +126,7 @@ example below: ... gpiod_add_lookup_table(&rotary_encoder_gpios); - device_add_properties(&rotary_encoder_device, rotary_encoder_properties); + device_add_software_node(&rotary_encoder_device.dev, &rotary_encoder_node); platform_device_register(&rotary_encoder_device); ... From 36a8fc6fa230eb936385884391cac80420cd0e6f Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sun, 21 Mar 2021 16:06:23 -0700 Subject: [PATCH 0217/1379] Input: gpio-keys - remove extra call to input_sync The input_sync() function is already called after the loop in gpio_keys_report_state(), so it does not need to be called after each iteration within gpio_keys_gpio_report_event(). Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20210307222240.380583-1-paul@crapouillou.net Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/gpio_keys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c index 77bac4ddf324..7fcb2c35c5cc 100644 --- a/drivers/input/keyboard/gpio_keys.c +++ b/drivers/input/keyboard/gpio_keys.c @@ -373,7 +373,6 @@ static void gpio_keys_gpio_report_event(struct gpio_button_data *bdata) } else { input_event(input, type, *bdata->code, state); } - input_sync(input); } static void gpio_keys_gpio_work_func(struct work_struct *work) @@ -382,6 +381,7 @@ static void gpio_keys_gpio_work_func(struct work_struct *work) container_of(work, struct gpio_button_data, work.work); gpio_keys_gpio_report_event(bdata); + input_sync(bdata->input); if (bdata->button->wakeup) pm_relax(bdata->input->dev.parent); From 019002f20cb5b9f78d39360aff244265d035e08a Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sun, 21 Mar 2021 16:06:34 -0700 Subject: [PATCH 0218/1379] Input: gpio-keys - use hrtimer for release timer Dealing with input, timing is important; if the button should be released in one millisecond, then it should be done in one millisecond and not a hundred milliseconds. Therefore, the standard timer API is not really suitable for this task. Convert the gpio-keys driver to use a hrtimer instead of the standard timer to address this issue. Note that by using a hard IRQ for the hrtimer callback, we can get rid of the spin_lock_irqsave() and spin_unlock_irqrestore(). Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20210307222240.380583-2-paul@crapouillou.net Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/gpio_keys.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c index 7fcb2c35c5cc..4b92f49decef 100644 --- a/drivers/input/keyboard/gpio_keys.c +++ b/drivers/input/keyboard/gpio_keys.c @@ -8,6 +8,7 @@ #include +#include #include #include #include @@ -36,7 +37,7 @@ struct gpio_button_data { unsigned short *code; - struct timer_list release_timer; + struct hrtimer release_timer; unsigned int release_delay; /* in msecs, for IRQ-only buttons */ struct delayed_work work; @@ -146,7 +147,7 @@ static void gpio_keys_disable_button(struct gpio_button_data *bdata) if (bdata->gpiod) cancel_delayed_work_sync(&bdata->work); else - del_timer_sync(&bdata->release_timer); + hrtimer_cancel(&bdata->release_timer); bdata->disabled = true; } @@ -415,19 +416,20 @@ static irqreturn_t gpio_keys_gpio_isr(int irq, void *dev_id) return IRQ_HANDLED; } -static void gpio_keys_irq_timer(struct timer_list *t) +static enum hrtimer_restart gpio_keys_irq_timer(struct hrtimer *t) { - struct gpio_button_data *bdata = from_timer(bdata, t, release_timer); + struct gpio_button_data *bdata = container_of(t, + struct gpio_button_data, + release_timer); struct input_dev *input = bdata->input; - unsigned long flags; - spin_lock_irqsave(&bdata->lock, flags); if (bdata->key_pressed) { input_event(input, EV_KEY, *bdata->code, 0); input_sync(input); bdata->key_pressed = false; } - spin_unlock_irqrestore(&bdata->lock, flags); + + return HRTIMER_NORESTART; } static irqreturn_t gpio_keys_irq_isr(int irq, void *dev_id) @@ -457,8 +459,9 @@ static irqreturn_t gpio_keys_irq_isr(int irq, void *dev_id) } if (bdata->release_delay) - mod_timer(&bdata->release_timer, - jiffies + msecs_to_jiffies(bdata->release_delay)); + hrtimer_start(&bdata->release_timer, + ms_to_ktime(bdata->release_delay), + HRTIMER_MODE_REL_HARD); out: spin_unlock_irqrestore(&bdata->lock, flags); return IRQ_HANDLED; @@ -471,7 +474,7 @@ static void gpio_keys_quiesce_key(void *data) if (bdata->gpiod) cancel_delayed_work_sync(&bdata->work); else - del_timer_sync(&bdata->release_timer); + hrtimer_cancel(&bdata->release_timer); } static int gpio_keys_setup_key(struct platform_device *pdev, @@ -595,7 +598,9 @@ static int gpio_keys_setup_key(struct platform_device *pdev, } bdata->release_delay = button->debounce_interval; - timer_setup(&bdata->release_timer, gpio_keys_irq_timer, 0); + hrtimer_init(&bdata->release_timer, + CLOCK_REALTIME, HRTIMER_MODE_REL_HARD); + bdata->release_timer.function = gpio_keys_irq_timer; isr = gpio_keys_irq_isr; irqflags = 0; From c9efb0ba281e88e2faec6ad919be509b6ab8ead6 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sun, 21 Mar 2021 16:09:29 -0700 Subject: [PATCH 0219/1379] Input: gpio-keys - use hrtimer for software debounce, if possible We want to be able to report the input event as soon as the debounce delay elapsed. However, the current code does not really ensure that, as it uses the jiffies-based schedule_delayed_work() API. With a small enough HZ value (HZ <= 100), this results in some input events being lost, when a key is quickly pressed then released (on a human's time scale). Switching to hrtimers fixes this issue, and will work even on extremely low HZ values (tested at HZ=24). This is however only possible if reading the GPIO is possible without sleeping. If this condition is not met, the previous approach of using a jiffies-based timer is taken. Signed-off-by: Paul Cercueil Link: https://lore.kernel.org/r/20210307222240.380583-3-paul@crapouillou.net Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/gpio_keys.c | 66 +++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c index 4b92f49decef..fe8fc76ee22e 100644 --- a/drivers/input/keyboard/gpio_keys.c +++ b/drivers/input/keyboard/gpio_keys.c @@ -41,6 +41,7 @@ struct gpio_button_data { unsigned int release_delay; /* in msecs, for IRQ-only buttons */ struct delayed_work work; + struct hrtimer debounce_timer; unsigned int software_debounce; /* in msecs, for GPIO-driven buttons */ unsigned int irq; @@ -49,6 +50,7 @@ struct gpio_button_data { bool disabled; bool key_pressed; bool suspended; + bool debounce_use_hrtimer; }; struct gpio_keys_drvdata { @@ -144,10 +146,10 @@ static void gpio_keys_disable_button(struct gpio_button_data *bdata) */ disable_irq(bdata->irq); - if (bdata->gpiod) - cancel_delayed_work_sync(&bdata->work); - else + if (bdata->debounce_use_hrtimer) hrtimer_cancel(&bdata->release_timer); + else + cancel_delayed_work_sync(&bdata->work); bdata->disabled = true; } @@ -361,7 +363,9 @@ static void gpio_keys_gpio_report_event(struct gpio_button_data *bdata) unsigned int type = button->type ?: EV_KEY; int state; - state = gpiod_get_value_cansleep(bdata->gpiod); + state = bdata->debounce_use_hrtimer ? + gpiod_get_value(bdata->gpiod) : + gpiod_get_value_cansleep(bdata->gpiod); if (state < 0) { dev_err(input->dev.parent, "failed to get gpio state: %d\n", state); @@ -376,11 +380,8 @@ static void gpio_keys_gpio_report_event(struct gpio_button_data *bdata) } } -static void gpio_keys_gpio_work_func(struct work_struct *work) +static void gpio_keys_debounce_event(struct gpio_button_data *bdata) { - struct gpio_button_data *bdata = - container_of(work, struct gpio_button_data, work.work); - gpio_keys_gpio_report_event(bdata); input_sync(bdata->input); @@ -388,6 +389,24 @@ static void gpio_keys_gpio_work_func(struct work_struct *work) pm_relax(bdata->input->dev.parent); } +static void gpio_keys_gpio_work_func(struct work_struct *work) +{ + struct gpio_button_data *bdata = + container_of(work, struct gpio_button_data, work.work); + + gpio_keys_debounce_event(bdata); +} + +static enum hrtimer_restart gpio_keys_debounce_timer(struct hrtimer *t) +{ + struct gpio_button_data *bdata = + container_of(t, struct gpio_button_data, debounce_timer); + + gpio_keys_debounce_event(bdata); + + return HRTIMER_NORESTART; +} + static irqreturn_t gpio_keys_gpio_isr(int irq, void *dev_id) { struct gpio_button_data *bdata = dev_id; @@ -409,9 +428,15 @@ static irqreturn_t gpio_keys_gpio_isr(int irq, void *dev_id) } } - mod_delayed_work(system_wq, - &bdata->work, - msecs_to_jiffies(bdata->software_debounce)); + if (bdata->debounce_use_hrtimer) { + hrtimer_start(&bdata->debounce_timer, + ms_to_ktime(bdata->software_debounce), + HRTIMER_MODE_REL); + } else { + mod_delayed_work(system_wq, + &bdata->work, + msecs_to_jiffies(bdata->software_debounce)); + } return IRQ_HANDLED; } @@ -471,10 +496,10 @@ static void gpio_keys_quiesce_key(void *data) { struct gpio_button_data *bdata = data; - if (bdata->gpiod) - cancel_delayed_work_sync(&bdata->work); + if (bdata->debounce_use_hrtimer) + hrtimer_cancel(&bdata->debounce_timer); else - hrtimer_cancel(&bdata->release_timer); + cancel_delayed_work_sync(&bdata->work); } static int gpio_keys_setup_key(struct platform_device *pdev, @@ -546,6 +571,14 @@ static int gpio_keys_setup_key(struct platform_device *pdev, if (error < 0) bdata->software_debounce = button->debounce_interval; + + /* + * If reading the GPIO won't sleep, we can use a + * hrtimer instead of a standard timer for the software + * debounce, to reduce the latency as much as possible. + */ + bdata->debounce_use_hrtimer = + !gpiod_cansleep(bdata->gpiod); } if (button->irq) { @@ -564,6 +597,10 @@ static int gpio_keys_setup_key(struct platform_device *pdev, INIT_DELAYED_WORK(&bdata->work, gpio_keys_gpio_work_func); + hrtimer_init(&bdata->debounce_timer, + CLOCK_REALTIME, HRTIMER_MODE_REL); + bdata->debounce_timer.function = gpio_keys_debounce_timer; + isr = gpio_keys_gpio_isr; irqflags = IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING; @@ -598,6 +635,7 @@ static int gpio_keys_setup_key(struct platform_device *pdev, } bdata->release_delay = button->debounce_interval; + bdata->debounce_use_hrtimer = true; hrtimer_init(&bdata->release_timer, CLOCK_REALTIME, HRTIMER_MODE_REL_HARD); bdata->release_timer.function = gpio_keys_irq_timer; From 4c976acb47bd4262ebf469698d26e1b8f4a338b4 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Sun, 21 Mar 2021 20:31:36 -0700 Subject: [PATCH 0220/1379] Input: silead - fix a typo s/subsytem/subsystem/ Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20210322022030.3857089-1-unixbhaskar@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/silead.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/silead.c b/drivers/input/touchscreen/silead.c index 8fa2f3b7cfd8..32725d7422de 100644 --- a/drivers/input/touchscreen/silead.c +++ b/drivers/input/touchscreen/silead.c @@ -486,7 +486,7 @@ static int silead_ts_probe(struct i2c_client *client, silead_ts_read_props(client); - /* We must have the IRQ provided by DT or ACPI subsytem */ + /* We must have the IRQ provided by DT or ACPI subsystem */ if (client->irq <= 0) return -ENODEV; From 0cdd2e906cf321e9a736b94d22e6603f6f515ee8 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Sun, 21 Mar 2021 21:00:24 -0700 Subject: [PATCH 0221/1379] Input: iqs5xx - update vendor's URL Replace 'http' with 'https' and correct the spelling of the nearby word 'datasheet'. Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/20210313191236.4366-2-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/iqs5xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/iqs5xx.c b/drivers/input/touchscreen/iqs5xx.c index 54f30038dca4..403e251a5e7d 100644 --- a/drivers/input/touchscreen/iqs5xx.c +++ b/drivers/input/touchscreen/iqs5xx.c @@ -8,7 +8,7 @@ * made available by the vendor. Firmware files may be pushed to the device's * nonvolatile memory by writing the filename to the 'fw_file' sysfs control. * - * Link to PC-based configuration tool and data sheet: http://www.azoteq.com/ + * Link to PC-based configuration tool and datasheet: https://www.azoteq.com/ */ #include From 40c3efdc0b77d3f5298c9ce4fcb029da30f887e5 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Sun, 21 Mar 2021 21:01:35 -0700 Subject: [PATCH 0222/1379] Input: iqs5xx - optimize axis definition and validation Set the maximum ABS_MT_PRESSURE value and use the existing U16_MAX definition instead of a magic number to validate ABS_MT_POSITION_X and ABS_MT_POSITION_Y. Also use input_set_abs_params() rather than input_abs_set_max() to avoid having to call input_set_capability() separately. Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/20210313191236.4366-3-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/iqs5xx.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/input/touchscreen/iqs5xx.c b/drivers/input/touchscreen/iqs5xx.c index 403e251a5e7d..2a4e048f1400 100644 --- a/drivers/input/touchscreen/iqs5xx.c +++ b/drivers/input/touchscreen/iqs5xx.c @@ -32,7 +32,6 @@ #define IQS5XX_NUM_RETRIES 10 #define IQS5XX_NUM_CONTACTS 5 #define IQS5XX_WR_BYTES_MAX 2 -#define IQS5XX_XY_RES_MAX 0xFFFE #define IQS5XX_PROD_NUM_IQS550 40 #define IQS5XX_PROD_NUM_IQS572 58 @@ -504,10 +503,6 @@ static int iqs5xx_axis_init(struct i2c_client *client) input->open = iqs5xx_open; input->close = iqs5xx_close; - input_set_capability(input, EV_ABS, ABS_MT_POSITION_X); - input_set_capability(input, EV_ABS, ABS_MT_POSITION_Y); - input_set_capability(input, EV_ABS, ABS_MT_PRESSURE); - input_set_drvdata(input, iqs5xx); iqs5xx->input = input; } @@ -520,26 +515,29 @@ static int iqs5xx_axis_init(struct i2c_client *client) if (error) return error; - input_abs_set_max(iqs5xx->input, ABS_MT_POSITION_X, max_x); - input_abs_set_max(iqs5xx->input, ABS_MT_POSITION_Y, max_y); + input_set_abs_params(iqs5xx->input, ABS_MT_POSITION_X, 0, max_x, 0, 0); + input_set_abs_params(iqs5xx->input, ABS_MT_POSITION_Y, 0, max_y, 0, 0); + input_set_abs_params(iqs5xx->input, ABS_MT_PRESSURE, 0, U16_MAX, 0, 0); touchscreen_parse_properties(iqs5xx->input, true, prop); - if (prop->max_x > IQS5XX_XY_RES_MAX) { - dev_err(&client->dev, "Invalid maximum x-coordinate: %u > %u\n", - prop->max_x, IQS5XX_XY_RES_MAX); + /* + * The device reserves 0xFFFF for coordinates that correspond to slots + * which are not in a state of touch. + */ + if (prop->max_x >= U16_MAX || prop->max_y >= U16_MAX) { + dev_err(&client->dev, "Invalid touchscreen size: %u*%u\n", + prop->max_x, prop->max_y); return -EINVAL; - } else if (prop->max_x != max_x) { + } + + if (prop->max_x != max_x) { error = iqs5xx_write_word(client, IQS5XX_X_RES, prop->max_x); if (error) return error; } - if (prop->max_y > IQS5XX_XY_RES_MAX) { - dev_err(&client->dev, "Invalid maximum y-coordinate: %u > %u\n", - prop->max_y, IQS5XX_XY_RES_MAX); - return -EINVAL; - } else if (prop->max_y != max_y) { + if (prop->max_y != max_y) { error = iqs5xx_write_word(client, IQS5XX_Y_RES, prop->max_y); if (error) return error; From 509c0083132bdca505a17140bc98a8365bf4e6ca Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Sun, 21 Mar 2021 21:02:11 -0700 Subject: [PATCH 0223/1379] Input: iqs5xx - expose firmware revision to user space Add the read-only 'fw_info' attribute which reports information about the device's firmware in the following format: a.b.c.d:e.f Where: a = Product number (e.g. 40 for IQS550) b = Project number (e.g. 15) c = Firmware revision (major) d = Firmware revision (minor) e = Customer-assigned exported file version (major) f = Customer-assigned exported file version (minor) As part of the corresponding rework to uses of 'bl_status', the IQS5XX_BL_STATUS_RESET definition is dropped with 0 used in its place instead. Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/20210313191236.4366-4-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/iqs5xx.c | 56 +++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/drivers/input/touchscreen/iqs5xx.c b/drivers/input/touchscreen/iqs5xx.c index 2a4e048f1400..f36d170e14b2 100644 --- a/drivers/input/touchscreen/iqs5xx.c +++ b/drivers/input/touchscreen/iqs5xx.c @@ -63,6 +63,7 @@ #define IQS5XX_SYS_CFG1 0x058F #define IQS5XX_X_RES 0x066E #define IQS5XX_Y_RES 0x0670 +#define IQS5XX_EXP_FILE 0x0677 #define IQS5XX_CHKSM 0x83C0 #define IQS5XX_APP 0x8400 #define IQS5XX_CSTM 0xBE00 @@ -86,22 +87,12 @@ #define IQS5XX_BL_CMD_CRC 0x03 #define IQS5XX_BL_BLK_LEN_MAX 64 #define IQS5XX_BL_ID 0x0200 -#define IQS5XX_BL_STATUS_RESET 0x00 #define IQS5XX_BL_STATUS_AVAIL 0xA5 #define IQS5XX_BL_STATUS_NONE 0xEE #define IQS5XX_BL_CRC_PASS 0x00 #define IQS5XX_BL_CRC_FAIL 0x01 #define IQS5XX_BL_ATTEMPTS 3 -struct iqs5xx_private { - struct i2c_client *client; - struct input_dev *input; - struct gpio_desc *reset_gpio; - struct touchscreen_properties prop; - struct mutex lock; - u8 bl_status; -}; - struct iqs5xx_dev_id_info { __be16 prod_num; __be16 proj_num; @@ -133,6 +124,16 @@ struct iqs5xx_status { struct iqs5xx_touch_data touch_data[IQS5XX_NUM_CONTACTS]; } __packed; +struct iqs5xx_private { + struct i2c_client *client; + struct input_dev *input; + struct gpio_desc *reset_gpio; + struct touchscreen_properties prop; + struct mutex lock; + struct iqs5xx_dev_id_info dev_id_info; + u8 exp_file[2]; +}; + static int iqs5xx_read_burst(struct i2c_client *client, u16 reg, void *val, u16 len) { @@ -445,7 +446,7 @@ static int iqs5xx_set_state(struct i2c_client *client, u8 state) struct iqs5xx_private *iqs5xx = i2c_get_clientdata(client); int error1, error2; - if (iqs5xx->bl_status == IQS5XX_BL_STATUS_RESET) + if (!iqs5xx->dev_id_info.bl_status) return 0; mutex_lock(&iqs5xx->lock); @@ -615,6 +616,11 @@ static int iqs5xx_dev_init(struct i2c_client *client) return -EINVAL; } + error = iqs5xx_read_burst(client, IQS5XX_EXP_FILE, + iqs5xx->exp_file, sizeof(iqs5xx->exp_file)); + if (error) + return error; + error = iqs5xx_axis_init(client); if (error) return error; @@ -638,7 +644,7 @@ static int iqs5xx_dev_init(struct i2c_client *client) if (error) return error; - iqs5xx->bl_status = dev_id_info->bl_status; + iqs5xx->dev_id_info = *dev_id_info; /* * The following delay allows ATI to complete before the open and close @@ -664,7 +670,7 @@ static irqreturn_t iqs5xx_irq(int irq, void *data) * RDY output during bootloader mode. If the device operates outside of * bootloader mode, the input device is guaranteed to be allocated. */ - if (iqs5xx->bl_status == IQS5XX_BL_STATUS_RESET) + if (!iqs5xx->dev_id_info.bl_status) return IRQ_NONE; error = iqs5xx_read_burst(client, IQS5XX_SYS_INFO0, @@ -853,7 +859,7 @@ static int iqs5xx_fw_file_write(struct i2c_client *client, const char *fw_file) int error, error_bl = 0; u8 *pmap; - if (iqs5xx->bl_status == IQS5XX_BL_STATUS_NONE) + if (iqs5xx->dev_id_info.bl_status == IQS5XX_BL_STATUS_NONE) return -EPERM; pmap = kzalloc(IQS5XX_PMAP_LEN, GFP_KERNEL); @@ -873,7 +879,7 @@ static int iqs5xx_fw_file_write(struct i2c_client *client, const char *fw_file) */ disable_irq(client->irq); - iqs5xx->bl_status = IQS5XX_BL_STATUS_RESET; + iqs5xx->dev_id_info.bl_status = 0; error = iqs5xx_bl_cmd(client, IQS5XX_BL_CMD_VER, 0); if (error) { @@ -906,7 +912,7 @@ err_reset: error_bl = error; error = iqs5xx_dev_init(client); - if (!error && iqs5xx->bl_status == IQS5XX_BL_STATUS_RESET) + if (!error && !iqs5xx->dev_id_info.bl_status) error = -EINVAL; enable_irq(client->irq); @@ -966,10 +972,28 @@ static ssize_t fw_file_store(struct device *dev, return count; } +static ssize_t fw_info_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct iqs5xx_private *iqs5xx = dev_get_drvdata(dev); + + if (!iqs5xx->dev_id_info.bl_status) + return -ENODATA; + + return scnprintf(buf, PAGE_SIZE, "%u.%u.%u.%u:%u.%u\n", + be16_to_cpu(iqs5xx->dev_id_info.prod_num), + be16_to_cpu(iqs5xx->dev_id_info.proj_num), + iqs5xx->dev_id_info.major_ver, + iqs5xx->dev_id_info.minor_ver, + iqs5xx->exp_file[0], iqs5xx->exp_file[1]); +} + static DEVICE_ATTR_WO(fw_file); +static DEVICE_ATTR_RO(fw_info); static struct attribute *iqs5xx_attrs[] = { &dev_attr_fw_file.attr, + &dev_attr_fw_info.attr, NULL, }; From e7d8e88aec888d4053f4b2be573ab63a39313f83 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Sun, 21 Mar 2021 21:03:16 -0700 Subject: [PATCH 0224/1379] Input: iqs5xx - remove superfluous revision validation The vendor-assigned firmware project number is restricted to the generic project number (15); however the vendor may assign other project numbers to specific applications and customers. These custom project numbers may be based on forwards-compatible firmware revision 1.x. However, the driver unnecessarily rejects anything older than firmware revision 2.0. To support other applications, remove these unnecessarily strict checks and enter the bootloader only for truly incompatible A000 devices. Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/20210313191236.4366-5-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/iqs5xx.c | 40 ++++++------------------------ 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/drivers/input/touchscreen/iqs5xx.c b/drivers/input/touchscreen/iqs5xx.c index f36d170e14b2..0920516124c7 100644 --- a/drivers/input/touchscreen/iqs5xx.c +++ b/drivers/input/touchscreen/iqs5xx.c @@ -36,9 +36,6 @@ #define IQS5XX_PROD_NUM_IQS550 40 #define IQS5XX_PROD_NUM_IQS572 58 #define IQS5XX_PROD_NUM_IQS525 52 -#define IQS5XX_PROJ_NUM_A000 0 -#define IQS5XX_PROJ_NUM_B000 15 -#define IQS5XX_MAJOR_VER_MIN 2 #define IQS5XX_SHOW_RESET BIT(7) #define IQS5XX_ACK_RESET BIT(7) @@ -87,7 +84,6 @@ #define IQS5XX_BL_CMD_CRC 0x03 #define IQS5XX_BL_BLK_LEN_MAX 64 #define IQS5XX_BL_ID 0x0200 -#define IQS5XX_BL_STATUS_AVAIL 0xA5 #define IQS5XX_BL_STATUS_NONE 0xEE #define IQS5XX_BL_CRC_PASS 0x00 #define IQS5XX_BL_CRC_FAIL 0x01 @@ -573,7 +569,7 @@ static int iqs5xx_dev_init(struct i2c_client *client) * the missing zero is prepended). */ buf[0] = 0; - dev_id_info = (struct iqs5xx_dev_id_info *)&buf[(buf[1] > 0) ? 0 : 1]; + dev_id_info = (struct iqs5xx_dev_id_info *)&buf[buf[1] ? 0 : 1]; switch (be16_to_cpu(dev_id_info->prod_num)) { case IQS5XX_PROD_NUM_IQS550: @@ -586,34 +582,14 @@ static int iqs5xx_dev_init(struct i2c_client *client) return -EINVAL; } - switch (be16_to_cpu(dev_id_info->proj_num)) { - case IQS5XX_PROJ_NUM_A000: - dev_err(&client->dev, "Unsupported project number: %u\n", - be16_to_cpu(dev_id_info->proj_num)); + /* + * With the product number recognized yet shifted by one byte, open the + * bootloader and wait for user space to convert the A000 device into a + * B000 device via new firmware. + */ + if (buf[1]) { + dev_err(&client->dev, "Opening bootloader for A000 device\n"); return iqs5xx_bl_open(client); - case IQS5XX_PROJ_NUM_B000: - break; - default: - dev_err(&client->dev, "Unrecognized project number: %u\n", - be16_to_cpu(dev_id_info->proj_num)); - return -EINVAL; - } - - if (dev_id_info->major_ver < IQS5XX_MAJOR_VER_MIN) { - dev_err(&client->dev, "Unsupported major version: %u\n", - dev_id_info->major_ver); - return iqs5xx_bl_open(client); - } - - switch (dev_id_info->bl_status) { - case IQS5XX_BL_STATUS_AVAIL: - case IQS5XX_BL_STATUS_NONE: - break; - default: - dev_err(&client->dev, - "Unrecognized bootloader status: 0x%02X\n", - dev_id_info->bl_status); - return -EINVAL; } error = iqs5xx_read_burst(client, IQS5XX_EXP_FILE, From 95a6d961401d7e7e4cdd15c5c454b335d71dd0b5 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Sun, 21 Mar 2021 21:04:17 -0700 Subject: [PATCH 0225/1379] Input: iqs5xx - close bootloader using hardware reset The bootloader can be closed using the 'execute' command (0x02) or hardware reset. Rather than using the former option for successful firmware update procedures and reserving the latter for recovering the device upon failure, simply use hardware reset for all cases. The post-bootloader initialization delay increases marginally when triggered by a hardware reset, so increase the wait time to ensure the device does not subsequently fail to respond. As part of this change, refactor the return path to avoid an extra assignment and to make the logic a bit smaller. Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/20210313191236.4366-6-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/iqs5xx.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/drivers/input/touchscreen/iqs5xx.c b/drivers/input/touchscreen/iqs5xx.c index 0920516124c7..a990c176abf7 100644 --- a/drivers/input/touchscreen/iqs5xx.c +++ b/drivers/input/touchscreen/iqs5xx.c @@ -832,7 +832,7 @@ static int iqs5xx_fw_file_parse(struct i2c_client *client, static int iqs5xx_fw_file_write(struct i2c_client *client, const char *fw_file) { struct iqs5xx_private *iqs5xx = i2c_get_clientdata(client); - int error, error_bl = 0; + int error, error_init = 0; u8 *pmap; if (iqs5xx->dev_id_info.bl_status == IQS5XX_BL_STATUS_NONE) @@ -875,21 +875,14 @@ static int iqs5xx_fw_file_write(struct i2c_client *client, const char *fw_file) error = iqs5xx_bl_verify(client, IQS5XX_CSTM, pmap + IQS5XX_CHKSM_LEN + IQS5XX_APP_LEN, IQS5XX_CSTM_LEN); - if (error) - goto err_reset; - - error = iqs5xx_bl_cmd(client, IQS5XX_BL_CMD_EXEC, 0); err_reset: - if (error) { - iqs5xx_reset(client); - usleep_range(10000, 10100); - } + iqs5xx_reset(client); + usleep_range(15000, 15100); - error_bl = error; - error = iqs5xx_dev_init(client); - if (!error && !iqs5xx->dev_id_info.bl_status) - error = -EINVAL; + error_init = iqs5xx_dev_init(client); + if (!iqs5xx->dev_id_info.bl_status) + error_init = error_init ? : -EINVAL; enable_irq(client->irq); @@ -898,10 +891,7 @@ err_reset: err_kfree: kfree(pmap); - if (error_bl) - return error_bl; - - return error; + return error ? : error_init; } static ssize_t fw_file_store(struct device *dev, From b6621f72cc88ef5ed8341bea8104a0f5a72d07a2 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 22 Mar 2021 14:55:05 -0700 Subject: [PATCH 0226/1379] Input: wacom_i2c - do not force interrupt trigger Instead of forcing interrupt trigger to "level low" rely on the platform to set it up according to how it is wired on the given board. Reviewed-by: Alistair Francis Link: https://lore.kernel.org/r/20210321220043.318239-1-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/wacom_i2c.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/input/touchscreen/wacom_i2c.c b/drivers/input/touchscreen/wacom_i2c.c index 1afc6bde2891..609ff84e7693 100644 --- a/drivers/input/touchscreen/wacom_i2c.c +++ b/drivers/input/touchscreen/wacom_i2c.c @@ -195,8 +195,7 @@ static int wacom_i2c_probe(struct i2c_client *client, input_set_drvdata(input, wac_i2c); error = request_threaded_irq(client->irq, NULL, wacom_i2c_irq, - IRQF_TRIGGER_LOW | IRQF_ONESHOT, - "wacom_i2c", wac_i2c); + IRQF_ONESHOT, "wacom_i2c", wac_i2c); if (error) { dev_err(&client->dev, "Failed to enable IRQ, error: %d\n", error); From c75cf86201e37c2dd6b8077ed6de2776471f5be5 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 22 Mar 2021 14:55:28 -0700 Subject: [PATCH 0227/1379] Input: wacom_i2c - switch to using managed resources This simplifies error unwinding path and allows us to get rid of remove() method. Reviewed-by: Alistair Francis Link: https://lore.kernel.org/r/20210321220043.318239-2-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/wacom_i2c.c | 55 +++++++++------------------ 1 file changed, 17 insertions(+), 38 deletions(-) diff --git a/drivers/input/touchscreen/wacom_i2c.c b/drivers/input/touchscreen/wacom_i2c.c index 609ff84e7693..22826c387da5 100644 --- a/drivers/input/touchscreen/wacom_i2c.c +++ b/drivers/input/touchscreen/wacom_i2c.c @@ -145,15 +145,16 @@ static void wacom_i2c_close(struct input_dev *dev) } static int wacom_i2c_probe(struct i2c_client *client, - const struct i2c_device_id *id) + const struct i2c_device_id *id) { + struct device *dev = &client->dev; struct wacom_i2c *wac_i2c; struct input_dev *input; struct wacom_features features = { 0 }; int error; if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { - dev_err(&client->dev, "i2c_check_functionality error\n"); + dev_err(dev, "i2c_check_functionality error\n"); return -EIO; } @@ -161,21 +162,22 @@ static int wacom_i2c_probe(struct i2c_client *client, if (error) return error; - wac_i2c = kzalloc(sizeof(*wac_i2c), GFP_KERNEL); - input = input_allocate_device(); - if (!wac_i2c || !input) { - error = -ENOMEM; - goto err_free_mem; - } + wac_i2c = devm_kzalloc(dev, sizeof(*wac_i2c), GFP_KERNEL); + if (!wac_i2c) + return -ENOMEM; wac_i2c->client = client; + + input = devm_input_allocate_device(dev); + if (!input) + return -ENOMEM; + wac_i2c->input = input; input->name = "Wacom I2C Digitizer"; input->id.bustype = BUS_I2C; input->id.vendor = 0x56a; input->id.version = features.fw_version; - input->dev.parent = &client->dev; input->open = wacom_i2c_open; input->close = wacom_i2c_close; @@ -194,12 +196,11 @@ static int wacom_i2c_probe(struct i2c_client *client, input_set_drvdata(input, wac_i2c); - error = request_threaded_irq(client->irq, NULL, wacom_i2c_irq, - IRQF_ONESHOT, "wacom_i2c", wac_i2c); + error = devm_request_threaded_irq(dev, client->irq, NULL, wacom_i2c_irq, + IRQF_ONESHOT, "wacom_i2c", wac_i2c); if (error) { - dev_err(&client->dev, - "Failed to enable IRQ, error: %d\n", error); - goto err_free_mem; + dev_err(dev, "Failed to request IRQ: %d\n", error); + return error; } /* Disable the IRQ, we'll enable it in wac_i2c_open() */ @@ -207,31 +208,10 @@ static int wacom_i2c_probe(struct i2c_client *client, error = input_register_device(wac_i2c->input); if (error) { - dev_err(&client->dev, - "Failed to register input device, error: %d\n", error); - goto err_free_irq; + dev_err(dev, "Failed to register input device: %d\n", error); + return error; } - i2c_set_clientdata(client, wac_i2c); - return 0; - -err_free_irq: - free_irq(client->irq, wac_i2c); -err_free_mem: - input_free_device(input); - kfree(wac_i2c); - - return error; -} - -static int wacom_i2c_remove(struct i2c_client *client) -{ - struct wacom_i2c *wac_i2c = i2c_get_clientdata(client); - - free_irq(client->irq, wac_i2c); - input_unregister_device(wac_i2c->input); - kfree(wac_i2c); - return 0; } @@ -268,7 +248,6 @@ static struct i2c_driver wacom_i2c_driver = { }, .probe = wacom_i2c_probe, - .remove = wacom_i2c_remove, .id_table = wacom_i2c_id, }; module_i2c_driver(wacom_i2c_driver); From e28b5c8d0aaee116a0dd42c602fd667f8ffe2629 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Mon, 22 Mar 2021 16:42:25 -0700 Subject: [PATCH 0228/1379] Input: touchscreen - move helper functions to core Some devices outside of drivers/input/touchscreen/ can still make use of the touchscreen helper functions. Therefore, it was agreed in [1] to move them outside of drivers/input/touchscreen/ so that other devices can call them without INPUT_TOUCHSCREEN being set. As part of this change, 'of' is dropped from the filename because the helpers no longer actually use OF. No changes are made to the file contents whatsoever. Based on the feedback in [2], the corresponding binding documents (touchscreen.yaml and touchscreen.txt) are left in their original locations. [1] https://patchwork.kernel.org/patch/11924029/ [2] https://patchwork.kernel.org/patch/12042037/ Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/20210301234928.4298-2-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- drivers/input/Makefile | 1 + drivers/input/{touchscreen/of_touchscreen.c => touchscreen.c} | 0 drivers/input/touchscreen/Kconfig | 4 ---- drivers/input/touchscreen/Makefile | 1 - 4 files changed, 1 insertion(+), 5 deletions(-) rename drivers/input/{touchscreen/of_touchscreen.c => touchscreen.c} (100%) diff --git a/drivers/input/Makefile b/drivers/input/Makefile index d8f5310e22ba..037cc595106c 100644 --- a/drivers/input/Makefile +++ b/drivers/input/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_INPUT) += input-core.o input-core-y := input.o input-compat.o input-mt.o input-poller.o ff-core.o +input-core-y += touchscreen.o obj-$(CONFIG_INPUT_FF_MEMLESS) += ff-memless.o obj-$(CONFIG_INPUT_SPARSEKMAP) += sparse-keymap.o diff --git a/drivers/input/touchscreen/of_touchscreen.c b/drivers/input/touchscreen.c similarity index 100% rename from drivers/input/touchscreen/of_touchscreen.c rename to drivers/input/touchscreen.c diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig index 529614d364fe..aead3ad6ba6a 100644 --- a/drivers/input/touchscreen/Kconfig +++ b/drivers/input/touchscreen/Kconfig @@ -12,10 +12,6 @@ menuconfig INPUT_TOUCHSCREEN if INPUT_TOUCHSCREEN -config TOUCHSCREEN_PROPERTIES - def_tristate INPUT - depends on INPUT - config TOUCHSCREEN_88PM860X tristate "Marvell 88PM860x touchscreen" depends on MFD_88PM860X diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile index 6233541e9173..80cd241b4c1b 100644 --- a/drivers/input/touchscreen/Makefile +++ b/drivers/input/touchscreen/Makefile @@ -7,7 +7,6 @@ wm97xx-ts-y := wm97xx-core.o -obj-$(CONFIG_TOUCHSCREEN_PROPERTIES) += of_touchscreen.o obj-$(CONFIG_TOUCHSCREEN_88PM860X) += 88pm860x-ts.o obj-$(CONFIG_TOUCHSCREEN_AD7877) += ad7877.o obj-$(CONFIG_TOUCHSCREEN_AD7879) += ad7879.o From 51e01fc04f1285b0e515a5262fc58682565d859c Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Mon, 22 Mar 2021 16:43:00 -0700 Subject: [PATCH 0229/1379] Input: touchscreen - broaden use-cases described in comments Now that the helper functions have been moved to drivers/input/ so that all input devices may use them, the introductory comments can be updated to remove any implication that the helper functions are solely limited to touchscreens. This patch also scrubs any remaining use of 'DT' since there isn't any actual dependency on OF. A minor spelling error is resolved as well ('setups' -> 'sets up'). Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/20210301234928.4298-3-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/input/touchscreen.c b/drivers/input/touchscreen.c index 97342e14b4f1..dd18cb917c4d 100644 --- a/drivers/input/touchscreen.c +++ b/drivers/input/touchscreen.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Generic DT helper functions for touchscreen devices + * Generic helper functions for touchscreens and other two-dimensional + * pointing devices * * Copyright (c) 2014 Sebastian Reichel */ @@ -37,7 +38,7 @@ static void touchscreen_set_params(struct input_dev *dev, if (!test_bit(axis, dev->absbit)) { dev_warn(&dev->dev, - "DT specifies parameters but the axis %lu is not set up\n", + "Parameters are specified but the axis %lu is not set up\n", axis); return; } @@ -49,7 +50,7 @@ static void touchscreen_set_params(struct input_dev *dev, } /** - * touchscreen_parse_properties - parse common touchscreen DT properties + * touchscreen_parse_properties - parse common touchscreen properties * @input: input device that should be parsed * @multitouch: specifies whether parsed properties should be applied to * single-touch or multi-touch axes @@ -57,9 +58,9 @@ static void touchscreen_set_params(struct input_dev *dev, * axis swap and invert info for use with touchscreen_report_x_y(); * or %NULL * - * This function parses common DT properties for touchscreens and setups the + * This function parses common properties for touchscreens and sets up the * input device accordingly. The function keeps previously set up default - * values if no value is specified via DT. + * values if no value is specified. */ void touchscreen_parse_properties(struct input_dev *input, bool multitouch, struct touchscreen_properties *prop) @@ -203,4 +204,4 @@ void touchscreen_report_pos(struct input_dev *input, EXPORT_SYMBOL(touchscreen_report_pos); MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("Device-tree helpers functions for touchscreen devices"); +MODULE_DESCRIPTION("Helper functions for touchscreens and other devices"); From a8f1f0dc865cd52e71bf083fb3414d35724d9b48 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Mon, 22 Mar 2021 16:43:12 -0700 Subject: [PATCH 0230/1379] dt-bindings: input: Add bindings for Azoteq IQS626A This patch adds device tree bindings for the Azoteq IQS626A capacitive touch controller. Signed-off-by: Jeff LaBundy Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20210301234928.4298-4-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- .../devicetree/bindings/input/iqs626a.yaml | 843 ++++++++++++++++++ 1 file changed, 843 insertions(+) create mode 100644 Documentation/devicetree/bindings/input/iqs626a.yaml diff --git a/Documentation/devicetree/bindings/input/iqs626a.yaml b/Documentation/devicetree/bindings/input/iqs626a.yaml new file mode 100644 index 000000000000..0cb736c541c9 --- /dev/null +++ b/Documentation/devicetree/bindings/input/iqs626a.yaml @@ -0,0 +1,843 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/iqs626a.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Azoteq IQS626A Capacitive Touch Controller + +maintainers: + - Jeff LaBundy + +description: | + The Azoteq IQS626A is a 14-channel capacitive touch controller that features + additional Hall-effect and inductive sensing capabilities. + + Link to datasheet: https://www.azoteq.com/ + +allOf: + - $ref: touchscreen/touchscreen.yaml# + +properties: + compatible: + const: azoteq,iqs626a + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + "#address-cells": + const: 1 + + "#size-cells": + const: 0 + + azoteq,suspend-mode: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: | + Specifies the power mode during suspend as follows: + 0: Automatic (same as normal runtime, i.e. suspend/resume disabled) + 1: Low power (all sensing at a reduced reporting rate) + 2: Ultra-low power (ULP channel proximity sensing) + 3: Halt (no sensing) + + azoteq,clk-div: + type: boolean + description: Divides the device's core clock by a factor of 4. + + azoteq,ulp-enable: + type: boolean + description: + Permits the device to automatically enter ultra-low-power mode from low- + power mode. + + azoteq,ulp-update: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3, 4, 5, 6, 7] + default: 3 + description: | + Specifies the rate at which the trackpad, generic and Hall channels are + updated during ultra-low-power mode as follows: + 0: 8 + 1: 13 + 2: 28 + 3: 54 + 4: 89 + 5: 135 + 6: 190 + 7: 256 + + azoteq,ati-band-disable: + type: boolean + description: Disables the ATI band check. + + azoteq,ati-lp-only: + type: boolean + description: Limits automatic ATI to low-power mode. + + azoteq,gpio3-select: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3, 4, 5, 6, 7] + default: 1 + description: | + Selects the channel or group of channels for which the GPIO3 pin + represents touch state as follows: + 0: None + 1: ULP channel + 2: Trackpad + 3: Trackpad + 4: Generic channel 0 + 5: Generic channel 1 + 6: Generic channel 2 + 7: Hall channel + + azoteq,reseed-select: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: | + Specifies the event(s) that prompt the device to reseed (i.e. reset the + long-term average) of an associated channel as follows: + 0: None + 1: Proximity + 2: Proximity or touch + 3: Proximity, touch or deep touch + + azoteq,thresh-extend: + type: boolean + description: Multiplies all touch and deep-touch thresholds by 4. + + azoteq,tracking-enable: + type: boolean + description: + Enables all associated channels to track their respective reference + channels. + + azoteq,reseed-offset: + type: boolean + description: + Applies an 8-count offset to all long-term averages upon either ATI or + reseed events. + + azoteq,rate-np-ms: + minimum: 0 + maximum: 255 + default: 150 + description: Specifies the report rate (in ms) during normal-power mode. + + azoteq,rate-lp-ms: + minimum: 0 + maximum: 255 + default: 150 + description: Specifies the report rate (in ms) during low-power mode. + + azoteq,rate-ulp-ms: + multipleOf: 16 + minimum: 0 + maximum: 4080 + default: 0 + description: Specifies the report rate (in ms) during ultra-low-power mode. + + azoteq,timeout-pwr-ms: + multipleOf: 512 + minimum: 0 + maximum: 130560 + default: 2560 + description: + Specifies the length of time (in ms) to wait for an event before moving + from normal-power mode to low-power mode, or (if 'azoteq,ulp-enable' is + present) from low-power mode to ultra-low-power mode. + + azoteq,timeout-lta-ms: + multipleOf: 512 + minimum: 0 + maximum: 130560 + default: 40960 + description: + Specifies the length of time (in ms) to wait before resetting the long- + term average of all channels. Specify the maximum timeout to disable it + altogether. + + touchscreen-inverted-x: true + touchscreen-inverted-y: true + touchscreen-swapped-x-y: true + +patternProperties: + "^ulp-0|generic-[0-2]|hall$": + type: object + description: + Represents a single sensing channel. A channel is active if defined and + inactive otherwise. + + properties: + azoteq,ati-exclude: + type: boolean + description: + Prevents the channel from participating in an ATI event that is + manually triggered during initialization. + + azoteq,reseed-disable: + type: boolean + description: + Prevents the channel from being reseeded if the long-term average + timeout (defined in 'azoteq,timeout-lta') expires. + + azoteq,meas-cap-decrease: + type: boolean + description: + Decreases the internal measurement capacitance from 60 pF to 15 pF. + + azoteq,rx-inactive: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2] + default: 0 + description: | + Specifies how inactive CRX pins are to be terminated as follows: + 0: VSS + 1: Floating + 2: VREG (generic channels only) + + azoteq,linearize: + type: boolean + description: + Enables linearization of the channel's counts (generic and Hall + channels) or inverts the polarity of the channel's proximity or + touch states (ULP channel). + + azoteq,dual-direction: + type: boolean + description: + Specifies that the channel's long-term average is to freeze in the + presence of either increasing or decreasing counts, thereby permit- + ting events to be reported in either direction. + + azoteq,filt-disable: + type: boolean + description: Disables raw count filtering for the channel. + + azoteq,ati-mode: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + description: | + Specifies the channel's ATI mode as follows: + 0: Disabled + 1: Semi-partial + 2: Partial + 3: Full + + The default value is a function of the channel and the device's reset + user interface (RUI); reference the datasheet for further information + about the available RUI options. + + azoteq,ati-base: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [75, 100, 150, 200] + description: + Specifies the channel's ATI base. The default value is a function + of the channel and the device's RUI. + + azoteq,ati-target: + $ref: /schemas/types.yaml#/definitions/uint32 + multipleOf: 32 + minimum: 0 + maximum: 2016 + description: + Specifies the channel's ATI target. The default value is a function + of the channel and the device's RUI. + + azoteq,cct-increase: + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 0 + maximum: 16 + default: 0 + description: + Specifies the degree to which the channel's charge cycle time is to + be increased, with 0 representing no increase. The maximum value is + limited to 4 in the case of the ULP channel, and the property is un- + available entirely in the case of the Hall channel. + + azoteq,proj-bias: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: | + Specifies the bias current applied during projected-capacitance + sensing as follows: + 0: 2.5 uA + 1: 5 uA + 2: 10 uA + 3: 20 uA + + This property is unavailable in the case of the Hall channel. + + azoteq,sense-freq: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + description: | + Specifies the channel's sensing frequency as follows (parenthesized + numbers represent the frequency if 'azoteq,clk-div' is present): + 0: 4 MHz (1 MHz) + 1: 2 MHz (500 kHz) + 2: 1 MHz (250 kHz) + 3: 500 kHz (125 kHz) + + This property is unavailable in the case of the Hall channel. The + default value is a function of the channel and the device's RUI. + + azoteq,ati-band-tighten: + type: boolean + description: + Tightens the ATI band from 1/8 to 1/16 of the desired target (ULP and + generic channels only). + + azoteq,proj-enable: + type: boolean + description: Enables projected-capacitance sensing (ULP channel only). + + azoteq,filt-str-np-cnt: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: + Specifies the raw count filter strength during normal-power mode (ULP + and generic channels only). + + azoteq,filt-str-lp-cnt: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: + Specifies the raw count filter strength during low-power mode (ULP and + generic channels only). + + azoteq,filt-str-np-lta: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: + Specifies the long-term average filter strength during normal-power + mode (ULP and generic channels only). + + azoteq,filt-str-lp-lta: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: + Specifies the long-term average filter strength during low-power mode + (ULP and generic channels only). + + azoteq,rx-enable: + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 1 + maxItems: 8 + items: + minimum: 0 + maximum: 7 + description: + Specifies the CRX pin(s) associated with the channel. + + This property is unavailable in the case of the Hall channel. The + default value is a function of the channel and the device's RUI. + + azoteq,tx-enable: + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 1 + maxItems: 8 + items: + minimum: 0 + maximum: 7 + description: + Specifies the TX pin(s) associated with the channel. + + This property is unavailable in the case of the Hall channel. The + default value is a function of the channel and the device's RUI. + + azoteq,local-cap-size: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3, 4] + default: 0 + description: | + Specifies the capacitance to be added to the channel as follows: + 0: 0 pF + 1: 0.5 pF + 2: 1.0 pF + 3: 1.5 pF + 4: 2.0 pF + + This property is unavailable in the case of the ULP or Hall channels. + + azoteq,sense-mode: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 8, 9, 12, 14, 15] + description: | + Specifies the channel's sensing mode as follows: + 0: Self capacitance + 1: Projected capacitance + 8: Self inductance + 9: Mutual inductance + 12: External + 14: Hall effect + 15: Temperature + + This property is unavailable in the case of the ULP or Hall channels. + The default value is a function of the channel and the device's RUI. + + azoteq,tx-freq: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: | + Specifies the inductive sensing excitation frequency as follows + (parenthesized numbers represent the frequency if 'azoteq,clk-div' + is present): + 0: 16 MHz (4 MHz) + 1: 8 MHz (2 MHz) + 2: 4 MHz (1 MHz) + 3: 2 MHz (500 kHz) + + This property is unavailable in the case of the ULP or Hall channels. + + azoteq,invert-enable: + type: boolean + description: + Inverts the polarity of the states reported for proximity, touch and + deep-touch events relative to their respective thresholds (generic + channels only). + + azoteq,comp-disable: + type: boolean + description: + Disables compensation for the channel (generic channels only). + + azoteq,static-enable: + type: boolean + description: + Enables the static front-end for the channel (generic channels only). + + azoteq,assoc-select: + $ref: /schemas/types.yaml#/definitions/string-array + minItems: 1 + maxItems: 6 + items: + enum: + - ulp-0 + - trackpad-3x2 + - trackpad-3x3 + - generic-0 + - generic-1 + - generic-2 + - hall + description: + Specifies the associated channels for which the channel serves as a + reference channel. By default, no channels are selected. This prop- + erty is only available for the generic channels. + + azoteq,assoc-weight: + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 0 + maximum: 255 + default: 0 + description: + Specifies the channel's impact weight if it acts as an associated + channel (0 = 0% impact, 255 = 200% impact). This property is only + available for the generic channels. + + patternProperties: + "^event-(prox|touch|deep)(-alt)?$": + type: object + description: + Represents a proximity, touch or deep-touch event reported by the + channel in response to a decrease in counts. Node names suffixed with + '-alt' instead correspond to an increase in counts. + + By default, the long-term average tracks an increase in counts such + that only events corresponding to a decrease in counts are reported + (refer to the datasheet for more information). + + Specify 'azoteq,dual-direction' to freeze the long-term average when + the counts increase or decrease such that events of either direction + can be reported. Alternatively, specify 'azoteq,invert-enable' to in- + vert the polarity of the states reported by the channel. + + Complementary events (e.g. event-touch and event-touch-alt) can both + be present and specify different key or switch codes, but not differ- + ent thresholds or hysteresis (if applicable). + + Proximity events are unavailable in the case of the Hall channel, and + deep-touch events are only available for the generic channels. Unless + otherwise specified, default values are a function of the channel and + the device's RUI. + + properties: + azoteq,thresh: + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 0 + maximum: 255 + description: Specifies the threshold for the event. + + azoteq,hyst: + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 0 + maximum: 15 + description: + Specifies the hysteresis for the event (touch and deep-touch + events only). + + linux,code: + $ref: /schemas/types.yaml#/definitions/uint32 + description: Numeric key or switch code associated with the event. + + linux,input-type: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [1, 5] + description: + Specifies whether the event is to be interpreted as a key (1) or + a switch (5). By default, Hall-channel events are interpreted as + switches and all others are interpreted as keys. + + dependencies: + linux,input-type: ["linux,code"] + + additionalProperties: false + + dependencies: + azoteq,assoc-weight: ["azoteq,assoc-select"] + + additionalProperties: false + + "^trackpad-3x[2-3]$": + type: object + description: + Represents all channels associated with the trackpad. The channels are + collectively active if the trackpad is defined and inactive otherwise. + + properties: + azoteq,ati-exclude: + type: boolean + description: + Prevents the trackpad channels from participating in an ATI event + that is manually triggered during initialization. + + azoteq,reseed-disable: + type: boolean + description: + Prevents the trackpad channels from being reseeded if the long-term + average timeout (defined in 'azoteq,timeout-lta') expires. + + azoteq,meas-cap-decrease: + type: boolean + description: + Decreases the internal measurement capacitance from 60 pF to 15 pF. + + azoteq,rx-inactive: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1] + default: 0 + description: | + Specifies how inactive CRX pins are to be terminated as follows: + 0: VSS + 1: Floating + + azoteq,linearize: + type: boolean + description: Inverts the polarity of the trackpad's touch state. + + azoteq,dual-direction: + type: boolean + description: + Specifies that the trackpad's long-term averages are to freeze in + the presence of either increasing or decreasing counts, thereby + permitting events to be reported in either direction. + + azoteq,filt-disable: + type: boolean + description: Disables raw count filtering for the trackpad channels. + + azoteq,ati-mode: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: | + Specifies the trackpad's ATI mode as follows: + 0: Disabled + 1: Semi-partial + 2: Partial + 3: Full + + azoteq,ati-base: + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 6 + maxItems: 9 + items: + minimum: 45 + maximum: 300 + default: [45, 45, 45, 45, 45, 45, 45, 45, 45] + description: Specifies each individual trackpad channel's ATI base. + + azoteq,ati-target: + $ref: /schemas/types.yaml#/definitions/uint32 + multipleOf: 32 + minimum: 0 + maximum: 2016 + default: 0 + description: Specifies the trackpad's ATI target. + + azoteq,cct-increase: + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 0 + maximum: 4 + default: 0 + description: + Specifies the degree to which the trackpad's charge cycle time is to + be increased, with 0 representing no increase. + + azoteq,proj-bias: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: | + Specifies the bias current applied during projected-capacitance + sensing as follows: + 0: 2.5 uA + 1: 5 uA + 2: 10 uA + 3: 20 uA + + azoteq,sense-freq: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: | + Specifies the trackpad's sensing frequency as follows (parenthesized + numbers represent the frequency if 'azoteq,clk-div' is present): + 0: 4 MHz (1 MHz) + 1: 2 MHz (500 kHz) + 2: 1 MHz (250 kHz) + 3: 500 kHz (125 kHz) + + azoteq,ati-band-tighten: + type: boolean + description: + Tightens the ATI band from 1/8 to 1/16 of the desired target. + + azoteq,thresh: + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 6 + maxItems: 9 + items: + minimum: 0 + maximum: 255 + default: [0, 0, 0, 0, 0, 0, 0, 0, 0] + description: + Specifies each individual trackpad channel's touch threshold. + + azoteq,hyst: + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 0 + maximum: 15 + default: 0 + description: Specifies the trackpad's touch hysteresis. + + azoteq,lta-update: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3, 4, 5, 6, 7] + default: 0 + description: | + Specifies the update rate of the trackpad's long-term average during + ultra-low-power mode as follows: + 0: 2 + 1: 4 + 2: 8 + 3: 16 + 4: 32 + 5: 64 + 6: 128 + 7: 255 + + azoteq,filt-str-trackpad: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: Specifies the trackpad coordinate filter strength. + + azoteq,filt-str-np-cnt: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: + Specifies the raw count filter strength during normal-power mode. + + azoteq,filt-str-lp-cnt: + $ref: /schemas/types.yaml#/definitions/uint32 + enum: [0, 1, 2, 3] + default: 0 + description: + Specifies the raw count filter strength during low-power mode. + + linux,keycodes: + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 1 + maxItems: 6 + description: | + Specifies the numeric keycodes associated with each available gesture + in the following order (enter 0 for unused gestures): + 0: Positive flick or swipe in X direction + 1: Negative flick or swipe in X direction + 2: Positive flick or swipe in Y direction + 3: Negative flick or swipe in Y direction + 4: Tap + 5: Hold + + azoteq,gesture-swipe: + type: boolean + description: + Directs the device to interpret axial gestures as a swipe (finger + remains on trackpad) instead of a flick (finger leaves trackpad). + + azoteq,timeout-tap-ms: + multipleOf: 16 + minimum: 0 + maximum: 4080 + default: 0 + description: + Specifies the length of time (in ms) within which a trackpad touch + must be released in order to be interpreted as a tap. + + azoteq,timeout-swipe-ms: + multipleOf: 16 + minimum: 0 + maximum: 4080 + default: 0 + description: + Specifies the length of time (in ms) within which an axial gesture + must be completed in order to be interpreted as a flick or swipe. + + azoteq,thresh-swipe: + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 0 + maximum: 255 + default: 0 + description: + Specifies the number of points across which an axial gesture must + travel in order to be interpreted as a flick or swipe. + + dependencies: + azoteq,gesture-swipe: ["linux,keycodes"] + azoteq,timeout-tap-ms: ["linux,keycodes"] + azoteq,timeout-swipe-ms: ["linux,keycodes"] + azoteq,thresh-swipe: ["linux,keycodes"] + + additionalProperties: false + +required: + - compatible + - reg + - interrupts + - "#address-cells" + - "#size-cells" + +additionalProperties: false + +examples: + - | + #include + #include + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + iqs626a@44 { + #address-cells = <1>; + #size-cells = <0>; + + compatible = "azoteq,iqs626a"; + reg = <0x44>; + interrupt-parent = <&gpio>; + interrupts = <17 IRQ_TYPE_LEVEL_LOW>; + + azoteq,rate-np-ms = <16>; + azoteq,rate-lp-ms = <160>; + + azoteq,timeout-pwr-ms = <2560>; + azoteq,timeout-lta-ms = <32768>; + + ulp-0 { + azoteq,meas-cap-decrease; + + azoteq,ati-base = <75>; + azoteq,ati-target = <1024>; + + azoteq,rx-enable = <2>, <3>, <4>, + <5>, <6>, <7>; + + event-prox { + linux,code = ; + }; + }; + + trackpad-3x3 { + azoteq,filt-str-np-cnt = <1>; + azoteq,filt-str-lp-cnt = <1>; + + azoteq,hyst = <4>; + azoteq,thresh = <35>, <40>, <40>, + <38>, <33>, <38>, + <35>, <35>, <35>; + + azoteq,ati-mode = <3>; + azoteq,ati-base = <195>, <195>, <195>, + <195>, <195>, <195>, + <195>, <195>, <195>; + azoteq,ati-target = <512>; + + azoteq,proj-bias = <1>; + azoteq,sense-freq = <2>; + + linux,keycodes = , + , + , + , + , + ; + + azoteq,gesture-swipe; + azoteq,timeout-swipe-ms = <800>; + azoteq,timeout-tap-ms = <400>; + azoteq,thresh-swipe = <40>; + }; + + /* + * Preserve the default register settings for + * the temperature-tracking channel leveraged + * by reset user interface (RUI) 1. + * + * Scalar properties (e.g. ATI mode) are left + * untouched by simply omitting them; boolean + * properties must be specified explicitly as + * needed. + */ + generic-2 { + azoteq,reseed-disable; + azoteq,meas-cap-decrease; + azoteq,dual-direction; + azoteq,comp-disable; + azoteq,static-enable; + }; + + hall { + azoteq,reseed-disable; + azoteq,meas-cap-decrease; + + event-touch { + linux,code = ; + }; + }; + }; + }; + +... From f1d2809de97adc422967b6de59f0f6199769eb93 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Mon, 22 Mar 2021 16:43:58 -0700 Subject: [PATCH 0231/1379] Input: Add support for Azoteq IQS626A This patch adds support for the Azoteq IQS626A capacitive touch controller. Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/20210301234928.4298-5-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- drivers/input/misc/Kconfig | 11 + drivers/input/misc/Makefile | 1 + drivers/input/misc/iqs626a.c | 1838 ++++++++++++++++++++++++++++++++++ 3 files changed, 1850 insertions(+) create mode 100644 drivers/input/misc/iqs626a.c diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig index ad1b6c90bc4d..bbab23a58c59 100644 --- a/drivers/input/misc/Kconfig +++ b/drivers/input/misc/Kconfig @@ -752,6 +752,17 @@ config INPUT_IQS269A To compile this driver as a module, choose M here: the module will be called iqs269a. +config INPUT_IQS626A + tristate "Azoteq IQS626A capacitive touch controller" + depends on I2C + select REGMAP_I2C + help + Say Y to enable support for the Azoteq IQS626A capacitive + touch controller. + + To compile this driver as a module, choose M here: the + module will be called iqs626a. + config INPUT_CMA3000 tristate "VTI CMA3000 Tri-axis accelerometer" help diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile index 7f202ba8f775..034c80a7ffa1 100644 --- a/drivers/input/misc/Makefile +++ b/drivers/input/misc/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_INPUT_HISI_POWERKEY) += hisi_powerkey.o obj-$(CONFIG_HP_SDC_RTC) += hp_sdc_rtc.o obj-$(CONFIG_INPUT_IMS_PCU) += ims-pcu.o obj-$(CONFIG_INPUT_IQS269A) += iqs269a.o +obj-$(CONFIG_INPUT_IQS626A) += iqs626a.o obj-$(CONFIG_INPUT_IXP4XX_BEEPER) += ixp4xx-beeper.o obj-$(CONFIG_INPUT_KEYSPAN_REMOTE) += keyspan_remote.o obj-$(CONFIG_INPUT_KXTJ9) += kxtj9.o diff --git a/drivers/input/misc/iqs626a.c b/drivers/input/misc/iqs626a.c new file mode 100644 index 000000000000..d57e996732cf --- /dev/null +++ b/drivers/input/misc/iqs626a.c @@ -0,0 +1,1838 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Azoteq IQS626A Capacitive Touch Controller + * + * Copyright (C) 2020 Jeff LaBundy + * + * This driver registers up to 2 input devices: one representing capacitive or + * inductive keys as well as Hall-effect switches, and one for a trackpad that + * can express various gestures. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IQS626_VER_INFO 0x00 +#define IQS626_VER_INFO_PROD_NUM 0x51 + +#define IQS626_SYS_FLAGS 0x02 +#define IQS626_SYS_FLAGS_SHOW_RESET BIT(15) +#define IQS626_SYS_FLAGS_IN_ATI BIT(12) +#define IQS626_SYS_FLAGS_PWR_MODE_MASK GENMASK(9, 8) +#define IQS626_SYS_FLAGS_PWR_MODE_SHIFT 8 + +#define IQS626_HALL_OUTPUT 0x23 + +#define IQS626_SYS_SETTINGS 0x80 +#define IQS626_SYS_SETTINGS_CLK_DIV BIT(15) +#define IQS626_SYS_SETTINGS_ULP_AUTO BIT(14) +#define IQS626_SYS_SETTINGS_DIS_AUTO BIT(13) +#define IQS626_SYS_SETTINGS_PWR_MODE_MASK GENMASK(12, 11) +#define IQS626_SYS_SETTINGS_PWR_MODE_SHIFT 11 +#define IQS626_SYS_SETTINGS_PWR_MODE_MAX 3 +#define IQS626_SYS_SETTINGS_ULP_UPDATE_MASK GENMASK(10, 8) +#define IQS626_SYS_SETTINGS_ULP_UPDATE_SHIFT 8 +#define IQS626_SYS_SETTINGS_ULP_UPDATE_MAX 7 +#define IQS626_SYS_SETTINGS_EVENT_MODE BIT(5) +#define IQS626_SYS_SETTINGS_EVENT_MODE_LP BIT(4) +#define IQS626_SYS_SETTINGS_REDO_ATI BIT(2) +#define IQS626_SYS_SETTINGS_ACK_RESET BIT(0) + +#define IQS626_MISC_A_ATI_BAND_DISABLE BIT(7) +#define IQS626_MISC_A_TPx_LTA_UPDATE_MASK GENMASK(6, 4) +#define IQS626_MISC_A_TPx_LTA_UPDATE_SHIFT 4 +#define IQS626_MISC_A_TPx_LTA_UPDATE_MAX 7 +#define IQS626_MISC_A_ATI_LP_ONLY BIT(3) +#define IQS626_MISC_A_GPIO3_SELECT_MASK GENMASK(2, 0) +#define IQS626_MISC_A_GPIO3_SELECT_MAX 7 + +#define IQS626_EVENT_MASK_SYS BIT(6) +#define IQS626_EVENT_MASK_GESTURE BIT(3) +#define IQS626_EVENT_MASK_DEEP BIT(2) +#define IQS626_EVENT_MASK_TOUCH BIT(1) +#define IQS626_EVENT_MASK_PROX BIT(0) + +#define IQS626_RATE_NP_MS_MAX 255 +#define IQS626_RATE_LP_MS_MAX 255 +#define IQS626_RATE_ULP_MS_MAX 4080 +#define IQS626_TIMEOUT_PWR_MS_MAX 130560 +#define IQS626_TIMEOUT_LTA_MS_MAX 130560 + +#define IQS626_MISC_B_RESEED_UI_SEL_MASK GENMASK(7, 6) +#define IQS626_MISC_B_RESEED_UI_SEL_SHIFT 6 +#define IQS626_MISC_B_RESEED_UI_SEL_MAX 3 +#define IQS626_MISC_B_THRESH_EXTEND BIT(5) +#define IQS626_MISC_B_TRACKING_UI_ENABLE BIT(4) +#define IQS626_MISC_B_TPx_SWIPE BIT(3) +#define IQS626_MISC_B_RESEED_OFFSET BIT(2) +#define IQS626_MISC_B_FILT_STR_TPx GENMASK(1, 0) + +#define IQS626_THRESH_SWIPE_MAX 255 +#define IQS626_TIMEOUT_TAP_MS_MAX 4080 +#define IQS626_TIMEOUT_SWIPE_MS_MAX 4080 + +#define IQS626_CHx_ENG_0_MEAS_CAP_SIZE BIT(7) +#define IQS626_CHx_ENG_0_RX_TERM_VSS BIT(5) +#define IQS626_CHx_ENG_0_LINEARIZE BIT(4) +#define IQS626_CHx_ENG_0_DUAL_DIR BIT(3) +#define IQS626_CHx_ENG_0_FILT_DISABLE BIT(2) +#define IQS626_CHx_ENG_0_ATI_MODE_MASK GENMASK(1, 0) +#define IQS626_CHx_ENG_0_ATI_MODE_MAX 3 + +#define IQS626_CHx_ENG_1_CCT_HIGH_1 BIT(7) +#define IQS626_CHx_ENG_1_CCT_HIGH_0 BIT(6) +#define IQS626_CHx_ENG_1_PROJ_BIAS_MASK GENMASK(5, 4) +#define IQS626_CHx_ENG_1_PROJ_BIAS_SHIFT 4 +#define IQS626_CHx_ENG_1_PROJ_BIAS_MAX 3 +#define IQS626_CHx_ENG_1_CCT_ENABLE BIT(3) +#define IQS626_CHx_ENG_1_SENSE_FREQ_MASK GENMASK(2, 1) +#define IQS626_CHx_ENG_1_SENSE_FREQ_SHIFT 1 +#define IQS626_CHx_ENG_1_SENSE_FREQ_MAX 3 +#define IQS626_CHx_ENG_1_ATI_BAND_TIGHTEN BIT(0) + +#define IQS626_CHx_ENG_2_LOCAL_CAP_MASK GENMASK(7, 6) +#define IQS626_CHx_ENG_2_LOCAL_CAP_SHIFT 6 +#define IQS626_CHx_ENG_2_LOCAL_CAP_MAX 3 +#define IQS626_CHx_ENG_2_LOCAL_CAP_ENABLE BIT(5) +#define IQS626_CHx_ENG_2_SENSE_MODE_MASK GENMASK(3, 0) +#define IQS626_CHx_ENG_2_SENSE_MODE_MAX 15 + +#define IQS626_CHx_ENG_3_TX_FREQ_MASK GENMASK(5, 4) +#define IQS626_CHx_ENG_3_TX_FREQ_SHIFT 4 +#define IQS626_CHx_ENG_3_TX_FREQ_MAX 3 +#define IQS626_CHx_ENG_3_INV_LOGIC BIT(0) + +#define IQS626_CHx_ENG_4_RX_TERM_VREG BIT(6) +#define IQS626_CHx_ENG_4_CCT_LOW_1 BIT(5) +#define IQS626_CHx_ENG_4_CCT_LOW_0 BIT(4) +#define IQS626_CHx_ENG_4_COMP_DISABLE BIT(1) +#define IQS626_CHx_ENG_4_STATIC_ENABLE BIT(0) + +#define IQS626_TPx_ATI_BASE_MIN 45 +#define IQS626_TPx_ATI_BASE_MAX 300 +#define IQS626_CHx_ATI_BASE_MASK GENMASK(7, 6) +#define IQS626_CHx_ATI_BASE_75 0x00 +#define IQS626_CHx_ATI_BASE_100 0x40 +#define IQS626_CHx_ATI_BASE_150 0x80 +#define IQS626_CHx_ATI_BASE_200 0xC0 +#define IQS626_CHx_ATI_TARGET_MASK GENMASK(5, 0) +#define IQS626_CHx_ATI_TARGET_MAX 2016 + +#define IQS626_CHx_THRESH_MAX 255 +#define IQS626_CHx_HYST_DEEP_MASK GENMASK(7, 4) +#define IQS626_CHx_HYST_DEEP_SHIFT 4 +#define IQS626_CHx_HYST_TOUCH_MASK GENMASK(3, 0) +#define IQS626_CHx_HYST_MAX 15 + +#define IQS626_FILT_STR_NP_TPx_MASK GENMASK(7, 6) +#define IQS626_FILT_STR_NP_TPx_SHIFT 6 +#define IQS626_FILT_STR_LP_TPx_MASK GENMASK(5, 4) +#define IQS626_FILT_STR_LP_TPx_SHIFT 4 + +#define IQS626_FILT_STR_NP_CNT_MASK GENMASK(7, 6) +#define IQS626_FILT_STR_NP_CNT_SHIFT 6 +#define IQS626_FILT_STR_LP_CNT_MASK GENMASK(5, 4) +#define IQS626_FILT_STR_LP_CNT_SHIFT 4 +#define IQS626_FILT_STR_NP_LTA_MASK GENMASK(3, 2) +#define IQS626_FILT_STR_NP_LTA_SHIFT 2 +#define IQS626_FILT_STR_LP_LTA_MASK GENMASK(1, 0) +#define IQS626_FILT_STR_MAX 3 + +#define IQS626_ULP_PROJ_ENABLE BIT(4) +#define IQS626_GEN_WEIGHT_MAX 255 + +#define IQS626_MAX_REG 0xFF + +#define IQS626_NUM_CH_TP_3 9 +#define IQS626_NUM_CH_TP_2 6 +#define IQS626_NUM_CH_GEN 3 +#define IQS626_NUM_CRx_TX 8 + +#define IQS626_PWR_MODE_POLL_SLEEP_US 50000 +#define IQS626_PWR_MODE_POLL_TIMEOUT_US 500000 + +#define iqs626_irq_wait() usleep_range(350, 400) + +enum iqs626_ch_id { + IQS626_CH_ULP_0, + IQS626_CH_TP_2, + IQS626_CH_TP_3, + IQS626_CH_GEN_0, + IQS626_CH_GEN_1, + IQS626_CH_GEN_2, + IQS626_CH_HALL, +}; + +enum iqs626_rx_inactive { + IQS626_RX_INACTIVE_VSS, + IQS626_RX_INACTIVE_FLOAT, + IQS626_RX_INACTIVE_VREG, +}; + +enum iqs626_st_offs { + IQS626_ST_OFFS_PROX, + IQS626_ST_OFFS_DIR, + IQS626_ST_OFFS_TOUCH, + IQS626_ST_OFFS_DEEP, +}; + +enum iqs626_th_offs { + IQS626_TH_OFFS_PROX, + IQS626_TH_OFFS_TOUCH, + IQS626_TH_OFFS_DEEP, +}; + +enum iqs626_event_id { + IQS626_EVENT_PROX_DN, + IQS626_EVENT_PROX_UP, + IQS626_EVENT_TOUCH_DN, + IQS626_EVENT_TOUCH_UP, + IQS626_EVENT_DEEP_DN, + IQS626_EVENT_DEEP_UP, +}; + +enum iqs626_gesture_id { + IQS626_GESTURE_FLICK_X_POS, + IQS626_GESTURE_FLICK_X_NEG, + IQS626_GESTURE_FLICK_Y_POS, + IQS626_GESTURE_FLICK_Y_NEG, + IQS626_GESTURE_TAP, + IQS626_GESTURE_HOLD, + IQS626_NUM_GESTURES, +}; + +struct iqs626_event_desc { + const char *name; + enum iqs626_st_offs st_offs; + enum iqs626_th_offs th_offs; + bool dir_up; + u8 mask; +}; + +static const struct iqs626_event_desc iqs626_events[] = { + [IQS626_EVENT_PROX_DN] = { + .name = "event-prox", + .st_offs = IQS626_ST_OFFS_PROX, + .th_offs = IQS626_TH_OFFS_PROX, + .mask = IQS626_EVENT_MASK_PROX, + }, + [IQS626_EVENT_PROX_UP] = { + .name = "event-prox-alt", + .st_offs = IQS626_ST_OFFS_PROX, + .th_offs = IQS626_TH_OFFS_PROX, + .dir_up = true, + .mask = IQS626_EVENT_MASK_PROX, + }, + [IQS626_EVENT_TOUCH_DN] = { + .name = "event-touch", + .st_offs = IQS626_ST_OFFS_TOUCH, + .th_offs = IQS626_TH_OFFS_TOUCH, + .mask = IQS626_EVENT_MASK_TOUCH, + }, + [IQS626_EVENT_TOUCH_UP] = { + .name = "event-touch-alt", + .st_offs = IQS626_ST_OFFS_TOUCH, + .th_offs = IQS626_TH_OFFS_TOUCH, + .dir_up = true, + .mask = IQS626_EVENT_MASK_TOUCH, + }, + [IQS626_EVENT_DEEP_DN] = { + .name = "event-deep", + .st_offs = IQS626_ST_OFFS_DEEP, + .th_offs = IQS626_TH_OFFS_DEEP, + .mask = IQS626_EVENT_MASK_DEEP, + }, + [IQS626_EVENT_DEEP_UP] = { + .name = "event-deep-alt", + .st_offs = IQS626_ST_OFFS_DEEP, + .th_offs = IQS626_TH_OFFS_DEEP, + .dir_up = true, + .mask = IQS626_EVENT_MASK_DEEP, + }, +}; + +struct iqs626_ver_info { + u8 prod_num; + u8 sw_num; + u8 hw_num; + u8 padding; +} __packed; + +struct iqs626_flags { + __be16 system; + u8 gesture; + u8 padding_a; + u8 states[4]; + u8 ref_active; + u8 padding_b; + u8 comp_min; + u8 comp_max; + u8 trackpad_x; + u8 trackpad_y; +} __packed; + +struct iqs626_ch_reg_ulp { + u8 thresh[2]; + u8 hyst; + u8 filter; + u8 engine[2]; + u8 ati_target; + u8 padding; + __be16 ati_comp; + u8 rx_enable; + u8 tx_enable; +} __packed; + +struct iqs626_ch_reg_tp { + u8 thresh; + u8 ati_base; + __be16 ati_comp; +} __packed; + +struct iqs626_tp_grp_reg { + u8 hyst; + u8 ati_target; + u8 engine[2]; + struct iqs626_ch_reg_tp ch_reg_tp[IQS626_NUM_CH_TP_3]; +} __packed; + +struct iqs626_ch_reg_gen { + u8 thresh[3]; + u8 padding; + u8 hyst; + u8 ati_target; + __be16 ati_comp; + u8 engine[5]; + u8 filter; + u8 rx_enable; + u8 tx_enable; + u8 assoc_select; + u8 assoc_weight; +} __packed; + +struct iqs626_ch_reg_hall { + u8 engine; + u8 thresh; + u8 hyst; + u8 ati_target; + __be16 ati_comp; +} __packed; + +struct iqs626_sys_reg { + __be16 general; + u8 misc_a; + u8 event_mask; + u8 active; + u8 reseed; + u8 rate_np; + u8 rate_lp; + u8 rate_ulp; + u8 timeout_pwr; + u8 timeout_rdy; + u8 timeout_lta; + u8 misc_b; + u8 thresh_swipe; + u8 timeout_tap; + u8 timeout_swipe; + u8 redo_ati; + u8 padding; + struct iqs626_ch_reg_ulp ch_reg_ulp; + struct iqs626_tp_grp_reg tp_grp_reg; + struct iqs626_ch_reg_gen ch_reg_gen[IQS626_NUM_CH_GEN]; + struct iqs626_ch_reg_hall ch_reg_hall; +} __packed; + +struct iqs626_channel_desc { + const char *name; + int num_ch; + u8 active; + bool events[ARRAY_SIZE(iqs626_events)]; +}; + +static const struct iqs626_channel_desc iqs626_channels[] = { + [IQS626_CH_ULP_0] = { + .name = "ulp-0", + .num_ch = 1, + .active = BIT(0), + .events = { + [IQS626_EVENT_PROX_DN] = true, + [IQS626_EVENT_PROX_UP] = true, + [IQS626_EVENT_TOUCH_DN] = true, + [IQS626_EVENT_TOUCH_UP] = true, + }, + }, + [IQS626_CH_TP_2] = { + .name = "trackpad-3x2", + .num_ch = IQS626_NUM_CH_TP_2, + .active = BIT(1), + .events = { + [IQS626_EVENT_TOUCH_DN] = true, + }, + }, + [IQS626_CH_TP_3] = { + .name = "trackpad-3x3", + .num_ch = IQS626_NUM_CH_TP_3, + .active = BIT(2) | BIT(1), + .events = { + [IQS626_EVENT_TOUCH_DN] = true, + }, + }, + [IQS626_CH_GEN_0] = { + .name = "generic-0", + .num_ch = 1, + .active = BIT(4), + .events = { + [IQS626_EVENT_PROX_DN] = true, + [IQS626_EVENT_PROX_UP] = true, + [IQS626_EVENT_TOUCH_DN] = true, + [IQS626_EVENT_TOUCH_UP] = true, + [IQS626_EVENT_DEEP_DN] = true, + [IQS626_EVENT_DEEP_UP] = true, + }, + }, + [IQS626_CH_GEN_1] = { + .name = "generic-1", + .num_ch = 1, + .active = BIT(5), + .events = { + [IQS626_EVENT_PROX_DN] = true, + [IQS626_EVENT_PROX_UP] = true, + [IQS626_EVENT_TOUCH_DN] = true, + [IQS626_EVENT_TOUCH_UP] = true, + [IQS626_EVENT_DEEP_DN] = true, + [IQS626_EVENT_DEEP_UP] = true, + }, + }, + [IQS626_CH_GEN_2] = { + .name = "generic-2", + .num_ch = 1, + .active = BIT(6), + .events = { + [IQS626_EVENT_PROX_DN] = true, + [IQS626_EVENT_PROX_UP] = true, + [IQS626_EVENT_TOUCH_DN] = true, + [IQS626_EVENT_TOUCH_UP] = true, + [IQS626_EVENT_DEEP_DN] = true, + [IQS626_EVENT_DEEP_UP] = true, + }, + }, + [IQS626_CH_HALL] = { + .name = "hall", + .num_ch = 1, + .active = BIT(7), + .events = { + [IQS626_EVENT_TOUCH_DN] = true, + [IQS626_EVENT_TOUCH_UP] = true, + }, + }, +}; + +struct iqs626_private { + struct i2c_client *client; + struct regmap *regmap; + struct iqs626_sys_reg sys_reg; + struct completion ati_done; + struct input_dev *keypad; + struct input_dev *trackpad; + struct touchscreen_properties prop; + unsigned int kp_type[ARRAY_SIZE(iqs626_channels)] + [ARRAY_SIZE(iqs626_events)]; + unsigned int kp_code[ARRAY_SIZE(iqs626_channels)] + [ARRAY_SIZE(iqs626_events)]; + unsigned int tp_code[IQS626_NUM_GESTURES]; + unsigned int suspend_mode; +}; + +static int iqs626_parse_events(struct iqs626_private *iqs626, + const struct fwnode_handle *ch_node, + enum iqs626_ch_id ch_id) +{ + struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg; + struct i2c_client *client = iqs626->client; + const struct fwnode_handle *ev_node; + const char *ev_name; + u8 *thresh, *hyst; + unsigned int thresh_tp[IQS626_NUM_CH_TP_3]; + unsigned int val; + int num_ch = iqs626_channels[ch_id].num_ch; + int error, i, j; + + switch (ch_id) { + case IQS626_CH_ULP_0: + thresh = sys_reg->ch_reg_ulp.thresh; + hyst = &sys_reg->ch_reg_ulp.hyst; + break; + + case IQS626_CH_TP_2: + case IQS626_CH_TP_3: + thresh = &sys_reg->tp_grp_reg.ch_reg_tp[0].thresh; + hyst = &sys_reg->tp_grp_reg.hyst; + break; + + case IQS626_CH_GEN_0: + case IQS626_CH_GEN_1: + case IQS626_CH_GEN_2: + i = ch_id - IQS626_CH_GEN_0; + thresh = sys_reg->ch_reg_gen[i].thresh; + hyst = &sys_reg->ch_reg_gen[i].hyst; + break; + + case IQS626_CH_HALL: + thresh = &sys_reg->ch_reg_hall.thresh; + hyst = &sys_reg->ch_reg_hall.hyst; + break; + + default: + return -EINVAL; + } + + for (i = 0; i < ARRAY_SIZE(iqs626_events); i++) { + if (!iqs626_channels[ch_id].events[i]) + continue; + + if (ch_id == IQS626_CH_TP_2 || ch_id == IQS626_CH_TP_3) { + /* + * Trackpad touch events are simply described under the + * trackpad child node. + */ + ev_node = ch_node; + } else { + ev_name = iqs626_events[i].name; + ev_node = fwnode_get_named_child_node(ch_node, ev_name); + if (!ev_node) + continue; + + if (!fwnode_property_read_u32(ev_node, "linux,code", + &val)) { + iqs626->kp_code[ch_id][i] = val; + + if (fwnode_property_read_u32(ev_node, + "linux,input-type", + &val)) { + if (ch_id == IQS626_CH_HALL) + val = EV_SW; + else + val = EV_KEY; + } + + if (val != EV_KEY && val != EV_SW) { + dev_err(&client->dev, + "Invalid input type: %u\n", + val); + return -EINVAL; + } + + iqs626->kp_type[ch_id][i] = val; + + sys_reg->event_mask &= ~iqs626_events[i].mask; + } + } + + if (!fwnode_property_read_u32(ev_node, "azoteq,hyst", &val)) { + if (val > IQS626_CHx_HYST_MAX) { + dev_err(&client->dev, + "Invalid %s channel hysteresis: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + if (i == IQS626_EVENT_DEEP_DN || + i == IQS626_EVENT_DEEP_UP) { + *hyst &= ~IQS626_CHx_HYST_DEEP_MASK; + *hyst |= (val << IQS626_CHx_HYST_DEEP_SHIFT); + } else if (i == IQS626_EVENT_TOUCH_DN || + i == IQS626_EVENT_TOUCH_UP) { + *hyst &= ~IQS626_CHx_HYST_TOUCH_MASK; + *hyst |= val; + } + } + + if (ch_id != IQS626_CH_TP_2 && ch_id != IQS626_CH_TP_3 && + !fwnode_property_read_u32(ev_node, "azoteq,thresh", &val)) { + if (val > IQS626_CHx_THRESH_MAX) { + dev_err(&client->dev, + "Invalid %s channel threshold: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + if (ch_id == IQS626_CH_HALL) + *thresh = val; + else + *(thresh + iqs626_events[i].th_offs) = val; + + continue; + } + + if (!fwnode_property_present(ev_node, "azoteq,thresh")) + continue; + + error = fwnode_property_read_u32_array(ev_node, "azoteq,thresh", + thresh_tp, num_ch); + if (error) { + dev_err(&client->dev, + "Failed to read %s channel thresholds: %d\n", + fwnode_get_name(ch_node), error); + return error; + } + + for (j = 0; j < num_ch; j++) { + if (thresh_tp[j] > IQS626_CHx_THRESH_MAX) { + dev_err(&client->dev, + "Invalid %s channel threshold: %u\n", + fwnode_get_name(ch_node), thresh_tp[j]); + return -EINVAL; + } + + sys_reg->tp_grp_reg.ch_reg_tp[j].thresh = thresh_tp[j]; + } + } + + return 0; +} + +static int iqs626_parse_ati_target(struct iqs626_private *iqs626, + const struct fwnode_handle *ch_node, + enum iqs626_ch_id ch_id) +{ + struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg; + struct i2c_client *client = iqs626->client; + unsigned int ati_base[IQS626_NUM_CH_TP_3]; + unsigned int val; + u8 *ati_target; + int num_ch = iqs626_channels[ch_id].num_ch; + int error, i; + + switch (ch_id) { + case IQS626_CH_ULP_0: + ati_target = &sys_reg->ch_reg_ulp.ati_target; + break; + + case IQS626_CH_TP_2: + case IQS626_CH_TP_3: + ati_target = &sys_reg->tp_grp_reg.ati_target; + break; + + case IQS626_CH_GEN_0: + case IQS626_CH_GEN_1: + case IQS626_CH_GEN_2: + i = ch_id - IQS626_CH_GEN_0; + ati_target = &sys_reg->ch_reg_gen[i].ati_target; + break; + + case IQS626_CH_HALL: + ati_target = &sys_reg->ch_reg_hall.ati_target; + break; + + default: + return -EINVAL; + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,ati-target", &val)) { + if (val > IQS626_CHx_ATI_TARGET_MAX) { + dev_err(&client->dev, + "Invalid %s channel ATI target: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *ati_target &= ~IQS626_CHx_ATI_TARGET_MASK; + *ati_target |= (val / 32); + } + + if (ch_id != IQS626_CH_TP_2 && ch_id != IQS626_CH_TP_3 && + !fwnode_property_read_u32(ch_node, "azoteq,ati-base", &val)) { + switch (val) { + case 75: + val = IQS626_CHx_ATI_BASE_75; + break; + + case 100: + val = IQS626_CHx_ATI_BASE_100; + break; + + case 150: + val = IQS626_CHx_ATI_BASE_150; + break; + + case 200: + val = IQS626_CHx_ATI_BASE_200; + break; + + default: + dev_err(&client->dev, + "Invalid %s channel ATI base: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *ati_target &= ~IQS626_CHx_ATI_BASE_MASK; + *ati_target |= val; + + return 0; + } + + if (!fwnode_property_present(ch_node, "azoteq,ati-base")) + return 0; + + error = fwnode_property_read_u32_array(ch_node, "azoteq,ati-base", + ati_base, num_ch); + if (error) { + dev_err(&client->dev, + "Failed to read %s channel ATI bases: %d\n", + fwnode_get_name(ch_node), error); + return error; + } + + for (i = 0; i < num_ch; i++) { + if (ati_base[i] < IQS626_TPx_ATI_BASE_MIN || + ati_base[i] > IQS626_TPx_ATI_BASE_MAX) { + dev_err(&client->dev, + "Invalid %s channel ATI base: %u\n", + fwnode_get_name(ch_node), ati_base[i]); + return -EINVAL; + } + + ati_base[i] -= IQS626_TPx_ATI_BASE_MIN; + sys_reg->tp_grp_reg.ch_reg_tp[i].ati_base = ati_base[i]; + } + + return 0; +} + +static int iqs626_parse_pins(struct iqs626_private *iqs626, + const struct fwnode_handle *ch_node, + const char *propname, u8 *enable) +{ + struct i2c_client *client = iqs626->client; + unsigned int val[IQS626_NUM_CRx_TX]; + int error, count, i; + + if (!fwnode_property_present(ch_node, propname)) + return 0; + + count = fwnode_property_count_u32(ch_node, propname); + if (count > IQS626_NUM_CRx_TX) { + dev_err(&client->dev, + "Too many %s channel CRX/TX pins present\n", + fwnode_get_name(ch_node)); + return -EINVAL; + } else if (count < 0) { + dev_err(&client->dev, + "Failed to count %s channel CRX/TX pins: %d\n", + fwnode_get_name(ch_node), count); + return count; + } + + error = fwnode_property_read_u32_array(ch_node, propname, val, count); + if (error) { + dev_err(&client->dev, + "Failed to read %s channel CRX/TX pins: %d\n", + fwnode_get_name(ch_node), error); + return error; + } + + *enable = 0; + + for (i = 0; i < count; i++) { + if (val[i] >= IQS626_NUM_CRx_TX) { + dev_err(&client->dev, + "Invalid %s channel CRX/TX pin: %u\n", + fwnode_get_name(ch_node), val[i]); + return -EINVAL; + } + + *enable |= BIT(val[i]); + } + + return 0; +} + +static int iqs626_parse_trackpad(struct iqs626_private *iqs626, + const struct fwnode_handle *ch_node) +{ + struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg; + struct i2c_client *client = iqs626->client; + u8 *hyst = &sys_reg->tp_grp_reg.hyst; + unsigned int val; + int error, count; + + if (!fwnode_property_read_u32(ch_node, "azoteq,lta-update", &val)) { + if (val > IQS626_MISC_A_TPx_LTA_UPDATE_MAX) { + dev_err(&client->dev, + "Invalid %s channel update rate: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + sys_reg->misc_a &= ~IQS626_MISC_A_TPx_LTA_UPDATE_MASK; + sys_reg->misc_a |= (val << IQS626_MISC_A_TPx_LTA_UPDATE_SHIFT); + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-trackpad", + &val)) { + if (val > IQS626_FILT_STR_MAX) { + dev_err(&client->dev, + "Invalid %s channel filter strength: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + sys_reg->misc_b &= ~IQS626_MISC_B_FILT_STR_TPx; + sys_reg->misc_b |= val; + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-np-cnt", + &val)) { + if (val > IQS626_FILT_STR_MAX) { + dev_err(&client->dev, + "Invalid %s channel filter strength: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *hyst &= ~IQS626_FILT_STR_NP_TPx_MASK; + *hyst |= (val << IQS626_FILT_STR_NP_TPx_SHIFT); + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-lp-cnt", + &val)) { + if (val > IQS626_FILT_STR_MAX) { + dev_err(&client->dev, + "Invalid %s channel filter strength: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *hyst &= ~IQS626_FILT_STR_LP_TPx_MASK; + *hyst |= (val << IQS626_FILT_STR_LP_TPx_SHIFT); + } + + if (!fwnode_property_present(ch_node, "linux,keycodes")) + return 0; + + count = fwnode_property_count_u32(ch_node, "linux,keycodes"); + if (count > IQS626_NUM_GESTURES) { + dev_err(&client->dev, "Too many keycodes present\n"); + return -EINVAL; + } else if (count < 0) { + dev_err(&client->dev, "Failed to count keycodes: %d\n", count); + return count; + } + + error = fwnode_property_read_u32_array(ch_node, "linux,keycodes", + iqs626->tp_code, count); + if (error) { + dev_err(&client->dev, "Failed to read keycodes: %d\n", error); + return error; + } + + sys_reg->misc_b &= ~IQS626_MISC_B_TPx_SWIPE; + if (fwnode_property_present(ch_node, "azoteq,gesture-swipe")) + sys_reg->misc_b |= IQS626_MISC_B_TPx_SWIPE; + + if (!fwnode_property_read_u32(ch_node, "azoteq,timeout-tap-ms", + &val)) { + if (val > IQS626_TIMEOUT_TAP_MS_MAX) { + dev_err(&client->dev, + "Invalid %s channel timeout: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + sys_reg->timeout_tap = val / 16; + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,timeout-swipe-ms", + &val)) { + if (val > IQS626_TIMEOUT_SWIPE_MS_MAX) { + dev_err(&client->dev, + "Invalid %s channel timeout: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + sys_reg->timeout_swipe = val / 16; + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,thresh-swipe", + &val)) { + if (val > IQS626_THRESH_SWIPE_MAX) { + dev_err(&client->dev, + "Invalid %s channel threshold: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + sys_reg->thresh_swipe = val; + } + + sys_reg->event_mask &= ~IQS626_EVENT_MASK_GESTURE; + + return 0; +} + +static int iqs626_parse_channel(struct iqs626_private *iqs626, + const struct fwnode_handle *ch_node, + enum iqs626_ch_id ch_id) +{ + struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg; + struct i2c_client *client = iqs626->client; + u8 *engine, *filter, *rx_enable, *tx_enable; + u8 *assoc_select, *assoc_weight; + unsigned int val; + int error, i; + + switch (ch_id) { + case IQS626_CH_ULP_0: + engine = sys_reg->ch_reg_ulp.engine; + break; + + case IQS626_CH_TP_2: + case IQS626_CH_TP_3: + engine = sys_reg->tp_grp_reg.engine; + break; + + case IQS626_CH_GEN_0: + case IQS626_CH_GEN_1: + case IQS626_CH_GEN_2: + i = ch_id - IQS626_CH_GEN_0; + engine = sys_reg->ch_reg_gen[i].engine; + break; + + case IQS626_CH_HALL: + engine = &sys_reg->ch_reg_hall.engine; + break; + + default: + return -EINVAL; + } + + *engine |= IQS626_CHx_ENG_0_MEAS_CAP_SIZE; + if (fwnode_property_present(ch_node, "azoteq,meas-cap-decrease")) + *engine &= ~IQS626_CHx_ENG_0_MEAS_CAP_SIZE; + + *engine |= IQS626_CHx_ENG_0_RX_TERM_VSS; + if (!fwnode_property_read_u32(ch_node, "azoteq,rx-inactive", &val)) { + switch (val) { + case IQS626_RX_INACTIVE_VSS: + break; + + case IQS626_RX_INACTIVE_FLOAT: + *engine &= ~IQS626_CHx_ENG_0_RX_TERM_VSS; + if (ch_id == IQS626_CH_GEN_0 || + ch_id == IQS626_CH_GEN_1 || + ch_id == IQS626_CH_GEN_2) + *(engine + 4) &= ~IQS626_CHx_ENG_4_RX_TERM_VREG; + break; + + case IQS626_RX_INACTIVE_VREG: + if (ch_id == IQS626_CH_GEN_0 || + ch_id == IQS626_CH_GEN_1 || + ch_id == IQS626_CH_GEN_2) { + *engine &= ~IQS626_CHx_ENG_0_RX_TERM_VSS; + *(engine + 4) |= IQS626_CHx_ENG_4_RX_TERM_VREG; + break; + } + fallthrough; + + default: + dev_err(&client->dev, + "Invalid %s channel CRX pin termination: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + } + + *engine &= ~IQS626_CHx_ENG_0_LINEARIZE; + if (fwnode_property_present(ch_node, "azoteq,linearize")) + *engine |= IQS626_CHx_ENG_0_LINEARIZE; + + *engine &= ~IQS626_CHx_ENG_0_DUAL_DIR; + if (fwnode_property_present(ch_node, "azoteq,dual-direction")) + *engine |= IQS626_CHx_ENG_0_DUAL_DIR; + + *engine &= ~IQS626_CHx_ENG_0_FILT_DISABLE; + if (fwnode_property_present(ch_node, "azoteq,filt-disable")) + *engine |= IQS626_CHx_ENG_0_FILT_DISABLE; + + if (!fwnode_property_read_u32(ch_node, "azoteq,ati-mode", &val)) { + if (val > IQS626_CHx_ENG_0_ATI_MODE_MAX) { + dev_err(&client->dev, + "Invalid %s channel ATI mode: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *engine &= ~IQS626_CHx_ENG_0_ATI_MODE_MASK; + *engine |= val; + } + + if (ch_id == IQS626_CH_HALL) + return 0; + + *(engine + 1) &= ~IQS626_CHx_ENG_1_CCT_ENABLE; + if (!fwnode_property_read_u32(ch_node, "azoteq,cct-increase", + &val) && val) { + unsigned int orig_val = val--; + + /* + * In the case of the generic channels, the charge cycle time + * field doubles in size and straddles two separate registers. + */ + if (ch_id == IQS626_CH_GEN_0 || + ch_id == IQS626_CH_GEN_1 || + ch_id == IQS626_CH_GEN_2) { + *(engine + 4) &= ~IQS626_CHx_ENG_4_CCT_LOW_1; + if (val & BIT(1)) + *(engine + 4) |= IQS626_CHx_ENG_4_CCT_LOW_1; + + *(engine + 4) &= ~IQS626_CHx_ENG_4_CCT_LOW_0; + if (val & BIT(0)) + *(engine + 4) |= IQS626_CHx_ENG_4_CCT_LOW_0; + + val >>= 2; + } + + if (val & ~GENMASK(1, 0)) { + dev_err(&client->dev, + "Invalid %s channel charge cycle time: %u\n", + fwnode_get_name(ch_node), orig_val); + return -EINVAL; + } + + *(engine + 1) &= ~IQS626_CHx_ENG_1_CCT_HIGH_1; + if (val & BIT(1)) + *(engine + 1) |= IQS626_CHx_ENG_1_CCT_HIGH_1; + + *(engine + 1) &= ~IQS626_CHx_ENG_1_CCT_HIGH_0; + if (val & BIT(0)) + *(engine + 1) |= IQS626_CHx_ENG_1_CCT_HIGH_0; + + *(engine + 1) |= IQS626_CHx_ENG_1_CCT_ENABLE; + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,proj-bias", &val)) { + if (val > IQS626_CHx_ENG_1_PROJ_BIAS_MAX) { + dev_err(&client->dev, + "Invalid %s channel bias current: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *(engine + 1) &= ~IQS626_CHx_ENG_1_PROJ_BIAS_MASK; + *(engine + 1) |= (val << IQS626_CHx_ENG_1_PROJ_BIAS_SHIFT); + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,sense-freq", &val)) { + if (val > IQS626_CHx_ENG_1_SENSE_FREQ_MAX) { + dev_err(&client->dev, + "Invalid %s channel sensing frequency: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *(engine + 1) &= ~IQS626_CHx_ENG_1_SENSE_FREQ_MASK; + *(engine + 1) |= (val << IQS626_CHx_ENG_1_SENSE_FREQ_SHIFT); + } + + *(engine + 1) &= ~IQS626_CHx_ENG_1_ATI_BAND_TIGHTEN; + if (fwnode_property_present(ch_node, "azoteq,ati-band-tighten")) + *(engine + 1) |= IQS626_CHx_ENG_1_ATI_BAND_TIGHTEN; + + if (ch_id == IQS626_CH_TP_2 || ch_id == IQS626_CH_TP_3) + return iqs626_parse_trackpad(iqs626, ch_node); + + if (ch_id == IQS626_CH_ULP_0) { + sys_reg->ch_reg_ulp.hyst &= ~IQS626_ULP_PROJ_ENABLE; + if (fwnode_property_present(ch_node, "azoteq,proj-enable")) + sys_reg->ch_reg_ulp.hyst |= IQS626_ULP_PROJ_ENABLE; + + filter = &sys_reg->ch_reg_ulp.filter; + + rx_enable = &sys_reg->ch_reg_ulp.rx_enable; + tx_enable = &sys_reg->ch_reg_ulp.tx_enable; + } else { + i = ch_id - IQS626_CH_GEN_0; + filter = &sys_reg->ch_reg_gen[i].filter; + + rx_enable = &sys_reg->ch_reg_gen[i].rx_enable; + tx_enable = &sys_reg->ch_reg_gen[i].tx_enable; + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-np-cnt", + &val)) { + if (val > IQS626_FILT_STR_MAX) { + dev_err(&client->dev, + "Invalid %s channel filter strength: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *filter &= ~IQS626_FILT_STR_NP_CNT_MASK; + *filter |= (val << IQS626_FILT_STR_NP_CNT_SHIFT); + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-lp-cnt", + &val)) { + if (val > IQS626_FILT_STR_MAX) { + dev_err(&client->dev, + "Invalid %s channel filter strength: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *filter &= ~IQS626_FILT_STR_LP_CNT_MASK; + *filter |= (val << IQS626_FILT_STR_LP_CNT_SHIFT); + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-np-lta", + &val)) { + if (val > IQS626_FILT_STR_MAX) { + dev_err(&client->dev, + "Invalid %s channel filter strength: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *filter &= ~IQS626_FILT_STR_NP_LTA_MASK; + *filter |= (val << IQS626_FILT_STR_NP_LTA_SHIFT); + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,filt-str-lp-lta", + &val)) { + if (val > IQS626_FILT_STR_MAX) { + dev_err(&client->dev, + "Invalid %s channel filter strength: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *filter &= ~IQS626_FILT_STR_LP_LTA_MASK; + *filter |= val; + } + + error = iqs626_parse_pins(iqs626, ch_node, "azoteq,rx-enable", + rx_enable); + if (error) + return error; + + error = iqs626_parse_pins(iqs626, ch_node, "azoteq,tx-enable", + tx_enable); + if (error) + return error; + + if (ch_id == IQS626_CH_ULP_0) + return 0; + + *(engine + 2) &= ~IQS626_CHx_ENG_2_LOCAL_CAP_ENABLE; + if (!fwnode_property_read_u32(ch_node, "azoteq,local-cap-size", + &val) && val) { + unsigned int orig_val = val--; + + if (val > IQS626_CHx_ENG_2_LOCAL_CAP_MAX) { + dev_err(&client->dev, + "Invalid %s channel local cap. size: %u\n", + fwnode_get_name(ch_node), orig_val); + return -EINVAL; + } + + *(engine + 2) &= ~IQS626_CHx_ENG_2_LOCAL_CAP_MASK; + *(engine + 2) |= (val << IQS626_CHx_ENG_2_LOCAL_CAP_SHIFT); + + *(engine + 2) |= IQS626_CHx_ENG_2_LOCAL_CAP_ENABLE; + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,sense-mode", &val)) { + if (val > IQS626_CHx_ENG_2_SENSE_MODE_MAX) { + dev_err(&client->dev, + "Invalid %s channel sensing mode: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *(engine + 2) &= ~IQS626_CHx_ENG_2_SENSE_MODE_MASK; + *(engine + 2) |= val; + } + + if (!fwnode_property_read_u32(ch_node, "azoteq,tx-freq", &val)) { + if (val > IQS626_CHx_ENG_3_TX_FREQ_MAX) { + dev_err(&client->dev, + "Invalid %s channel excitation frequency: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *(engine + 3) &= ~IQS626_CHx_ENG_3_TX_FREQ_MASK; + *(engine + 3) |= (val << IQS626_CHx_ENG_3_TX_FREQ_SHIFT); + } + + *(engine + 3) &= ~IQS626_CHx_ENG_3_INV_LOGIC; + if (fwnode_property_present(ch_node, "azoteq,invert-enable")) + *(engine + 3) |= IQS626_CHx_ENG_3_INV_LOGIC; + + *(engine + 4) &= ~IQS626_CHx_ENG_4_COMP_DISABLE; + if (fwnode_property_present(ch_node, "azoteq,comp-disable")) + *(engine + 4) |= IQS626_CHx_ENG_4_COMP_DISABLE; + + *(engine + 4) &= ~IQS626_CHx_ENG_4_STATIC_ENABLE; + if (fwnode_property_present(ch_node, "azoteq,static-enable")) + *(engine + 4) |= IQS626_CHx_ENG_4_STATIC_ENABLE; + + i = ch_id - IQS626_CH_GEN_0; + assoc_select = &sys_reg->ch_reg_gen[i].assoc_select; + assoc_weight = &sys_reg->ch_reg_gen[i].assoc_weight; + + *assoc_select = 0; + if (!fwnode_property_present(ch_node, "azoteq,assoc-select")) + return 0; + + for (i = 0; i < ARRAY_SIZE(iqs626_channels); i++) { + if (fwnode_property_match_string(ch_node, "azoteq,assoc-select", + iqs626_channels[i].name) < 0) + continue; + + *assoc_select |= iqs626_channels[i].active; + } + + if (fwnode_property_read_u32(ch_node, "azoteq,assoc-weight", &val)) + return 0; + + if (val > IQS626_GEN_WEIGHT_MAX) { + dev_err(&client->dev, + "Invalid %s channel associated weight: %u\n", + fwnode_get_name(ch_node), val); + return -EINVAL; + } + + *assoc_weight = val; + + return 0; +} + +static int iqs626_parse_prop(struct iqs626_private *iqs626) +{ + struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg; + struct i2c_client *client = iqs626->client; + struct fwnode_handle *ch_node; + unsigned int val; + int error, i; + u16 general; + + if (!device_property_read_u32(&client->dev, "azoteq,suspend-mode", + &val)) { + if (val > IQS626_SYS_SETTINGS_PWR_MODE_MAX) { + dev_err(&client->dev, "Invalid suspend mode: %u\n", + val); + return -EINVAL; + } + + iqs626->suspend_mode = val; + } + + error = regmap_raw_read(iqs626->regmap, IQS626_SYS_SETTINGS, sys_reg, + sizeof(*sys_reg)); + if (error) + return error; + + general = be16_to_cpu(sys_reg->general); + general &= IQS626_SYS_SETTINGS_ULP_UPDATE_MASK; + + if (device_property_present(&client->dev, "azoteq,clk-div")) + general |= IQS626_SYS_SETTINGS_CLK_DIV; + + if (device_property_present(&client->dev, "azoteq,ulp-enable")) + general |= IQS626_SYS_SETTINGS_ULP_AUTO; + + if (!device_property_read_u32(&client->dev, "azoteq,ulp-update", + &val)) { + if (val > IQS626_SYS_SETTINGS_ULP_UPDATE_MAX) { + dev_err(&client->dev, "Invalid update rate: %u\n", val); + return -EINVAL; + } + + general &= ~IQS626_SYS_SETTINGS_ULP_UPDATE_MASK; + general |= (val << IQS626_SYS_SETTINGS_ULP_UPDATE_SHIFT); + } + + sys_reg->misc_a &= ~IQS626_MISC_A_ATI_BAND_DISABLE; + if (device_property_present(&client->dev, "azoteq,ati-band-disable")) + sys_reg->misc_a |= IQS626_MISC_A_ATI_BAND_DISABLE; + + sys_reg->misc_a &= ~IQS626_MISC_A_ATI_LP_ONLY; + if (device_property_present(&client->dev, "azoteq,ati-lp-only")) + sys_reg->misc_a |= IQS626_MISC_A_ATI_LP_ONLY; + + if (!device_property_read_u32(&client->dev, "azoteq,gpio3-select", + &val)) { + if (val > IQS626_MISC_A_GPIO3_SELECT_MAX) { + dev_err(&client->dev, "Invalid GPIO3 selection: %u\n", + val); + return -EINVAL; + } + + sys_reg->misc_a &= ~IQS626_MISC_A_GPIO3_SELECT_MASK; + sys_reg->misc_a |= val; + } + + if (!device_property_read_u32(&client->dev, "azoteq,reseed-select", + &val)) { + if (val > IQS626_MISC_B_RESEED_UI_SEL_MAX) { + dev_err(&client->dev, "Invalid reseed selection: %u\n", + val); + return -EINVAL; + } + + sys_reg->misc_b &= ~IQS626_MISC_B_RESEED_UI_SEL_MASK; + sys_reg->misc_b |= (val << IQS626_MISC_B_RESEED_UI_SEL_SHIFT); + } + + sys_reg->misc_b &= ~IQS626_MISC_B_THRESH_EXTEND; + if (device_property_present(&client->dev, "azoteq,thresh-extend")) + sys_reg->misc_b |= IQS626_MISC_B_THRESH_EXTEND; + + sys_reg->misc_b &= ~IQS626_MISC_B_TRACKING_UI_ENABLE; + if (device_property_present(&client->dev, "azoteq,tracking-enable")) + sys_reg->misc_b |= IQS626_MISC_B_TRACKING_UI_ENABLE; + + sys_reg->misc_b &= ~IQS626_MISC_B_RESEED_OFFSET; + if (device_property_present(&client->dev, "azoteq,reseed-offset")) + sys_reg->misc_b |= IQS626_MISC_B_RESEED_OFFSET; + + if (!device_property_read_u32(&client->dev, "azoteq,rate-np-ms", + &val)) { + if (val > IQS626_RATE_NP_MS_MAX) { + dev_err(&client->dev, "Invalid report rate: %u\n", val); + return -EINVAL; + } + + sys_reg->rate_np = val; + } + + if (!device_property_read_u32(&client->dev, "azoteq,rate-lp-ms", + &val)) { + if (val > IQS626_RATE_LP_MS_MAX) { + dev_err(&client->dev, "Invalid report rate: %u\n", val); + return -EINVAL; + } + + sys_reg->rate_lp = val; + } + + if (!device_property_read_u32(&client->dev, "azoteq,rate-ulp-ms", + &val)) { + if (val > IQS626_RATE_ULP_MS_MAX) { + dev_err(&client->dev, "Invalid report rate: %u\n", val); + return -EINVAL; + } + + sys_reg->rate_ulp = val / 16; + } + + if (!device_property_read_u32(&client->dev, "azoteq,timeout-pwr-ms", + &val)) { + if (val > IQS626_TIMEOUT_PWR_MS_MAX) { + dev_err(&client->dev, "Invalid timeout: %u\n", val); + return -EINVAL; + } + + sys_reg->timeout_pwr = val / 512; + } + + if (!device_property_read_u32(&client->dev, "azoteq,timeout-lta-ms", + &val)) { + if (val > IQS626_TIMEOUT_LTA_MS_MAX) { + dev_err(&client->dev, "Invalid timeout: %u\n", val); + return -EINVAL; + } + + sys_reg->timeout_lta = val / 512; + } + + sys_reg->event_mask = ~((u8)IQS626_EVENT_MASK_SYS); + sys_reg->redo_ati = 0; + + sys_reg->reseed = 0; + sys_reg->active = 0; + + for (i = 0; i < ARRAY_SIZE(iqs626_channels); i++) { + ch_node = device_get_named_child_node(&client->dev, + iqs626_channels[i].name); + if (!ch_node) + continue; + + error = iqs626_parse_channel(iqs626, ch_node, i); + if (error) + return error; + + error = iqs626_parse_ati_target(iqs626, ch_node, i); + if (error) + return error; + + error = iqs626_parse_events(iqs626, ch_node, i); + if (error) + return error; + + if (!fwnode_property_present(ch_node, "azoteq,ati-exclude")) + sys_reg->redo_ati |= iqs626_channels[i].active; + + if (!fwnode_property_present(ch_node, "azoteq,reseed-disable")) + sys_reg->reseed |= iqs626_channels[i].active; + + sys_reg->active |= iqs626_channels[i].active; + } + + general |= IQS626_SYS_SETTINGS_EVENT_MODE; + + /* + * Enable streaming during normal-power mode if the trackpad is used to + * report raw coordinates instead of gestures. In that case, the device + * returns to event mode during low-power mode. + */ + if (sys_reg->active & iqs626_channels[IQS626_CH_TP_2].active && + sys_reg->event_mask & IQS626_EVENT_MASK_GESTURE) + general |= IQS626_SYS_SETTINGS_EVENT_MODE_LP; + + general |= IQS626_SYS_SETTINGS_REDO_ATI; + general |= IQS626_SYS_SETTINGS_ACK_RESET; + + sys_reg->general = cpu_to_be16(general); + + error = regmap_raw_write(iqs626->regmap, IQS626_SYS_SETTINGS, + &iqs626->sys_reg, sizeof(iqs626->sys_reg)); + if (error) + return error; + + iqs626_irq_wait(); + + return 0; +} + +static int iqs626_input_init(struct iqs626_private *iqs626) +{ + struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg; + struct i2c_client *client = iqs626->client; + int error, i, j; + + iqs626->keypad = devm_input_allocate_device(&client->dev); + if (!iqs626->keypad) + return -ENOMEM; + + iqs626->keypad->keycodemax = ARRAY_SIZE(iqs626->kp_code); + iqs626->keypad->keycode = iqs626->kp_code; + iqs626->keypad->keycodesize = sizeof(**iqs626->kp_code); + + iqs626->keypad->name = "iqs626a_keypad"; + iqs626->keypad->id.bustype = BUS_I2C; + + for (i = 0; i < ARRAY_SIZE(iqs626_channels); i++) { + if (!(sys_reg->active & iqs626_channels[i].active)) + continue; + + for (j = 0; j < ARRAY_SIZE(iqs626_events); j++) { + if (!iqs626->kp_type[i][j]) + continue; + + input_set_capability(iqs626->keypad, + iqs626->kp_type[i][j], + iqs626->kp_code[i][j]); + } + } + + if (!(sys_reg->active & iqs626_channels[IQS626_CH_TP_2].active)) + return 0; + + iqs626->trackpad = devm_input_allocate_device(&client->dev); + if (!iqs626->trackpad) + return -ENOMEM; + + iqs626->trackpad->keycodemax = ARRAY_SIZE(iqs626->tp_code); + iqs626->trackpad->keycode = iqs626->tp_code; + iqs626->trackpad->keycodesize = sizeof(*iqs626->tp_code); + + iqs626->trackpad->name = "iqs626a_trackpad"; + iqs626->trackpad->id.bustype = BUS_I2C; + + /* + * Present the trackpad as a traditional pointing device if no gestures + * have been mapped to a keycode. + */ + if (sys_reg->event_mask & IQS626_EVENT_MASK_GESTURE) { + u8 tp_mask = iqs626_channels[IQS626_CH_TP_3].active; + + input_set_capability(iqs626->trackpad, EV_KEY, BTN_TOUCH); + input_set_abs_params(iqs626->trackpad, ABS_Y, 0, 255, 0, 0); + + if ((sys_reg->active & tp_mask) == tp_mask) + input_set_abs_params(iqs626->trackpad, + ABS_X, 0, 255, 0, 0); + else + input_set_abs_params(iqs626->trackpad, + ABS_X, 0, 128, 0, 0); + + touchscreen_parse_properties(iqs626->trackpad, false, + &iqs626->prop); + } else { + for (i = 0; i < IQS626_NUM_GESTURES; i++) + if (iqs626->tp_code[i] != KEY_RESERVED) + input_set_capability(iqs626->trackpad, EV_KEY, + iqs626->tp_code[i]); + } + + error = input_register_device(iqs626->trackpad); + if (error) + dev_err(&client->dev, "Failed to register trackpad: %d\n", + error); + + return error; +} + +static int iqs626_report(struct iqs626_private *iqs626) +{ + struct iqs626_sys_reg *sys_reg = &iqs626->sys_reg; + struct i2c_client *client = iqs626->client; + struct iqs626_flags flags; + __le16 hall_output; + int error, i, j; + u8 state; + u8 *dir_mask = &flags.states[IQS626_ST_OFFS_DIR]; + + error = regmap_raw_read(iqs626->regmap, IQS626_SYS_FLAGS, &flags, + sizeof(flags)); + if (error) { + dev_err(&client->dev, "Failed to read device status: %d\n", + error); + return error; + } + + /* + * The device resets itself if its own watchdog bites, which can happen + * in the event of an I2C communication error. In this case, the device + * asserts a SHOW_RESET interrupt and all registers must be restored. + */ + if (be16_to_cpu(flags.system) & IQS626_SYS_FLAGS_SHOW_RESET) { + dev_err(&client->dev, "Unexpected device reset\n"); + + error = regmap_raw_write(iqs626->regmap, IQS626_SYS_SETTINGS, + sys_reg, sizeof(*sys_reg)); + if (error) + dev_err(&client->dev, + "Failed to re-initialize device: %d\n", error); + + return error; + } + + if (be16_to_cpu(flags.system) & IQS626_SYS_FLAGS_IN_ATI) + return 0; + + /* + * Unlike the ULP or generic channels, the Hall channel does not have a + * direction flag. Instead, the direction (i.e. magnet polarity) can be + * derived based on the sign of the 2's complement differential output. + */ + if (sys_reg->active & iqs626_channels[IQS626_CH_HALL].active) { + error = regmap_raw_read(iqs626->regmap, IQS626_HALL_OUTPUT, + &hall_output, sizeof(hall_output)); + if (error) { + dev_err(&client->dev, + "Failed to read Hall output: %d\n", error); + return error; + } + + *dir_mask &= ~iqs626_channels[IQS626_CH_HALL].active; + if (le16_to_cpu(hall_output) < 0x8000) + *dir_mask |= iqs626_channels[IQS626_CH_HALL].active; + } + + for (i = 0; i < ARRAY_SIZE(iqs626_channels); i++) { + if (!(sys_reg->active & iqs626_channels[i].active)) + continue; + + for (j = 0; j < ARRAY_SIZE(iqs626_events); j++) { + if (!iqs626->kp_type[i][j]) + continue; + + state = flags.states[iqs626_events[j].st_offs]; + state &= iqs626_events[j].dir_up ? *dir_mask + : ~(*dir_mask); + state &= iqs626_channels[i].active; + + input_event(iqs626->keypad, iqs626->kp_type[i][j], + iqs626->kp_code[i][j], !!state); + } + } + + input_sync(iqs626->keypad); + + /* + * The following completion signals that ATI has finished, any initial + * switch states have been reported and the keypad can be registered. + */ + complete_all(&iqs626->ati_done); + + if (!(sys_reg->active & iqs626_channels[IQS626_CH_TP_2].active)) + return 0; + + if (sys_reg->event_mask & IQS626_EVENT_MASK_GESTURE) { + state = flags.states[IQS626_ST_OFFS_TOUCH]; + state &= iqs626_channels[IQS626_CH_TP_2].active; + + input_report_key(iqs626->trackpad, BTN_TOUCH, state); + + if (state) + touchscreen_report_pos(iqs626->trackpad, &iqs626->prop, + flags.trackpad_x, + flags.trackpad_y, false); + } else { + for (i = 0; i < IQS626_NUM_GESTURES; i++) + input_report_key(iqs626->trackpad, iqs626->tp_code[i], + flags.gesture & BIT(i)); + + if (flags.gesture & GENMASK(IQS626_GESTURE_TAP, 0)) { + input_sync(iqs626->trackpad); + + /* + * Momentary gestures are followed by a complementary + * release cycle so as to emulate a full keystroke. + */ + for (i = 0; i < IQS626_GESTURE_HOLD; i++) + input_report_key(iqs626->trackpad, + iqs626->tp_code[i], 0); + } + } + + input_sync(iqs626->trackpad); + + return 0; +} + +static irqreturn_t iqs626_irq(int irq, void *context) +{ + struct iqs626_private *iqs626 = context; + + if (iqs626_report(iqs626)) + return IRQ_NONE; + + /* + * The device does not deassert its interrupt (RDY) pin until shortly + * after receiving an I2C stop condition; the following delay ensures + * the interrupt handler does not return before this time. + */ + iqs626_irq_wait(); + + return IRQ_HANDLED; +} + +static const struct regmap_config iqs626_regmap_config = { + .reg_bits = 8, + .val_bits = 16, + .max_register = IQS626_MAX_REG, +}; + +static int iqs626_probe(struct i2c_client *client) +{ + struct iqs626_ver_info ver_info; + struct iqs626_private *iqs626; + int error; + + iqs626 = devm_kzalloc(&client->dev, sizeof(*iqs626), GFP_KERNEL); + if (!iqs626) + return -ENOMEM; + + i2c_set_clientdata(client, iqs626); + iqs626->client = client; + + iqs626->regmap = devm_regmap_init_i2c(client, &iqs626_regmap_config); + if (IS_ERR(iqs626->regmap)) { + error = PTR_ERR(iqs626->regmap); + dev_err(&client->dev, "Failed to initialize register map: %d\n", + error); + return error; + } + + init_completion(&iqs626->ati_done); + + error = regmap_raw_read(iqs626->regmap, IQS626_VER_INFO, &ver_info, + sizeof(ver_info)); + if (error) + return error; + + if (ver_info.prod_num != IQS626_VER_INFO_PROD_NUM) { + dev_err(&client->dev, "Unrecognized product number: 0x%02X\n", + ver_info.prod_num); + return -EINVAL; + } + + error = iqs626_parse_prop(iqs626); + if (error) + return error; + + error = iqs626_input_init(iqs626); + if (error) + return error; + + error = devm_request_threaded_irq(&client->dev, client->irq, + NULL, iqs626_irq, IRQF_ONESHOT, + client->name, iqs626); + if (error) { + dev_err(&client->dev, "Failed to request IRQ: %d\n", error); + return error; + } + + if (!wait_for_completion_timeout(&iqs626->ati_done, + msecs_to_jiffies(2000))) { + dev_err(&client->dev, "Failed to complete ATI\n"); + return -ETIMEDOUT; + } + + /* + * The keypad may include one or more switches and is not registered + * until ATI is complete and the initial switch states are read. + */ + error = input_register_device(iqs626->keypad); + if (error) + dev_err(&client->dev, "Failed to register keypad: %d\n", error); + + return error; +} + +static int __maybe_unused iqs626_suspend(struct device *dev) +{ + struct iqs626_private *iqs626 = dev_get_drvdata(dev); + struct i2c_client *client = iqs626->client; + unsigned int val; + int error; + + if (!iqs626->suspend_mode) + return 0; + + disable_irq(client->irq); + + /* + * Automatic power mode switching must be disabled before the device is + * forced into any particular power mode. In this case, the device will + * transition into normal-power mode. + */ + error = regmap_update_bits(iqs626->regmap, IQS626_SYS_SETTINGS, + IQS626_SYS_SETTINGS_DIS_AUTO, ~0); + if (error) + goto err_irq; + + /* + * The following check ensures the device has completed its transition + * into normal-power mode before a manual mode switch is performed. + */ + error = regmap_read_poll_timeout(iqs626->regmap, IQS626_SYS_FLAGS, val, + !(val & IQS626_SYS_FLAGS_PWR_MODE_MASK), + IQS626_PWR_MODE_POLL_SLEEP_US, + IQS626_PWR_MODE_POLL_TIMEOUT_US); + if (error) + goto err_irq; + + error = regmap_update_bits(iqs626->regmap, IQS626_SYS_SETTINGS, + IQS626_SYS_SETTINGS_PWR_MODE_MASK, + iqs626->suspend_mode << + IQS626_SYS_SETTINGS_PWR_MODE_SHIFT); + if (error) + goto err_irq; + + /* + * This last check ensures the device has completed its transition into + * the desired power mode to prevent any spurious interrupts from being + * triggered after iqs626_suspend has already returned. + */ + error = regmap_read_poll_timeout(iqs626->regmap, IQS626_SYS_FLAGS, val, + (val & IQS626_SYS_FLAGS_PWR_MODE_MASK) + == (iqs626->suspend_mode << + IQS626_SYS_FLAGS_PWR_MODE_SHIFT), + IQS626_PWR_MODE_POLL_SLEEP_US, + IQS626_PWR_MODE_POLL_TIMEOUT_US); + +err_irq: + iqs626_irq_wait(); + enable_irq(client->irq); + + return error; +} + +static int __maybe_unused iqs626_resume(struct device *dev) +{ + struct iqs626_private *iqs626 = dev_get_drvdata(dev); + struct i2c_client *client = iqs626->client; + unsigned int val; + int error; + + if (!iqs626->suspend_mode) + return 0; + + disable_irq(client->irq); + + error = regmap_update_bits(iqs626->regmap, IQS626_SYS_SETTINGS, + IQS626_SYS_SETTINGS_PWR_MODE_MASK, 0); + if (error) + goto err_irq; + + /* + * This check ensures the device has returned to normal-power mode + * before automatic power mode switching is re-enabled. + */ + error = regmap_read_poll_timeout(iqs626->regmap, IQS626_SYS_FLAGS, val, + !(val & IQS626_SYS_FLAGS_PWR_MODE_MASK), + IQS626_PWR_MODE_POLL_SLEEP_US, + IQS626_PWR_MODE_POLL_TIMEOUT_US); + if (error) + goto err_irq; + + error = regmap_update_bits(iqs626->regmap, IQS626_SYS_SETTINGS, + IQS626_SYS_SETTINGS_DIS_AUTO, 0); + if (error) + goto err_irq; + + /* + * This step reports any events that may have been "swallowed" as a + * result of polling PWR_MODE (which automatically acknowledges any + * pending interrupts). + */ + error = iqs626_report(iqs626); + +err_irq: + iqs626_irq_wait(); + enable_irq(client->irq); + + return error; +} + +static SIMPLE_DEV_PM_OPS(iqs626_pm, iqs626_suspend, iqs626_resume); + +static const struct of_device_id iqs626_of_match[] = { + { .compatible = "azoteq,iqs626a" }, + { } +}; +MODULE_DEVICE_TABLE(of, iqs626_of_match); + +static struct i2c_driver iqs626_i2c_driver = { + .driver = { + .name = "iqs626a", + .of_match_table = iqs626_of_match, + .pm = &iqs626_pm, + }, + .probe_new = iqs626_probe, +}; +module_i2c_driver(iqs626_i2c_driver); + +MODULE_AUTHOR("Jeff LaBundy "); +MODULE_DESCRIPTION("Azoteq IQS626A Capacitive Touch Controller"); +MODULE_LICENSE("GPL"); From 9d41359caca7cdc6d3011ba4e485e89d40505e81 Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Mon, 22 Mar 2021 20:15:32 -0700 Subject: [PATCH 0232/1379] Input: iqs5xx - make reset GPIO optional The device's hardware reset pin is only required if the platform must be able to update the device's firmware. As such, demote the reset GPIO to optional in support of devices that ship with pre-programmed firmware and don't route the reset pin back to the SoC. In that case, the 'fw_file' attribute is hidden because there is no way to open the bootloader. The logic is extended to the case in which the device does not advertise bootloader support in the first place. Last but not least, remove the hardware reset performed at probe because there is no reason to reset the device manually. A power on reset function already ensures a clean reset at start-up. Signed-off-by: Jeff LaBundy Link: https://lore.kernel.org/r/20210323021006.367-1-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/iqs5xx.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/input/touchscreen/iqs5xx.c b/drivers/input/touchscreen/iqs5xx.c index a990c176abf7..b3fa71213d60 100644 --- a/drivers/input/touchscreen/iqs5xx.c +++ b/drivers/input/touchscreen/iqs5xx.c @@ -835,9 +835,6 @@ static int iqs5xx_fw_file_write(struct i2c_client *client, const char *fw_file) int error, error_init = 0; u8 *pmap; - if (iqs5xx->dev_id_info.bl_status == IQS5XX_BL_STATUS_NONE) - return -EPERM; - pmap = kzalloc(IQS5XX_PMAP_LEN, GFP_KERNEL); if (!pmap) return -ENOMEM; @@ -963,7 +960,22 @@ static struct attribute *iqs5xx_attrs[] = { NULL, }; +static umode_t iqs5xx_attr_is_visible(struct kobject *kobj, + struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + struct iqs5xx_private *iqs5xx = dev_get_drvdata(dev); + + if (attr == &dev_attr_fw_file.attr && + (iqs5xx->dev_id_info.bl_status == IQS5XX_BL_STATUS_NONE || + !iqs5xx->reset_gpio)) + return 0; + + return attr->mode; +} + static const struct attribute_group iqs5xx_attr_group = { + .is_visible = iqs5xx_attr_is_visible, .attrs = iqs5xx_attrs, }; @@ -1020,8 +1032,8 @@ static int iqs5xx_probe(struct i2c_client *client, i2c_set_clientdata(client, iqs5xx); iqs5xx->client = client; - iqs5xx->reset_gpio = devm_gpiod_get(&client->dev, - "reset", GPIOD_OUT_LOW); + iqs5xx->reset_gpio = devm_gpiod_get_optional(&client->dev, + "reset", GPIOD_OUT_LOW); if (IS_ERR(iqs5xx->reset_gpio)) { error = PTR_ERR(iqs5xx->reset_gpio); dev_err(&client->dev, "Failed to request GPIO: %d\n", error); @@ -1030,9 +1042,6 @@ static int iqs5xx_probe(struct i2c_client *client, mutex_init(&iqs5xx->lock); - iqs5xx_reset(client); - usleep_range(10000, 10100); - error = iqs5xx_dev_init(client); if (error) return error; From 55f2645c92bda7281adb81a806cd0a014ca9702e Mon Sep 17 00:00:00 2001 From: Jeff LaBundy Date: Mon, 22 Mar 2021 20:16:40 -0700 Subject: [PATCH 0233/1379] dt-bindings: input: iqs5xx: Convert to YAML This patch converts the legacy text-based binding document to YAML format. Extraneous details and touchscreen properties that weren't actually supported have been dropped. The reset GPIO has since been made optional in the driver; this is now reflected here as well. Signed-off-by: Jeff LaBundy Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20210323021006.367-2-jeff@labundy.com Signed-off-by: Dmitry Torokhov --- .../input/touchscreen/azoteq,iqs5xx.yaml | 75 +++++++++++++++++ .../bindings/input/touchscreen/iqs5xx.txt | 80 ------------------- 2 files changed, 75 insertions(+), 80 deletions(-) create mode 100644 Documentation/devicetree/bindings/input/touchscreen/azoteq,iqs5xx.yaml delete mode 100644 Documentation/devicetree/bindings/input/touchscreen/iqs5xx.txt diff --git a/Documentation/devicetree/bindings/input/touchscreen/azoteq,iqs5xx.yaml b/Documentation/devicetree/bindings/input/touchscreen/azoteq,iqs5xx.yaml new file mode 100644 index 000000000000..b5f377215c09 --- /dev/null +++ b/Documentation/devicetree/bindings/input/touchscreen/azoteq,iqs5xx.yaml @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/touchscreen/azoteq,iqs5xx.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Azoteq IQS550/572/525 Trackpad/Touchscreen Controller + +maintainers: + - Jeff LaBundy + +description: | + The Azoteq IQS550, IQS572 and IQS525 trackpad and touchscreen controllers + employ projected-capacitance sensing and can track up to five independent + contacts. + + Link to datasheet: https://www.azoteq.com/ + +allOf: + - $ref: touchscreen.yaml# + +properties: + compatible: + enum: + - azoteq,iqs550 + - azoteq,iqs572 + - azoteq,iqs525 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + reset-gpios: + maxItems: 1 + + wakeup-source: true + + touchscreen-size-x: true + touchscreen-size-y: true + touchscreen-inverted-x: true + touchscreen-inverted-y: true + touchscreen-swapped-x-y: true + +required: + - compatible + - reg + - interrupts + +additionalProperties: false + +examples: + - | + #include + #include + + i2c { + #address-cells = <1>; + #size-cells = <0>; + + touchscreen@74 { + compatible = "azoteq,iqs550"; + reg = <0x74>; + interrupt-parent = <&gpio>; + interrupts = <27 IRQ_TYPE_LEVEL_HIGH>; + reset-gpios = <&gpio 22 (GPIO_ACTIVE_LOW | + GPIO_PUSH_PULL)>; + + touchscreen-size-x = <800>; + touchscreen-size-y = <480>; + }; + }; + +... diff --git a/Documentation/devicetree/bindings/input/touchscreen/iqs5xx.txt b/Documentation/devicetree/bindings/input/touchscreen/iqs5xx.txt deleted file mode 100644 index efa0820e2469..000000000000 --- a/Documentation/devicetree/bindings/input/touchscreen/iqs5xx.txt +++ /dev/null @@ -1,80 +0,0 @@ -Azoteq IQS550/572/525 Trackpad/Touchscreen Controller - -Required properties: - -- compatible : Must be equal to one of the following: - "azoteq,iqs550" - "azoteq,iqs572" - "azoteq,iqs525" - -- reg : I2C slave address for the device. - -- interrupts : GPIO to which the device's active-high RDY - output is connected (see [0]). - -- reset-gpios : GPIO to which the device's active-low NRST - input is connected (see [1]). - -Optional properties: - -- touchscreen-min-x : See [2]. - -- touchscreen-min-y : See [2]. - -- touchscreen-size-x : See [2]. If this property is omitted, the - maximum x-coordinate is specified by the - device's "X Resolution" register. - -- touchscreen-size-y : See [2]. If this property is omitted, the - maximum y-coordinate is specified by the - device's "Y Resolution" register. - -- touchscreen-max-pressure : See [2]. Pressure is expressed as the sum of - the deltas across all channels impacted by a - touch event. A channel's delta is calculated - as its count value minus a reference, where - the count value is inversely proportional to - the channel's capacitance. - -- touchscreen-fuzz-x : See [2]. - -- touchscreen-fuzz-y : See [2]. - -- touchscreen-fuzz-pressure : See [2]. - -- touchscreen-inverted-x : See [2]. Inversion is applied relative to that - which may already be specified by the device's - FLIP_X and FLIP_Y register fields. - -- touchscreen-inverted-y : See [2]. Inversion is applied relative to that - which may already be specified by the device's - FLIP_X and FLIP_Y register fields. - -- touchscreen-swapped-x-y : See [2]. Swapping is applied relative to that - which may already be specified by the device's - SWITCH_XY_AXIS register field. - -[0]: Documentation/devicetree/bindings/interrupt-controller/interrupts.txt -[1]: Documentation/devicetree/bindings/gpio/gpio.txt -[2]: Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt - -Example: - - &i2c1 { - /* ... */ - - touchscreen@74 { - compatible = "azoteq,iqs550"; - reg = <0x74>; - interrupt-parent = <&gpio>; - interrupts = <17 4>; - reset-gpios = <&gpio 27 1>; - - touchscreen-size-x = <640>; - touchscreen-size-y = <480>; - - touchscreen-max-pressure = <16000>; - }; - - /* ... */ - }; From 84c36ab7a6ddeab213c979d22b6372f71d738862 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 22 Mar 2021 20:25:26 -0700 Subject: [PATCH 0234/1379] Input: cyttsp - verbose error on soft reset The first thing the Cypress driver does when starting up is to try a soft reset. This is the first point where the driver SPI/I2C communication can fail, so put out some nice debug text: cyttsp-spi spi2.0: failed to send soft reset Instead of just: cyttsp-spi: probe of spi2.0 failed with error -5 This is more helpful. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20210322221349.1116666-1-linus.walleij@linaro.org Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/cyttsp_core.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/input/touchscreen/cyttsp_core.c b/drivers/input/touchscreen/cyttsp_core.c index 73c854f35f33..e7b6b6c87515 100644 --- a/drivers/input/touchscreen/cyttsp_core.c +++ b/drivers/input/touchscreen/cyttsp_core.c @@ -238,7 +238,6 @@ static void cyttsp_hard_reset(struct cyttsp *ts) static int cyttsp_soft_reset(struct cyttsp *ts) { - unsigned long timeout; int retval; /* wait for interrupt to set ready completion */ @@ -248,12 +247,16 @@ static int cyttsp_soft_reset(struct cyttsp *ts) enable_irq(ts->irq); retval = ttsp_send_command(ts, CY_SOFT_RESET_MODE); - if (retval) + if (retval) { + dev_err(ts->dev, "failed to send soft reset\n"); goto out; + } - timeout = wait_for_completion_timeout(&ts->bl_ready, - msecs_to_jiffies(CY_DELAY_DFLT * CY_DELAY_MAX)); - retval = timeout ? 0 : -EIO; + if (!wait_for_completion_timeout(&ts->bl_ready, + msecs_to_jiffies(CY_DELAY_DFLT * CY_DELAY_MAX))) { + dev_err(ts->dev, "timeout waiting for soft reset\n"); + retval = -EIO; + } out: ts->state = CY_IDLE_STATE; From 30c3d39f7f78f3b232f6a6f6357a545cbe23cc16 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 25 Mar 2021 16:37:52 +0000 Subject: [PATCH 0235/1379] tracing: A minor cleanup for create_system_filter() The first two parameters should be reduced to one, as @tr is simply @dir->tr. Link: https://lkml.kernel.org/r/20210324205642.65e03248@oasis.local.home Link: https://lkml.kernel.org/r/20210325163752.128407-1-hqjagain@gmail.com Suggested-by: Steven Rostedt (VMware) Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 49de3e21e9bc..c2dd697cc9c0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1750,7 +1750,6 @@ int create_event_filter(struct trace_array *tr, * and always remembers @filter_str. */ static int create_system_filter(struct trace_subsystem_dir *dir, - struct trace_array *tr, char *filter_str, struct event_filter **filterp) { struct filter_parse_error *pe = NULL; @@ -1758,13 +1757,13 @@ static int create_system_filter(struct trace_subsystem_dir *dir, err = create_filter_start(filter_str, true, &pe, filterp); if (!err) { - err = process_system_preds(dir, tr, pe, filter_str); + err = process_system_preds(dir, dir->tr, pe, filter_str); if (!err) { /* System filters just show a default message */ kfree((*filterp)->filter_string); (*filterp)->filter_string = NULL; } else { - append_filter_err(tr, pe, *filterp); + append_filter_err(dir->tr, pe, *filterp); } } create_filter_finish(pe); @@ -1852,7 +1851,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, goto out_unlock; } - err = create_system_filter(dir, tr, filter_string, &filter); + err = create_system_filter(dir, filter_string, &filter); if (filter) { /* * No event actually uses the system filter From 70193038a6ec9bbf10990a126432b0cbf56aa339 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 25 Mar 2021 16:19:10 +0000 Subject: [PATCH 0236/1379] tracing: Update create_system_filter() kernel-doc comment commit f306cc82a93d ("tracing: Update event filters for multibuffer") added the parameter @tr for create_system_filter(). commit bb9ef1cb7d86 ("tracing: Change apply_subsystem_event_filter() paths to check file->system == dir") changed the parameter from @system to @dir. Link: https://lkml.kernel.org/r/20210325161911.123452-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c2dd697cc9c0..c9124038b140 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1693,6 +1693,7 @@ static void create_filter_finish(struct filter_parse_error *pe) /** * create_filter - create a filter for a trace_event_call + * @tr: the trace array associated with these events * @call: trace_event_call to create a filter for * @filter_str: filter string * @set_str: remember @filter_str and enable detailed error in filter @@ -1741,8 +1742,8 @@ int create_event_filter(struct trace_array *tr, } /** - * create_system_filter - create a filter for an event_subsystem - * @system: event_subsystem to create a filter for + * create_system_filter - create a filter for an event subsystem + * @dir: the descriptor for the subsystem directory * @filter_str: filter string * @filterp: out param for created filter (always updated on return) * From 34a6ae672645a89f760960a11ce80125cc4d361f Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Thu, 25 Mar 2021 11:39:29 +0100 Subject: [PATCH 0237/1379] leds: Kconfig: LEDS_CLASS is usually selected. People should really say Y to the LEDS_CLASS prompt. Signed-off-by: Pavel Machek --- drivers/leds/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index 4ca8cd594518..49d99cb084db 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -18,7 +18,7 @@ config LEDS_CLASS tristate "LED Class Support" help This option enables the LED sysfs class in /sys/class/leds. You'll - need this to do anything useful with LEDs. If unsure, say N. + need this to do anything useful with LEDs. If unsure, say Y. config LEDS_CLASS_FLASH tristate "LED Flash Class Support" From bcd9730a04a1f18d873adb3907f2b4830b88ee9a Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 25 Mar 2021 14:43:18 -0700 Subject: [PATCH 0238/1379] Input: move to use request_irq by IRQF_NO_AUTOEN flag disable_irq() after request_irq() still has a time gap in which interrupts can come. request_irq() with IRQF_NO_AUTOEN flag will disable IRQ auto-enable because of requesting. On the other hand, request_irq() after setting IRQ_NOAUTOEN as below irq_set_status_flags(irq, IRQ_NOAUTOEN); request_irq(dev, irq...); can also be replaced by request_irq() with IRQF_NO_AUTOEN flag. Signed-off-by: Barry Song Link: https://lore.kernel.org/r/20210302224916.13980-3-song.bao.hua@hisilicon.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/tca6416-keypad.c | 3 +-- drivers/input/keyboard/tegra-kbc.c | 5 ++--- drivers/input/touchscreen/ar1021_i2c.c | 5 +---- drivers/input/touchscreen/atmel_mxt_ts.c | 5 ++--- drivers/input/touchscreen/bu21029_ts.c | 4 ++-- drivers/input/touchscreen/cyttsp_core.c | 5 ++--- drivers/input/touchscreen/melfas_mip4.c | 5 ++--- drivers/input/touchscreen/mms114.c | 4 ++-- drivers/input/touchscreen/stmfts.c | 3 +-- drivers/input/touchscreen/wm831x-ts.c | 3 +-- drivers/input/touchscreen/zinitix.c | 4 ++-- 11 files changed, 18 insertions(+), 28 deletions(-) diff --git a/drivers/input/keyboard/tca6416-keypad.c b/drivers/input/keyboard/tca6416-keypad.c index 9b0f9665dcb0..2a9755910065 100644 --- a/drivers/input/keyboard/tca6416-keypad.c +++ b/drivers/input/keyboard/tca6416-keypad.c @@ -274,7 +274,7 @@ static int tca6416_keypad_probe(struct i2c_client *client, error = request_threaded_irq(chip->irqnum, NULL, tca6416_keys_isr, IRQF_TRIGGER_FALLING | - IRQF_ONESHOT, + IRQF_ONESHOT | IRQF_NO_AUTOEN, "tca6416-keypad", chip); if (error) { dev_dbg(&client->dev, @@ -282,7 +282,6 @@ static int tca6416_keypad_probe(struct i2c_client *client, chip->irqnum, error); goto fail1; } - disable_irq(chip->irqnum); } error = input_register_device(input); diff --git a/drivers/input/keyboard/tegra-kbc.c b/drivers/input/keyboard/tegra-kbc.c index 9671842a082a..570fe18c0ce9 100644 --- a/drivers/input/keyboard/tegra-kbc.c +++ b/drivers/input/keyboard/tegra-kbc.c @@ -694,14 +694,13 @@ static int tegra_kbc_probe(struct platform_device *pdev) input_set_drvdata(kbc->idev, kbc); err = devm_request_irq(&pdev->dev, kbc->irq, tegra_kbc_isr, - IRQF_TRIGGER_HIGH, pdev->name, kbc); + IRQF_TRIGGER_HIGH | IRQF_NO_AUTOEN, + pdev->name, kbc); if (err) { dev_err(&pdev->dev, "failed to request keyboard IRQ\n"); return err; } - disable_irq(kbc->irq); - err = input_register_device(kbc->idev); if (err) { dev_err(&pdev->dev, "failed to register input device\n"); diff --git a/drivers/input/touchscreen/ar1021_i2c.c b/drivers/input/touchscreen/ar1021_i2c.c index c0d5c2413356..dc6a85362a40 100644 --- a/drivers/input/touchscreen/ar1021_i2c.c +++ b/drivers/input/touchscreen/ar1021_i2c.c @@ -125,7 +125,7 @@ static int ar1021_i2c_probe(struct i2c_client *client, error = devm_request_threaded_irq(&client->dev, client->irq, NULL, ar1021_i2c_irq, - IRQF_ONESHOT, + IRQF_ONESHOT | IRQF_NO_AUTOEN, "ar1021_i2c", ar1021); if (error) { dev_err(&client->dev, @@ -133,9 +133,6 @@ static int ar1021_i2c_probe(struct i2c_client *client, return error; } - /* Disable the IRQ, we'll enable it in ar1021_i2c_open() */ - disable_irq(client->irq); - error = input_register_device(ar1021->input); if (error) { dev_err(&client->dev, diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c index 5ed23689047b..05de92c0293b 100644 --- a/drivers/input/touchscreen/atmel_mxt_ts.c +++ b/drivers/input/touchscreen/atmel_mxt_ts.c @@ -3215,15 +3215,14 @@ static int mxt_probe(struct i2c_client *client, const struct i2c_device_id *id) } error = devm_request_threaded_irq(&client->dev, client->irq, - NULL, mxt_interrupt, IRQF_ONESHOT, + NULL, mxt_interrupt, + IRQF_ONESHOT | IRQF_NO_AUTOEN, client->name, data); if (error) { dev_err(&client->dev, "Failed to register interrupt\n"); return error; } - disable_irq(client->irq); - error = regulator_bulk_enable(ARRAY_SIZE(data->regulators), data->regulators); if (error) { diff --git a/drivers/input/touchscreen/bu21029_ts.c b/drivers/input/touchscreen/bu21029_ts.c index 341925edb8e6..392950aa7856 100644 --- a/drivers/input/touchscreen/bu21029_ts.c +++ b/drivers/input/touchscreen/bu21029_ts.c @@ -401,10 +401,10 @@ static int bu21029_probe(struct i2c_client *client, input_set_drvdata(in_dev, bu21029); - irq_set_status_flags(client->irq, IRQ_NOAUTOEN); error = devm_request_threaded_irq(&client->dev, client->irq, NULL, bu21029_touch_soft_irq, - IRQF_ONESHOT, DRIVER_NAME, bu21029); + IRQF_ONESHOT | IRQF_NO_AUTOEN, + DRIVER_NAME, bu21029); if (error) { dev_err(&client->dev, "unable to request touch irq: %d\n", error); diff --git a/drivers/input/touchscreen/cyttsp_core.c b/drivers/input/touchscreen/cyttsp_core.c index e7b6b6c87515..c59aa6b8d257 100644 --- a/drivers/input/touchscreen/cyttsp_core.c +++ b/drivers/input/touchscreen/cyttsp_core.c @@ -655,7 +655,8 @@ struct cyttsp *cyttsp_probe(const struct cyttsp_bus_ops *bus_ops, } error = devm_request_threaded_irq(dev, ts->irq, NULL, cyttsp_irq, - IRQF_TRIGGER_FALLING | IRQF_ONESHOT, + IRQF_TRIGGER_FALLING | IRQF_ONESHOT | + IRQF_NO_AUTOEN, "cyttsp", ts); if (error) { dev_err(ts->dev, "failed to request IRQ %d, err: %d\n", @@ -663,8 +664,6 @@ struct cyttsp *cyttsp_probe(const struct cyttsp_bus_ops *bus_ops, return ERR_PTR(error); } - disable_irq(ts->irq); - cyttsp_hard_reset(ts); error = cyttsp_power_on(ts); diff --git a/drivers/input/touchscreen/melfas_mip4.c b/drivers/input/touchscreen/melfas_mip4.c index 225796a3f546..2745bf1aee38 100644 --- a/drivers/input/touchscreen/melfas_mip4.c +++ b/drivers/input/touchscreen/melfas_mip4.c @@ -1502,7 +1502,8 @@ static int mip4_probe(struct i2c_client *client, const struct i2c_device_id *id) error = devm_request_threaded_irq(&client->dev, client->irq, NULL, mip4_interrupt, - IRQF_ONESHOT, MIP4_DEVICE_NAME, ts); + IRQF_ONESHOT | IRQF_NO_AUTOEN, + MIP4_DEVICE_NAME, ts); if (error) { dev_err(&client->dev, "Failed to request interrupt %d: %d\n", @@ -1510,8 +1511,6 @@ static int mip4_probe(struct i2c_client *client, const struct i2c_device_id *id) return error; } - disable_irq(client->irq); - error = input_register_device(input); if (error) { dev_err(&client->dev, diff --git a/drivers/input/touchscreen/mms114.c b/drivers/input/touchscreen/mms114.c index 16557f51b09d..7043f57ea2dd 100644 --- a/drivers/input/touchscreen/mms114.c +++ b/drivers/input/touchscreen/mms114.c @@ -530,13 +530,13 @@ static int mms114_probe(struct i2c_client *client, } error = devm_request_threaded_irq(&client->dev, client->irq, - NULL, mms114_interrupt, IRQF_ONESHOT, + NULL, mms114_interrupt, + IRQF_ONESHOT | IRQF_NO_AUTOEN, dev_name(&client->dev), data); if (error) { dev_err(&client->dev, "Failed to register interrupt\n"); return error; } - disable_irq(client->irq); error = input_register_device(data->input_dev); if (error) { diff --git a/drivers/input/touchscreen/stmfts.c b/drivers/input/touchscreen/stmfts.c index 9a64e1dbc04a..bc11203c9cf7 100644 --- a/drivers/input/touchscreen/stmfts.c +++ b/drivers/input/touchscreen/stmfts.c @@ -691,10 +691,9 @@ static int stmfts_probe(struct i2c_client *client, * interrupts. To be on the safe side it's better to not enable * the interrupts during their request. */ - irq_set_status_flags(client->irq, IRQ_NOAUTOEN); err = devm_request_threaded_irq(&client->dev, client->irq, NULL, stmfts_irq_handler, - IRQF_ONESHOT, + IRQF_ONESHOT | IRQF_NO_AUTOEN, "stmfts_irq", sdata); if (err) return err; diff --git a/drivers/input/touchscreen/wm831x-ts.c b/drivers/input/touchscreen/wm831x-ts.c index bb1699e0d3c7..319f57fb9af5 100644 --- a/drivers/input/touchscreen/wm831x-ts.c +++ b/drivers/input/touchscreen/wm831x-ts.c @@ -317,14 +317,13 @@ static int wm831x_ts_probe(struct platform_device *pdev) error = request_threaded_irq(wm831x_ts->data_irq, NULL, wm831x_ts_data_irq, - irqf | IRQF_ONESHOT, + irqf | IRQF_ONESHOT | IRQF_NO_AUTOEN, "Touchscreen data", wm831x_ts); if (error) { dev_err(&pdev->dev, "Failed to request data IRQ %d: %d\n", wm831x_ts->data_irq, error); goto err_alloc; } - disable_irq(wm831x_ts->data_irq); if (pdata && pdata->pd_irqf) irqf = pdata->pd_irqf; diff --git a/drivers/input/touchscreen/zinitix.c b/drivers/input/touchscreen/zinitix.c index 3b636beb583c..b8d901099378 100644 --- a/drivers/input/touchscreen/zinitix.c +++ b/drivers/input/touchscreen/zinitix.c @@ -513,10 +513,10 @@ static int zinitix_ts_probe(struct i2c_client *client) return -EINVAL; } - irq_set_status_flags(client->irq, IRQ_NOAUTOEN); error = devm_request_threaded_irq(&client->dev, client->irq, NULL, zinitix_ts_irq_handler, - IRQF_ONESHOT, client->name, bt541); + IRQF_ONESHOT | IRQF_NO_AUTOEN, + client->name, bt541); if (error) { dev_err(&client->dev, "Failed to request IRQ: %d\n", error); return error; From bfcf3d48dd02e95808a4693f2a49163f40fa5e74 Mon Sep 17 00:00:00 2001 From: Nikolai Kostrigin Date: Tue, 23 Mar 2021 12:18:55 -0700 Subject: [PATCH 0239/1379] Input: elan_i2c - fix a typo in parameter name s/max_baseliune/max_baseline/ Signed-off-by: Nikolai Kostrigin Link: https://lore.kernel.org/r/20210311114146.1977616-1-nickel@altlinux.org Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/elan_i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/mouse/elan_i2c.h b/drivers/input/mouse/elan_i2c.h index 838b3b346316..dc4a240f4489 100644 --- a/drivers/input/mouse/elan_i2c.h +++ b/drivers/input/mouse/elan_i2c.h @@ -78,7 +78,7 @@ struct elan_transport_ops { int (*calibrate_result)(struct i2c_client *client, u8 *val); int (*get_baseline_data)(struct i2c_client *client, - bool max_baseliune, u8 *value); + bool max_baseline, u8 *value); int (*get_version)(struct i2c_client *client, u8 pattern, bool iap, u8 *version); From 0bb2045ce5ce67b0428301c117ec960b3f705a44 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Tue, 9 Mar 2021 13:21:18 +0800 Subject: [PATCH 0240/1379] f2fs: fix to use per-inode maxbytes in f2fs_fiemap F2FS inode may have different max size, so change to use per-inode maxbytes. Signed-off-by: Chengguang Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0e749cf60e11..4bf7e79c8342 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1837,6 +1837,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int ret = 0; bool compr_cluster = false; unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + loff_t maxbytes; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { ret = f2fs_precache_extents(inode); @@ -1850,6 +1851,15 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, inode_lock(inode); + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + if (start > maxbytes) { + ret = -EFBIG; + goto out; + } + + if (len > maxbytes || (maxbytes - len) < start) + len = maxbytes - start; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { ret = f2fs_xattr_fiemap(inode, fieinfo); goto out; From 5ac443e26a096429065349c640538101012ce40d Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 15 Mar 2021 17:12:33 +0900 Subject: [PATCH 0241/1379] f2fs: add sysfs nodes to get runtime compression stat I've added new sysfs nodes to show runtime compression stat since mount. compr_written_block - show the block count written after compression compr_saved_block - show the saved block count with compression compr_new_inode - show the count of inode newly enabled for compression Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 24 ++++++++++++++++ fs/f2fs/compress.c | 1 + fs/f2fs/f2fs.h | 19 +++++++++++++ fs/f2fs/sysfs.c | 38 +++++++++++++++++++++++++ 4 files changed, 82 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 4aa8f38b52d7..4849b8e84e42 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -414,3 +414,27 @@ What: /sys/fs/f2fs//ovp_segments Date: March 2021 Contact: "Jaegeuk Kim" Description: Shows the number of overprovision segments. + +What: /sys/fs/f2fs//compr_written_block +Date: March 2021 +Contact: "Daeho Jeong" +Description: Show the block count written after compression since mount. Note + that when the compressed blocks are deleted, this count doesn't + decrease. If you write "0" here, you can initialize + compr_written_block and compr_saved_block to "0". + +What: /sys/fs/f2fs//compr_saved_block +Date: March 2021 +Contact: "Daeho Jeong" +Description: Show the saved block count with compression since mount. Note + that when the compressed blocks are deleted, this count doesn't + decrease. If you write "0" here, you can initialize + compr_written_block and compr_saved_block to "0". + +What: /sys/fs/f2fs//compr_new_inode +Date: March 2021 +Contact: "Daeho Jeong" +Description: Show the count of inode newly enabled for compression since mount. + Note that when the compression is disabled for the files, this count + doesn't decrease. If you write "0" here, you can initialize + compr_new_inode to "0". diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 77fa342de38f..3c9d797dbdd6 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1353,6 +1353,7 @@ unlock_continue: if (fio.compr_blocks) f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true); + add_compr_block_stat(inode, cc->nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); if (cc->cluster_idx == 0) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ac57072d73cf..165bfe2a5a0e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1628,6 +1628,11 @@ struct f2fs_sb_info { #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ + + /* For runtime compression statistics */ + u64 compr_written_block; + u64 compr_saved_block; + u32 compr_new_inode; #endif }; @@ -3961,6 +3966,18 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); int __init f2fs_init_compress_cache(void); void f2fs_destroy_compress_cache(void); +#define inc_compr_inode_stat(inode) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + sbi->compr_new_inode++; \ + } while (0) +#define add_compr_block_stat(inode, blocks) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + int diff = F2FS_I(inode)->i_cluster_size - blocks; \ + sbi->compr_written_block += blocks; \ + sbi->compr_saved_block += diff; \ + } while (0) #else static inline bool f2fs_is_compressed_page(struct page *page) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) @@ -3989,6 +4006,7 @@ static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } static inline void f2fs_destroy_compress_cache(void) { } +#define inc_compr_inode_stat(inode) do { } while (0) #endif static inline void set_compress_context(struct inode *inode) @@ -4012,6 +4030,7 @@ static inline void set_compress_context(struct inode *inode) F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); + inc_compr_inode_stat(inode); f2fs_mark_inode_dirty_sync(inode, true); } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 0c391ab2d8b7..39b522ec73e7 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -289,6 +290,17 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return len; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block")) + return sysfs_emit(buf, "%llu\n", sbi->compr_written_block); + + if (!strcmp(a->attr.name, "compr_saved_block")) + return sysfs_emit(buf, "%llu\n", sbi->compr_saved_block); + + if (!strcmp(a->attr.name, "compr_new_inode")) + return sysfs_emit(buf, "%u\n", sbi->compr_new_inode); +#endif + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -465,6 +477,24 @@ out: return count; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block") || + !strcmp(a->attr.name, "compr_saved_block")) { + if (t != 0) + return -EINVAL; + sbi->compr_written_block = 0; + sbi->compr_saved_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "compr_new_inode")) { + if (t != 0) + return -EINVAL; + sbi->compr_new_inode = 0; + return count; + } +#endif + *ui = (unsigned int)t; return count; @@ -676,6 +706,9 @@ F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); #ifdef CONFIG_F2FS_FS_COMPRESSION F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); #endif #define ATTR_LIST(name) (&f2fs_attr_##name.attr) @@ -739,6 +772,11 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(moved_blocks_foreground), ATTR_LIST(moved_blocks_background), ATTR_LIST(avg_vblocks), +#endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + ATTR_LIST(compr_written_block), + ATTR_LIST(compr_saved_block), + ATTR_LIST(compr_new_inode), #endif NULL, }; From ac2d750b2043cbe10d42ac974e07b9876cddfff8 Mon Sep 17 00:00:00 2001 From: Weichao Guo Date: Wed, 17 Mar 2021 17:27:23 +0800 Subject: [PATCH 0242/1379] f2fs: do not use AT_SSR mode in FG_GC & high urgent BG_GC AT_SSR mode is introduced by age threshold based GC for better hot/cold data seperation and avoiding free segment cost. However, LFS write mode is preferred in the scenario of foreground or high urgent GC, which should be finished ASAP. Let's only use AT_SSR in background GC and not high urgent GC modes. Signed-off-by: Weichao Guo Signed-off-by: Huang Jianan Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 3 ++- fs/f2fs/segment.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 86ba8ed0b8a7..d96acc6531f2 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1120,7 +1120,8 @@ static int move_data_block(struct inode *inode, block_t bidx, block_t newaddr; int err = 0; bool lfs_mode = f2fs_lfs_mode(fio.sbi); - int type = fio.sbi->am.atgc_enabled ? + int type = fio.sbi->am.atgc_enabled && (gc_type == BG_GC) && + (fio.sbi->gc_mode != GC_URGENT_HIGH) ? CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA; /* do not read out */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f2b22da8c134..b6f19518afd1 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3263,7 +3263,9 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) struct inode *inode = fio->page->mapping->host; if (is_cold_data(fio->page)) { - if (fio->sbi->am.atgc_enabled) + if (fio->sbi->am.atgc_enabled && + (fio->io_type == FS_DATA_IO) && + (fio->sbi->gc_mode != GC_URGENT_HIGH)) return CURSEG_ALL_DATA_ATGC; else return CURSEG_COLD_DATA; From 3f7070b05052f997d571a51e750583b9dea726f8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Mar 2021 17:56:03 +0800 Subject: [PATCH 0243/1379] f2fs: don't start checkpoint thread in readonly mountpoint In readonly mountpoint, there should be no write IOs include checkpoint IO, so that it's not needed to create kernel checkpoint thread. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e03b2f0f69d0..721dc64609b4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2069,8 +2069,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } - if (!test_opt(sbi, DISABLE_CHECKPOINT) && - test_opt(sbi, MERGE_CHECKPOINT)) { + if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || + !test_opt(sbi, MERGE_CHECKPOINT)) { + f2fs_stop_ckpt_thread(sbi); + } else { err = f2fs_start_ckpt_thread(sbi); if (err) { f2fs_err(sbi, @@ -2078,8 +2080,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err); goto restore_gc; } - } else { - f2fs_stop_ckpt_thread(sbi); } /* @@ -3835,7 +3835,7 @@ try_onemore: /* setup checkpoint request control and start checkpoint issue thread */ f2fs_init_ckpt_req_control(sbi); - if (!test_opt(sbi, DISABLE_CHECKPOINT) && + if (!f2fs_readonly(sb) && !test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, MERGE_CHECKPOINT)) { err = f2fs_start_ckpt_thread(sbi); if (err) { From b862676e371715456c9dade7990c8004996d0d9e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 22 Mar 2021 19:47:30 +0800 Subject: [PATCH 0244/1379] f2fs: fix to avoid out-of-bounds memory access butt3rflyh4ck reported a bug found by syzkaller fuzzer with custom modifications in 5.12.0-rc3+ [1]: dump_stack+0xfa/0x151 lib/dump_stack.c:120 print_address_description.constprop.0.cold+0x82/0x32c mm/kasan/report.c:232 __kasan_report mm/kasan/report.c:399 [inline] kasan_report.cold+0x7c/0xd8 mm/kasan/report.c:416 f2fs_test_bit fs/f2fs/f2fs.h:2572 [inline] current_nat_addr fs/f2fs/node.h:213 [inline] get_next_nat_page fs/f2fs/node.c:123 [inline] __flush_nat_entry_set fs/f2fs/node.c:2888 [inline] f2fs_flush_nat_entries+0x258e/0x2960 fs/f2fs/node.c:2991 f2fs_write_checkpoint+0x1372/0x6a70 fs/f2fs/checkpoint.c:1640 f2fs_issue_checkpoint+0x149/0x410 fs/f2fs/checkpoint.c:1807 f2fs_sync_fs+0x20f/0x420 fs/f2fs/super.c:1454 __sync_filesystem fs/sync.c:39 [inline] sync_filesystem fs/sync.c:67 [inline] sync_filesystem+0x1b5/0x260 fs/sync.c:48 generic_shutdown_super+0x70/0x370 fs/super.c:448 kill_block_super+0x97/0xf0 fs/super.c:1394 The root cause is, if nat entry in checkpoint journal area is corrupted, e.g. nid of journalled nat entry exceeds max nid value, during checkpoint, once it tries to flush nat journal to NAT area, get_next_nat_page() may access out-of-bounds memory on nat_bitmap due to it uses wrong nid value as bitmap offset. [1] https://lore.kernel.org/lkml/CAFcO6XOMWdr8pObek6eN6-fs58KG9doRFadgJj-FnF-1x43s2g@mail.gmail.com/T/#u Reported-and-tested-by: butt3rflyh4ck Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4b0e2e3c2c88..45c8cf1afe66 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2785,6 +2785,9 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) struct f2fs_nat_entry raw_ne; nid_t nid = le32_to_cpu(nid_in_journal(journal, i)); + if (f2fs_check_nid_range(sbi, nid)) + continue; + raw_ne = nat_in_journal(journal, i); ne = __lookup_nat_cache(nm_i, nid); From f3e367d4fe2bcccb51d64cb974f73153d23adf15 Mon Sep 17 00:00:00 2001 From: qiulaibin Date: Tue, 23 Mar 2021 19:41:30 +0800 Subject: [PATCH 0245/1379] f2fs: fix wrong comment of nat_tree_lock Do trivial comment fix of nat_tree_lock. Signed-off-by: qiulaibin Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 165bfe2a5a0e..eb154d9cb063 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -865,7 +865,7 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ + struct rw_semaphore nat_tree_lock; /* protect nat entry tree */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ From 3fd9735908287cdcd7dd04912e8ba7d749313f13 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 17 Mar 2021 17:56:04 +0800 Subject: [PATCH 0246/1379] f2fs: fix error path of f2fs_remount() In error path of f2fs_remount(), it missed to restart/stop kernel thread or enable/disable checkpoint, then mount option status may not be consistent with real condition of filesystem, so let's reorder remount flow a bit as below and do recovery correctly in error path: 1) handle gc thread 2) handle ckpt thread 3) handle flush thread 4) handle checkpoint disabling Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 721dc64609b4..b48281642e98 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1927,8 +1927,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) struct f2fs_mount_info org_mount_opt; unsigned long old_sb_flags; int err; - bool need_restart_gc = false; - bool need_stop_gc = false; + bool need_restart_gc = false, need_stop_gc = false; + bool need_restart_ckpt = false, need_stop_ckpt = false; + bool need_restart_flush = false, need_stop_flush = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); @@ -2059,19 +2060,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_IS_CLOSE); } - if (checkpoint_changed) { - if (test_opt(sbi, DISABLE_CHECKPOINT)) { - err = f2fs_disable_checkpoint(sbi); - if (err) - goto restore_gc; - } else { - f2fs_enable_checkpoint(sbi); - } - } - if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || !test_opt(sbi, MERGE_CHECKPOINT)) { f2fs_stop_ckpt_thread(sbi); + need_restart_ckpt = true; } else { err = f2fs_start_ckpt_thread(sbi); if (err) { @@ -2080,6 +2072,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err); goto restore_gc; } + need_stop_ckpt = true; } /* @@ -2089,11 +2082,24 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); f2fs_destroy_flush_cmd_control(sbi, false); + need_restart_flush = true; } else { err = f2fs_create_flush_cmd_control(sbi); if (err) - goto restore_gc; + goto restore_ckpt; + need_stop_flush = true; } + + if (checkpoint_changed) { + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + err = f2fs_disable_checkpoint(sbi); + if (err) + goto restore_flush; + } else { + f2fs_enable_checkpoint(sbi); + } + } + skip: #ifdef CONFIG_QUOTA /* Release old quota file names */ @@ -2108,6 +2114,21 @@ skip: adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; +restore_flush: + if (need_restart_flush) { + if (f2fs_create_flush_cmd_control(sbi)) + f2fs_warn(sbi, "background flush thread has stopped"); + } else if (need_stop_flush) { + clear_opt(sbi, FLUSH_MERGE); + f2fs_destroy_flush_cmd_control(sbi, false); + } +restore_ckpt: + if (need_restart_ckpt) { + if (f2fs_start_ckpt_thread(sbi)) + f2fs_warn(sbi, "background ckpt thread has stopped"); + } else if (need_stop_ckpt) { + f2fs_stop_ckpt_thread(sbi); + } restore_gc: if (need_restart_gc) { if (f2fs_start_gc_thread(sbi)) From 88f2cfc5fa90326edb569b4a81bb38ed4dcd3108 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 24 Mar 2021 11:24:33 +0800 Subject: [PATCH 0247/1379] f2fs: fix to update last i_size if fallocate partially succeeds In the case of expanding pinned file, map.m_lblk and map.m_len will update in each round of section allocation, so in error path, last i_size will be calculated with wrong m_lblk and m_len, fix it. Fixes: f5a53edcf01e ("f2fs: support aligned pinned file") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bd5a77091d23..dc79694e512c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1619,9 +1619,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_map_blocks map = { .m_next_pgofs = NULL, .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = true }; - pgoff_t pg_end; + pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; + block_t expanded = 0; int err; err = inode_newsize_ok(inode, (len + offset)); @@ -1634,11 +1635,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, f2fs_balance_fs(sbi, true); + pg_start = ((unsigned long long)offset) >> PAGE_SHIFT; pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; off_end = (offset + len) & (PAGE_SIZE - 1); - map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT; - map.m_len = pg_end - map.m_lblk; + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; if (off_end) map.m_len++; @@ -1648,7 +1650,6 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (f2fs_is_pinned_file(inode)) { block_t sec_blks = BLKS_PER_SEC(sbi); block_t sec_len = roundup(map.m_len, sec_blks); - block_t done = 0; map.m_len = sec_blks; next_alloc: @@ -1656,10 +1657,8 @@ next_alloc: GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { down_write(&sbi->gc_lock); err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); - if (err && err != -ENODATA && err != -EAGAIN) { - map.m_len = done; + if (err && err != -ENODATA && err != -EAGAIN) goto out_err; - } } down_write(&sbi->pin_sem); @@ -1673,24 +1672,25 @@ next_alloc: up_write(&sbi->pin_sem); - done += map.m_len; + expanded += map.m_len; sec_len -= map.m_len; map.m_lblk += map.m_len; if (!err && sec_len) goto next_alloc; - map.m_len = done; + map.m_len = expanded; } else { err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + expanded = map.m_len; } out_err: if (err) { pgoff_t last_off; - if (!map.m_len) + if (!expanded) return err; - last_off = map.m_lblk + map.m_len - 1; + last_off = pg_start + expanded - 1; /* update new size to the failed position */ new_size = (last_off == pg_end) ? offset + len : From c889136004eb3dc9c7e29f599d068273e5950669 Mon Sep 17 00:00:00 2001 From: ChiYuan Huang Date: Fri, 26 Mar 2021 14:13:07 +0800 Subject: [PATCH 0248/1379] leds: rt4505: Add DT binding document for Richtek RT4505 Add DT binding document for Richtek RT4505 flash LED controller. Signed-off-by: ChiYuan Huang Reviewed-by: Rob Herring Signed-off-by: Pavel Machek --- .../devicetree/bindings/leds/leds-rt4505.yaml | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 Documentation/devicetree/bindings/leds/leds-rt4505.yaml diff --git a/Documentation/devicetree/bindings/leds/leds-rt4505.yaml b/Documentation/devicetree/bindings/leds/leds-rt4505.yaml new file mode 100644 index 000000000000..5b0c74aa6723 --- /dev/null +++ b/Documentation/devicetree/bindings/leds/leds-rt4505.yaml @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/leds/leds-rt4505.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Richtek RT4505 Single Channel LED Driver + +maintainers: + - ChiYuan Huang + +description: | + The RT4505 is a flash LED driver that can support up to 375mA and 1.5A for + torch and flash mode, respectively. + + The data sheet can be found at: + https://www.richtek.com/assets/product_file/RT4505/DS4505-02.pdf + +properties: + compatible: + const: richtek,rt4505 + + reg: + description: I2C slave address of the controller. + maxItems: 1 + + led: + type: object + $ref: common.yaml# + +required: + - compatible + - reg + +additionalProperties: false + +examples: + - | + #include + + i2c0 { + #address-cells = <1>; + #size-cells = <0>; + + led-controller@63 { + compatible = "richtek,rt4505"; + reg = <0x63>; + + rt4505_flash: led { + function = LED_FUNCTION_FLASH; + color = ; + led-max-microamp = <375000>; + flash-max-microamp = <1500000>; + flash-max-timeout-us = <800000>; + }; + }; + }; From d9dfac5419d08e5f0048b53effd5b64de5801882 Mon Sep 17 00:00:00 2001 From: ChiYuan Huang Date: Fri, 26 Mar 2021 14:13:08 +0800 Subject: [PATCH 0249/1379] leds: rt4505: Add support for Richtek RT4505 flash LED controller Add support for RT4505 flash LED controller. It can support up to 1.5A flash current with hardware timeout and low input voltage protection. Signed-off-by: ChiYuan Huang Acked-by: Jacek Anaszewski Signed-off-by: Pavel Machek --- drivers/leds/flash/Kconfig | 11 + drivers/leds/flash/Makefile | 1 + drivers/leds/flash/leds-rt4505.c | 430 +++++++++++++++++++++++++++++++ 3 files changed, 442 insertions(+) create mode 100644 drivers/leds/flash/leds-rt4505.c diff --git a/drivers/leds/flash/Kconfig b/drivers/leds/flash/Kconfig index b580b416b9a4..3f49f3edbffb 100644 --- a/drivers/leds/flash/Kconfig +++ b/drivers/leds/flash/Kconfig @@ -2,6 +2,17 @@ if LEDS_CLASS_FLASH +config LEDS_RT4505 + tristate "LED support for RT4505 flashlight controller" + depends on I2C && OF + depends on V4L2_FLASH_LED_CLASS || !V4L2_FLASH_LED_CLASS + select REGMAP_I2C + help + This option enables support for the RT4505 flash LED controller. + RT4505 includes torch and flash functions with programmable current. + And it's commonly used to compensate the illuminance for the camera + inside the mobile product like as phones or tablets. + config LEDS_RT8515 tristate "LED support for Richtek RT8515 flash/torch LED" depends on GPIOLIB diff --git a/drivers/leds/flash/Makefile b/drivers/leds/flash/Makefile index e990e257f4d7..09aee561f769 100644 --- a/drivers/leds/flash/Makefile +++ b/drivers/leds/flash/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_LEDS_RT4505) += leds-rt4505.o obj-$(CONFIG_LEDS_RT8515) += leds-rt8515.o diff --git a/drivers/leds/flash/leds-rt4505.c b/drivers/leds/flash/leds-rt4505.c new file mode 100644 index 000000000000..ee129ab7255d --- /dev/null +++ b/drivers/leds/flash/leds-rt4505.c @@ -0,0 +1,430 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RT4505_REG_RESET 0x0 +#define RT4505_REG_CONFIG 0x8 +#define RT4505_REG_ILED 0x9 +#define RT4505_REG_ENABLE 0xA +#define RT4505_REG_FLAGS 0xB + +#define RT4505_RESET_MASK BIT(7) +#define RT4505_FLASHTO_MASK GENMASK(2, 0) +#define RT4505_ITORCH_MASK GENMASK(7, 5) +#define RT4505_ITORCH_SHIFT 5 +#define RT4505_IFLASH_MASK GENMASK(4, 0) +#define RT4505_ENABLE_MASK GENMASK(5, 0) +#define RT4505_TORCH_SET (BIT(0) | BIT(4)) +#define RT4505_FLASH_SET (BIT(0) | BIT(1) | BIT(2) | BIT(4)) +#define RT4505_EXT_FLASH_SET (BIT(0) | BIT(1) | BIT(4) | BIT(5)) +#define RT4505_FLASH_GET (BIT(0) | BIT(1) | BIT(4)) +#define RT4505_OVP_MASK BIT(3) +#define RT4505_SHORT_MASK BIT(2) +#define RT4505_OTP_MASK BIT(1) +#define RT4505_TIMEOUT_MASK BIT(0) + +#define RT4505_ITORCH_MINUA 46000 +#define RT4505_ITORCH_MAXUA 375000 +#define RT4505_ITORCH_STPUA 47000 +#define RT4505_IFLASH_MINUA 93750 +#define RT4505_IFLASH_MAXUA 1500000 +#define RT4505_IFLASH_STPUA 93750 +#define RT4505_FLASHTO_MINUS 100000 +#define RT4505_FLASHTO_MAXUS 800000 +#define RT4505_FLASHTO_STPUS 100000 + +struct rt4505_priv { + struct device *dev; + struct regmap *regmap; + struct mutex lock; + struct led_classdev_flash flash; + struct v4l2_flash *v4l2_flash; +}; + +static int rt4505_torch_brightness_set(struct led_classdev *lcdev, + enum led_brightness level) +{ + struct rt4505_priv *priv = + container_of(lcdev, struct rt4505_priv, flash.led_cdev); + u32 val = 0; + int ret; + + mutex_lock(&priv->lock); + + if (level != LED_OFF) { + ret = regmap_update_bits(priv->regmap, + RT4505_REG_ILED, RT4505_ITORCH_MASK, + (level - 1) << RT4505_ITORCH_SHIFT); + if (ret) + goto unlock; + + val = RT4505_TORCH_SET; + } + + ret = regmap_update_bits(priv->regmap, RT4505_REG_ENABLE, + RT4505_ENABLE_MASK, val); + +unlock: + mutex_unlock(&priv->lock); + return ret; +} + +static enum led_brightness rt4505_torch_brightness_get( + struct led_classdev *lcdev) +{ + struct rt4505_priv *priv = + container_of(lcdev, struct rt4505_priv, flash.led_cdev); + u32 val; + int ret; + + mutex_lock(&priv->lock); + + ret = regmap_read(priv->regmap, RT4505_REG_ENABLE, &val); + if (ret) { + dev_err(lcdev->dev, "Failed to get LED enable\n"); + ret = LED_OFF; + goto unlock; + } + + if ((val & RT4505_ENABLE_MASK) != RT4505_TORCH_SET) { + ret = LED_OFF; + goto unlock; + } + + ret = regmap_read(priv->regmap, RT4505_REG_ILED, &val); + if (ret) { + dev_err(lcdev->dev, "Failed to get LED brightness\n"); + ret = LED_OFF; + goto unlock; + } + + ret = ((val & RT4505_ITORCH_MASK) >> RT4505_ITORCH_SHIFT) + 1; + +unlock: + mutex_unlock(&priv->lock); + return ret; +} + +static int rt4505_flash_brightness_set(struct led_classdev_flash *fled_cdev, + u32 brightness) +{ + struct rt4505_priv *priv = + container_of(fled_cdev, struct rt4505_priv, flash); + struct led_flash_setting *s = &fled_cdev->brightness; + u32 val = (brightness - s->min) / s->step; + int ret; + + mutex_lock(&priv->lock); + ret = regmap_update_bits(priv->regmap, RT4505_REG_ILED, + RT4505_IFLASH_MASK, val); + mutex_unlock(&priv->lock); + + return ret; +} + +static int rt4505_flash_strobe_set(struct led_classdev_flash *fled_cdev, + bool state) +{ + struct rt4505_priv *priv = + container_of(fled_cdev, struct rt4505_priv, flash); + u32 val = state ? RT4505_FLASH_SET : 0; + int ret; + + mutex_lock(&priv->lock); + ret = regmap_update_bits(priv->regmap, RT4505_REG_ENABLE, + RT4505_ENABLE_MASK, val); + mutex_unlock(&priv->lock); + + return ret; +} + +static int rt4505_flash_strobe_get(struct led_classdev_flash *fled_cdev, + bool *state) +{ + struct rt4505_priv *priv = + container_of(fled_cdev, struct rt4505_priv, flash); + u32 val; + int ret; + + mutex_lock(&priv->lock); + + ret = regmap_read(priv->regmap, RT4505_REG_ENABLE, &val); + if (ret) + goto unlock; + + *state = (val & RT4505_FLASH_GET) == RT4505_FLASH_GET; + +unlock: + mutex_unlock(&priv->lock); + return ret; +} + +static int rt4505_flash_timeout_set(struct led_classdev_flash *fled_cdev, + u32 timeout) +{ + struct rt4505_priv *priv = + container_of(fled_cdev, struct rt4505_priv, flash); + struct led_flash_setting *s = &fled_cdev->timeout; + u32 val = (timeout - s->min) / s->step; + int ret; + + mutex_lock(&priv->lock); + ret = regmap_update_bits(priv->regmap, RT4505_REG_CONFIG, + RT4505_FLASHTO_MASK, val); + mutex_unlock(&priv->lock); + + return ret; +} + +static int rt4505_fault_get(struct led_classdev_flash *fled_cdev, u32 *fault) +{ + struct rt4505_priv *priv = + container_of(fled_cdev, struct rt4505_priv, flash); + u32 val, led_faults = 0; + int ret; + + ret = regmap_read(priv->regmap, RT4505_REG_FLAGS, &val); + if (ret) + return ret; + + if (val & RT4505_OVP_MASK) + led_faults |= LED_FAULT_OVER_VOLTAGE; + + if (val & RT4505_SHORT_MASK) + led_faults |= LED_FAULT_SHORT_CIRCUIT; + + if (val & RT4505_OTP_MASK) + led_faults |= LED_FAULT_OVER_TEMPERATURE; + + if (val & RT4505_TIMEOUT_MASK) + led_faults |= LED_FAULT_TIMEOUT; + + *fault = led_faults; + return 0; +} + +static const struct led_flash_ops rt4505_flash_ops = { + .flash_brightness_set = rt4505_flash_brightness_set, + .strobe_set = rt4505_flash_strobe_set, + .strobe_get = rt4505_flash_strobe_get, + .timeout_set = rt4505_flash_timeout_set, + .fault_get = rt4505_fault_get, +}; + +static bool rt4505_is_accessible_reg(struct device *dev, unsigned int reg) +{ + if (reg == RT4505_REG_RESET || + (reg >= RT4505_REG_CONFIG && reg <= RT4505_REG_FLAGS)) + return true; + return false; +} + +static const struct regmap_config rt4505_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = RT4505_REG_FLAGS, + + .readable_reg = rt4505_is_accessible_reg, + .writeable_reg = rt4505_is_accessible_reg, +}; + +#if IS_ENABLED(CONFIG_V4L2_FLASH_LED_CLASS) +static int rt4505_flash_external_strobe_set(struct v4l2_flash *v4l2_flash, + bool enable) +{ + struct led_classdev_flash *flash = v4l2_flash->fled_cdev; + struct rt4505_priv *priv = + container_of(flash, struct rt4505_priv, flash); + u32 val = enable ? RT4505_EXT_FLASH_SET : 0; + int ret; + + mutex_lock(&priv->lock); + ret = regmap_update_bits(priv->regmap, RT4505_REG_ENABLE, + RT4505_ENABLE_MASK, val); + mutex_unlock(&priv->lock); + + return ret; +} + +static const struct v4l2_flash_ops v4l2_flash_ops = { + .external_strobe_set = rt4505_flash_external_strobe_set, +}; + +static void rt4505_init_v4l2_config(struct rt4505_priv *priv, + struct v4l2_flash_config *config) +{ + struct led_classdev_flash *flash = &priv->flash; + struct led_classdev *lcdev = &flash->led_cdev; + struct led_flash_setting *s; + + strscpy(config->dev_name, lcdev->dev->kobj.name, + sizeof(config->dev_name)); + + s = &config->intensity; + s->min = RT4505_ITORCH_MINUA; + s->step = RT4505_ITORCH_STPUA; + s->max = s->val = s->min + (lcdev->max_brightness - 1) * s->step; + + config->flash_faults = LED_FAULT_OVER_VOLTAGE | + LED_FAULT_SHORT_CIRCUIT | + LED_FAULT_LED_OVER_TEMPERATURE | + LED_FAULT_TIMEOUT; + config->has_external_strobe = 1; +} +#else +static const struct v4l2_flash_ops v4l2_flash_ops; +static void rt4505_init_v4l2_config(struct rt4505_priv *priv, + struct v4l2_flash_config *config) +{ +} +#endif + +static void rt4505_init_flash_properties(struct rt4505_priv *priv, + struct fwnode_handle *child) +{ + struct led_classdev_flash *flash = &priv->flash; + struct led_classdev *lcdev = &flash->led_cdev; + struct led_flash_setting *s; + u32 val; + int ret; + + ret = fwnode_property_read_u32(child, "led-max-microamp", &val); + if (ret) { + dev_warn(priv->dev, "led-max-microamp DT property missing\n"); + val = RT4505_ITORCH_MINUA; + } else + val = clamp_val(val, RT4505_ITORCH_MINUA, RT4505_ITORCH_MAXUA); + + lcdev->max_brightness = + (val - RT4505_ITORCH_MINUA) / RT4505_ITORCH_STPUA + 1; + lcdev->brightness_set_blocking = rt4505_torch_brightness_set; + lcdev->brightness_get = rt4505_torch_brightness_get; + lcdev->flags |= LED_DEV_CAP_FLASH; + + ret = fwnode_property_read_u32(child, "flash-max-microamp", &val); + if (ret) { + dev_warn(priv->dev, "flash-max-microamp DT property missing\n"); + val = RT4505_IFLASH_MINUA; + } else + val = clamp_val(val, RT4505_IFLASH_MINUA, RT4505_IFLASH_MAXUA); + + s = &flash->brightness; + s->min = RT4505_IFLASH_MINUA; + s->step = RT4505_IFLASH_STPUA; + s->max = s->val = val; + + ret = fwnode_property_read_u32(child, "flash-max-timeout-us", &val); + if (ret) { + dev_warn(priv->dev, + "flash-max-timeout-us DT property missing\n"); + val = RT4505_FLASHTO_MINUS; + } else + val = clamp_val(val, RT4505_FLASHTO_MINUS, + RT4505_FLASHTO_MAXUS); + + s = &flash->timeout; + s->min = RT4505_FLASHTO_MINUS; + s->step = RT4505_FLASHTO_STPUS; + s->max = s->val = val; + + flash->ops = &rt4505_flash_ops; +} + +static int rt4505_probe(struct i2c_client *client) +{ + struct rt4505_priv *priv; + struct fwnode_handle *child; + struct led_init_data init_data = {}; + struct v4l2_flash_config v4l2_config = {}; + int ret; + + priv = devm_kzalloc(&client->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->dev = &client->dev; + mutex_init(&priv->lock); + + priv->regmap = devm_regmap_init_i2c(client, &rt4505_regmap_config); + if (IS_ERR(priv->regmap)) { + dev_err(priv->dev, "Failed to allocate register map\n"); + return PTR_ERR(priv->regmap); + } + + ret = regmap_write(priv->regmap, RT4505_REG_RESET, RT4505_RESET_MASK); + if (ret) { + dev_err(priv->dev, "Failed to reset registers\n"); + return ret; + } + + child = fwnode_get_next_available_child_node(client->dev.fwnode, NULL); + if (!child) { + dev_err(priv->dev, "Failed to get child node\n"); + return -EINVAL; + } + init_data.fwnode = child; + + rt4505_init_flash_properties(priv, child); + ret = devm_led_classdev_flash_register_ext(priv->dev, &priv->flash, + &init_data); + if (ret) { + dev_err(priv->dev, "Failed to register flash\n"); + return ret; + } + + rt4505_init_v4l2_config(priv, &v4l2_config); + priv->v4l2_flash = v4l2_flash_init(priv->dev, init_data.fwnode, + &priv->flash, &v4l2_flash_ops, + &v4l2_config); + if (IS_ERR(priv->v4l2_flash)) { + dev_err(priv->dev, "Failed to register v4l2 flash\n"); + return PTR_ERR(priv->v4l2_flash); + } + + i2c_set_clientdata(client, priv); + return 0; +} + +static int rt4505_remove(struct i2c_client *client) +{ + struct rt4505_priv *priv = i2c_get_clientdata(client); + + v4l2_flash_release(priv->v4l2_flash); + return 0; +} + +static void rt4505_shutdown(struct i2c_client *client) +{ + struct rt4505_priv *priv = i2c_get_clientdata(client); + + /* Reset registers to make sure all off before shutdown */ + regmap_write(priv->regmap, RT4505_REG_RESET, RT4505_RESET_MASK); +} + +static const struct of_device_id __maybe_unused rt4505_leds_match[] = { + { .compatible = "richtek,rt4505", }, + {} +}; +MODULE_DEVICE_TABLE(of, rt4505_leds_match); + +static struct i2c_driver rt4505_driver = { + .driver = { + .name = "rt4505", + .of_match_table = of_match_ptr(rt4505_leds_match), + }, + .probe_new = rt4505_probe, + .remove = rt4505_remove, + .shutdown = rt4505_shutdown, +}; +module_i2c_driver(rt4505_driver); + +MODULE_AUTHOR("ChiYuan Huang "); +MODULE_LICENSE("GPL v2"); From 3fd19d4b565dafd690a262fa95d25927bc797e42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Wed, 3 Mar 2021 21:34:43 +0100 Subject: [PATCH 0250/1379] docs: driver-api: gpio: consumer: Mark another line of code as such MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make it so that this #include line is rendered in monospace, like other code blocks. Signed-off-by: Jonathan Neuschäfer Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- Documentation/driver-api/gpio/consumer.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/driver-api/gpio/consumer.rst b/Documentation/driver-api/gpio/consumer.rst index 22271c342d92..3366a991b4aa 100644 --- a/Documentation/driver-api/gpio/consumer.rst +++ b/Documentation/driver-api/gpio/consumer.rst @@ -12,7 +12,7 @@ Guidelines for GPIOs consumers Drivers that can't work without standard GPIO calls should have Kconfig entries that depend on GPIOLIB or select GPIOLIB. The functions that allow a driver to -obtain and use GPIOs are available by including the following file: +obtain and use GPIOs are available by including the following file:: #include From 67196fea0fcef92b25608882f62f3985bc59f1fe Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 9 Mar 2021 11:37:31 +0200 Subject: [PATCH 0251/1379] irqdomain: Introduce irq_domain_create_simple() API Linus Walleij pointed out that ird_domain_add_simple() gained additional functionality and can't be anymore replaced with a simple conditional. In preparation to upgrade GPIO library to use fwnode, introduce irq_domain_create_simple() API which is functional equivalent to the existing irq_domain_add_simple(), but takes a pointer to the struct fwnode_handle as a parameter. While at it, amend documentation to mention irq_domain_create_*() functions where it makes sense. Signed-off-by: Andy Shevchenko Acked-by: Marc Zyngier Signed-off-by: Bartosz Golaszewski --- Documentation/core-api/irq/irq-domain.rst | 22 ++++++++++++---------- include/linux/irqdomain.h | 19 ++++++++++++++----- kernel/irq/irqdomain.c | 20 ++++++++++---------- 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/Documentation/core-api/irq/irq-domain.rst b/Documentation/core-api/irq/irq-domain.rst index a77c24c27f7b..8214e215a8bf 100644 --- a/Documentation/core-api/irq/irq-domain.rst +++ b/Documentation/core-api/irq/irq-domain.rst @@ -42,10 +42,10 @@ irq_domain usage ================ An interrupt controller driver creates and registers an irq_domain by -calling one of the irq_domain_add_*() functions (each mapping method -has a different allocator function, more on that later). The function -will return a pointer to the irq_domain on success. The caller must -provide the allocator function with an irq_domain_ops structure. +calling one of the irq_domain_add_*() or irq_domain_create_*() functions +(each mapping method has a different allocator function, more on that later). +The function will return a pointer to the irq_domain on success. The caller +must provide the allocator function with an irq_domain_ops structure. In most cases, the irq_domain will begin empty without any mappings between hwirq and IRQ numbers. Mappings are added to the irq_domain @@ -147,6 +147,7 @@ Legacy irq_domain_add_simple() irq_domain_add_legacy() irq_domain_add_legacy_isa() + irq_domain_create_simple() irq_domain_create_legacy() The Legacy mapping is a special case for drivers that already have a @@ -169,13 +170,13 @@ supported. For example, ISA controllers would use the legacy map for mapping Linux IRQs 0-15 so that existing ISA drivers get the correct IRQ numbers. -Most users of legacy mappings should use irq_domain_add_simple() which -will use a legacy domain only if an IRQ range is supplied by the -system and will otherwise use a linear domain mapping. The semantics -of this call are such that if an IRQ range is specified then +Most users of legacy mappings should use irq_domain_add_simple() or +irq_domain_create_simple() which will use a legacy domain only if an IRQ range +is supplied by the system and will otherwise use a linear domain mapping. +The semantics of this call are such that if an IRQ range is specified then descriptors will be allocated on-the-fly for it, and if no range is -specified it will fall through to irq_domain_add_linear() which means -*no* irq descriptors will be allocated. +specified it will fall through to irq_domain_add_linear() or +irq_domain_create_linear() which means *no* irq descriptors will be allocated. A typical use case for simple domains is where an irqchip provider is supporting both dynamic and static IRQ assignments. @@ -186,6 +187,7 @@ that the driver using the simple domain call irq_create_mapping() before any irq_find_mapping() since the latter will actually work for the static IRQ assignment case. +irq_domain_add_simple() and irq_domain_create_simple() as well as irq_domain_add_legacy() and irq_domain_create_legacy() are functionally equivalent, except for the first argument is different - the former accepts an Open Firmware specific 'struct device_node', while the latter diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 33cacc8af26d..1ad8d5328715 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -256,11 +256,11 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data); -struct irq_domain *irq_domain_add_simple(struct device_node *of_node, - unsigned int size, - unsigned int first_irq, - const struct irq_domain_ops *ops, - void *host_data); +struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, + unsigned int size, + unsigned int first_irq, + const struct irq_domain_ops *ops, + void *host_data); struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, unsigned int size, unsigned int first_irq, @@ -325,6 +325,15 @@ static inline struct irq_domain *irq_find_host(struct device_node *node) return d; } +static inline struct irq_domain *irq_domain_add_simple(struct device_node *of_node, + unsigned int size, + unsigned int first_irq, + const struct irq_domain_ops *ops, + void *host_data) +{ + return irq_domain_create_simple(of_node_to_fwnode(of_node), size, first_irq, ops, host_data); +} + /** * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain. * @of_node: pointer to interrupt controller's device tree node. diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d10ab1d689d5..2681dc43813c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -295,8 +295,8 @@ void irq_domain_update_bus_token(struct irq_domain *domain, EXPORT_SYMBOL_GPL(irq_domain_update_bus_token); /** - * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs - * @of_node: pointer to interrupt controller's device tree node. + * irq_domain_create_simple() - Register an irq_domain and optionally map a range of irqs + * @fwnode: firmware node for the interrupt controller * @size: total number of irqs in mapping * @first_irq: first number of irq block assigned to the domain, * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then @@ -312,15 +312,15 @@ EXPORT_SYMBOL_GPL(irq_domain_update_bus_token); * irqs get mapped dynamically on the fly. However, if the controller requires * static virq assignments (non-DT boot) then it will set that up correctly. */ -struct irq_domain *irq_domain_add_simple(struct device_node *of_node, - unsigned int size, - unsigned int first_irq, - const struct irq_domain_ops *ops, - void *host_data) +struct irq_domain *irq_domain_create_simple(struct fwnode_handle *fwnode, + unsigned int size, + unsigned int first_irq, + const struct irq_domain_ops *ops, + void *host_data) { struct irq_domain *domain; - domain = __irq_domain_add(of_node_to_fwnode(of_node), size, size, 0, ops, host_data); + domain = __irq_domain_add(fwnode, size, size, 0, ops, host_data); if (!domain) return NULL; @@ -328,7 +328,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { /* attempt to allocated irq_descs */ int rc = irq_alloc_descs(first_irq, first_irq, size, - of_node_to_nid(of_node)); + of_node_to_nid(to_of_node(fwnode))); if (rc < 0) pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", first_irq); @@ -338,7 +338,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node, return domain; } -EXPORT_SYMBOL_GPL(irq_domain_add_simple); +EXPORT_SYMBOL_GPL(irq_domain_create_simple); /** * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. From 944f4b0af9ca0d203ebc0d1426218af372d2d995 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 9 Mar 2021 11:37:32 +0200 Subject: [PATCH 0252/1379] gpiolib: Unify the checks on fwnode type We have (historically) different approaches how we identify the type of a given fwnode. Let's standardize them across the library code. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 6367646dce83..14d79c166124 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -3684,11 +3684,12 @@ EXPORT_SYMBOL_GPL(fwnode_gpiod_get_index); */ int gpiod_count(struct device *dev, const char *con_id) { + const struct fwnode_handle *fwnode = dev ? dev_fwnode(dev) : NULL; int count = -ENOENT; - if (IS_ENABLED(CONFIG_OF) && dev && dev->of_node) + if (is_of_node(fwnode)) count = of_gpio_get_count(dev, con_id); - else if (IS_ENABLED(CONFIG_ACPI) && dev && ACPI_HANDLE(dev)) + else if (is_acpi_node(fwnode)) count = acpi_gpio_count(dev, con_id); if (count < 0) @@ -3826,18 +3827,17 @@ struct gpio_desc *__must_check gpiod_get_index(struct device *dev, int ret; /* Maybe we have a device name, maybe not */ const char *devname = dev ? dev_name(dev) : "?"; + const struct fwnode_handle *fwnode = dev ? dev_fwnode(dev) : NULL; dev_dbg(dev, "GPIO lookup for consumer %s\n", con_id); - if (dev) { - /* Using device tree? */ - if (IS_ENABLED(CONFIG_OF) && dev->of_node) { - dev_dbg(dev, "using device tree for GPIO lookup\n"); - desc = of_find_gpio(dev, con_id, idx, &lookupflags); - } else if (ACPI_COMPANION(dev)) { - dev_dbg(dev, "using ACPI for GPIO lookup\n"); - desc = acpi_find_gpio(dev, con_id, idx, &flags, &lookupflags); - } + /* Using device tree? */ + if (is_of_node(fwnode)) { + dev_dbg(dev, "using device tree for GPIO lookup\n"); + desc = of_find_gpio(dev, con_id, idx, &lookupflags); + } else if (is_acpi_node(fwnode)) { + dev_dbg(dev, "using ACPI for GPIO lookup\n"); + desc = acpi_find_gpio(dev, con_id, idx, &flags, &lookupflags); } /* @@ -3921,9 +3921,6 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode, struct gpio_desc *desc = ERR_PTR(-ENODEV); int ret; - if (!fwnode) - return ERR_PTR(-EINVAL); - if (is_of_node(fwnode)) { desc = gpiod_get_from_of_node(to_of_node(fwnode), propname, index, @@ -3939,7 +3936,8 @@ struct gpio_desc *fwnode_get_named_gpiod(struct fwnode_handle *fwnode, acpi_gpio_update_gpiod_flags(&dflags, &info); acpi_gpio_update_gpiod_lookup_flags(&lflags, &info); - } + } else + return ERR_PTR(-EINVAL); /* Currently only ACPI takes this path */ ret = gpiod_request(desc, label); From 1df62542e0161e828615d7ec233e68c18902b0dc Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 9 Mar 2021 11:37:33 +0200 Subject: [PATCH 0253/1379] gpiolib: Move of_node operations to gpiolib-of and correct fwnode use The initial value of the OF node based on presence of parent, but at the same time this operation somehow appeared separately from others that handle the OF case. On the other hand there is no need to assign dev->fwnode in the OF case if code properly retrieves fwnode, i.e. via dev_fwnode() helper. Amend gpiolib.c and gpiolib-of.c code in order to group OF operations. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-of.c | 6 ++++-- drivers/gpio/gpiolib.c | 9 ++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index baf0153b7bca..bbcc7c073f63 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -1042,11 +1042,13 @@ void of_gpiochip_remove(struct gpio_chip *chip) void of_gpio_dev_init(struct gpio_chip *gc, struct gpio_device *gdev) { + /* Set default OF node to parent's one if present */ + if (gc->parent) + gdev->dev.of_node = gc->parent->of_node; + /* If the gpiochip has an assigned OF node this takes precedence */ if (gc->of_node) gdev->dev.of_node = gc->of_node; else gc->of_node = gdev->dev.of_node; - if (gdev->dev.of_node) - gdev->dev.fwnode = of_fwnode_handle(gdev->dev.of_node); } diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 14d79c166124..90ead10bc086 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -586,12 +586,9 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, if (!gdev) return -ENOMEM; gdev->dev.bus = &gpio_bus_type; + gdev->dev.parent = gc->parent; gdev->chip = gc; gc->gpiodev = gdev; - if (gc->parent) { - gdev->dev.parent = gc->parent; - gdev->dev.of_node = gc->parent->of_node; - } of_gpio_dev_init(gc, gdev); @@ -4218,11 +4215,13 @@ EXPORT_SYMBOL_GPL(gpiod_put_array); static int gpio_bus_match(struct device *dev, struct device_driver *drv) { + struct fwnode_handle *fwnode = dev_fwnode(dev); + /* * Only match if the fwnode doesn't already have a proper struct device * created for it. */ - if (dev->fwnode && dev->fwnode->dev != dev) + if (fwnode && fwnode->dev != dev) return 0; return 1; } From 515321acb56e1360bce4c9d60c498ec126a669dc Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 9 Mar 2021 11:37:34 +0200 Subject: [PATCH 0254/1379] gpiolib: Introduce acpi_gpio_dev_init() and call it from core In the ACPI case we may use the firmware node in the similar way as it's done for OF case. We may use that fwnode for other purposes in the future. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib-acpi.c | 7 +++++++ drivers/gpio/gpiolib-acpi.h | 4 ++++ drivers/gpio/gpiolib.c | 1 + 3 files changed, 12 insertions(+) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index 1aacd2a5a1fd..21750be9c489 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -1291,6 +1291,13 @@ void acpi_gpiochip_remove(struct gpio_chip *chip) kfree(acpi_gpio); } +void acpi_gpio_dev_init(struct gpio_chip *gc, struct gpio_device *gdev) +{ + /* Set default fwnode to parent's one if present */ + if (gc->parent) + ACPI_COMPANION_SET(&gdev->dev, ACPI_COMPANION(gc->parent)); +} + static int acpi_gpio_package_count(const union acpi_object *obj) { const union acpi_object *element = obj->package.elements; diff --git a/drivers/gpio/gpiolib-acpi.h b/drivers/gpio/gpiolib-acpi.h index e2edb632b2cc..e476558d9471 100644 --- a/drivers/gpio/gpiolib-acpi.h +++ b/drivers/gpio/gpiolib-acpi.h @@ -36,6 +36,8 @@ struct acpi_gpio_info { void acpi_gpiochip_add(struct gpio_chip *chip); void acpi_gpiochip_remove(struct gpio_chip *chip); +void acpi_gpio_dev_init(struct gpio_chip *gc, struct gpio_device *gdev); + void acpi_gpiochip_request_interrupts(struct gpio_chip *chip); void acpi_gpiochip_free_interrupts(struct gpio_chip *chip); @@ -58,6 +60,8 @@ int acpi_gpio_count(struct device *dev, const char *con_id); static inline void acpi_gpiochip_add(struct gpio_chip *chip) { } static inline void acpi_gpiochip_remove(struct gpio_chip *chip) { } +static inline void acpi_gpio_dev_init(struct gpio_chip *gc, struct gpio_device *gdev) { } + static inline void acpi_gpiochip_request_interrupts(struct gpio_chip *chip) { } diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 90ead10bc086..84b0a83f87ed 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -591,6 +591,7 @@ int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, gc->gpiodev = gdev; of_gpio_dev_init(gc, gdev); + acpi_gpio_dev_init(gc, gdev); /* * Assign fwnode depending on the result of the previous calls, From 5c63a9dbab55be3c0df512c9d8efe80a44cd2ce8 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 9 Mar 2021 11:37:35 +0200 Subject: [PATCH 0255/1379] gpiolib: Reuse device's fwnode to create IRQ domain When IRQ domain is created for an ACPI case, the name of it becomes unknown-%d since for now it utilizes of_node member only and doesn't consider fwnode case. Convert IRQ domain creation code to utilize fwnode instead. Before/After the change on Intel Galileo Gen 2 with two GPIO (IRQ) controllers: unknown-1 ==> \_SB.PCI0.GIP0.GPO unknown-2 ==> \_SB.NIO3 Due to the nature of this change we may also deduplicate the WARN():s because in either case (DT or ACPI) the fwnode will be set correctly and %pfw is an equivalent to what the current code prints as a prefix. Signed-off-by: Andy Shevchenko Reviewed-by: Rafael J. Wysocki Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 84b0a83f87ed..00b097e138b6 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1463,9 +1463,9 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, struct lock_class_key *lock_key, struct lock_class_key *request_key) { + struct fwnode_handle *fwnode = dev_fwnode(&gc->gpiodev->dev); struct irq_chip *irqchip = gc->irq.chip; const struct irq_domain_ops *ops = NULL; - struct device_node *np; unsigned int type; unsigned int i; @@ -1477,7 +1477,6 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, return -EINVAL; } - np = gc->gpiodev->dev.of_node; type = gc->irq.default_type; /* @@ -1485,16 +1484,10 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, * used to configure the interrupts, as you may end up with * conflicting triggers. Tell the user, and reset to NONE. */ - if (WARN(np && type != IRQ_TYPE_NONE, - "%s: Ignoring %u default trigger\n", np->full_name, type)) + if (WARN(fwnode && type != IRQ_TYPE_NONE, + "%pfw: Ignoring %u default trigger\n", fwnode, type)) type = IRQ_TYPE_NONE; - if (has_acpi_companion(gc->parent) && type != IRQ_TYPE_NONE) { - acpi_handle_warn(ACPI_HANDLE(gc->parent), - "Ignoring %u default trigger\n", type); - type = IRQ_TYPE_NONE; - } - if (gc->to_irq) chip_warn(gc, "to_irq is redefined in %s and you shouldn't rely on it\n", __func__); @@ -1515,7 +1508,7 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, if (!ops) ops = &gpiochip_domain_ops; - gc->irq.domain = irq_domain_add_simple(np, + gc->irq.domain = irq_domain_create_simple(fwnode, gc->ngpio, gc->irq.first, ops, gc); From 266315fb7cbed86f628f3fb4bb89a90943b66718 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 9 Mar 2021 11:37:36 +0200 Subject: [PATCH 0256/1379] gpiolib: Fold conditionals into a simple ternary operator It's quite spread code to initialize IRQ domain options. Let's fold it into a simple oneliner. Signed-off-by: Andy Shevchenko Reviewed-by: Rafael J. Wysocki Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpiolib.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 00b097e138b6..1427c1be749b 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1465,7 +1465,6 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, { struct fwnode_handle *fwnode = dev_fwnode(&gc->gpiodev->dev); struct irq_chip *irqchip = gc->irq.chip; - const struct irq_domain_ops *ops = NULL; unsigned int type; unsigned int i; @@ -1503,15 +1502,11 @@ static int gpiochip_add_irqchip(struct gpio_chip *gc, return ret; } else { /* Some drivers provide custom irqdomain ops */ - if (gc->irq.domain_ops) - ops = gc->irq.domain_ops; - - if (!ops) - ops = &gpiochip_domain_ops; gc->irq.domain = irq_domain_create_simple(fwnode, gc->ngpio, gc->irq.first, - ops, gc); + gc->irq.domain_ops ?: &gpiochip_domain_ops, + gc); if (!gc->irq.domain) return -EINVAL; } From e5391a02bce2422fa57a520dfaaf2693ec7f9546 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 15 Mar 2021 20:51:41 +0200 Subject: [PATCH 0257/1379] gpio: mockup: Drop duplicate NULL check in gpio_mockup_unregister_pdevs() Since platform_device_unregister() is NULL-aware, we don't need to duplicate this check. Remove it and fold the rest of the code. Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mockup.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/gpio/gpio-mockup.c b/drivers/gpio/gpio-mockup.c index 28b757d34046..d7e73876a3b9 100644 --- a/drivers/gpio/gpio-mockup.c +++ b/drivers/gpio/gpio-mockup.c @@ -479,15 +479,10 @@ static struct platform_device *gpio_mockup_pdevs[GPIO_MOCKUP_MAX_GC]; static void gpio_mockup_unregister_pdevs(void) { - struct platform_device *pdev; int i; - for (i = 0; i < GPIO_MOCKUP_MAX_GC; i++) { - pdev = gpio_mockup_pdevs[i]; - - if (pdev) - platform_device_unregister(pdev); - } + for (i = 0; i < GPIO_MOCKUP_MAX_GC; i++) + platform_device_unregister(gpio_mockup_pdevs[i]); } static __init char **gpio_mockup_make_line_names(const char *label, From 81dd500b1c8612a42979ee3ea788d2d9f19aa9f9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 15 Mar 2021 18:59:40 +0200 Subject: [PATCH 0258/1379] gpio: mockup: Adjust documentation to the code First of all one of the parameter missed 'mockup' in its name, Second, the semantics of the integer pairs depends on the sign of the base (the first value in the pair). Update documentation to reflect the real code behaviour. Fixes: 2fd1abe99e5f ("Documentation: gpio: add documentation for gpio-mockup") Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- Documentation/admin-guide/gpio/gpio-mockup.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/gpio/gpio-mockup.rst b/Documentation/admin-guide/gpio/gpio-mockup.rst index 9fa1618b3adc..493071da1738 100644 --- a/Documentation/admin-guide/gpio/gpio-mockup.rst +++ b/Documentation/admin-guide/gpio/gpio-mockup.rst @@ -17,17 +17,18 @@ module. gpio_mockup_ranges This parameter takes an argument in the form of an array of integer - pairs. Each pair defines the base GPIO number (if any) and the number - of lines exposed by the chip. If the base GPIO is -1, the gpiolib - will assign it automatically. + pairs. Each pair defines the base GPIO number (non-negative integer) + and the first number after the last of this chip. If the base GPIO + is -1, the gpiolib will assign it automatically. while the following + parameter is the number of lines exposed by the chip. - Example: gpio_mockup_ranges=-1,8,-1,16,405,4 + Example: gpio_mockup_ranges=-1,8,-1,16,405,409 The line above creates three chips. The first one will expose 8 lines, the second 16 and the third 4. The base GPIO for the third chip is set to 405 while for two first chips it will be assigned automatically. - gpio_named_lines + gpio_mockup_named_lines This parameter doesn't take any arguments. It lets the driver know that GPIO lines exposed by it should be named. From 4a5c9da4ec29bc2a4fff00c2c36ce38826123d68 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 26 Mar 2021 09:15:35 +0100 Subject: [PATCH 0259/1379] gpio: Mention GPIO MUX in docs There is now a GPIO multiplexer, so mention this in the document about drivers using GPIO as backend. Signed-off-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- Documentation/driver-api/gpio/drivers-on-gpio.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/driver-api/gpio/drivers-on-gpio.rst b/Documentation/driver-api/gpio/drivers-on-gpio.rst index 41ec3cc72d32..af632d764ac6 100644 --- a/Documentation/driver-api/gpio/drivers-on-gpio.rst +++ b/Documentation/driver-api/gpio/drivers-on-gpio.rst @@ -96,6 +96,12 @@ hardware descriptions such as device tree or ACPI: way to pass the charging parameters from hardware descriptions such as the device tree. +- gpio-mux: drivers/mux/gpio.c is used for controlling a multiplexer using + n GPIO lines such that you can mux in 2^n different devices by activating + different GPIO lines. Often the GPIOs are on a SoC and the devices are + some SoC-external entities, such as different components on a PCB that + can be selectively enabled. + Apart from this there are special GPIO drivers in subsystems like MMC/SD to read card detect and write protect GPIO lines, and in the TTY serial subsystem to emulate MCTRL (modem control) signals CTS/RTS by using two GPIO lines. The From b0922c0732c10eabab7ef15c420b0ae6cf540564 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Mar 2021 10:55:36 -0700 Subject: [PATCH 0260/1379] tools: gpio-utils: fix various kernel-doc warnings Fix several problems in kernel-doc notation in gpio-utils.c. gpio-utils.c:37: warning: Incorrect use of kernel-doc format: * gpiotools_request_line() - request gpio lines in a gpiochip gpio-utils.c:61: warning: expecting prototype for doc(). Prototype was for gpiotools_request_line() instead gpio-utils.c:265: warning: Excess function parameter 'value' description in 'gpiotools_sets' gpio-utils.c:1: warning: 'gpiotools_request_lines' not found Signed-off-by: Randy Dunlap Cc: Bartosz Golaszewski Cc: Linus Walleij Cc: linux-gpio@vger.kernel.org Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- tools/gpio/gpio-utils.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/gpio/gpio-utils.c b/tools/gpio/gpio-utils.c index 1639b4d832cd..4096bcd511d1 100644 --- a/tools/gpio/gpio-utils.c +++ b/tools/gpio/gpio-utils.c @@ -20,7 +20,7 @@ #define CONSUMER "gpio-utils" /** - * doc: Operation of gpio + * DOC: Operation of gpio * * Provide the api of gpiochip for chardev interface. There are two * types of api. The first one provide as same function as each @@ -100,7 +100,7 @@ exit_free_name: } /** - * gpiotools_set_values(): Set the value of gpio(s) + * gpiotools_set_values() - Set the value of gpio(s) * @fd: The fd returned by * gpiotools_request_line(). * @values: The array of values want to set. @@ -124,7 +124,7 @@ int gpiotools_set_values(const int fd, struct gpio_v2_line_values *values) } /** - * gpiotools_get_values(): Get the value of gpio(s) + * gpiotools_get_values() - Get the value of gpio(s) * @fd: The fd returned by * gpiotools_request_line(). * @values: The array of values get from hardware. @@ -148,7 +148,7 @@ int gpiotools_get_values(const int fd, struct gpio_v2_line_values *values) } /** - * gpiotools_release_line(): Release the line(s) of gpiochip + * gpiotools_release_line() - Release the line(s) of gpiochip * @fd: The fd returned by * gpiotools_request_line(). * @@ -169,7 +169,7 @@ int gpiotools_release_line(const int fd) } /** - * gpiotools_get(): Get value from specific line + * gpiotools_get() - Get value from specific line * @device_name: The name of gpiochip without prefix "/dev/", * such as "gpiochip0" * @line: number of line, such as 2. @@ -191,7 +191,7 @@ int gpiotools_get(const char *device_name, unsigned int line) /** - * gpiotools_gets(): Get values from specific lines. + * gpiotools_gets() - Get values from specific lines. * @device_name: The name of gpiochip without prefix "/dev/", * such as "gpiochip0". * @lines: An array desired lines, specified by offset @@ -230,7 +230,7 @@ int gpiotools_gets(const char *device_name, unsigned int *lines, } /** - * gpiotools_set(): Set value to specific line + * gpiotools_set() - Set value to specific line * @device_name: The name of gpiochip without prefix "/dev/", * such as "gpiochip0" * @line: number of line, such as 2. @@ -248,13 +248,13 @@ int gpiotools_set(const char *device_name, unsigned int line, } /** - * gpiotools_sets(): Set values to specific lines. + * gpiotools_sets() - Set values to specific lines. * @device_name: The name of gpiochip without prefix "/dev/", * such as "gpiochip0". * @lines: An array desired lines, specified by offset * index for the associated GPIO device. * @num_lines: The number of lines to request. - * @value: The array of values set to gpiochip, must be + * @values: The array of values set to gpiochip, must be * 0(low) or 1(high). * * Return: On success return 0; From 2d93018fe67d42c44d65a898da2a6a5a0209b9ee Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 23 Mar 2021 15:19:05 -0700 Subject: [PATCH 0261/1379] gpiolib: some edits of kernel docs for clarity Fix a few typos and some punctuation. Also, change CONFIG_OF to CONFIG_OF_GPIO in one comment. Signed-off-by: Randy Dunlap Cc: Linus Walleij Cc: Bartosz Golaszewski Cc: linux-gpio@vger.kernel.org Reviewed-by: Linus Walleij [Bartosz: tweaked the commit message] Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/driver.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 286de0520574..63283de9daeb 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -227,7 +227,7 @@ struct gpio_irq_chip { /** * @valid_mask: * - * If not %NULL holds bitmask of GPIOs which are valid to be included + * If not %NULL, holds bitmask of GPIOs which are valid to be included * in IRQ domain of the chip. */ unsigned long *valid_mask; @@ -346,7 +346,7 @@ struct gpio_irq_chip { * output. * * A gpio_chip can help platforms abstract various sources of GPIOs so - * they can all be accessed through a common programing interface. + * they can all be accessed through a common programming interface. * Example sources would be SOC controllers, FPGAs, multifunction * chips, dedicated GPIO expanders, and so on. * @@ -435,15 +435,15 @@ struct gpio_chip { /** * @valid_mask: * - * If not %NULL holds bitmask of GPIOs which are valid to be used + * If not %NULL, holds bitmask of GPIOs which are valid to be used * from the chip. */ unsigned long *valid_mask; #if defined(CONFIG_OF_GPIO) /* - * If CONFIG_OF is enabled, then all GPIO controllers described in the - * device tree automatically may have an OF translation + * If CONFIG_OF_GPIO is enabled, then all GPIO controllers described in + * the device tree automatically may have an OF translation */ /** @@ -508,7 +508,7 @@ extern int gpiochip_add_data_with_key(struct gpio_chip *gc, void *data, * for GPIOs will fail rudely. * * gpiochip_add_data() must only be called after gpiolib initialization, - * ie after core_initcall(). + * i.e. after core_initcall(). * * If gc->base is negative, this requests dynamic assignment of * a range of valid GPIOs. From 6613bc2301ba291a1c5a90e1dc24cf3edf223c03 Mon Sep 17 00:00:00 2001 From: Shradha Todi Date: Wed, 24 Mar 2021 15:46:09 +0530 Subject: [PATCH 0262/1379] PCI: endpoint: Fix NULL pointer dereference for ->get_features() get_features ops of pci_epc_ops may return NULL, causing NULL pointer dereference in pci_epf_test_alloc_space function. Let us add a check for pci_epc_feature pointer in pci_epf_test_bind before we access it to avoid any such NULL pointer dereference and return -ENOTSUPP in case pci_epc_feature is not found. When the patch is not applied and EPC features is not implemented in the platform driver, we see the following dump due to kernel NULL pointer dereference. Call trace: pci_epf_test_bind+0xf4/0x388 pci_epf_bind+0x3c/0x80 pci_epc_epf_link+0xa8/0xcc configfs_symlink+0x1a4/0x48c vfs_symlink+0x104/0x184 do_symlinkat+0x80/0xd4 __arm64_sys_symlinkat+0x1c/0x24 el0_svc_common.constprop.3+0xb8/0x170 el0_svc_handler+0x70/0x88 el0_svc+0x8/0x640 Code: d2800581 b9403ab9 f9404ebb 8b394f60 (f9400400) ---[ end trace a438e3c5a24f9df0 ]--- Link: https://lore.kernel.org/r/20210324101609.79278-1-shradha.t@samsung.com Fixes: 2c04c5b8eef79 ("PCI: pci-epf-test: Use pci_epc_get_features() to get EPC features") Signed-off-by: Sriram Dash Signed-off-by: Shradha Todi Signed-off-by: Lorenzo Pieralisi Reviewed-by: Pankaj Dubey Reviewed-by: Kishon Vijay Abraham I --- drivers/pci/endpoint/functions/pci-epf-test.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c index c0ac4e9cbe72..bc35b3566be6 100644 --- a/drivers/pci/endpoint/functions/pci-epf-test.c +++ b/drivers/pci/endpoint/functions/pci-epf-test.c @@ -833,15 +833,18 @@ static int pci_epf_test_bind(struct pci_epf *epf) return -EINVAL; epc_features = pci_epc_get_features(epc, epf->func_no); - if (epc_features) { - linkup_notifier = epc_features->linkup_notifier; - core_init_notifier = epc_features->core_init_notifier; - test_reg_bar = pci_epc_get_first_free_bar(epc_features); - if (test_reg_bar < 0) - return -EINVAL; - pci_epf_configure_bar(epf, epc_features); + if (!epc_features) { + dev_err(&epf->dev, "epc_features not implemented\n"); + return -EOPNOTSUPP; } + linkup_notifier = epc_features->linkup_notifier; + core_init_notifier = epc_features->core_init_notifier; + test_reg_bar = pci_epc_get_first_free_bar(epc_features); + if (test_reg_bar < 0) + return -EINVAL; + pci_epf_configure_bar(epf, epc_features); + epf_test->test_reg_bar = test_reg_bar; epf_test->epc_features = epc_features; From 61461fc921b756ae16e64243f72af2bfc2e620db Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 24 Mar 2021 11:18:28 +0800 Subject: [PATCH 0263/1379] f2fs: fix to avoid touching checkpointed data in get_victim() In CP disabling mode, there are two issues when using LFS or SSR | AT_SSR mode to select victim: 1. LFS is set to find source section during GC, the victim should have no checkpointed data, since after GC, section could not be set free for reuse. Previously, we only check valid chpt blocks in current segment rather than section, fix it. 2. SSR | AT_SSR are set to find target segment for writes which can be fully filled by checkpointed and newly written blocks, we should never select such segment, otherwise it can cause panic or data corruption during allocation, potential case is described as below: a) target segment has 'n' (n < 512) ckpt valid blocks b) GC migrates 'n' valid blocks to other segment (segment is still in dirty list) c) GC migrates '512 - n' blocks to target segment (segment has 'n' cp_vblocks and '512 - n' vblocks) d) If GC selects target segment via {AT,}SSR allocator, however there is no free space in targe segment. Fixes: 4354994f097d ("f2fs: checkpoint disabling") Fixes: 093749e296e2 ("f2fs: support age threshold based garbage collection") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/gc.c | 28 ++++++++++++++++++++-------- fs/f2fs/segment.c | 36 +++++++++++++++++++++--------------- fs/f2fs/segment.h | 14 +++++++++++++- 4 files changed, 55 insertions(+), 24 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index eb154d9cb063..fe380bcf8d4d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3387,6 +3387,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno); void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi); void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d96acc6531f2..a2ca483f9855 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -392,10 +392,6 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, if (p->gc_mode == GC_AT && get_valid_blocks(sbi, segno, true) == 0) return; - - if (p->alloc_mode == AT_SSR && - get_seg_entry(sbi, segno)->ckpt_valid_blocks == 0) - return; } for (i = 0; i < sbi->segs_per_sec; i++) @@ -728,11 +724,27 @@ retry: if (sec_usage_check(sbi, secno)) goto next; + /* Don't touch checkpointed data */ - if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && - get_ckpt_valid_blocks(sbi, segno) && - p.alloc_mode == LFS)) - goto next; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (p.alloc_mode == LFS) { + /* + * LFS is set to find source section during GC. + * The victim should have no checkpointed data. + */ + if (get_ckpt_valid_blocks(sbi, segno, true)) + goto next; + } else { + /* + * SSR | AT_SSR are set to find target segment + * for writes which can be full by checkpointed + * and newly written blocks. + */ + if (!f2fs_segment_has_free_slot(sbi, segno)) + goto next; + } + } + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b6f19518afd1..33cb8aa5ec8f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -865,7 +865,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); - ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno); + ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false); if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || ckpt_valid_blocks == usable_blocks)) { @@ -950,7 +950,7 @@ static unsigned int get_free_segment(struct f2fs_sb_info *sbi) for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; - if (get_ckpt_valid_blocks(sbi, segno)) + if (get_ckpt_valid_blocks(sbi, segno, false)) continue; mutex_unlock(&dirty_i->seglist_lock); return segno; @@ -2642,6 +2642,23 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, seg->next_blkoff++; } +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) +{ + struct seg_entry *se = get_seg_entry(sbi, segno); + int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); + unsigned long *target_map = SIT_I(sbi)->tmp_map; + unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *cur_map = (unsigned long *)se->cur_valid_map; + int i, pos; + + for (i = 0; i < entries; i++) + target_map[i] = ckpt_map[i] | cur_map[i]; + + pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, 0); + + return pos < sbi->blocks_per_seg; +} + /* * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks @@ -2912,19 +2929,8 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, get_valid_blocks(sbi, curseg->segno, new_sec)) goto alloc; - if (new_sec) { - unsigned int segno = START_SEGNO(curseg->segno); - int i; - - for (i = 0; i < sbi->segs_per_sec; i++, segno++) { - if (get_ckpt_valid_blocks(sbi, segno)) - goto alloc; - } - } else { - if (!get_ckpt_valid_blocks(sbi, curseg->segno)) - return; - } - + if (!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) + return; alloc: old_segno = curseg->segno; SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 144980b62f9e..dab87ecba2b5 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -359,8 +359,20 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, } static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno) + unsigned int segno, bool use_section) { + if (use_section && __is_large_section(sbi)) { + unsigned int start_segno = START_SEGNO(segno); + unsigned int blocks = 0; + int i; + + for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) { + struct seg_entry *se = get_seg_entry(sbi, start_segno); + + blocks += se->ckpt_valid_blocks; + } + return blocks; + } return get_seg_entry(sbi, segno)->ckpt_valid_blocks; } From d6d2b491a82e1e411a6766fbfb87c697d8701554 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Tue, 16 Mar 2021 14:59:18 +0530 Subject: [PATCH 0264/1379] f2fs: allow to change discard policy based on cached discard cmds With the default DPOLICY_BG discard thread is ioaware, which prevents the discard thread from issuing the discard commands. On low RAM setups, it is observed that these discard commands in the cache are consuming high memory. This patch aims to relax the memory pressure on the system due to f2fs pending discard cmds by changing the policy to DPOLICY_FORCE based on the nm_i->ram_thresh configured. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 8 ++++++++ fs/f2fs/node.h | 1 + fs/f2fs/segment.c | 3 ++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 45c8cf1afe66..3eb724bb6594 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -43,11 +43,15 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) { struct f2fs_nm_info *nm_i = NM_I(sbi); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct sysinfo val; unsigned long avail_ram; unsigned long mem_size = 0; bool res = false; + if (!nm_i) + return true; + si_meminfo(&val); /* only uses low memory */ @@ -89,6 +93,10 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) /* it allows 20% / total_ram for inmemory pages */ mem_size = get_pages(sbi, F2FS_INMEM_PAGES); res = mem_size < (val.totalram / 5); + } else if (type == DISCARD_CACHE) { + mem_size = (atomic_read(&dcc->discard_cmd_cnt) * + sizeof(struct discard_cmd)) >> PAGE_SHIFT; + res = mem_size < (avail_ram * nm_i->ram_thresh / 100); } else { if (!sbi->sb->s_bdi->wb.dirty_exceeded) return true; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index f84541b57acb..7a45c0f10629 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -147,6 +147,7 @@ enum mem_type { INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ INMEM_PAGES, /* indicates inmemory pages */ + DISCARD_CACHE, /* indicates memory of cached discard cmds */ BASE_CHECK, /* check kernel status */ }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 33cb8aa5ec8f..ad48f1f16387 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1762,7 +1762,8 @@ static int issue_discard_thread(void *data) if (!atomic_read(&dcc->discard_cmd_cnt)) continue; - if (sbi->gc_mode == GC_URGENT_HIGH) + if (sbi->gc_mode == GC_URGENT_HIGH || + !f2fs_available_free_memory(sbi, DISCARD_CACHE)) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); sb_start_intwrite(sbi->sb); From 2c718feead3533647a061501122457a16a355736 Mon Sep 17 00:00:00 2001 From: Ruiqi Gong Date: Thu, 25 Mar 2021 02:38:11 -0400 Subject: [PATCH 0265/1379] f2fs: fix a typo in inode.c Do a trivial typo fix. s/runing/running Reported-by: Hulk Robot Signed-off-by: Ruiqi Gong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 349d9cb933ee..5d2253d53f17 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -698,7 +698,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) /* * We need to balance fs here to prevent from producing dirty node pages - * during the urgent cleaning time when runing out of free sections. + * during the urgent cleaning time when running out of free sections. */ f2fs_update_inode_page(inode); if (wbc && wbc->nr_to_write) From e8bf1f522aee3b3e1e7658e8f224dca1d88c3338 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 26 Mar 2021 22:41:43 +0800 Subject: [PATCH 0266/1379] f2fs: delete empty compress.h Commit 75e91c888989 ("f2fs: compress: fix compression chksum") wrongly introduced empty compress.h, delete it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 fs/f2fs/compress.h diff --git a/fs/f2fs/compress.h b/fs/f2fs/compress.h deleted file mode 100644 index e69de29bb2d1..000000000000 From 753a8ed0ae9c196a7d09a17aae1e354cabd1233d Mon Sep 17 00:00:00 2001 From: Wang Xiaojun Date: Thu, 25 Mar 2021 10:19:20 -0400 Subject: [PATCH 0267/1379] f2fs: fix wrong alloc_type in f2fs_do_replace_block If the alloc_type of the original curseg is LFS, when we change_curseg and then do recover curseg, the alloc_type becomes SSR. Signed-off-by: Wang Xiaojun Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ad48f1f16387..c19114be554c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3572,6 +3572,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct seg_entry *se; int type; unsigned short old_blkoff; + unsigned char old_alloc_type; segno = GET_SEGNO(sbi, new_blkaddr); se = get_seg_entry(sbi, segno); @@ -3605,6 +3606,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, old_cursegno = curseg->segno; old_blkoff = curseg->next_blkoff; + old_alloc_type = curseg->alloc_type; /* change the current segment */ if (segno != curseg->segno) { @@ -3639,6 +3641,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, change_curseg(sbi, type, true); } curseg->next_blkoff = old_blkoff; + curseg->alloc_type = old_alloc_type; } up_write(&sit_i->sentry_lock); From 823d13e12b6cbaef2f6e5d63c648643e7bc094dd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 26 Mar 2021 09:46:22 +0800 Subject: [PATCH 0268/1379] f2fs: fix to cover __allocate_new_section() with curseg_lock In order to avoid race with f2fs_do_replace_block(). Fixes: f5a53edcf01e ("f2fs: support aligned pinned file") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c19114be554c..24ad45f5e335 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2945,19 +2945,23 @@ static void __allocate_new_section(struct f2fs_sb_info *sbi, int type) void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type) { + down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); __allocate_new_section(sbi, type); up_write(&SIT_I(sbi)->sentry_lock); + up_read(&SM_I(sbi)->curseg_lock); } void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { int i; + down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) __allocate_new_segment(sbi, i, false); up_write(&SIT_I(sbi)->sentry_lock); + up_read(&SM_I(sbi)->curseg_lock); } static const struct segment_allocation default_salloc_ops = { From 73e7f1732e800a88cafab31d75548c6fcfdd8c47 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Sat, 27 Mar 2021 21:38:56 -0700 Subject: [PATCH 0269/1379] Input: imx_keypad - convert to a DT-only driver i.MX has been converted to a DT-only platform, so make the adjustments to the driver to convert it to a DT-only driver. Signed-off-by: Fabio Estevam Link: https://lore.kernel.org/r/20210327194307.541248-1-festevam@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/imx_keypad.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/input/keyboard/imx_keypad.c b/drivers/input/keyboard/imx_keypad.c index 1f5c9ea5e9e5..ae9303848571 100644 --- a/drivers/input/keyboard/imx_keypad.c +++ b/drivers/input/keyboard/imx_keypad.c @@ -408,27 +408,18 @@ open_err: return -EIO; } -#ifdef CONFIG_OF static const struct of_device_id imx_keypad_of_match[] = { { .compatible = "fsl,imx21-kpp", }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, imx_keypad_of_match); -#endif static int imx_keypad_probe(struct platform_device *pdev) { - const struct matrix_keymap_data *keymap_data = - dev_get_platdata(&pdev->dev); struct imx_keypad *keypad; struct input_dev *input_dev; int irq, error, i, row, col; - if (!keymap_data && !pdev->dev.of_node) { - dev_err(&pdev->dev, "no keymap defined\n"); - return -EINVAL; - } - irq = platform_get_irq(pdev, 0); if (irq < 0) return irq; @@ -469,7 +460,7 @@ static int imx_keypad_probe(struct platform_device *pdev) input_dev->open = imx_keypad_open; input_dev->close = imx_keypad_close; - error = matrix_keypad_build_keymap(keymap_data, NULL, + error = matrix_keypad_build_keymap(NULL, NULL, MAX_MATRIX_KEY_ROWS, MAX_MATRIX_KEY_COLS, keypad->keycodes, input_dev); @@ -582,7 +573,7 @@ static struct platform_driver imx_keypad_driver = { .driver = { .name = "imx-keypad", .pm = &imx_kbd_pm_ops, - .of_match_table = of_match_ptr(imx_keypad_of_match), + .of_match_table = imx_keypad_of_match, }, .probe = imx_keypad_probe, }; From 31ec9c2746467a372b009940ce7b722055daaf6a Mon Sep 17 00:00:00 2001 From: Ryder Lee Date: Thu, 5 Nov 2020 04:58:33 +0800 Subject: [PATCH 0270/1379] PCI: mediatek: Configure FC and FTS for functions other than 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "PCI_FUNC(port->slot << 3)" is always 0, so previously mtk_pcie_startup_port() only configured FC credits and FTs for function 0. Compute "func" correctly so we also configure functions other than 0. This affects MT2701 and MT7623. Link: https://lore.kernel.org/r/c529dbfc066f4bda9b87edbdbf771f207e69b84e.1604510053.git.ryder.lee@mediatek.com Signed-off-by: Ryder Lee Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Wilczyński --- drivers/pci/controller/pcie-mediatek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/pcie-mediatek.c b/drivers/pci/controller/pcie-mediatek.c index 23548b517e4b..8af04848e0f9 100644 --- a/drivers/pci/controller/pcie-mediatek.c +++ b/drivers/pci/controller/pcie-mediatek.c @@ -760,7 +760,7 @@ static struct pci_ops mtk_pcie_ops = { static int mtk_pcie_startup_port(struct mtk_pcie_port *port) { struct mtk_pcie *pcie = port->pcie; - u32 func = PCI_FUNC(port->slot << 3); + u32 func = PCI_FUNC(port->slot); u32 slot = PCI_SLOT(port->slot << 3); u32 val; int err; From f35bb4b8d10a8ce356f0fff634fa885926f8d439 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 15 Mar 2021 16:34:59 +0530 Subject: [PATCH 0271/1379] RISC-V: Don't print SBI version for all detected extensions The sbi_init() already prints SBI version before detecting various SBI extensions so we don't need to print SBI version for all detected SBI extensions. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sbi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index f4a7db3d309e..c0dcebdd30ec 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -577,19 +577,19 @@ void __init sbi_init(void) sbi_get_firmware_id(), sbi_get_firmware_version()); if (sbi_probe_extension(SBI_EXT_TIME) > 0) { __sbi_set_timer = __sbi_set_timer_v02; - pr_info("SBI v0.2 TIME extension detected\n"); + pr_info("SBI TIME extension detected\n"); } else { __sbi_set_timer = __sbi_set_timer_v01; } if (sbi_probe_extension(SBI_EXT_IPI) > 0) { __sbi_send_ipi = __sbi_send_ipi_v02; - pr_info("SBI v0.2 IPI extension detected\n"); + pr_info("SBI IPI extension detected\n"); } else { __sbi_send_ipi = __sbi_send_ipi_v01; } if (sbi_probe_extension(SBI_EXT_RFENCE) > 0) { __sbi_rfence = __sbi_rfence_v02; - pr_info("SBI v0.2 RFENCE extension detected\n"); + pr_info("SBI RFENCE extension detected\n"); } else { __sbi_rfence = __sbi_rfence_v01; } From 2da073c19641bb6820c3591d3d865120263f14e8 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Sat, 13 Mar 2021 03:45:05 -0500 Subject: [PATCH 0272/1379] riscv: Cleanup KASAN_VMALLOC support When KASAN vmalloc region is populated, there is no userspace process and the page table in use is swapper_pg_dir, so there is no need to read SATP. Then we can use the same scheme used by kasan_populate_p*d functions to go through the page table, which harmonizes the code. In addition, make use of set_pgd that goes through all unused page table levels, contrary to p*d_populate functions, which makes this function work whatever the number of page table levels. Signed-off-by: Alexandre Ghiti Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 57 ++++++++++++-------------------------- 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 3fc18f469efb..2c39f0386673 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -11,18 +11,6 @@ #include #include -static __init void *early_alloc(size_t size, int node) -{ - void *ptr = memblock_alloc_try_nid(size, size, - __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, node); - - if (!ptr) - panic("%pS: Failed to allocate %zu bytes align=%zx nid=%d from=%llx\n", - __func__, size, size, node, (u64)__pa(MAX_DMA_ADDRESS)); - - return ptr; -} - extern pgd_t early_pg_dir[PTRS_PER_PGD]; asmlinkage void __init kasan_early_init(void) { @@ -155,38 +143,27 @@ static void __init kasan_populate(void *start, void *end) memset(start, KASAN_SHADOW_INIT, end - start); } -void __init kasan_shallow_populate(void *start, void *end) +static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end) +{ + unsigned long next; + void *p; + pgd_t *pgd_k = pgd_offset_k(vaddr); + + do { + next = pgd_addr_end(vaddr, end); + if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) { + p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); + } + } while (pgd_k++, vaddr = next, vaddr != end); +} + +static void __init kasan_shallow_populate(void *start, void *end) { unsigned long vaddr = (unsigned long)start & PAGE_MASK; unsigned long vend = PAGE_ALIGN((unsigned long)end); - unsigned long pfn; - int index; - void *p; - pud_t *pud_dir, *pud_k; - pgd_t *pgd_dir, *pgd_k; - p4d_t *p4d_dir, *p4d_k; - while (vaddr < vend) { - index = pgd_index(vaddr); - pfn = csr_read(CSR_SATP) & SATP_PPN; - pgd_dir = (pgd_t *)pfn_to_virt(pfn) + index; - pgd_k = init_mm.pgd + index; - pgd_dir = pgd_offset_k(vaddr); - set_pgd(pgd_dir, *pgd_k); - - p4d_dir = p4d_offset(pgd_dir, vaddr); - p4d_k = p4d_offset(pgd_k, vaddr); - - vaddr = (vaddr + PUD_SIZE) & PUD_MASK; - pud_dir = pud_offset(p4d_dir, vaddr); - pud_k = pud_offset(p4d_k, vaddr); - - if (pud_present(*pud_dir)) { - p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); - pud_populate(&init_mm, pud_dir, p); - } - vaddr += PAGE_SIZE; - } + kasan_shallow_populate_pgd(vaddr, vend); } void __init kasan_init(void) From 7d0bc44bd0ea163a251d4aa778d1b6fcf6174d22 Mon Sep 17 00:00:00 2001 From: Carlos de Paula Date: Tue, 16 Mar 2021 13:02:43 -0300 Subject: [PATCH 0273/1379] kbuild: buildtar: add riscv support Make 'make tar-pkg' and 'tarbz2-pkg' work on riscv. Signed-off-by: Carlos de Paula Signed-off-by: Palmer Dabbelt --- scripts/package/buildtar | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/package/buildtar b/scripts/package/buildtar index 936198a90477..221aa7df008d 100755 --- a/scripts/package/buildtar +++ b/scripts/package/buildtar @@ -125,6 +125,14 @@ case "${ARCH}" in fi done ;; + riscv) + for i in Image.bz2 Image.gz Image; do + if [ -f "${objtree}/arch/riscv/boot/${i}" ] ; then + cp -v -- "${objtree}/arch/riscv/boot/${i}" "${tmpdir}/boot/vmlinux-${KERNELRELEASE}" + break + fi + done + ;; *) [ -f "${KBUILD_IMAGE}" ] && cp -v -- "${KBUILD_IMAGE}" "${tmpdir}/boot/vmlinux-kbuild-${KERNELRELEASE}" echo "" >&2 From 52ab55dfe32357d5889eef2969a03dd662aa2b7d Mon Sep 17 00:00:00 2001 From: Dongdong Liu Date: Tue, 30 Mar 2021 21:43:19 +0800 Subject: [PATCH 0274/1379] dt-bindings: PCI: hisi: Delete the obsolete HiSilicon PCIe file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hisilicon-pcie.txt file is no longer useful since commit c2fa6cf76d20 (PCI: dwc: hisi: Remove non-ECAM HiSilicon hip05/hip06 driver), so delete it and remove related code in MAINTAINERS file. Suggested-by: Zhou Wang Link: https://lore.kernel.org/r/1617111799-109749-1-git-send-email-liudongdong3@huawei.com Signed-off-by: Dongdong Liu Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Wilczyński --- .../bindings/pci/hisilicon-pcie.txt | 43 ------------------- MAINTAINERS | 1 - 2 files changed, 44 deletions(-) delete mode 100644 Documentation/devicetree/bindings/pci/hisilicon-pcie.txt diff --git a/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt b/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt deleted file mode 100644 index d6796ef54ea1..000000000000 --- a/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt +++ /dev/null @@ -1,43 +0,0 @@ -HiSilicon Hip05 and Hip06 PCIe host bridge DT description - -HiSilicon PCIe host controller is based on the Synopsys DesignWare PCI core. -It shares common functions with the PCIe DesignWare core driver and inherits -common properties defined in -Documentation/devicetree/bindings/pci/designware-pcie.txt. - -Additional properties are described here: - -Required properties -- compatible: Should contain "hisilicon,hip05-pcie" or "hisilicon,hip06-pcie". -- reg: Should contain rc_dbi, config registers location and length. -- reg-names: Must include the following entries: - "rc_dbi": controller configuration registers; - "config": PCIe configuration space registers. -- msi-parent: Should be its_pcie which is an ITS receiving MSI interrupts. -- port-id: Should be 0, 1, 2 or 3. - -Optional properties: -- status: Either "ok" or "disabled". -- dma-coherent: Present if DMA operations are coherent. - -Hip05 Example (note that Hip06 is the same except compatible): - pcie@b0080000 { - compatible = "hisilicon,hip05-pcie", "snps,dw-pcie"; - reg = <0 0xb0080000 0 0x10000>, <0x220 0x00000000 0 0x2000>; - reg-names = "rc_dbi", "config"; - bus-range = <0 15>; - msi-parent = <&its_pcie>; - #address-cells = <3>; - #size-cells = <2>; - device_type = "pci"; - dma-coherent; - ranges = <0x82000000 0 0x00000000 0x220 0x00000000 0 0x10000000>; - num-lanes = <8>; - port-id = <1>; - #interrupt-cells = <1>; - interrupt-map-mask = <0xf800 0 0 7>; - interrupt-map = <0x0 0 0 1 &mbigen_pcie 1 10 - 0x0 0 0 2 &mbigen_pcie 2 11 - 0x0 0 0 3 &mbigen_pcie 3 12 - 0x0 0 0 4 &mbigen_pcie 4 13>; - }; diff --git a/MAINTAINERS b/MAINTAINERS index d92f85ca831d..8a154939ae27 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13899,7 +13899,6 @@ PCIE DRIVER FOR HISILICON M: Zhou Wang L: linux-pci@vger.kernel.org S: Maintained -F: Documentation/devicetree/bindings/pci/hisilicon-pcie.txt F: drivers/pci/controller/dwc/pcie-hisi.c PCIE DRIVER FOR HISILICON KIRIN From dbc334fb411f2e87ca0e812dc7ba13464aa89504 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Sun, 7 Feb 2021 17:08:38 +0800 Subject: [PATCH 0275/1379] platform/chrome: wilco_ec: convert stream-like files from nonseekable_open -> stream_open Eliminate the following coccicheck warning: ./drivers/platform/chrome/wilco_ec/telemetry.c:259:1-17: WARNING: telem_fops: .read() and .write() have stream semantic; safe to change nonseekable_open -> stream_open. Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/1612688918-63132-1-git-send-email-yang.lee@linux.alibaba.com --- drivers/platform/chrome/wilco_ec/telemetry.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/chrome/wilco_ec/telemetry.c b/drivers/platform/chrome/wilco_ec/telemetry.c index e06d96fb9426..60da7a29f2ff 100644 --- a/drivers/platform/chrome/wilco_ec/telemetry.c +++ b/drivers/platform/chrome/wilco_ec/telemetry.c @@ -256,7 +256,7 @@ static int telem_open(struct inode *inode, struct file *filp) sess_data->dev_data = dev_data; sess_data->has_msg = false; - nonseekable_open(inode, filp); + stream_open(inode, filp); filp->private_data = sess_data; return 0; From 639ff208cb37c5a3f0198e62d04962b677d25c9c Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Thu, 18 Mar 2021 18:51:03 -0700 Subject: [PATCH 0276/1379] platform/chrome: cros_ec_typec: Check for device within remove function In a couple of call sites, we use the same pattern of checking for a partner or cable device before attempting to remove it. Simplify this by moving those checks into the remove functions. Cc: Benson Leung Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20210319015103.3751672-1-pmalani@chromium.org --- drivers/platform/chrome/cros_ec_typec.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index 0811562deecc..8e7fde3e7032 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -220,6 +220,9 @@ static void cros_typec_remove_partner(struct cros_typec_data *typec, { struct cros_typec_port *port = typec->ports[port_num]; + if (!port->partner) + return; + cros_typec_unregister_altmodes(typec, port_num, true); cros_typec_usb_disconnect_state(port); @@ -235,6 +238,9 @@ static void cros_typec_remove_cable(struct cros_typec_data *typec, { struct cros_typec_port *port = typec->ports[port_num]; + if (!port->cable) + return; + cros_typec_unregister_altmodes(typec, port_num, false); typec_unregister_plug(port->plug); @@ -253,11 +259,8 @@ static void cros_unregister_ports(struct cros_typec_data *typec) if (!typec->ports[i]) continue; - if (typec->ports[i]->partner) - cros_typec_remove_partner(typec, i); - - if (typec->ports[i]->cable) - cros_typec_remove_cable(typec, i); + cros_typec_remove_partner(typec, i); + cros_typec_remove_cable(typec, i); usb_role_switch_put(typec->ports[i]->role_sw); typec_switch_put(typec->ports[i]->ori_sw); @@ -647,11 +650,8 @@ static void cros_typec_set_port_params_v1(struct cros_typec_data *typec, "Failed to register partner on port: %d\n", port_num); } else { - if (typec->ports[port_num]->partner) - cros_typec_remove_partner(typec, port_num); - - if (typec->ports[port_num]->cable) - cros_typec_remove_cable(typec, port_num); + cros_typec_remove_partner(typec, port_num); + cros_typec_remove_cable(typec, port_num); } } From c6e939c63c80c26460b25cf1150ebe8396e8adcf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Mar 2021 12:55:55 +0100 Subject: [PATCH 0277/1379] platform/chrome: cros_ec_typec: fix clang -Wformat warning Clang warns about using the %h format modifier to truncate an integer: drivers/platform/chrome/cros_ec_typec.c:1031:3: error: format specifies type 'unsigned char' but the argument has type 'unsigned int' [-Werror,-Wformat] typec->pd_ctrl_ver); ^~~~~~~~~~~~~~~~~~ include/linux/dev_printk.h:131:47: note: expanded from macro 'dev_dbg' dev_printk(KERN_DEBUG, dev, dev_fmt(fmt), ##__VA_ARGS__); \ ~~~ ^~~~~~~~~~~ Use an explicit bit mask to limit the number to its lower eight bits instead. Fixes: ad7c0510c99e ("platform/chrome: cros_ec_typec: Update port info from EC") Signed-off-by: Arnd Bergmann Reviewed-by: Guenter Roeck Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20210322115602.4003221-1-arnd@kernel.org --- drivers/platform/chrome/cros_ec_typec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index 8e7fde3e7032..d3df1717a5fd 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -1027,8 +1027,8 @@ static int cros_typec_get_cmd_version(struct cros_typec_data *typec) else typec->pd_ctrl_ver = 0; - dev_dbg(typec->dev, "PD Control has version mask 0x%hhx\n", - typec->pd_ctrl_ver); + dev_dbg(typec->dev, "PD Control has version mask 0x%02x\n", + typec->pd_ctrl_ver & 0xff); return 0; } From a7bf66ff3d945db9dd2079265ce9cc5c374cecc3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 17 Dec 2020 21:59:03 +0100 Subject: [PATCH 0278/1379] PCI/VPD: Remove obsolete Broadcom NIC quirk quirk_brcm_570x_limit_vpd() was added in 2008 [0] when we didn't have the logic to determine VPD size based on checking for the VPD end tag. Now that we do have this logic [1] and don't read beyond the end tag, this quirk can be removed. [0] 99cb233d60cb ("PCI: Limit VPD read/write lengths for Broadcom 5706, 5708, 5709 rev.") [1] 104daa71b396 ("PCI: Determine actual VPD size on first access") Link: https://lore.kernel.org/r/daa6acdf-5027-62c8-e3fb-125411b018f5@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/pci/vpd.c | 46 ---------------------------------------------- 1 file changed, 46 deletions(-) diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 7915d10f9aa1..ef5165eb3b62 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -578,52 +578,6 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_QLOGIC, 0x2261, quirk_blacklist_vpd); DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, PCI_CLASS_BRIDGE_PCI, 8, quirk_blacklist_vpd); -/* - * For Broadcom 5706, 5708, 5709 rev. A nics, any read beyond the - * VPD end tag will hang the device. This problem was initially - * observed when a vpd entry was created in sysfs - * ('/sys/bus/pci/devices//vpd'). A read to this sysfs entry - * will dump 32k of data. Reading a full 32k will cause an access - * beyond the VPD end tag causing the device to hang. Once the device - * is hung, the bnx2 driver will not be able to reset the device. - * We believe that it is legal to read beyond the end tag and - * therefore the solution is to limit the read/write length. - */ -static void quirk_brcm_570x_limit_vpd(struct pci_dev *dev) -{ - /* - * Only disable the VPD capability for 5706, 5706S, 5708, - * 5708S and 5709 rev. A - */ - if ((dev->device == PCI_DEVICE_ID_NX2_5706) || - (dev->device == PCI_DEVICE_ID_NX2_5706S) || - (dev->device == PCI_DEVICE_ID_NX2_5708) || - (dev->device == PCI_DEVICE_ID_NX2_5708S) || - ((dev->device == PCI_DEVICE_ID_NX2_5709) && - (dev->revision & 0xf0) == 0x0)) { - if (dev->vpd) - dev->vpd->len = 0x80; - } -} -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_BROADCOM, - PCI_DEVICE_ID_NX2_5706, - quirk_brcm_570x_limit_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_BROADCOM, - PCI_DEVICE_ID_NX2_5706S, - quirk_brcm_570x_limit_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_BROADCOM, - PCI_DEVICE_ID_NX2_5708, - quirk_brcm_570x_limit_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_BROADCOM, - PCI_DEVICE_ID_NX2_5708S, - quirk_brcm_570x_limit_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_BROADCOM, - PCI_DEVICE_ID_NX2_5709, - quirk_brcm_570x_limit_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_BROADCOM, - PCI_DEVICE_ID_NX2_5709S, - quirk_brcm_570x_limit_vpd); - static void quirk_chelsio_extend_vpd(struct pci_dev *dev) { int chip = (dev->device & 0xf000) >> 12; From 26c0cf2a603d4ebf00d8c1baf714763fca31dba4 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 3 Feb 2021 09:48:03 +0100 Subject: [PATCH 0279/1379] PCI/VPD: Remove sysfs accessor size checking dead code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 104daa71b396 ("PCI: Determine actual VPD size on first access") attribute size is set to 0 (unlimited). Remove the dead code that checks for "bin_attr->size > 0". Link: https://lore.kernel.org/r/267eae86-f8a6-6792-a7f8-2c4fd51beedc@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- drivers/pci/vpd.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index ef5165eb3b62..ab81c7a5185f 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -403,13 +403,6 @@ static ssize_t read_vpd_attr(struct file *filp, struct kobject *kobj, { struct pci_dev *dev = to_pci_dev(kobj_to_dev(kobj)); - if (bin_attr->size > 0) { - if (off > bin_attr->size) - count = 0; - else if (count > bin_attr->size - off) - count = bin_attr->size - off; - } - return pci_read_vpd(dev, off, count, buf); } @@ -419,13 +412,6 @@ static ssize_t write_vpd_attr(struct file *filp, struct kobject *kobj, { struct pci_dev *dev = to_pci_dev(kobj_to_dev(kobj)); - if (bin_attr->size > 0) { - if (off > bin_attr->size) - count = 0; - else if (count > bin_attr->size - off) - count = bin_attr->size - off; - } - return pci_write_vpd(dev, off, count, buf); } From 0dae52282a5eede01339480f7273a545ca4447c2 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Fri, 8 Jan 2021 23:46:50 +0100 Subject: [PATCH 0280/1379] 9p: Constify static struct v9fs_attr_group The only usage of v9fs_attr_group is to pass its address to sysfs_create_group() and sysfs_create_group(), both which takes pointers to const struct attribute_group. Make it const to allow the compiler to put it in read-only memory. Link: http://lkml.kernel.org/r/20210108224650.25872-1-rikard.falkeborn@gmail.com Signed-off-by: Rikard Falkeborn Signed-off-by: Dominique Martinet --- fs/9p/v9fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 39def020a074..cdb99507ef33 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -583,7 +583,7 @@ static struct attribute *v9fs_attrs[] = { NULL, }; -static struct attribute_group v9fs_attr_group = { +static const struct attribute_group v9fs_attr_group = { .attrs = v9fs_attrs, }; From f8b139e2f24112f4e21f1eb02c7fc7600fea4b8d Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 30 Mar 2021 21:06:32 +0800 Subject: [PATCH 0281/1379] fs: 9p: fix v9fs_file_open writeback fid error check IS_ERR() and PTR_ERR() use wrong pointer, it should be writeback_fid, fix it. Link: http://lkml.kernel.org/r/20210330130632.1054357-1-yangyingliang@huawei.com Fixes: 5bfe97d7382b ("9p: Fix writeback fid incorrectly being attached to dentry") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang [Dominique: adjusted commit summary] Signed-off-by: Dominique Martinet --- fs/9p/vfs_file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 649f04f112dc..59c32c9b799f 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -86,8 +86,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) * to work. */ writeback_fid = v9fs_writeback_fid(file_dentry(file)); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); + if (IS_ERR(writeback_fid)) { + err = PTR_ERR(writeback_fid); mutex_unlock(&v9inode->v_mutex); goto out_error; } From 5911d2d1d1a38b26585383478bd71d9254e48bdf Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 27 Mar 2021 17:57:06 +0800 Subject: [PATCH 0282/1379] f2fs: introduce gc_merge mount option In this patch, we will add two new mount options: "gc_merge" and "nogc_merge", when background_gc is on, "gc_merge" option can be set to let background GC thread to handle foreground GC requests, it can eliminate the sluggish issue caused by slow foreground GC operation when GC is triggered from a process with limited I/O and CPU resources. Original idea is from Xiang. Signed-off-by: Gao Xiang Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 6 ++++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/gc.c | 26 ++++++++++++++++++++++---- fs/f2fs/gc.h | 6 ++++++ fs/f2fs/segment.c | 15 +++++++++++++-- fs/f2fs/super.c | 19 +++++++++++++++++-- 6 files changed, 65 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 35ed01a5fbc9..63c0c49b726d 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -110,6 +110,12 @@ background_gc=%s Turn on/off cleaning operations, namely garbage on synchronous garbage collection running in background. Default value for this option is on. So garbage collection is on by default. +gc_merge When background_gc is on, this option can be enabled to + let background GC thread to handle foreground GC requests, + it can eliminate the sluggish issue caused by slow foreground + GC operation when GC is triggered from a process with limited + I/O and CPU resources. +nogc_merge Disable GC merge feature. disable_roll_forward Disable the roll-forward recovery routine norecovery Disable the roll-forward recovery routine, mounted read- only (i.e., -o ro,disable_roll_forward) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fe380bcf8d4d..87d734f5589d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -97,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_NORECOVERY 0x04000000 #define F2FS_MOUNT_ATGC 0x08000000 #define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 +#define F2FS_MOUNT_GC_MERGE 0x20000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a2ca483f9855..5c48825fd12d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -31,19 +31,24 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; + wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; set_freezable(); do { - bool sync_mode; + bool sync_mode, foreground = false; wait_event_interruptible_timeout(*wq, kthread_should_stop() || freezing(current) || + waitqueue_active(fggc_wq) || gc_th->gc_wake, msecs_to_jiffies(wait_ms)); + if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq)) + foreground = true; + /* give it a try one time */ if (gc_th->gc_wake) gc_th->gc_wake = 0; @@ -90,7 +95,10 @@ static int gc_thread_func(void *data) goto do_gc; } - if (!down_write_trylock(&sbi->gc_lock)) { + if (foreground) { + down_write(&sbi->gc_lock); + goto do_gc; + } else if (!down_write_trylock(&sbi->gc_lock)) { stat_other_skip_bggc_count(sbi); goto next; } @@ -107,14 +115,22 @@ static int gc_thread_func(void *data) else increase_sleep_time(gc_th, &wait_ms); do_gc: - stat_inc_bggc_count(sbi->stat_info); + if (!foreground) + stat_inc_bggc_count(sbi->stat_info); sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; + /* foreground GC was been triggered via f2fs_balance_fs() */ + if (foreground) + sync_mode = false; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, true, false, NULL_SEGNO)) + if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; + if (foreground) + wake_up_all(&gc_th->fggc_wq); + trace_f2fs_background_gc(sbi->sb, wait_ms, prefree_segments(sbi), free_segments(sbi)); @@ -148,6 +164,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); + init_waitqueue_head(&sbi->gc_thread->fggc_wq); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { @@ -165,6 +182,7 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); + wake_up_all(&gc_th->fggc_wq); kfree(gc_th); sbi->gc_thread = NULL; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 0c8dae12dc51..3fe145e8e594 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -42,6 +42,12 @@ struct f2fs_gc_kthread { /* for changing gc mode */ unsigned int gc_wake; + + /* for GC_MERGE mount option */ + wait_queue_head_t fggc_wq; /* + * caller of f2fs_balance_fs() + * will wait on this wait queue. + */ }; struct gc_inode_list { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 24ad45f5e335..31ccea1378fa 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -503,8 +503,19 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) * dir/node pages without enough free segments. */ if (has_not_enough_free_secs(sbi, 0, 0)) { - down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, false, NULL_SEGNO); + if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && + sbi->gc_thread->f2fs_gc_task) { + DEFINE_WAIT(wait); + + prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sbi->gc_thread->gc_wait_queue_head); + io_schedule(); + finish_wait(&sbi->gc_thread->fggc_wq, &wait); + } else { + down_write(&sbi->gc_lock); + f2fs_gc(sbi, false, false, false, NULL_SEGNO); + } } } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b48281642e98..954b1fe97d67 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -151,6 +151,8 @@ enum { Opt_compress_chksum, Opt_compress_mode, Opt_atgc, + Opt_gc_merge, + Opt_nogc_merge, Opt_err, }; @@ -223,6 +225,8 @@ static match_table_t f2fs_tokens = { {Opt_compress_chksum, "compress_chksum"}, {Opt_compress_mode, "compress_mode=%s"}, {Opt_atgc, "atgc"}, + {Opt_gc_merge, "gc_merge"}, + {Opt_nogc_merge, "nogc_merge"}, {Opt_err, NULL}, }; @@ -1073,6 +1077,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_atgc: set_opt(sbi, ATGC); break; + case Opt_gc_merge: + set_opt(sbi, GC_MERGE); + break; + case Opt_nogc_merge: + clear_opt(sbi, GC_MERGE); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1675,6 +1685,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) seq_printf(seq, ",background_gc=%s", "off"); + if (test_opt(sbi, GC_MERGE)) + seq_puts(seq, ",gc_merge"); + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, NORECOVERY)) @@ -2038,7 +2051,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * option. Also sync the filesystem. */ if ((*flags & SB_RDONLY) || - F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) { + (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF && + !test_opt(sbi, GC_MERGE))) { if (sbi->gc_thread) { f2fs_stop_gc_thread(sbi); need_restart_gc = true; @@ -4012,7 +4026,8 @@ reset_checkpoint: * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF && !f2fs_readonly(sb)) { + if ((F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF || + test_opt(sbi, GC_MERGE)) && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) From 23738e74472f9c5f3a05a68724a2ccfba97d283d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 31 Mar 2021 11:16:32 +0800 Subject: [PATCH 0283/1379] f2fs: fix to restrict mount condition on readonly block device When we mount an unclean f2fs image in a readonly block device, let's make mount() succeed only when there is no recoverable data in that image, otherwise after mount(), file fsyned won't be recovered as user expected. Fixes: 938a184265d7 ("f2fs: give a warning only for readonly partition") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 954b1fe97d67..14239e2b7ae7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3966,10 +3966,18 @@ try_onemore: * previous checkpoint was not done by clean system shutdown. */ if (f2fs_hw_is_readonly(sbi)) { - if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) - f2fs_err(sbi, "Need to recover fsync data, but write access unavailable"); - else - f2fs_info(sbi, "write access unavailable, skipping recovery"); + if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + err = f2fs_recover_fsync_data(sbi, true); + if (err > 0) { + err = -EROFS; + f2fs_err(sbi, "Need to recover fsync data, but " + "write access unavailable, please try " + "mount w/ disable_roll_forward or norecovery"); + } + if (err < 0) + goto free_meta; + } + f2fs_info(sbi, "write access unavailable, skipping recovery"); goto reset_checkpoint; } From d4707d79fae08c8996a1ba45965a491045a22dda Mon Sep 17 00:00:00 2001 From: Dejin Zheng Date: Sun, 28 Mar 2021 22:41:18 +0800 Subject: [PATCH 0284/1379] PCI: xgene: Fix cfg resource mapping In commit e2dcd20b1645 a change was made to use devm_platform_ioremap_resource_byname() to simplify code and remove the res variable; this was wrong since the res variable is still needed and as an outcome the port->cfg_addr gets an erroneous address. Revert the change going back to original behaviour. Link: https://lore.kernel.org/r/20210328144118.305074-1-zhengdejin5@gmail.com Fixes: e2dcd20b1645a ("PCI: controller: Convert to devm_platform_ioremap_resource_byname()") Reported-by: dann.frazier@canonical.com Tested-by: dann frazier Signed-off-by: Dejin Zheng Signed-off-by: Lorenzo Pieralisi Cc: stable@vger.kernel.org # v5.9+ --- drivers/pci/controller/pci-xgene.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/pci-xgene.c b/drivers/pci/controller/pci-xgene.c index 2afdc865253e..7f503dd4ff81 100644 --- a/drivers/pci/controller/pci-xgene.c +++ b/drivers/pci/controller/pci-xgene.c @@ -354,7 +354,8 @@ static int xgene_pcie_map_reg(struct xgene_pcie_port *port, if (IS_ERR(port->csr_base)) return PTR_ERR(port->csr_base); - port->cfg_base = devm_platform_ioremap_resource_byname(pdev, "cfg"); + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg"); + port->cfg_base = devm_ioremap_resource(dev, res); if (IS_ERR(port->cfg_base)) return PTR_ERR(port->cfg_base); port->cfg_addr = res->start; From be1ee45d51384161681ecf21085a42d316ae25f7 Mon Sep 17 00:00:00 2001 From: Yi Zhuang Date: Wed, 31 Mar 2021 17:34:14 +0800 Subject: [PATCH 0285/1379] f2fs: Fix a hungtask problem in atomic write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the cache writing process, if it is an atomic file, increase the page count of F2FS_WB_CP_DATA, otherwise increase the page count of F2FS_WB_DATA. When you step into the hook branch due to insufficient memory in f2fs_write_begin, f2fs_drop_inmem_pages_all will be called to traverse all atomic inodes and clear the FI_ATOMIC_FILE mark of all atomic files. In f2fs_drop_inmem_pages,first acquire the inmem_lock , revoke all the inmem_pages, and then clear the FI_ATOMIC_FILE mark. Before this mark is cleared, other threads may hold inmem_lock to add inmem_pages to the inode that has just been emptied inmem_pages, and increase the page count of F2FS_WB_CP_DATA. When the IO returns, it is found that the FI_ATOMIC_FILE flag is cleared by f2fs_drop_inmem_pages_all, and f2fs_is_atomic_file returns false,which causes the page count of F2FS_WB_DATA to be decremented. The page count of F2FS_WB_CP_DATA cannot be cleared. Finally, hungtask is triggered in f2fs_wait_on_all_pages because get_pages will never return zero. process A: process B: f2fs_drop_inmem_pages_all ->f2fs_drop_inmem_pages of inode#1 ->mutex_lock(&fi->inmem_lock) ->__revoke_inmem_pages of inode#1 f2fs_ioc_commit_atomic_write ->mutex_unlock(&fi->inmem_lock) ->f2fs_commit_inmem_pages of inode#1 ->mutex_lock(&fi->inmem_lock) ->__f2fs_commit_inmem_pages ->f2fs_do_write_data_page ->f2fs_outplace_write_data ->do_write_page ->f2fs_submit_page_write ->inc_page_count(sbi, F2FS_WB_CP_DATA ) ->mutex_unlock(&fi->inmem_lock) ->spin_lock(&sbi->inode_lock[ATOMIC_FILE]); ->clear_inode_flag(inode, FI_ATOMIC_FILE) ->spin_unlock(&sbi->inode_lock[ATOMIC_FILE]) f2fs_write_end_io ->dec_page_count(sbi, F2FS_WB_DATA ); We can fix the problem by putting the action of clearing the FI_ATOMIC_FILE mark into the inmem_lock lock. This operation can ensure that no one will submit the inmem pages before the FI_ATOMIC_FILE mark is cleared, so that there will be no atomic writes waiting for writeback. Fixes: 57864ae5ce3a ("f2fs: limit # of inmemory pages") Signed-off-by: Yi Zhuang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 31ccea1378fa..c517e689a9a3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -324,23 +324,27 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - while (!list_empty(&fi->inmem_pages)) { + do { mutex_lock(&fi->inmem_lock); + if (list_empty(&fi->inmem_pages)) { + fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; + + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(inode, FI_ATOMIC_FILE); + sbi->atomic_files--; + } + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + + mutex_unlock(&fi->inmem_lock); + break; + } __revoke_inmem_pages(inode, &fi->inmem_pages, true, false, true); mutex_unlock(&fi->inmem_lock); - } - - fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - sbi->atomic_files--; - } - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } while (1); } void f2fs_drop_inmem_page(struct inode *inode, struct page *page) From 1fd3dde5e270ad08f1406f921c9a2cda154fcea9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 30 Mar 2021 12:43:16 -0500 Subject: [PATCH 0286/1379] PCI: Add pci_disable_parity() Add pci_disable_parity() to disable reporting of parity errors for a device by clearing PCI_COMMAND_PARITY. The device will still set PCI_STATUS_DETECTED_PARITY when it detects a parity error or receives a Poisoned TLP, but it will not set PCI_STATUS_PARITY, which means it will not assert PERR# (conventional PCI) or report Poisoned TLPs (PCIe). Based-on: https://lore.kernel.org/linux-arm-kernel/d375987c-ea4f-dd98-4ef8-99b2fbfe7c33@gmail.com/ Based-on-patch-by: Heiner Kallweit Link: https://lore.kernel.org/r/20210330174318.1289680-2-helgaas@kernel.org Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.c | 17 +++++++++++++++++ include/linux/pci.h | 1 + 2 files changed, 18 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 16a17215f633..b1845e5e5c8f 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4453,6 +4453,23 @@ void pci_clear_mwi(struct pci_dev *dev) } EXPORT_SYMBOL(pci_clear_mwi); +/** + * pci_disable_parity - disable parity checking for device + * @dev: the PCI device to operate on + * + * Disable parity checking for device @dev + */ +void pci_disable_parity(struct pci_dev *dev) +{ + u16 cmd; + + pci_read_config_word(dev, PCI_COMMAND, &cmd); + if (cmd & PCI_COMMAND_PARITY) { + cmd &= ~PCI_COMMAND_PARITY; + pci_write_config_word(dev, PCI_COMMAND, cmd); + } +} + /** * pci_intx - enables/disables PCI INTx for device dev * @pdev: the PCI device to operate on diff --git a/include/linux/pci.h b/include/linux/pci.h index 86c799c97b77..4eaa773115da 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1201,6 +1201,7 @@ int __must_check pci_set_mwi(struct pci_dev *dev); int __must_check pcim_set_mwi(struct pci_dev *dev); int pci_try_set_mwi(struct pci_dev *dev); void pci_clear_mwi(struct pci_dev *dev); +void pci_disable_parity(struct pci_dev *dev); void pci_intx(struct pci_dev *dev, int enable); bool pci_check_and_mask_intx(struct pci_dev *dev); bool pci_check_and_unmask_intx(struct pci_dev *dev); From d06a113fec57b2c012b94241af4a1bc5656458a2 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 30 Mar 2021 12:43:17 -0500 Subject: [PATCH 0287/1379] IB/mthca: Disable parity reporting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For Mellanox Tavor devices, we previously set dev->broken_parity_status, which does not change the device's behavior; it merely prevents the EDAC PCI error reporting from warning about Master Data Parity Error, Signaled System Error, or Detected Parity Error for this device. Instead, disable Parity Error Response so the device doesn't report parity errors in the first place. [bhelgaas: split out pci_disable_parity(), commit log, keep quirk static] Link: https://lore.kernel.org/r/20210330174318.1289680-3-helgaas@kernel.org Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- drivers/pci/quirks.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 653660e3ba9e..6aa9df411604 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -206,16 +206,11 @@ DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_BRIDGE_HOST, 8, quirk_mmio_always_on); /* - * The Mellanox Tavor device gives false positive parity errors. Mark this - * device with a broken_parity_status to allow PCI scanning code to "skip" - * this now blacklisted device. + * The Mellanox Tavor device gives false positive parity errors. Disable + * parity error reporting. */ -static void quirk_mellanox_tavor(struct pci_dev *dev) -{ - dev->broken_parity_status = 1; /* This device gives false positives */ -} -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR, quirk_mellanox_tavor); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE, quirk_mellanox_tavor); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR, pci_disable_parity); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR_BRIDGE, pci_disable_parity); /* * Deal with broken BIOSes that neglect to enable passive release, From 0a0b5f4b43671f8f128eb438edacee0a1d113385 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 30 Mar 2021 12:43:18 -0500 Subject: [PATCH 0288/1379] ARM: iop32x: disable N2100 PCI parity reporting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On the N2100, instead of just marking the r8169 chips as having broken_parity_status, disable parity error reporting for them entirely. This was the only relevant place that set broken_parity_status, so we no longer need to check for it in the r8169 error interrupt handler. [bhelgaas: squash into one patch, commit log] Link: https://lore.kernel.org/r/20210330174318.1289680-4-helgaas@kernel.org Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- arch/arm/mach-iop32x/n2100.c | 8 ++++---- drivers/net/ethernet/realtek/r8169_main.c | 14 -------------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/arch/arm/mach-iop32x/n2100.c b/arch/arm/mach-iop32x/n2100.c index 78b9a5ee41c9..bf99e718f8b8 100644 --- a/arch/arm/mach-iop32x/n2100.c +++ b/arch/arm/mach-iop32x/n2100.c @@ -116,16 +116,16 @@ static struct hw_pci n2100_pci __initdata = { }; /* - * Both r8169 chips on the n2100 exhibit PCI parity problems. Set - * the ->broken_parity_status flag for both ports so that the r8169 - * driver knows it should ignore error interrupts. + * Both r8169 chips on the n2100 exhibit PCI parity problems. Turn + * off parity reporting for both ports so we don't get error interrupts + * for them. */ static void n2100_fixup_r8169(struct pci_dev *dev) { if (dev->bus->number == 0 && (dev->devfn == PCI_DEVFN(1, 0) || dev->devfn == PCI_DEVFN(2, 0))) - dev->broken_parity_status = 1; + pci_disable_parity(dev); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_REALTEK, PCI_ANY_ID, n2100_fixup_r8169); diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index f704da3f214c..a6aff0d993eb 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4358,20 +4358,6 @@ static void rtl8169_pcierr_interrupt(struct net_device *dev) if (net_ratelimit()) netdev_err(dev, "PCI error (cmd = 0x%04x, status_errs = 0x%04x)\n", pci_cmd, pci_status_errs); - /* - * The recovery sequence below admits a very elaborated explanation: - * - it seems to work; - * - I did not see what else could be done; - * - it makes iop3xx happy. - * - * Feel free to adjust to your needs. - */ - if (pdev->broken_parity_status) - pci_cmd &= ~PCI_COMMAND_PARITY; - else - pci_cmd |= PCI_COMMAND_SERR | PCI_COMMAND_PARITY; - - pci_write_config_word(pdev, PCI_COMMAND, pci_cmd); rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING); } From 1decdb335c366fc0a1bae0db55c138c613cc9a1f Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Tue, 30 Mar 2021 11:40:55 +0800 Subject: [PATCH 0289/1379] tracing: Remove duplicate struct declaration in trace_events.h struct trace_array is declared twice. One has been declared at forward declaration. Remove the duplicate. Link: https://lkml.kernel.org/r/20210330034056.2266969-1-wanjiabing@vivo.com Signed-off-by: Wan Jiabing Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 36e27c1f42e0..ad413b382a3c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -404,7 +404,6 @@ trace_get_fields(struct trace_event_call *event_call) return event_call->class->get_fields(event_call); } -struct trace_array; struct trace_subsystem_dir; enum { From f3ef7202ef7c705d640d1aeec3b286a641ac9186 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Mon, 29 Mar 2021 16:03:31 +0300 Subject: [PATCH 0290/1379] tracing: Remove unused argument from "ring_buffer_time_stamp() The "cpu" parameter is not being used by the function. Link: https://lkml.kernel.org/r/20210329130331.199402-1-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 2 +- kernel/trace/trace.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 057b7ed4fe24..dac53fd3afea 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -181,7 +181,7 @@ unsigned long ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cp unsigned long ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu); unsigned long ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu); -u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu); +u64 ring_buffer_time_stamp(struct trace_buffer *buffer); void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer, int cpu, u64 *ts); void ring_buffer_set_clock(struct trace_buffer *buffer, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f4216df58e31..2c0ee6484990 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1080,7 +1080,7 @@ static inline u64 rb_time_stamp(struct trace_buffer *buffer) return ts << DEBUG_SHIFT; } -u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu) +u64 ring_buffer_time_stamp(struct trace_buffer *buffer) { u64 time; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3834de91fb97..507a30bf26e4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -771,7 +771,7 @@ static u64 buffer_ftrace_now(struct array_buffer *buf, int cpu) if (!buf->buffer) return trace_clock_local(); - ts = ring_buffer_time_stamp(buf->buffer, cpu); + ts = ring_buffer_time_stamp(buf->buffer); ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts); return ts; @@ -7174,7 +7174,7 @@ static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_event *rbe) { if (rbe == this_cpu_read(trace_buffered_event)) - return ring_buffer_time_stamp(buffer, smp_processor_id()); + return ring_buffer_time_stamp(buffer); return ring_buffer_event_time_stamp(buffer, rbe); } @@ -8088,7 +8088,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); - t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu)); + t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); } else { @@ -8097,7 +8097,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); trace_seq_printf(s, "now ts: %llu\n", - ring_buffer_time_stamp(trace_buf->buffer, cpu)); + ring_buffer_time_stamp(trace_buf->buffer)); } cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); From 693978527c17e6234918f40b157c740f5632c102 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 25 Mar 2021 19:57:51 +0100 Subject: [PATCH 0291/1379] PCI/ACPI: Fix acpi_pci_set_power_state() debug message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If PCI_D3cold is passed to acpi_pci_set_power_state() as the second argument and there is no ACPI D3cold support for the given device, the debug message will state that the device power state has been changed to D3cold, while in fact it will be D3hot, because acpi_device_set_power() falls back to D3hot automatically if D3cold is not supported without returning an error. To address this issue, modify the debug message to print the current power state of the target PCI device's ACPI companion instead of printing the target power state, which may not reflect the real final power state of the device. Link: https://lore.kernel.org/r/4319486.LvFx2qVVIh@kreacher Signed-off-by: Rafael J. Wysocki Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- drivers/pci/pci-acpi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 53502a751914..36bc23e21759 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -1021,7 +1021,7 @@ static int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) if (!error) pci_dbg(dev, "power state changed by ACPI to %s\n", - acpi_power_state_string(state_conv[state])); + acpi_power_state_string(adev->power.state)); return error; } From 2726bf3ff2520dba61fafc90a055640f7ad54e05 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 22 Mar 2021 18:53:49 -0700 Subject: [PATCH 0292/1379] swiotlb: Make SWIOTLB_NO_FORCE perform no allocation When SWIOTLB_NO_FORCE is used, there should really be no allocations of default_nslabs to occur since we are not going to use those slabs. If a platform was somehow setting swiotlb_no_force and a later call to swiotlb_init() was to be made we would still be proceeding with allocating the default SWIOTLB size (64MB), whereas if swiotlb=noforce was set on the kernel command line we would have only allocated 2KB. This would be inconsistent and the point of initializing default_nslabs to 1, was intended to allocate the minimum amount of memory possible, so simply remove that minimal allocation period. Signed-off-by: Florian Fainelli Reviewed-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 539c76beb52e..0a5b6f7e75bc 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -83,12 +83,10 @@ setup_io_tlb_npages(char *str) } if (*str == ',') ++str; - if (!strcmp(str, "force")) { + if (!strcmp(str, "force")) swiotlb_force = SWIOTLB_FORCE; - } else if (!strcmp(str, "noforce")) { + else if (!strcmp(str, "noforce")) swiotlb_force = SWIOTLB_NO_FORCE; - default_nslabs = 1; - } return 0; } @@ -174,6 +172,9 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) struct io_tlb_mem *mem; size_t alloc_size; + if (swiotlb_force == SWIOTLB_NO_FORCE) + return 0; + /* protect against double initialization */ if (WARN_ON_ONCE(io_tlb_default_mem)) return -ENOMEM; @@ -211,6 +212,9 @@ swiotlb_init(int verbose) size_t bytes = PAGE_ALIGN(default_nslabs << IO_TLB_SHIFT); void *tlb; + if (swiotlb_force == SWIOTLB_NO_FORCE) + return; + /* Get IO TLB memory from the low pages */ tlb = memblock_alloc_low(bytes, PAGE_SIZE); if (!tlb) @@ -240,6 +244,9 @@ swiotlb_late_init_with_default_size(size_t default_size) unsigned int order; int rc = 0; + if (swiotlb_force == SWIOTLB_NO_FORCE) + return 0; + /* * Get IO TLB memory from the low pages */ @@ -276,6 +283,9 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) unsigned long bytes = nslabs << IO_TLB_SHIFT, i; struct io_tlb_mem *mem; + if (swiotlb_force == SWIOTLB_NO_FORCE) + return 0; + /* protect against double initialization */ if (WARN_ON_ONCE(io_tlb_default_mem)) return -ENOMEM; From db42523b4f3e83ff86b53cdda219a9767c8b047f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 1 Apr 2021 16:14:17 -0400 Subject: [PATCH 0293/1379] ftrace: Store the order of pages allocated in ftrace_page Instead of saving the size of the records field of the ftrace_page, store the order it uses to allocate the pages, as that is what is needed to know in order to free the pages. This simplifies the code. Link: https://lore.kernel.org/lkml/CAHk-=whyMxheOqXAORt9a7JK9gc9eHTgCJ55Pgs4p=X3RrQubQ@mail.gmail.com/ Signed-off-by: Linus Torvalds [ change log written by Steven Rostedt ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 22ecaaa13baa..f7bef5a9ada9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1090,7 +1090,7 @@ struct ftrace_page { struct ftrace_page *next; struct dyn_ftrace *records; int index; - int size; + int order; }; #define ENTRY_SIZE sizeof(struct dyn_ftrace) @@ -3181,7 +3181,7 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) ftrace_number_of_groups++; cnt = (PAGE_SIZE << order) / ENTRY_SIZE; - pg->size = cnt; + pg->order = order; if (cnt > count) cnt = count; @@ -3194,7 +3194,6 @@ ftrace_allocate_pages(unsigned long num_to_init) { struct ftrace_page *start_pg; struct ftrace_page *pg; - int order; int cnt; if (!num_to_init) @@ -3230,13 +3229,13 @@ ftrace_allocate_pages(unsigned long num_to_init) free_pages: pg = start_pg; while (pg) { - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - if (order >= 0) - free_pages((unsigned long)pg->records, order); + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } start_pg = pg->next; kfree(pg); pg = start_pg; - ftrace_number_of_pages -= 1 << order; ftrace_number_of_groups--; } pr_info("ftrace: FAILED to allocate memory for functions\n"); @@ -6188,6 +6187,7 @@ static int ftrace_process_locs(struct module *mod, p = start; pg = start_pg; while (p < end) { + unsigned long end_offset; addr = ftrace_call_adjust(*p++); /* * Some architecture linkers will pad between @@ -6198,7 +6198,8 @@ static int ftrace_process_locs(struct module *mod, if (!addr) continue; - if (pg->index == pg->size) { + end_offset = (pg->index+1) * sizeof(pg->records[0]); + if (end_offset > PAGE_SIZE << pg->order) { /* We should have allocated enough */ if (WARN_ON(!pg->next)) break; @@ -6367,7 +6368,6 @@ void ftrace_release_mod(struct module *mod) struct ftrace_page **last_pg; struct ftrace_page *tmp_page = NULL; struct ftrace_page *pg; - int order; mutex_lock(&ftrace_lock); @@ -6418,12 +6418,12 @@ void ftrace_release_mod(struct module *mod) /* Needs to be called outside of ftrace_lock */ clear_mod_from_hashes(pg); - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - if (order >= 0) - free_pages((unsigned long)pg->records, order); + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } tmp_page = pg->next; kfree(pg); - ftrace_number_of_pages -= 1 << order; ftrace_number_of_groups--; } } @@ -6741,7 +6741,6 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) struct ftrace_mod_map *mod_map = NULL; struct ftrace_init_func *func, *func_next; struct list_head clear_hash; - int order; INIT_LIST_HEAD(&clear_hash); @@ -6779,10 +6778,10 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) ftrace_update_tot_cnt--; if (!pg->index) { *last_pg = pg->next; - order = get_count_order(pg->size / ENTRIES_PER_PAGE); - if (order >= 0) - free_pages((unsigned long)pg->records, order); - ftrace_number_of_pages -= 1 << order; + if (pg->records) { + free_pages((unsigned long)pg->records, pg->order); + ftrace_number_of_pages -= 1 << pg->order; + } ftrace_number_of_groups--; kfree(pg); pg = container_of(last_pg, struct ftrace_page, next); From ceaaa12904df07d07ea8975abbf04c4d60e46956 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 1 Apr 2021 16:40:32 -0400 Subject: [PATCH 0294/1379] ftrace: Simplify the calculation of page number for ftrace_page->records some more Commit b40c6eabfcd40 ("ftrace: Simplify the calculation of page number for ftrace_page->records") simplified the calculation of the number of pages needed for each page group without having any empty pages, but it can be simplified even further. Link: https://lore.kernel.org/lkml/CAHk-=wjt9b7kxQ2J=aDNKbR1QBMB3Hiqb_hYcZbKsxGRSEb+gQ@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f7bef5a9ada9..057e962ca5ce 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3156,15 +3156,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) if (WARN_ON(!count)) return -EINVAL; + /* We want to fill as much as possible, with no empty pages */ pages = DIV_ROUND_UP(count, ENTRIES_PER_PAGE); - order = get_count_order(pages); - - /* - * We want to fill as much as possible. No more than a page - * may be empty. - */ - if (!is_power_of_2(pages)) - order--; + order = fls(pages) - 1; again: pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order); From b5d15199a26f6dce624b43c82764cdb3827e7c89 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Apr 2021 17:25:20 -0700 Subject: [PATCH 0295/1379] f2fs: set checkpoint_merge by default Once we introduced checkpoint_merge, we've seen some contention w/o the option. In order to avoid it, let's set it by default. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 14239e2b7ae7..c15800c3cdb1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1839,6 +1839,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, EXTENT_CACHE); set_opt(sbi, NOHEAP); clear_opt(sbi, DISABLE_CHECKPOINT); + set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); From 42e4eefb089f12ea900062ecdcc7ca10c3423a05 Mon Sep 17 00:00:00 2001 From: Hao Fang Date: Tue, 30 Mar 2021 14:33:48 +0800 Subject: [PATCH 0296/1379] dma-mapping: benchmark: use the correct HiSilicon copyright s/Hisilicon/HiSilicon/g. It should use capital S, according to https://www.hisilicon.com/en/terms-of-use. Signed-off-by: Hao Fang Acked-by: Barry Song Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 2 +- tools/testing/selftests/dma/dma_map_benchmark.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index e0e64f8b0739..00d6549a5495 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020 Hisilicon Limited. + * Copyright (C) 2020 HiSilicon Limited. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c index fb23ce9617ea..b492bed0936d 100644 --- a/tools/testing/selftests/dma/dma_map_benchmark.c +++ b/tools/testing/selftests/dma/dma_map_benchmark.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020 Hisilicon Limited. + * Copyright (C) 2020 HiSilicon Limited. */ #include From ca947482b0b30443e6da1f0f5ba7244e34a4f65a Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Thu, 18 Mar 2021 17:29:30 +0800 Subject: [PATCH 0297/1379] dma-mapping: benchmark: Add support for multi-pages map/unmap Currently it only support one page map/unmap once a time for dma-map benchmark, but there are some other scenaries which need to support for multi-page map/unmap: for those multi-pages interfaces such as dma_alloc_coherent() and dma_map_sg(), the time spent on multi-pages map/unmap is not the time of a single page * npages (not linear) as it may use block description instead of page description when it is satified with the size such as 2M/1G, and also it can send a single TLB invalidation command to invalidate multi-pages instead of multi-times when RIL is enabled (which will short the time of unmap). So it is necessary to add support for multi-pages map/unmap. Add a parameter "-g" to support multi-pages map/unmap. Signed-off-by: Xiang Chen Acked-by: Barry Song Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 21 ++++++++++++------- .../testing/selftests/dma/dma_map_benchmark.c | 20 ++++++++++++++---- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 00d6549a5495..9b9af1bd6be3 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -38,7 +38,8 @@ struct map_benchmark { __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u8 expansion[80]; /* For future use */ + __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ }; struct map_benchmark_data { @@ -58,9 +59,11 @@ static int map_benchmark_thread(void *data) void *buf; dma_addr_t dma_addr; struct map_benchmark_data *map = data; + int npages = map->bparam.granule; + u64 size = npages * PAGE_SIZE; int ret = 0; - buf = (void *)__get_free_page(GFP_KERNEL); + buf = alloc_pages_exact(size, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -76,10 +79,10 @@ static int map_benchmark_thread(void *data) * 66 means evertything goes well! 66 is lucky. */ if (map->dir != DMA_FROM_DEVICE) - memset(buf, 0x66, PAGE_SIZE); + memset(buf, 0x66, size); map_stime = ktime_get(); - dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir); + dma_addr = dma_map_single(map->dev, buf, size, map->dir); if (unlikely(dma_mapping_error(map->dev, dma_addr))) { pr_err("dma_map_single failed on %s\n", dev_name(map->dev)); @@ -93,7 +96,7 @@ static int map_benchmark_thread(void *data) ndelay(map->bparam.dma_trans_ns); unmap_stime = ktime_get(); - dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir); + dma_unmap_single(map->dev, dma_addr, size, map->dir); unmap_etime = ktime_get(); unmap_delta = ktime_sub(unmap_etime, unmap_stime); @@ -112,7 +115,7 @@ static int map_benchmark_thread(void *data) } out: - free_page((unsigned long)buf); + free_pages_exact(buf, size); return ret; } @@ -203,7 +206,6 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, struct map_benchmark_data *map = file->private_data; void __user *argp = (void __user *)arg; u64 old_dma_mask; - int ret; if (copy_from_user(&map->bparam, argp, sizeof(map->bparam))) @@ -234,6 +236,11 @@ static long map_benchmark_ioctl(struct file *file, unsigned int cmd, return -EINVAL; } + if (map->bparam.granule < 1 || map->bparam.granule > 1024) { + pr_err("invalid granule size\n"); + return -EINVAL; + } + switch (map->bparam.dma_dir) { case DMA_MAP_BIDIRECTIONAL: map->dir = DMA_BIDIRECTIONAL; diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c index b492bed0936d..485dff51bad2 100644 --- a/tools/testing/selftests/dma/dma_map_benchmark.c +++ b/tools/testing/selftests/dma/dma_map_benchmark.c @@ -40,7 +40,8 @@ struct map_benchmark { __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u8 expansion[80]; /* For future use */ + __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ }; int main(int argc, char **argv) @@ -51,11 +52,13 @@ int main(int argc, char **argv) int threads = 1, seconds = 20, node = -1; /* default dma mask 32bit, bidirectional DMA */ int bits = 32, xdelay = 0, dir = DMA_MAP_BIDIRECTIONAL; + /* default granule 1 PAGESIZE */ + int granule = 1; int cmd = DMA_MAP_BENCHMARK; char *p; - while ((opt = getopt(argc, argv, "t:s:n:b:d:x:")) != -1) { + while ((opt = getopt(argc, argv, "t:s:n:b:d:x:g:")) != -1) { switch (opt) { case 't': threads = atoi(optarg); @@ -75,6 +78,9 @@ int main(int argc, char **argv) case 'x': xdelay = atoi(optarg); break; + case 'g': + granule = atoi(optarg); + break; default: return -1; } @@ -110,6 +116,11 @@ int main(int argc, char **argv) exit(1); } + if (granule < 1 || granule > 1024) { + fprintf(stderr, "invalid granule size\n"); + exit(1); + } + fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR); if (fd == -1) { perror("open"); @@ -123,14 +134,15 @@ int main(int argc, char **argv) map.dma_bits = bits; map.dma_dir = dir; map.dma_trans_ns = xdelay; + map.granule = granule; if (ioctl(fd, cmd, &map)) { perror("ioctl"); exit(1); } - printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s\n", - threads, seconds, node, dir[directions]); + printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s granule: %d\n", + threads, seconds, node, dir[directions], granule); printf("average map latency(us):%.1f standard deviation:%.1f\n", map.avg_map_100ns/10.0, map.map_stddev/10.0); printf("average unmap latency(us):%.1f standard deviation:%.1f\n", From a7f3d3d3600c8ed119eb0d2483de0062ce2e3707 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 26 Mar 2021 22:03:05 +0100 Subject: [PATCH 0298/1379] dma-mapping: add unlikely hint to error path in dma_mapping_error Zillions of drivers use the unlikely() hint when checking the result of dma_mapping_error(). This is an inline function anyway, so we can move the hint into the function and remove it from drivers over time. Signed-off-by: Heiner Kallweit Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index e9d19b974f26..183e7103a66d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -95,7 +95,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { debug_dma_mapping_error(dev, dma_addr); - if (dma_addr == DMA_MAPPING_ERROR) + if (unlikely(dma_addr == DMA_MAPPING_ERROR)) return -ENOMEM; return 0; } From 86438186a7b372728c74055d441549e17915cddb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 2 Feb 2021 14:22:51 -0500 Subject: [PATCH 0299/1379] NFSv4: Simplify nfs4_retry_setlk() Simplify the code that retries locks based on notification events. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 54 ++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c65c4b41e2c1..bca1726bcd55 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7250,22 +7250,22 @@ nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd, #ifdef CONFIG_NFS_V4_1 struct nfs4_lock_waiter { - struct task_struct *task; struct inode *inode; - struct nfs_lowner *owner; + struct nfs_lowner owner; + wait_queue_entry_t wait; }; static int nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, void *key) { - int ret; - struct nfs4_lock_waiter *waiter = wait->private; + struct nfs4_lock_waiter *waiter = + container_of(wait, struct nfs4_lock_waiter, wait); /* NULL key means to wake up everyone */ if (key) { struct cb_notify_lock_args *cbnl = key; struct nfs_lowner *lowner = &cbnl->cbnl_owner, - *wowner = waiter->owner; + *wowner = &waiter->owner; /* Only wake if the callback was for the same owner. */ if (lowner->id != wowner->id || lowner->s_dev != wowner->s_dev) @@ -7276,53 +7276,45 @@ nfs4_wake_lock_waiter(wait_queue_entry_t *wait, unsigned int mode, int flags, vo return 0; } - /* override "private" so we can use default_wake_function */ - wait->private = waiter->task; - ret = woken_wake_function(wait, mode, flags, key); - if (ret) - list_del_init(&wait->entry); - wait->private = waiter; - return ret; + return woken_wake_function(wait, mode, flags, key); } static int nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { - int status = -ERESTARTSYS; struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; struct nfs_server *server = NFS_SERVER(state->inode); struct nfs_client *clp = server->nfs_client; wait_queue_head_t *q = &clp->cl_lock_waitq; - struct nfs_lowner owner = { .clientid = clp->cl_clientid, - .id = lsp->ls_seqid.owner_id, - .s_dev = server->s_dev }; - struct nfs4_lock_waiter waiter = { .task = current, - .inode = state->inode, - .owner = &owner}; - wait_queue_entry_t wait; + struct nfs4_lock_waiter waiter = { + .inode = state->inode, + .owner = { .clientid = clp->cl_clientid, + .id = lsp->ls_seqid.owner_id, + .s_dev = server->s_dev }, + }; + int status; /* Don't bother with waitqueue if we don't expect a callback */ if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags)) return nfs4_retry_setlk_simple(state, cmd, request); - init_wait(&wait); - wait.private = &waiter; - wait.func = nfs4_wake_lock_waiter; + init_wait(&waiter.wait); + waiter.wait.func = nfs4_wake_lock_waiter; + add_wait_queue(q, &waiter.wait); - while(!signalled()) { - add_wait_queue(q, &wait); + do { status = nfs4_proc_setlk(state, cmd, request); - if ((status != -EAGAIN) || IS_SETLK(cmd)) { - finish_wait(q, &wait); + if (status != -EAGAIN || IS_SETLK(cmd)) break; - } status = -ERESTARTSYS; freezer_do_not_count(); - wait_woken(&wait, TASK_INTERRUPTIBLE, NFS4_LOCK_MAXTIMEOUT); + wait_woken(&waiter.wait, TASK_INTERRUPTIBLE, + NFS4_LOCK_MAXTIMEOUT); freezer_count(); - finish_wait(q, &wait); - } + } while (!signalled()); + + remove_wait_queue(q, &waiter.wait); return status; } From d737e5d418706abf32f6de68c3e09958516d422f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 9 Feb 2021 16:04:15 -0500 Subject: [PATCH 0300/1379] SUNRPC: Set TCP_CORK until the transmit queue is empty When we have multiple RPC requests queued up, it makes sense to set the TCP_CORK option while the transmit queue is non-empty. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/xprt.c | 2 ++ net/sunrpc/xprtsock.c | 5 ++++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index d2e97ee802af..d81fe8b364d0 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -247,6 +247,7 @@ struct rpc_xprt { struct rpc_task * snd_task; /* Task blocked in send */ struct list_head xmit_queue; /* Send queue */ + atomic_long_t xmit_queuelen; struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ #if defined(CONFIG_SUNRPC_BACKCHANNEL) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 691ccf8049a4..a853f75d4968 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1352,6 +1352,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task) list_add_tail(&req->rq_xmit, &xprt->xmit_queue); INIT_LIST_HEAD(&req->rq_xmit2); out: + atomic_long_inc(&xprt->xmit_queuelen); set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); spin_unlock(&xprt->queue_lock); } @@ -1381,6 +1382,7 @@ xprt_request_dequeue_transmit_locked(struct rpc_task *task) } } else list_del(&req->rq_xmit2); + atomic_long_dec(&req->rq_xprt->xmit_queuelen); } /** diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e35760f238a4..a64f5ed1edb4 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1018,6 +1018,7 @@ static int xs_tcp_send_request(struct rpc_rqst *req) * to cope with writespace callbacks arriving _after_ we have * called sendmsg(). */ req->rq_xtime = ktime_get(); + tcp_sock_set_cork(transport->inet, true); while (1) { status = xprt_sock_sendmsg(transport->sock, &msg, xdr, transport->xmit.offset, rm, &sent); @@ -1032,6 +1033,8 @@ static int xs_tcp_send_request(struct rpc_rqst *req) if (likely(req->rq_bytes_sent >= msglen)) { req->rq_xmit_bytes_sent += transport->xmit.offset; transport->xmit.offset = 0; + if (atomic_long_read(&xprt->xmit_queuelen) == 1) + tcp_sock_set_cork(transport->inet, false); return 0; } @@ -2163,6 +2166,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) } xs_tcp_set_socket_timeouts(xprt, sock); + tcp_sock_set_nodelay(sk); write_lock_bh(&sk->sk_callback_lock); @@ -2177,7 +2181,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* socket options */ sock_reset_flag(sk, SOCK_LINGER); - tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; xprt_clear_connected(xprt); From 6453bcd0d8bd67fce9e0f566250caf864b5032f6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 4 Mar 2021 20:29:50 -0500 Subject: [PATCH 0301/1379] NFS: Fix up incorrect documentation Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 8 ++++---- fs/nfs/io.c | 2 +- fs/nfs/nfs4state.c | 2 +- fs/nfs/pagelist.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 04bf8066980c..6a29de964268 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -114,7 +114,7 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) return ret; } /** - * nfs_have_delegation - check if inode has a delegation, mark it + * nfs4_have_delegation - check if inode has a delegation, mark it * NFS_DELEGATION_REFERENCED if there is one. * @inode: inode to check * @flags: delegation types to check for @@ -674,7 +674,7 @@ void nfs_inode_evict_delegation(struct inode *inode) } /** - * nfs_inode_return_delegation - synchronously return a delegation + * nfs4_inode_return_delegation - synchronously return a delegation * @inode: inode to process * * This routine will always flush any dirty data to disk on the @@ -697,7 +697,7 @@ int nfs4_inode_return_delegation(struct inode *inode) } /** - * nfs_inode_return_delegation_on_close - asynchronously return a delegation + * nfs4_inode_return_delegation_on_close - asynchronously return a delegation * @inode: inode to process * * This routine is called on file close in order to determine if the @@ -811,7 +811,7 @@ void nfs_expire_all_delegations(struct nfs_client *clp) } /** - * nfs_super_return_all_delegations - return delegations for one superblock + * nfs_server_return_all_delegations - return delegations for one superblock * @server: pointer to nfs_server to process * */ diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 5088fda9b453..b5551ed8f648 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -104,7 +104,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) } /** - * nfs_end_io_direct - declare the file is being used for direct i/o + * nfs_start_io_direct - declare the file is being used for direct i/o * @inode: file inode * * Declare that a direct I/O operation is about to start, and ensure diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 3a51351bdc6a..9db9b11acb2a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -645,7 +645,7 @@ void nfs4_purge_state_owners(struct nfs_server *server, struct list_head *head) } /** - * nfs4_purge_state_owners - Release all cached state owners + * nfs4_free_state_owners - Release all cached state owners * @head: resulting list of state owners * * Frees a list of state owners that was generated by diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 78c9c4bdef2b..6c20b28d9d7c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -577,7 +577,7 @@ static void nfs_clear_request(struct nfs_page *req) } /** - * nfs_release_request - Release the count on an NFS read/write request + * nfs_free_request - Release the count on an NFS read/write request * @req: request to release * * Note: Should never be called with the spinlock held! @@ -1152,7 +1152,7 @@ nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc, } /** - * nfs_pageio_add_request - Attempt to coalesce a request into a page list. + * __nfs_pageio_add_request - Attempt to coalesce a request into a page list. * @desc: destination io descriptor * @req: request * From ee3707ae2c1f1327ad5188836b7ab62ed2c93b28 Mon Sep 17 00:00:00 2001 From: Nagendra S Tomar Date: Tue, 16 Mar 2021 10:25:14 +0000 Subject: [PATCH 0302/1379] nfs: Subsequent READDIR calls should carry non-zero cookieverifier If the loop in nfs_readdir_xdr_to_array() runs more than once, subsequent READDIR RPCs may wrongly carry a zero cookie verifier and non-zero cookie. Make sure subsequent calls to READDIR carry the cookie verifier returned by the first call. Signed-off-by: Nagendra S Tomar Fixes: b593c09f83a2 ("NFS: Improve handling of directory verifiers") Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index fc4f490f2d78..08a1e2e31d0b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -866,6 +866,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, break; } + verf_arg = verf_res; + status = nfs_readdir_page_filler(desc, entry, pages, pglen, arrays, narrays); } while (!status && nfs_readdir_page_needs_filling(page)); From 13884ff2bef01df37c450c6dd09122f92333dccc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 16 Mar 2021 07:57:40 -0400 Subject: [PATCH 0303/1379] NFS: Fix handling of cookie verifier in uncached_readdir() If we're doing uncached readdir(), then the readdir cookie could be different from the one cached in the nfs_inode. We should therefore ensure that we save that one in the struct nfs_open_dir_context. Fixes: 35df59d3ef69 ("NFS: Reduce number of RPC calls when doing uncached readdir") Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 08a1e2e31d0b..2cf2a7d92faf 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -976,10 +976,10 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) /* * Once we've found the start of the dirent within a page: fill 'er up... */ -static void nfs_do_filldir(struct nfs_readdir_descriptor *desc) +static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, + const __be32 *verf) { struct file *file = desc->file; - struct nfs_inode *nfsi = NFS_I(file_inode(file)); struct nfs_cache_array *array; unsigned int i = 0; @@ -993,7 +993,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc) desc->eof = true; break; } - memcpy(desc->verf, nfsi->cookieverf, sizeof(desc->verf)); + memcpy(desc->verf, verf, sizeof(desc->verf)); if (i < (array->size-1)) desc->dir_cookie = array->array[i+1].cookie; else @@ -1050,7 +1050,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) for (i = 0; !desc->eof && i < sz && arrays[i]; i++) { desc->page = arrays[i]; - nfs_do_filldir(desc); + nfs_do_filldir(desc, verf); } desc->page = NULL; @@ -1071,6 +1071,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); + struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_dir_context *dir_ctx = file->private_data; struct nfs_readdir_descriptor *desc; int res; @@ -1124,7 +1125,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; } if (res == -ETOOSMALL && desc->plus) { - clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); + clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); nfs_zap_caches(inode); desc->page_index = 0; desc->plus = false; @@ -1134,7 +1135,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) if (res < 0) break; - nfs_do_filldir(desc); + nfs_do_filldir(desc, nfsi->cookieverf); nfs_readdir_page_unlock_and_put_cached(desc); } while (!desc->eof); From f892c41c14e0fa3d78ce37de1d5c8161ed13bf08 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 17 Mar 2021 08:46:19 -0400 Subject: [PATCH 0304/1379] NFS: Only change the cookie verifier if the directory page cache is empty The cached NFSv3/v4 readdir cookies are associated with a verifier, which is checked by the server on subsequent calls to readdir, and is only expected to change when the cookies (and hence also the page cache contents) are considered invalid. We therefore do have to store the verifier, but only when the page cache is empty. Fixes: b593c09f83a2 ("NFS: Improve handling of directory verifiers") Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2cf2a7d92faf..0cd7c59a6601 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -929,7 +929,12 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) } return res; } - memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf)); + /* + * Set the cookie verifier if the page cache was empty + */ + if (desc->page_index == 0) + memcpy(nfsi->cookieverf, verf, + sizeof(nfsi->cookieverf)); } res = nfs_readdir_search_array(desc); if (res == 0) { From c09f11ef35955785f92369e25819bf0629df2e59 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 1 Mar 2021 16:19:30 -0800 Subject: [PATCH 0305/1379] NFS: fs_context: validate UDP retrans to prevent shift out-of-bounds Fix shift out-of-bounds in xprt_calc_majortimeo(). This is caused by a garbage timeout (retrans) mount option being passed to nfs mount, in this case from syzkaller. If the protocol is XPRT_TRANSPORT_UDP, then 'retrans' is a shift value for a 64-bit long integer, so 'retrans' cannot be >= 64. If it is >= 64, fail the mount and return an error. Fixes: 9954bf92c0cd ("NFS: Move mount parameterisation bits into their own file") Reported-by: syzbot+ba2e91df8f74809417fa@syzkaller.appspotmail.com Reported-by: syzbot+f3a0fa110fd630ab56c8@syzkaller.appspotmail.com Signed-off-by: Randy Dunlap Cc: Trond Myklebust Cc: Anna Schumaker Cc: linux-nfs@vger.kernel.org Cc: David Howells Cc: Al Viro Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/fs_context.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 971a9251c1d9..902db1262d2b 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -973,6 +973,15 @@ static int nfs23_parse_monolithic(struct fs_context *fc, memset(mntfh->data + mntfh->size, 0, sizeof(mntfh->data) - mntfh->size); + /* + * for proto == XPRT_TRANSPORT_UDP, which is what uses + * to_exponential, implying shift: limit the shift value + * to BITS_PER_LONG (majortimeo is unsigned long) + */ + if (!(data->flags & NFS_MOUNT_TCP)) /* this will be UDP */ + if (data->retrans >= 64) /* shift value is too large */ + goto out_invalid_data; + /* * Translate to nfs_fs_context, which nfs_fill_super * can deal with. @@ -1073,6 +1082,9 @@ out_no_address: out_invalid_fh: return nfs_invalf(fc, "NFS: invalid root filehandle"); + +out_invalid_data: + return nfs_invalf(fc, "NFS: invalid binary mount data"); } #if IS_ENABLED(CONFIG_NFS_V4) From c2508730d6bb62196783b3de28a09782c97d4365 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Mon, 15 Mar 2021 06:54:10 +0530 Subject: [PATCH 0306/1379] nfs: Fix a typo in the file nfs42xattr.c s/attribues/attributes/ Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 6c2ce799150f..1c4d2a05b401 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -168,7 +168,7 @@ nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry) * make it easier to copy the value after an RPC, even if * the value will not be passed up to application (e.g. * for a 'query' getxattr with NULL buffer). - * @len: Length of the value. Can be 0 for zero-length attribues. + * @len: Length of the value. Can be 0 for zero-length attributes. * @value and @pages will be NULL if @len is 0. */ static struct nfs4_xattr_entry * From 90ff57bf4df2825ace0f61befdc3f87c79838ec2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 30 Mar 2021 09:01:50 -0400 Subject: [PATCH 0307/1379] NFS: Fix up the support for CONFIG_NFS_DISABLE_UDP_SUPPORT Rather than removing the support in nfs_init_timeout_values(), we should just fix up the validation checks in the mount option parsers. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 -- fs/nfs/fs_context.c | 54 +++++++++++++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ff5c4d0d6d13..399a8eb15397 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -476,7 +476,6 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, to->to_maxval = to->to_initval; to->to_exponential = 0; break; -#ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT case XPRT_TRANSPORT_UDP: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_UDP_RETRANS; @@ -487,7 +486,6 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, to->to_maxval = NFS_MAX_UDP_TIMEOUT; to->to_exponential = 1; break; -#endif default: BUG(); } diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 902db1262d2b..cdf32b9a6c35 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -283,20 +283,40 @@ static int nfs_verify_server_address(struct sockaddr *addr) return 0; } +#ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT +static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx) +{ + return true; +} +#else +static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx) +{ + if (ctx->version == 4) + return true; + return false; +} +#endif + /* * Sanity check the NFS transport protocol. - * */ -static void nfs_validate_transport_protocol(struct nfs_fs_context *ctx) +static int nfs_validate_transport_protocol(struct fs_context *fc, + struct nfs_fs_context *ctx) { switch (ctx->nfs_server.protocol) { case XPRT_TRANSPORT_UDP: + if (nfs_server_transport_udp_invalid(ctx)) + goto out_invalid_transport_udp; + break; case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_RDMA: break; default: ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; } + return 0; +out_invalid_transport_udp: + return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); } /* @@ -305,8 +325,6 @@ static void nfs_validate_transport_protocol(struct nfs_fs_context *ctx) */ static void nfs_set_mount_transport_protocol(struct nfs_fs_context *ctx) { - nfs_validate_transport_protocol(ctx); - if (ctx->mount_server.protocol == XPRT_TRANSPORT_UDP || ctx->mount_server.protocol == XPRT_TRANSPORT_TCP) return; @@ -929,6 +947,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, struct nfs_fh *mntfh = ctx->mntfh; struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; int extra_flags = NFS_MOUNT_LEGACY_INTERFACE; + int ret; if (data == NULL) goto out_no_data; @@ -1054,6 +1073,10 @@ static int nfs23_parse_monolithic(struct fs_context *fc, goto generic; } + ret = nfs_validate_transport_protocol(fc, ctx); + if (ret) + return ret; + ctx->skip_reconfig_option_check = true; return 0; @@ -1155,6 +1178,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc, { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; + int ret; char *c; if (!data) { @@ -1227,9 +1251,9 @@ static int nfs4_parse_monolithic(struct fs_context *fc, ctx->acdirmin = data->acdirmin; ctx->acdirmax = data->acdirmax; ctx->nfs_server.protocol = data->proto; - nfs_validate_transport_protocol(ctx); - if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP) - goto out_invalid_transport_udp; + ret = nfs_validate_transport_protocol(fc, ctx); + if (ret) + return ret; done: ctx->skip_reconfig_option_check = true; return 0; @@ -1240,9 +1264,6 @@ out_inval_auth: out_no_address: return nfs_invalf(fc, "NFS4: mount program didn't pass remote address"); - -out_invalid_transport_udp: - return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); } #endif @@ -1307,6 +1328,10 @@ static int nfs_fs_context_validate(struct fs_context *fc) if (!nfs_verify_server_address(sap)) goto out_no_address; + ret = nfs_validate_transport_protocol(fc, ctx); + if (ret) + return ret; + if (ctx->version == 4) { if (IS_ENABLED(CONFIG_NFS_V4)) { if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA) @@ -1315,9 +1340,6 @@ static int nfs_fs_context_validate(struct fs_context *fc) port = NFS_PORT; max_namelen = NFS4_MAXNAMLEN; max_pathlen = NFS4_MAXPATHLEN; - nfs_validate_transport_protocol(ctx); - if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP) - goto out_invalid_transport_udp; ctx->flags &= ~(NFS_MOUNT_NONLM | NFS_MOUNT_NOACL | NFS_MOUNT_VER3 | NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); @@ -1326,10 +1348,6 @@ static int nfs_fs_context_validate(struct fs_context *fc) } } else { nfs_set_mount_transport_protocol(ctx); -#ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT - if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP) - goto out_invalid_transport_udp; -#endif if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA) port = NFS_RDMA_PORT; } @@ -1363,8 +1381,6 @@ out_no_device_name: out_v4_not_compiled: nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel"); return -EPROTONOSUPPORT; -out_invalid_transport_udp: - return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); out_no_address: return nfs_invalf(fc, "NFS: mount program didn't pass remote address"); out_mountproto_mismatch: From 6b996476f364009e9be43e98f5bca11e5ec95b2d Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Mon, 22 Mar 2021 13:29:04 +0800 Subject: [PATCH 0308/1379] sunrpc: honor rpc_task's timeout value in rpcb_create() Currently rpcbind client is created without setting rpc timeout (thus using the default value). But if the rpc_task already has a customized timeout in its tk_client field, it's also ignored. Let's use the same timeout setting in rpc_task->tk_client->cl_timeout for rpcbind connection. Signed-off-by: Eryu Guan Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 38fe2ce8a5aa..647b323cc1d5 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -344,13 +344,15 @@ static struct rpc_clnt *rpcb_create(struct net *net, const char *nodename, const char *hostname, struct sockaddr *srvaddr, size_t salen, int proto, u32 version, - const struct cred *cred) + const struct cred *cred, + const struct rpc_timeout *timeo) { struct rpc_create_args args = { .net = net, .protocol = proto, .address = srvaddr, .addrsize = salen, + .timeout = timeo, .servername = hostname, .nodename = nodename, .program = &rpcb_program, @@ -705,7 +707,8 @@ void rpcb_getport_async(struct rpc_task *task) clnt->cl_nodename, xprt->servername, sap, salen, xprt->prot, bind_version, - clnt->cl_cred); + clnt->cl_cred, + task->tk_client->cl_timeout); if (IS_ERR(rpcb_clnt)) { status = PTR_ERR(rpcb_clnt); goto bailout_nofree; From c9301cb35b59ad7d733a7332f3aefd4da1382468 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 23 Mar 2021 10:57:13 +0800 Subject: [PATCH 0309/1379] nfs: hornor timeo and retrans option when mounting NFSv3 Mounting NFSv3 uses default timeout parameters specified by underlying sunrpc transport, and mount options like 'timeo' and 'retrans', unlike NFSv4, are not honored. But sometimes we want to set non-default timeout value when mounting NFSv3, so pass 'timeo' and 'retrans' to nfs_mount() and fill the 'timeout' field of struct rpc_create_args before creating RPC connection. This is also consistent with NFSv4 behavior. Note that this only sets the timeout value of rpc connection to mountd, but the timeout of rpcbind connection should be set as well. A later patch will fix the rpcbind part. Signed-off-by: Eryu Guan Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 2 +- fs/nfs/mount_clnt.c | 14 +++++++++----- fs/nfs/super.c | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7b644d6c09e4..cf0d7db24d44 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -180,7 +180,7 @@ struct nfs_mount_request { struct net *net; }; -extern int nfs_mount(struct nfs_mount_request *info); +extern int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans); extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index dda5c3e65d8d..c5e3b6b3366a 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -136,14 +136,16 @@ struct mnt_fhstatus { /** * nfs_mount - Obtain an NFS file handle for the given host and path * @info: pointer to mount request arguments + * @timeo: deciseconds the mount waits for a response before it retries + * @retrans: number of times the mount retries a request * - * Uses default timeout parameters specified by underlying transport. On - * successful return, the auth_flavs list and auth_flav_len will be populated - * with the list from the server or a faked-up list if the server didn't - * provide one. + * Uses timeout parameters specified by caller. On successful return, the + * auth_flavs list and auth_flav_len will be populated with the list from the + * server or a faked-up list if the server didn't provide one. */ -int nfs_mount(struct nfs_mount_request *info) +int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans) { + struct rpc_timeout mnt_timeout; struct mountres result = { .fh = info->fh, .auth_count = info->auth_flav_len, @@ -158,6 +160,7 @@ int nfs_mount(struct nfs_mount_request *info) .protocol = info->protocol, .address = info->sap, .addrsize = info->salen, + .timeout = &mnt_timeout, .servername = info->hostname, .program = &mnt_program, .version = info->version, @@ -177,6 +180,7 @@ int nfs_mount(struct nfs_mount_request *info) if (info->noresvport) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + nfs_init_timeout_values(&mnt_timeout, info->protocol, timeo, retrans); mnt_clnt = rpc_create(&args); if (IS_ERR(mnt_clnt)) goto out_clnt_err; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 94885c6f8f54..13a650750f04 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -867,7 +867,7 @@ static int nfs_request_mount(struct fs_context *fc, * Now ask the mount server to map our export path * to a file handle. */ - status = nfs_mount(&request); + status = nfs_mount(&request, ctx->timeo, ctx->retrans); if (status != 0) { dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", request.hostname, status); From 98b5cee37389b899de044bc4aac56e6ff33dbd4d Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 22 Mar 2021 14:37:01 -0400 Subject: [PATCH 0310/1379] SUNRPC: Ensure the transport backchannel association If the server sends CB_ calls on a connection that is not associated with the backchannel, refuse to process the call and shut down the connection. This avoids a NULL dereference crash in xprt_complete_bc_request(). There's not much more we can do in this situation unless we want to look into allowing all connections to be associated with the fore and back channel. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index a64f5ed1edb4..47aa47a2b07c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -558,6 +558,10 @@ xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) struct rpc_rqst *req; ssize_t ret; + /* Is this transport associated with the backchannel? */ + if (!xprt->bc_serv) + return -ESHUTDOWN; + /* Look up and lock the request corresponding to the given XID */ req = xprt_lookup_bc_request(xprt, transport->recv.xid); if (!req) { From b876d708316bf9b6b9678eb2beb289b93cfe6369 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Mar 2021 15:32:21 -0400 Subject: [PATCH 0311/1379] NFS: fix nfs_fetch_iversion() The change attribute is always set by all NFS client versions so get rid of the open-coded version. Fixes: 3cc55f4434b4 ("nfs: use change attribute for NFS re-exports") Signed-off-by: Trond Myklebust --- fs/nfs/export.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/fs/nfs/export.c b/fs/nfs/export.c index f2b34cfe286c..b347e3ce0cc8 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -171,17 +171,10 @@ static u64 nfs_fetch_iversion(struct inode *inode) { struct nfs_server *server = NFS_SERVER(inode); - /* Is this the right call?: */ - nfs_revalidate_inode(server, inode); - /* - * Also, note we're ignoring any returned error. That seems to be - * the practice for cache consistency information elsewhere in - * the server, but I'm not sure why. - */ - if (server->nfs_client->rpc_ops->version >= 4) - return inode_peek_iversion_raw(inode); - else - return time_to_chattr(&inode->i_ctime); + if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_CHANGE | + NFS_INO_REVAL_PAGECACHE)) + __nfs_revalidate_inode(server, inode); + return inode_peek_iversion_raw(inode); } const struct export_operations nfs_export_ops = { From beab450d8ea93cdf4c6cb7714bdc31a9e0f34738 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 26 Mar 2021 16:23:39 -0400 Subject: [PATCH 0312/1379] NFS: Fix fscache invalidation in nfs_set_cache_invalid() Ensure that we invalidate the fscache before we strip the NFS_INO_INVALID_DATA flag. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index a7fb076a5f44..ff737be559dc 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -223,11 +223,11 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) if (!nfs_has_xattr_cache(nfsi)) flags &= ~NFS_INO_INVALID_XATTR; + if (flags & NFS_INO_INVALID_DATA) + nfs_fscache_invalidate(inode); if (inode->i_mapping->nrpages == 0) flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER); nfsi->cache_validity |= flags; - if (flags & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); } EXPORT_SYMBOL_GPL(nfs_set_cache_invalid); From a303b0ac920d807cb7da4f1cd85759fbe44fa654 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 1 Apr 2021 11:01:53 +0800 Subject: [PATCH 0313/1379] f2fs: fix to avoid GC/mmap race with f2fs_truncate() It missed to hold i_gc_rwsem and i_map_sem around f2fs_truncate() in f2fs_file_write_iter() to avoid racing with background GC and mmap, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index dc79694e512c..f3ca63b55843 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4443,8 +4443,13 @@ write: clear_inode_flag(inode, FI_NO_PREALLOC); /* if we couldn't write data, we should deallocate blocks. */ - if (preallocated && i_size_read(inode) < target_size) + if (preallocated && i_size_read(inode) < target_size) { + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); f2fs_truncate(inode); + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); From 25ae837e61dee712b4b1df36602ebfe724b2a0b6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 2 Apr 2021 17:22:23 +0800 Subject: [PATCH 0314/1379] f2fs: fix to avoid accessing invalid fio in f2fs_allocate_data_block() Callers may pass fio parameter with NULL value to f2fs_allocate_data_block(), so we should make sure accessing fio's field after fio's validation check. Fixes: f608c38c59c6 ("f2fs: clean up parameter of f2fs_allocate_data_block()") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c517e689a9a3..44897cfecb1e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3417,12 +3417,12 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, f2fs_inode_chksum_set(sbi, page); } - if (F2FS_IO_ALIGNED(sbi)) - fio->retry = false; - if (fio) { struct f2fs_bio_info *io; + if (F2FS_IO_ALIGNED(sbi)) + fio->retry = false; + INIT_LIST_HEAD(&fio->list); fio->in_list = true; io = sbi->write_io[fio->type] + fio->temp; From c35b8d5e757e0fd0144890b7b536f7b756f3a648 Mon Sep 17 00:00:00 2001 From: Sahitya Tummala Date: Tue, 6 Apr 2021 14:39:16 +0530 Subject: [PATCH 0315/1379] f2fs: fix the periodic wakeups of discard thread Fix the unnecessary periodic wakeups of discard thread that happens under below two conditions - 1. When f2fs is heavily utilized over 80%, the current discard policy sets the max sleep timeout of discard thread as 50ms (DEF_MIN_DISCARD_ISSUE_TIME). But this is set even when there are no pending discard commands to be issued. 2. In the issue_discard_thread() path when there are no pending discard commands, it fails to reset the wait_ms to max timeout value. Signed-off-by: Sahitya Tummala Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 44897cfecb1e..5bd0e1d7a00c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1114,6 +1114,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, int discard_type, unsigned int granularity) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + /* common policy */ dpolicy->type = discard_type; dpolicy->sync = true; @@ -1133,7 +1135,9 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->ordered = true; if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { dpolicy->granularity = 1; - dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; + if (atomic_read(&dcc->discard_cmd_cnt)) + dpolicy->max_interval = + DEF_MIN_DISCARD_ISSUE_TIME; } } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1749,8 +1753,15 @@ static int issue_discard_thread(void *data) set_freezable(); do { - __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, - dcc->discard_granularity); + if (sbi->gc_mode == GC_URGENT_HIGH || + !f2fs_available_free_memory(sbi, DISCARD_CACHE)) + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); + else + __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, + dcc->discard_granularity); + + if (!atomic_read(&dcc->discard_cmd_cnt)) + wait_ms = dpolicy.max_interval; wait_event_interruptible_timeout(*q, kthread_should_stop() || freezing(current) || @@ -1777,10 +1788,6 @@ static int issue_discard_thread(void *data) if (!atomic_read(&dcc->discard_cmd_cnt)) continue; - if (sbi->gc_mode == GC_URGENT_HIGH || - !f2fs_available_free_memory(sbi, DISCARD_CACHE)) - __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); - sb_start_intwrite(sbi->sb); issued = __issue_discard_cmd(sbi, &dpolicy); From 213e12205232886902507ba4e2af12d79b7cbbab Mon Sep 17 00:00:00 2001 From: Bharat Kumar Gogada Date: Mon, 22 Feb 2021 14:17:31 +0530 Subject: [PATCH 0316/1379] PCI: xilinx-nwl: Enable coherent PCIe DMA traffic using CCI Add support for routing PCIe DMA traffic coherently when Cache Coherent Interconnect (CCI) is enabled in the system. The "dma-coherent" property is used to determine if CCI is enabled or not. Refer to https://developer.arm.com/documentation/ddi0470/k/preface for the CCI specification. Link: https://lore.kernel.org/r/20210222084732.21521-1-bharat.kumar.gogada@xilinx.com Signed-off-by: Bharat Kumar Gogada Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/pcie-xilinx-nwl.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/pci/controller/pcie-xilinx-nwl.c b/drivers/pci/controller/pcie-xilinx-nwl.c index 07e36661bbc2..8689311c5ef6 100644 --- a/drivers/pci/controller/pcie-xilinx-nwl.c +++ b/drivers/pci/controller/pcie-xilinx-nwl.c @@ -26,6 +26,7 @@ /* Bridge core config registers */ #define BRCFG_PCIE_RX0 0x00000000 +#define BRCFG_PCIE_RX1 0x00000004 #define BRCFG_INTERRUPT 0x00000010 #define BRCFG_PCIE_RX_MSG_FILTER 0x00000020 @@ -128,6 +129,7 @@ #define NWL_ECAM_VALUE_DEFAULT 12 #define CFG_DMA_REG_BAR GENMASK(2, 0) +#define CFG_PCIE_CACHE GENMASK(7, 0) #define INT_PCI_MSI_NR (2 * 32) @@ -675,6 +677,11 @@ static int nwl_pcie_bridge_init(struct nwl_pcie *pcie) nwl_bridge_writel(pcie, CFG_ENABLE_MSG_FILTER_MASK, BRCFG_PCIE_RX_MSG_FILTER); + /* This routes the PCIe DMA traffic to go through CCI path */ + if (of_dma_is_coherent(dev->of_node)) + nwl_bridge_writel(pcie, nwl_bridge_readl(pcie, BRCFG_PCIE_RX1) | + CFG_PCIE_CACHE, BRCFG_PCIE_RX1); + err = nwl_wait_for_link(pcie); if (err) return err; From 1c4422f22605ec0f4455400c52a31898edcda425 Mon Sep 17 00:00:00 2001 From: Bharat Kumar Gogada Date: Mon, 22 Feb 2021 14:17:32 +0530 Subject: [PATCH 0317/1379] PCI: xilinx-nwl: Add optional "dma-coherent" property Add optional dma-coherent property to support coherent PCIe DMA traffic. Link: https://lore.kernel.org/r/20210222084732.21521-2-bharat.kumar.gogada@xilinx.com Signed-off-by: Bharat Kumar Gogada Signed-off-by: Lorenzo Pieralisi --- Documentation/devicetree/bindings/pci/xilinx-nwl-pcie.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/pci/xilinx-nwl-pcie.txt b/Documentation/devicetree/bindings/pci/xilinx-nwl-pcie.txt index 01bf7fdf4c19..2d677e90a7e2 100644 --- a/Documentation/devicetree/bindings/pci/xilinx-nwl-pcie.txt +++ b/Documentation/devicetree/bindings/pci/xilinx-nwl-pcie.txt @@ -33,6 +33,8 @@ Required properties: - #address-cells: specifies the number of cells needed to encode an address. The value must be 0. +Optional properties: +- dma-coherent: present if DMA operations are coherent Example: ++++++++ From 2531fdbf8bfc22b0a2554bb3e3772dd8105d74ad Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 7 Apr 2021 23:14:50 -0700 Subject: [PATCH 0318/1379] Input: gpio-keys - fix crash when disabliing GPIO-less buttons My brain-damaged adjustments to Paul's patch caused crashes in gpio_keys_disable_button() when driver is used in GPIO-less (i.e. purely interrupt-driven) setups, because I mixed together debounce and release timers when they are in fact separate: Unable to handle kernel NULL pointer dereference at virtual address 0000000c ... PC is at hrtimer_active+0xc/0x98 LR is at hrtimer_try_to_cancel+0x24/0x140 ... [] (hrtimer_active) from [] (hrtimer_try_to_cancel+0x24/0x140) [] (hrtimer_try_to_cancel) from [] (hrtimer_cancel+0x14/0x4c) [] (hrtimer_cancel) from [] (gpio_keys_attr_store_helper+0x1b8/0x1d8 [gpio_keys]) [] (gpio_keys_attr_store_helper [gpio_keys]) from [] (gpio_keys_store_disabled_keys+0x18/0x24 [gpio_keys]) [] (gpio_keys_store_disabled_keys [gpio_keys]) from [] (kernfs_fop_write_iter+0x10c/0x1cc) [] (kernfs_fop_write_iter) from [] (vfs_write+0x2ac/0x404) [] (vfs_write) from [] (ksys_write+0x64/0xdc) [] (ksys_write) from [] (ret_fast_syscall+0x0/0x58) Let's fix it up. Fixes: c9efb0ba281e ("Input: gpio-keys - use hrtimer for software debounce, if possible") Reported-by: Tony Lindgren Tested-by: Tony Lindgren Link: https://lore.kernel.org/r/YG1DFFgojSVfdpaz@google.com Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/gpio_keys.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c index fe8fc76ee22e..8dbf1e69c90a 100644 --- a/drivers/input/keyboard/gpio_keys.c +++ b/drivers/input/keyboard/gpio_keys.c @@ -125,6 +125,18 @@ static const unsigned long *get_bm_events_by_type(struct input_dev *dev, return (type == EV_KEY) ? dev->keybit : dev->swbit; } +static void gpio_keys_quiesce_key(void *data) +{ + struct gpio_button_data *bdata = data; + + if (!bdata->gpiod) + hrtimer_cancel(&bdata->release_timer); + if (bdata->debounce_use_hrtimer) + hrtimer_cancel(&bdata->debounce_timer); + else + cancel_delayed_work_sync(&bdata->work); +} + /** * gpio_keys_disable_button() - disables given GPIO button * @bdata: button data for button to be disabled @@ -145,12 +157,7 @@ static void gpio_keys_disable_button(struct gpio_button_data *bdata) * Disable IRQ and associated timer/work structure. */ disable_irq(bdata->irq); - - if (bdata->debounce_use_hrtimer) - hrtimer_cancel(&bdata->release_timer); - else - cancel_delayed_work_sync(&bdata->work); - + gpio_keys_quiesce_key(bdata); bdata->disabled = true; } } @@ -492,16 +499,6 @@ out: return IRQ_HANDLED; } -static void gpio_keys_quiesce_key(void *data) -{ - struct gpio_button_data *bdata = data; - - if (bdata->debounce_use_hrtimer) - hrtimer_cancel(&bdata->debounce_timer); - else - cancel_delayed_work_sync(&bdata->work); -} - static int gpio_keys_setup_key(struct platform_device *pdev, struct input_dev *input, struct gpio_keys_drvdata *ddata, @@ -635,7 +632,6 @@ static int gpio_keys_setup_key(struct platform_device *pdev, } bdata->release_delay = button->debounce_interval; - bdata->debounce_use_hrtimer = true; hrtimer_init(&bdata->release_timer, CLOCK_REALTIME, HRTIMER_MODE_REL_HARD); bdata->release_timer.function = gpio_keys_irq_timer; From 3bbfd319034ddce59e023837a4aa11439460509b Mon Sep 17 00:00:00 2001 From: Feilong Lin Date: Thu, 25 Mar 2021 15:26:00 +0800 Subject: [PATCH 0319/1379] ACPI / hotplug / PCI: Fix reference count leak in enable_slot() In enable_slot(), if pci_get_slot() returns NULL, we clear the SLOT_ENABLED flag. When pci_get_slot() finds a device, it increments the device's reference count. In this case, we did not call pci_dev_put() to decrement the reference count, so the memory of the device (struct pci_dev type) will eventually leak. Call pci_dev_put() to decrement its reference count when pci_get_slot() returns a PCI device. Link: https://lore.kernel.org/r/b411af88-5049-a1c6-83ac-d104a1f429be@huawei.com Signed-off-by: Feilong Lin Signed-off-by: Zhiqiang Liu Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki --- drivers/pci/hotplug/acpiphp_glue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index 3365c93abf0e..f031302ad401 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -533,6 +533,7 @@ static void enable_slot(struct acpiphp_slot *slot, bool bridge) slot->flags &= ~SLOT_ENABLED; continue; } + pci_dev_put(dev); } } From 5859c926d1f052ee61b5815b14658875c14f6243 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Thu, 8 Apr 2021 15:26:58 +0800 Subject: [PATCH 0320/1379] PCI: tegra: Fix runtime PM imbalance in pex_ep_event_pex_rst_deassert() pm_runtime_get_sync() will increase the runtime PM counter even it returns an error. Thus a pairing decrement is needed to prevent refcount leak. Fix this by replacing this API with pm_runtime_resume_and_get(), which will not change the runtime PM counter on error. Link: https://lore.kernel.org/r/20210408072700.15791-1-dinghao.liu@zju.edu.cn Signed-off-by: Dinghao Liu Signed-off-by: Lorenzo Pieralisi Acked-by: Thierry Reding --- drivers/pci/controller/dwc/pcie-tegra194.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index 18acd48e8e9b..9b6799258fa4 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -1645,7 +1645,7 @@ static void pex_ep_event_pex_rst_deassert(struct tegra_pcie_dw *pcie) if (pcie->ep_state == EP_STATE_ENABLED) return; - ret = pm_runtime_get_sync(dev); + ret = pm_runtime_resume_and_get(dev); if (ret < 0) { dev_err(dev, "Failed to get runtime sync for PCIe dev: %d\n", ret); From 1a7a6e8072ea0e4582de2da63a9088841fde798e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 6 Apr 2021 09:30:36 +0200 Subject: [PATCH 0321/1379] pwm: Clarify which state pwm_get_state() returns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Given that lowlevel drivers usually cannot implement exactly what a consumer requests with pwm_apply_state() there is some rounding involved. pwm_get_state() returns the setting that was requested most recently by the consumer (opposed to what was actually implemented in hardware in reply to the last request). Clarify this in the function kerneldoc. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- Documentation/driver-api/pwm.rst | 6 +++++- include/linux/pwm.h | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Documentation/driver-api/pwm.rst b/Documentation/driver-api/pwm.rst index ab62f1bb0366..381f3c46cdac 100644 --- a/Documentation/driver-api/pwm.rst +++ b/Documentation/driver-api/pwm.rst @@ -55,7 +55,11 @@ several parameter at once. For example, if you see pwm_config() and pwm_{enable,disable}() calls in the same function, this probably means you should switch to pwm_apply_state(). -The PWM user API also allows one to query the PWM state with pwm_get_state(). +The PWM user API also allows one to query the last applied PWM state with +pwm_get_last_applied_state(). Note this is different to what the driver has +actually implemented if the request cannot be implemented exactly with the +hardware in use. There is currently no way for consumers to get the actually +implemented settings. In addition to the PWM state, the PWM API also exposes PWM arguments, which are the reference PWM config one should use on this PWM. diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 8f4eefd129aa..5bb90af4997e 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -91,6 +91,11 @@ struct pwm_device { * pwm_get_state() - retrieve the current PWM state * @pwm: PWM device * @state: state to fill with the current PWM state + * + * The returned PWM state represents the state that was applied by a previous call to + * pwm_apply_state(). Drivers may have to slightly tweak that state before programming it to + * hardware. If pwm_apply_state() was never called, this returns either the current hardware + * state (if supported) or the default settings. */ static inline void pwm_get_state(const struct pwm_device *pwm, struct pwm_state *state) From 89c6f314602efeecfe8d9093571b3bbfe1029ab5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 24 Mar 2021 20:56:35 +0100 Subject: [PATCH 0322/1379] pwm: atmel: Free resources only after pwmchip_remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before pwmchip_remove() returns the PWM is expected to be functional. So remove the pwmchip before disabling the clock. Signed-off-by: Uwe Kleine-König Acked-by: Claudiu Beznea Signed-off-by: Thierry Reding --- drivers/pwm/pwm-atmel.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c index a4d0be6b265b..d49da708337f 100644 --- a/drivers/pwm/pwm-atmel.c +++ b/drivers/pwm/pwm-atmel.c @@ -450,10 +450,12 @@ static int atmel_pwm_remove(struct platform_device *pdev) { struct atmel_pwm_chip *atmel_pwm = platform_get_drvdata(pdev); + pwmchip_remove(&atmel_pwm->chip); + clk_unprepare(atmel_pwm->clk); mutex_destroy(&atmel_pwm->isr_lock); - return pwmchip_remove(&atmel_pwm->chip); + return 0; } static struct platform_driver atmel_pwm_driver = { From d4ac3917bca64a4f630dc6f7ba47fe71dbf0273e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 24 Mar 2021 21:01:34 +0100 Subject: [PATCH 0323/1379] pwm: bcm-iproc: Free resources only after pwmchip_remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before pwmchip_remove() returns the PWM is expected to be functional. So remove the pwmchip before disabling the clock. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-bcm-iproc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-bcm-iproc.c b/drivers/pwm/pwm-bcm-iproc.c index 529a66ab692d..edd2ce1760ab 100644 --- a/drivers/pwm/pwm-bcm-iproc.c +++ b/drivers/pwm/pwm-bcm-iproc.c @@ -253,9 +253,11 @@ static int iproc_pwmc_remove(struct platform_device *pdev) { struct iproc_pwmc *ip = platform_get_drvdata(pdev); + pwmchip_remove(&ip->chip); + clk_disable_unprepare(ip->clk); - return pwmchip_remove(&ip->chip); + return 0; } static const struct of_device_id bcm_iproc_pwmc_dt[] = { From 3c817469a53d93bbae52f8ead207dc0b9aeebae9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Thu, 25 Mar 2021 09:29:31 +0100 Subject: [PATCH 0324/1379] pwm: bcm2835: Free resources only after pwmchip_remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before pwmchip_remove() returns the PWM is expected to be functional. So remove the pwmchip before disabling the clock. Signed-off-by: Uwe Kleine-König Acked-by: Florian Fainelli Signed-off-by: Thierry Reding --- drivers/pwm/pwm-bcm2835.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-bcm2835.c b/drivers/pwm/pwm-bcm2835.c index e4b54675b356..fc240d5b8121 100644 --- a/drivers/pwm/pwm-bcm2835.c +++ b/drivers/pwm/pwm-bcm2835.c @@ -179,9 +179,11 @@ static int bcm2835_pwm_remove(struct platform_device *pdev) { struct bcm2835_pwm *pc = platform_get_drvdata(pdev); + pwmchip_remove(&pc->chip); + clk_disable_unprepare(pc->clk); - return pwmchip_remove(&pc->chip); + return 0; } static const struct of_device_id bcm2835_pwm_of_match[] = { From 819e82460ac858cdca38f748829979602a7708ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 26 Mar 2021 09:18:04 +0100 Subject: [PATCH 0325/1379] pwm: bcm-kona: Don't modify HW state in .remove callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A consumer is expected to disable a PWM before calling pwm_put(). And if they didn't there is hopefully a good reason (or the consumer needs fixing.) Also if disabling an enabled PWM was the right thing to do, this should better be done in the framework instead of in each low level driver. So drop the hardware modification from the .remove() callback. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-bcm-kona.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/pwm/pwm-bcm-kona.c b/drivers/pwm/pwm-bcm-kona.c index f09a31042859..800b9edf2e71 100644 --- a/drivers/pwm/pwm-bcm-kona.c +++ b/drivers/pwm/pwm-bcm-kona.c @@ -310,11 +310,6 @@ static int kona_pwmc_probe(struct platform_device *pdev) static int kona_pwmc_remove(struct platform_device *pdev) { struct kona_pwmc *kp = platform_get_drvdata(pdev); - unsigned int chan; - - for (chan = 0; chan < kp->chip.npwm; chan++) - if (pwm_is_enabled(&kp->chip.pwms[chan])) - clk_disable_unprepare(kp->clk); return pwmchip_remove(&kp->chip); } From d58a484e7cf00412b8f7c17cd60caaa9fb4c2b42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Sat, 27 Mar 2021 22:24:28 +0100 Subject: [PATCH 0326/1379] pwm: lpc18xx-sct: Free resources only after pwmchip_remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before pwmchip_remove() returns the PWM is expected to be functional. So remove the pwmchip before disabling the clock. Signed-off-by: Uwe Kleine-König Acked-by: Vladimir Zapolskiy Signed-off-by: Thierry Reding --- drivers/pwm/pwm-lpc18xx-sct.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-lpc18xx-sct.c b/drivers/pwm/pwm-lpc18xx-sct.c index 3f8e54ec28c6..b643ac61a2e7 100644 --- a/drivers/pwm/pwm-lpc18xx-sct.c +++ b/drivers/pwm/pwm-lpc18xx-sct.c @@ -441,13 +441,15 @@ static int lpc18xx_pwm_remove(struct platform_device *pdev) struct lpc18xx_pwm_chip *lpc18xx_pwm = platform_get_drvdata(pdev); u32 val; + pwmchip_remove(&lpc18xx_pwm->chip); + val = lpc18xx_pwm_readl(lpc18xx_pwm, LPC18XX_PWM_CTRL); lpc18xx_pwm_writel(lpc18xx_pwm, LPC18XX_PWM_CTRL, val | LPC18XX_PWM_CTRL_HALT); clk_disable_unprepare(lpc18xx_pwm->pwm_clk); - return pwmchip_remove(&lpc18xx_pwm->chip); + return 0; } static struct platform_driver lpc18xx_pwm_driver = { From 13ef0414c891658c7d4c1d13fbd58a8d59a09dd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 29 Mar 2021 08:41:11 +0200 Subject: [PATCH 0327/1379] pwm: lpc3200: Don't modify HW state in .remove callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A consumer is expected to disable a PWM before calling pwm_put(). And if they didn't there is hopefully a good reason (or the consumer needs fixing). Also if disabling an enabled PWM was the right thing to do, this should better be done in the framework instead of in each low level driver. So drop the hardware modification from the .remove() callback. Signed-off-by: Uwe Kleine-König Acked-by: Vladimir Zapolskiy Signed-off-by: Thierry Reding --- drivers/pwm/pwm-lpc32xx.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/pwm/pwm-lpc32xx.c b/drivers/pwm/pwm-lpc32xx.c index c42cc314c170..2834a0f001d3 100644 --- a/drivers/pwm/pwm-lpc32xx.c +++ b/drivers/pwm/pwm-lpc32xx.c @@ -136,10 +136,6 @@ static int lpc32xx_pwm_probe(struct platform_device *pdev) static int lpc32xx_pwm_remove(struct platform_device *pdev) { struct lpc32xx_pwm_chip *lpc32xx = platform_get_drvdata(pdev); - unsigned int i; - - for (i = 0; i < lpc32xx->chip.npwm; i++) - pwm_disable(&lpc32xx->chip.pwms[i]); return pwmchip_remove(&lpc32xx->chip); } From a9ea2e793e5aee690b90d115dbb9229934d30f2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 30 Mar 2021 14:37:41 +0200 Subject: [PATCH 0328/1379] pwm: sti: Don't modify HW state in .remove callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A consumer is expected to disable a PWM before calling pwm_put(). And if they didn't there is hopefully a good reason (or the consumer needs fixing). Also if disabling an enabled PWM was the right thing to do, this should better be done in the framework instead of in each low level driver. So drop the hardware modification from the .remove() callback. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-sti.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c index aa2b211d7ee3..3064b320df93 100644 --- a/drivers/pwm/pwm-sti.c +++ b/drivers/pwm/pwm-sti.c @@ -649,10 +649,7 @@ static int sti_pwm_probe(struct platform_device *pdev) static int sti_pwm_remove(struct platform_device *pdev) { struct sti_pwm_chip *pc = platform_get_drvdata(pdev); - unsigned int i; - for (i = 0; i < pc->cdata->pwm_num_devs; i++) - pwm_disable(&pc->chip.pwms[i]); clk_unprepare(pc->pwm_clk); clk_unprepare(pc->cpt_clk); From 0e719e8ca3946bac0034152309fe7565616bde50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 30 Mar 2021 14:37:42 +0200 Subject: [PATCH 0329/1379] pwm: sti: Free resources only after pwmchip_remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before pwmchip_remove() returns the PWM is expected to be functional. So remove the pwmchip before disabling the clocks. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-sti.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c index 3064b320df93..f491d56254d7 100644 --- a/drivers/pwm/pwm-sti.c +++ b/drivers/pwm/pwm-sti.c @@ -650,11 +650,12 @@ static int sti_pwm_remove(struct platform_device *pdev) { struct sti_pwm_chip *pc = platform_get_drvdata(pdev); + pwmchip_remove(&pc->chip); clk_unprepare(pc->pwm_clk); clk_unprepare(pc->cpt_clk); - return pwmchip_remove(&pc->chip); + return 0; } static const struct of_device_id sti_pwm_of_match[] = { From 64d7d074acd52e1bdff621f2cb86c0aae9bcef80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 7 Apr 2021 10:01:53 +0200 Subject: [PATCH 0330/1379] pwm: lpss: Don't modify HW state in .remove callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A consumer is expected to disable a PWM before calling pwm_put(). And if they didn't there is hopefully a good reason (or the consumer needs fixing). Also if disabling an enabled PWM was the right thing to do, this should better be done in the framework instead of in each low level driver. So drop the hardware modification from the .remove() callback. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-lpss.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/pwm/pwm-lpss.c b/drivers/pwm/pwm-lpss.c index 9ea6a176cbe5..58b4031524af 100644 --- a/drivers/pwm/pwm-lpss.c +++ b/drivers/pwm/pwm-lpss.c @@ -254,12 +254,6 @@ EXPORT_SYMBOL_GPL(pwm_lpss_probe); int pwm_lpss_remove(struct pwm_lpss_chip *lpwm) { - int i; - - for (i = 0; i < lpwm->info->npwm; i++) { - if (pwm_is_enabled(&lpwm->chip.pwms[i])) - pm_runtime_put(lpwm->chip.dev); - } return pwmchip_remove(&lpwm->chip); } EXPORT_SYMBOL_GPL(pwm_lpss_remove); From 26594c6bbb60c6bc87e3762a86ceece57d164c66 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sat, 6 Mar 2021 05:36:24 -0800 Subject: [PATCH 0331/1379] rpmsg: qcom_glink_native: fix error return code of qcom_glink_rx_data() When idr_find() returns NULL to intent, no error return code of qcom_glink_rx_data() is assigned. To fix this bug, ret is assigned with -ENOENT in this case. Fixes: 64f95f87920d ("rpmsg: glink: Use the local intents when receiving data") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Link: https://lore.kernel.org/r/20210306133624.17237-1-baijiaju1990@gmail.com Signed-off-by: Bjorn Andersson --- drivers/rpmsg/qcom_glink_native.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/rpmsg/qcom_glink_native.c b/drivers/rpmsg/qcom_glink_native.c index edae0c9a8236..05533c71b10e 100644 --- a/drivers/rpmsg/qcom_glink_native.c +++ b/drivers/rpmsg/qcom_glink_native.c @@ -857,6 +857,7 @@ static int qcom_glink_rx_data(struct qcom_glink *glink, size_t avail) dev_err(glink->dev, "no intent found for channel %s intent %d", channel->name, liid); + ret = -ENOENT; goto advance_rx; } } From 0349a070881f7e3b4472d886989db092ed3ccac8 Mon Sep 17 00:00:00 2001 From: Raphael Norwitz Date: Thu, 8 Apr 2021 19:05:27 +0000 Subject: [PATCH 0332/1379] PCI: Delay after FLR of Intel DC P4510 NVMe Like the Intel DC P3700 NVMe, the Intel P4510 NVMe exhibits a timeout failure when the driver tries to interact with the device too soon after an FLR. Add a device-specific reset method that delays 250ms after doing an FLR. Link: https://lore.kernel.org/r/20210408190521.16897-1-raphael.norwitz@nutanix.com Signed-off-by: Alay Shah Signed-off-by: Suresh Gumpula Signed-off-by: Raphael Norwitz Signed-off-by: Bjorn Helgaas Reviewed-by: Alex Williamson --- drivers/pci/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 653660e3ba9e..5a8c059b848d 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3922,6 +3922,7 @@ static const struct pci_dev_reset_methods pci_dev_reset_methods[] = { reset_ivb_igd }, { PCI_VENDOR_ID_SAMSUNG, 0xa804, nvme_disable_and_flr }, { PCI_VENDOR_ID_INTEL, 0x0953, delay_250ms_after_flr }, + { PCI_VENDOR_ID_INTEL, 0x0a54, delay_250ms_after_flr }, { PCI_VENDOR_ID_CHELSIO, PCI_ANY_ID, reset_chelsio_generic_dev }, { 0 } From 65299e8bfb24774e6340e93ae49f6626598917c8 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 9 Apr 2021 22:29:07 -0700 Subject: [PATCH 0333/1379] Input: elants_i2c - do not bind to i2c-hid compatible ACPI instantiated devices Several users have been reporting that elants_i2c gives several errors during probe and that their touchscreen does not work on their Lenovo AMD based laptops with a touchscreen with a ELAN0001 ACPI hardware-id: [ 0.550596] elants_i2c i2c-ELAN0001:00: i2c-ELAN0001:00 supply vcc33 not found, using dummy regulator [ 0.551836] elants_i2c i2c-ELAN0001:00: i2c-ELAN0001:00 supply vccio not found, using dummy regulator [ 0.560932] elants_i2c i2c-ELAN0001:00: elants_i2c_send failed (77 77 77 77): -121 [ 0.562427] elants_i2c i2c-ELAN0001:00: software reset failed: -121 [ 0.595925] elants_i2c i2c-ELAN0001:00: elants_i2c_send failed (77 77 77 77): -121 [ 0.597974] elants_i2c i2c-ELAN0001:00: software reset failed: -121 [ 0.621893] elants_i2c i2c-ELAN0001:00: elants_i2c_send failed (77 77 77 77): -121 [ 0.622504] elants_i2c i2c-ELAN0001:00: software reset failed: -121 [ 0.632650] elants_i2c i2c-ELAN0001:00: elants_i2c_send failed (4d 61 69 6e): -121 [ 0.634256] elants_i2c i2c-ELAN0001:00: boot failed: -121 [ 0.699212] elants_i2c i2c-ELAN0001:00: invalid 'hello' packet: 00 00 ff ff [ 1.630506] elants_i2c i2c-ELAN0001:00: Failed to read fw id: -121 [ 1.645508] elants_i2c i2c-ELAN0001:00: unknown packet 00 00 ff ff Despite these errors, the elants_i2c driver stays bound to the device (it returns 0 from its probe method despite the errors), blocking the i2c-hid driver from binding. Manually unbinding the elants_i2c driver and binding the i2c-hid driver makes the touchscreen work. Check if the ACPI-fwnode for the touchscreen contains one of the i2c-hid compatiblity-id strings and if it has the I2C-HID spec's DSM to get the HID descriptor address, If it has both then make elants_i2c not bind, so that the i2c-hid driver can bind. This assumes that non of the (older) elan touchscreens which actually need the elants_i2c driver falsely advertise an i2c-hid compatiblity-id + DSM in their ACPI-fwnodes. If some of them actually do have this false advertising, then this change may lead to regressions. While at it also drop the unnecessary DEVICE_NAME prefixing of the "I2C check functionality error", dev_err already outputs the driver-name. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=207759 Acked-by: Benjamin Tissoires Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20210405202756.16830-1-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/elants_i2c.c | 44 ++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/drivers/input/touchscreen/elants_i2c.c b/drivers/input/touchscreen/elants_i2c.c index 4c2b579f6c8b..36cf5694bfcc 100644 --- a/drivers/input/touchscreen/elants_i2c.c +++ b/drivers/input/touchscreen/elants_i2c.c @@ -38,6 +38,7 @@ #include #include #include +#include #include /* Device, Driver information */ @@ -1334,6 +1335,40 @@ static void elants_i2c_power_off(void *_data) } } +#ifdef CONFIG_ACPI +static const struct acpi_device_id i2c_hid_ids[] = { + {"ACPI0C50", 0 }, + {"PNP0C50", 0 }, + { }, +}; + +static const guid_t i2c_hid_guid = + GUID_INIT(0x3CDFF6F7, 0x4267, 0x4555, + 0xAD, 0x05, 0xB3, 0x0A, 0x3D, 0x89, 0x38, 0xDE); + +static bool elants_acpi_is_hid_device(struct device *dev) +{ + acpi_handle handle = ACPI_HANDLE(dev); + union acpi_object *obj; + + if (acpi_match_device_ids(ACPI_COMPANION(dev), i2c_hid_ids)) + return false; + + obj = acpi_evaluate_dsm_typed(handle, &i2c_hid_guid, 1, 1, NULL, ACPI_TYPE_INTEGER); + if (obj) { + ACPI_FREE(obj); + return true; + } + + return false; +} +#else +static bool elants_acpi_is_hid_device(struct device *dev) +{ + return false; +} +#endif + static int elants_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id) { @@ -1342,9 +1377,14 @@ static int elants_i2c_probe(struct i2c_client *client, unsigned long irqflags; int error; + /* Don't bind to i2c-hid compatible devices, these are handled by the i2c-hid drv. */ + if (elants_acpi_is_hid_device(&client->dev)) { + dev_warn(&client->dev, "This device appears to be an I2C-HID device, not binding\n"); + return -ENODEV; + } + if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { - dev_err(&client->dev, - "%s: i2c check functionality error\n", DEVICE_NAME); + dev_err(&client->dev, "I2C check functionality error\n"); return -ENXIO; } From e479187748a8f151a85116a7091c599b121fdea5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 9 Apr 2021 22:29:49 -0700 Subject: [PATCH 0334/1379] Input: silead - add workaround for x86 BIOS-es which bring the chip up in a stuck state Some buggy BIOS-es bring up the touchscreen-controller in a stuck state where it blocks the I2C bus. Specifically this happens on the Jumper EZpad 7 tablet model. After much poking at this problem I have found that the following steps are necessary to unstuck the chip / bus: 1. Turn off the Silead chip. 2. Try to do an I2C transfer with the chip, this will fail in response to which the I2C-bus-driver will call: i2c_recover_bus() which will unstuck the I2C-bus. Note the unstuck-ing of the I2C bus only works if we first drop the chip of the bus by turning it off. 3. Turn the chip back on. On the x86/ACPI systems were this problem is seen, step 1. and 3. require making ACPI calls and dealing with ACPI Power Resources. This commit adds a workaround which runtime-suspends the chip to turn it off, leaving it up to the ACPI subsystem to deal with all the ACPI specific details. There is no good way to detect this bug, so the workaround gets activated by a new "silead,stuck-controller-bug" boolean device-property. Since this is only used on x86/ACPI, this will be set by model specific device-props set by drivers/platform/x86/touchscreen_dmi.c. Therefor this new device-property is not documented in the DT-bindings. Dmesg will contain the following messages on systems where the workaround is activated: [ 54.309029] silead_ts i2c-MSSL1680:00: [Firmware Bug]: Stuck I2C bus: please ignore the next 'controller timed out' error [ 55.373593] i2c_designware 808622C1:04: controller timed out [ 55.582186] silead_ts i2c-MSSL1680:00: Silead chip ID: 0x80360000 Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20210405202745.16777-1-hdegoede@redhat.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/silead.c | 44 +++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/drivers/input/touchscreen/silead.c b/drivers/input/touchscreen/silead.c index 32725d7422de..1ee760bac0cf 100644 --- a/drivers/input/touchscreen/silead.c +++ b/drivers/input/touchscreen/silead.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -335,10 +336,8 @@ static int silead_ts_get_id(struct i2c_client *client) error = i2c_smbus_read_i2c_block_data(client, SILEAD_REG_ID, sizeof(chip_id), (u8 *)&chip_id); - if (error < 0) { - dev_err(&client->dev, "Chip ID read error %d\n", error); + if (error < 0) return error; - } data->chip_id = le32_to_cpu(chip_id); dev_info(&client->dev, "Silead chip ID: 0x%8X", data->chip_id); @@ -351,12 +350,49 @@ static int silead_ts_setup(struct i2c_client *client) int error; u32 status; + /* + * Some buggy BIOS-es bring up the chip in a stuck state where it + * blocks the I2C bus. The following steps are necessary to + * unstuck the chip / bus: + * 1. Turn off the Silead chip. + * 2. Try to do an I2C transfer with the chip, this will fail in + * response to which the I2C-bus-driver will call: + * i2c_recover_bus() which will unstuck the I2C-bus. Note the + * unstuck-ing of the I2C bus only works if we first drop the + * chip off the bus by turning it off. + * 3. Turn the chip back on. + * + * On the x86/ACPI systems were this problem is seen, step 1. and + * 3. require making ACPI calls and dealing with ACPI Power + * Resources. The workaround below runtime-suspends the chip to + * turn it off, leaving it up to the ACPI subsystem to deal with + * this. + */ + + if (device_property_read_bool(&client->dev, + "silead,stuck-controller-bug")) { + pm_runtime_set_active(&client->dev); + pm_runtime_enable(&client->dev); + pm_runtime_allow(&client->dev); + + pm_runtime_suspend(&client->dev); + + dev_warn(&client->dev, FW_BUG "Stuck I2C bus: please ignore the next 'controller timed out' error\n"); + silead_ts_get_id(client); + + /* The forbid will also resume the device */ + pm_runtime_forbid(&client->dev); + pm_runtime_disable(&client->dev); + } + silead_ts_set_power(client, SILEAD_POWER_OFF); silead_ts_set_power(client, SILEAD_POWER_ON); error = silead_ts_get_id(client); - if (error) + if (error) { + dev_err(&client->dev, "Chip ID read error %d\n", error); return error; + } error = silead_ts_init(client); if (error) From 2911ce35faf3df41eb09610cc5c55796fe69104b Mon Sep 17 00:00:00 2001 From: Vincent Knecht Date: Fri, 9 Apr 2021 13:16:49 -0700 Subject: [PATCH 0335/1379] dt-bindings: input/touchscreen: add bindings for msg2638 This adds dts bindings for the mstar msg2638 touchscreen. Reviewed-by: Linus Walleij Signed-off-by: Vincent Knecht Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20210305153815.126937-1-vincent.knecht@mailoo.org Signed-off-by: Dmitry Torokhov --- .../input/touchscreen/mstar,msg2638.yaml | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 Documentation/devicetree/bindings/input/touchscreen/mstar,msg2638.yaml diff --git a/Documentation/devicetree/bindings/input/touchscreen/mstar,msg2638.yaml b/Documentation/devicetree/bindings/input/touchscreen/mstar,msg2638.yaml new file mode 100644 index 000000000000..3a42c23faf6f --- /dev/null +++ b/Documentation/devicetree/bindings/input/touchscreen/mstar,msg2638.yaml @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/touchscreen/mstar,msg2638.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MStar msg2638 touchscreen controller Bindings + +maintainers: + - Vincent Knecht + +allOf: + - $ref: touchscreen.yaml# + +properties: + compatible: + const: mstar,msg2638 + + reg: + const: 0x26 + + interrupts: + maxItems: 1 + + reset-gpios: + maxItems: 1 + + vdd-supply: + description: Power supply regulator for the chip + + vddio-supply: + description: Power supply regulator for the I2C bus + + touchscreen-size-x: true + touchscreen-size-y: true + +additionalProperties: false + +required: + - compatible + - reg + - interrupts + - reset-gpios + - touchscreen-size-x + - touchscreen-size-y + +examples: + - | + #include + #include + i2c { + #address-cells = <1>; + #size-cells = <0>; + touchscreen@26 { + compatible = "mstar,msg2638"; + reg = <0x26>; + interrupt-parent = <&msmgpio>; + interrupts = <13 IRQ_TYPE_EDGE_FALLING>; + reset-gpios = <&msmgpio 100 GPIO_ACTIVE_LOW>; + pinctrl-names = "default"; + pinctrl-0 = <&ts_int_reset_default>; + vdd-supply = <&pm8916_l17>; + vddio-supply = <&pm8916_l5>; + touchscreen-size-x = <2048>; + touchscreen-size-y = <2048>; + }; + }; + +... From cbdb24e59e7fc1943bc72bab4a7b477d298a2d80 Mon Sep 17 00:00:00 2001 From: Vincent Knecht Date: Fri, 9 Apr 2021 13:29:03 -0700 Subject: [PATCH 0336/1379] Input: add MStar MSG2638 touchscreen driver Add support for the msg2638 touchscreen IC from MStar. Firmware handling, wakeup gestures and other specialties are not supported. This driver reuses zinitix.c structure, while the checksum and irq handler functions are based on out-of-tree driver for Alcatel Idol 3 (4.7"). Signed-off-by: Vincent Knecht Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20210305153815.126937-2-vincent.knecht@mailoo.org Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/Kconfig | 12 + drivers/input/touchscreen/Makefile | 1 + drivers/input/touchscreen/msg2638.c | 337 ++++++++++++++++++++++++++++ 3 files changed, 350 insertions(+) create mode 100644 drivers/input/touchscreen/msg2638.c diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig index aead3ad6ba6a..db1a4c78eec3 100644 --- a/drivers/input/touchscreen/Kconfig +++ b/drivers/input/touchscreen/Kconfig @@ -590,6 +590,18 @@ config TOUCHSCREEN_MELFAS_MIP4 To compile this driver as a module, choose M here: the module will be called melfas_mip4. +config TOUCHSCREEN_MSG2638 + tristate "MStar msg2638 touchscreen support" + depends on I2C + depends on GPIOLIB || COMPILE_TEST + help + Say Y here if you have an I2C touchscreen using MStar msg2638. + + If unsure, say N. + + To compile this driver as a module, choose M here: the + module will be called msg2638. + config TOUCHSCREEN_MTOUCH tristate "MicroTouch serial touchscreens" select SERIO diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile index 80cd241b4c1b..ee1d3d3fd918 100644 --- a/drivers/input/touchscreen/Makefile +++ b/drivers/input/touchscreen/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_TOUCHSCREEN_MCS5000) += mcs5000_ts.o obj-$(CONFIG_TOUCHSCREEN_MELFAS_MIP4) += melfas_mip4.o obj-$(CONFIG_TOUCHSCREEN_MIGOR) += migor_ts.o obj-$(CONFIG_TOUCHSCREEN_MMS114) += mms114.o +obj-$(CONFIG_TOUCHSCREEN_MSG2638) += msg2638.o obj-$(CONFIG_TOUCHSCREEN_MTOUCH) += mtouch.o obj-$(CONFIG_TOUCHSCREEN_MK712) += mk712.o obj-$(CONFIG_TOUCHSCREEN_HP600) += hp680_ts_input.o diff --git a/drivers/input/touchscreen/msg2638.c b/drivers/input/touchscreen/msg2638.c new file mode 100644 index 000000000000..75536bc88969 --- /dev/null +++ b/drivers/input/touchscreen/msg2638.c @@ -0,0 +1,337 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Driver for MStar msg2638 touchscreens + * + * Copyright (c) 2021 Vincent Knecht + * + * Checksum and IRQ handler based on mstar_drv_common.c and + * mstar_drv_mutual_fw_control.c + * Copyright (c) 2006-2012 MStar Semiconductor, Inc. + * + * Driver structure based on zinitix.c by Michael Srba + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MODE_DATA_RAW 0x5A + +#define MAX_SUPPORTED_FINGER_NUM 5 + +#define CHIP_ON_DELAY_MS 15 +#define FIRMWARE_ON_DELAY_MS 50 +#define RESET_DELAY_MIN_US 10000 +#define RESET_DELAY_MAX_US 11000 + +struct packet { + u8 xy_hi; /* higher bits of x and y coordinates */ + u8 x_low; + u8 y_low; + u8 pressure; +}; + +struct touch_event { + u8 mode; + struct packet pkt[MAX_SUPPORTED_FINGER_NUM]; + u8 proximity; + u8 checksum; +}; + +struct msg2638_ts_data { + struct i2c_client *client; + struct input_dev *input_dev; + struct touchscreen_properties prop; + struct regulator_bulk_data supplies[2]; + struct gpio_desc *reset_gpiod; +}; + +static u8 msg2638_checksum(u8 *data, u32 length) +{ + s32 sum = 0; + u32 i; + + for (i = 0; i < length; i++) + sum += data[i]; + + return (u8)((-sum) & 0xFF); +} + +static irqreturn_t msg2638_ts_irq_handler(int irq, void *msg2638_handler) +{ + struct msg2638_ts_data *msg2638 = msg2638_handler; + struct i2c_client *client = msg2638->client; + struct input_dev *input = msg2638->input_dev; + struct touch_event touch_event; + u32 len = sizeof(touch_event); + struct i2c_msg msg[] = { + { + .addr = client->addr, + .flags = I2C_M_RD, + .len = sizeof(touch_event), + .buf = (u8 *)&touch_event, + }, + }; + struct packet *p; + u16 x, y; + int ret; + int i; + + ret = i2c_transfer(client->adapter, msg, ARRAY_SIZE(msg)); + if (ret != ARRAY_SIZE(msg)) { + dev_err(&client->dev, + "Failed I2C transfer in irq handler: %d\n", + ret < 0 ? ret : -EIO); + goto out; + } + + if (touch_event.mode != MODE_DATA_RAW) + goto out; + + if (msg2638_checksum((u8 *)&touch_event, len - 1) != + touch_event.checksum) { + dev_err(&client->dev, "Failed checksum!\n"); + goto out; + } + + for (i = 0; i < MAX_SUPPORTED_FINGER_NUM; i++) { + p = &touch_event.pkt[i]; + + /* Ignore non-pressed finger data */ + if (p->xy_hi == 0xFF && p->x_low == 0xFF && p->y_low == 0xFF) + continue; + + x = (((p->xy_hi & 0xF0) << 4) | p->x_low); + y = (((p->xy_hi & 0x0F) << 8) | p->y_low); + + input_mt_slot(input, i); + input_mt_report_slot_state(input, MT_TOOL_FINGER, true); + touchscreen_report_pos(input, &msg2638->prop, x, y, true); + } + + input_mt_sync_frame(msg2638->input_dev); + input_sync(msg2638->input_dev); + +out: + return IRQ_HANDLED; +} + +static void msg2638_reset(struct msg2638_ts_data *msg2638) +{ + gpiod_set_value_cansleep(msg2638->reset_gpiod, 1); + usleep_range(RESET_DELAY_MIN_US, RESET_DELAY_MAX_US); + gpiod_set_value_cansleep(msg2638->reset_gpiod, 0); + msleep(FIRMWARE_ON_DELAY_MS); +} + +static int msg2638_start(struct msg2638_ts_data *msg2638) +{ + int error; + + error = regulator_bulk_enable(ARRAY_SIZE(msg2638->supplies), + msg2638->supplies); + if (error) { + dev_err(&msg2638->client->dev, + "Failed to enable regulators: %d\n", error); + return error; + } + + msleep(CHIP_ON_DELAY_MS); + + msg2638_reset(msg2638); + + enable_irq(msg2638->client->irq); + + return 0; +} + +static int msg2638_stop(struct msg2638_ts_data *msg2638) +{ + int error; + + disable_irq(msg2638->client->irq); + + error = regulator_bulk_disable(ARRAY_SIZE(msg2638->supplies), + msg2638->supplies); + if (error) { + dev_err(&msg2638->client->dev, + "Failed to disable regulators: %d\n", error); + return error; + } + + return 0; +} + +static int msg2638_input_open(struct input_dev *dev) +{ + struct msg2638_ts_data *msg2638 = input_get_drvdata(dev); + + return msg2638_start(msg2638); +} + +static void msg2638_input_close(struct input_dev *dev) +{ + struct msg2638_ts_data *msg2638 = input_get_drvdata(dev); + + msg2638_stop(msg2638); +} + +static int msg2638_init_input_dev(struct msg2638_ts_data *msg2638) +{ + struct device *dev = &msg2638->client->dev; + struct input_dev *input_dev; + int error; + + input_dev = devm_input_allocate_device(dev); + if (!input_dev) { + dev_err(dev, "Failed to allocate input device.\n"); + return -ENOMEM; + } + + input_set_drvdata(input_dev, msg2638); + msg2638->input_dev = input_dev; + + input_dev->name = "MStar TouchScreen"; + input_dev->phys = "input/ts"; + input_dev->id.bustype = BUS_I2C; + input_dev->open = msg2638_input_open; + input_dev->close = msg2638_input_close; + + input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_X); + input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_Y); + + touchscreen_parse_properties(input_dev, true, &msg2638->prop); + if (!msg2638->prop.max_x || !msg2638->prop.max_y) { + dev_err(dev, "touchscreen-size-x and/or touchscreen-size-y not set in properties\n"); + return -EINVAL; + } + + error = input_mt_init_slots(input_dev, MAX_SUPPORTED_FINGER_NUM, + INPUT_MT_DIRECT | INPUT_MT_DROP_UNUSED); + if (error) { + dev_err(dev, "Failed to initialize MT slots: %d\n", error); + return error; + } + + error = input_register_device(input_dev); + if (error) { + dev_err(dev, "Failed to register input device: %d\n", error); + return error; + } + + return 0; +} + +static int msg2638_ts_probe(struct i2c_client *client) +{ + struct device *dev = &client->dev; + struct msg2638_ts_data *msg2638; + int error; + + if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { + dev_err(dev, "Failed to assert adapter's support for plain I2C.\n"); + return -ENXIO; + } + + msg2638 = devm_kzalloc(dev, sizeof(*msg2638), GFP_KERNEL); + if (!msg2638) + return -ENOMEM; + + msg2638->client = client; + i2c_set_clientdata(client, msg2638); + + msg2638->supplies[0].supply = "vdd"; + msg2638->supplies[1].supply = "vddio"; + error = devm_regulator_bulk_get(dev, ARRAY_SIZE(msg2638->supplies), + msg2638->supplies); + if (error) { + dev_err(dev, "Failed to get regulators: %d\n", error); + return error; + } + + msg2638->reset_gpiod = devm_gpiod_get(dev, "reset", GPIOD_OUT_LOW); + if (IS_ERR(msg2638->reset_gpiod)) { + error = PTR_ERR(msg2638->reset_gpiod); + dev_err(dev, "Failed to request reset GPIO: %d\n", error); + return error; + } + + error = msg2638_init_input_dev(msg2638); + if (error) { + dev_err(dev, "Failed to initialize input device: %d\n", error); + return error; + } + + error = devm_request_threaded_irq(dev, client->irq, + NULL, msg2638_ts_irq_handler, + IRQF_ONESHOT | IRQF_NO_AUTOEN, + client->name, msg2638); + if (error) { + dev_err(dev, "Failed to request IRQ: %d\n", error); + return error; + } + + return 0; +} + +static int __maybe_unused msg2638_suspend(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + struct msg2638_ts_data *msg2638 = i2c_get_clientdata(client); + + mutex_lock(&msg2638->input_dev->mutex); + + if (input_device_enabled(msg2638->input_dev)) + msg2638_stop(msg2638); + + mutex_unlock(&msg2638->input_dev->mutex); + + return 0; +} + +static int __maybe_unused msg2638_resume(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + struct msg2638_ts_data *msg2638 = i2c_get_clientdata(client); + int ret = 0; + + mutex_lock(&msg2638->input_dev->mutex); + + if (input_device_enabled(msg2638->input_dev)) + ret = msg2638_start(msg2638); + + mutex_unlock(&msg2638->input_dev->mutex); + + return ret; +} + +static SIMPLE_DEV_PM_OPS(msg2638_pm_ops, msg2638_suspend, msg2638_resume); + +static const struct of_device_id msg2638_of_match[] = { + { .compatible = "mstar,msg2638" }, + { } +}; +MODULE_DEVICE_TABLE(of, msg2638_of_match); + +static struct i2c_driver msg2638_ts_driver = { + .probe_new = msg2638_ts_probe, + .driver = { + .name = "MStar-TS", + .pm = &msg2638_pm_ops, + .of_match_table = msg2638_of_match, + }, +}; +module_i2c_driver(msg2638_ts_driver); + +MODULE_AUTHOR("Vincent Knecht "); +MODULE_DESCRIPTION("MStar MSG2638 touchscreen driver"); +MODULE_LICENSE("GPL v2"); From 7c06272f0ed87f13f5ab1a81f18ad4a173da3556 Mon Sep 17 00:00:00 2001 From: Joe Hung Date: Fri, 9 Apr 2021 15:24:15 -0700 Subject: [PATCH 0337/1379] dt-bindings: input: touchscreen: ilitek_ts_i2c: Add bindings Add binding documentation for ILITEK touch devices. Signed-off-by: Joe Hung Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20210324122601.125873-1-joe_hung@ilitek.com Signed-off-by: Dmitry Torokhov --- .../input/touchscreen/ilitek_ts_i2c.yaml | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 Documentation/devicetree/bindings/input/touchscreen/ilitek_ts_i2c.yaml diff --git a/Documentation/devicetree/bindings/input/touchscreen/ilitek_ts_i2c.yaml b/Documentation/devicetree/bindings/input/touchscreen/ilitek_ts_i2c.yaml new file mode 100644 index 000000000000..a190e7baac31 --- /dev/null +++ b/Documentation/devicetree/bindings/input/touchscreen/ilitek_ts_i2c.yaml @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/touchscreen/ilitek_ts_i2c.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Ilitek I2C Touchscreen Controller + +maintainers: + - Dmitry Torokhov + +allOf: + - $ref: touchscreen.yaml# + +properties: + compatible: + enum: + - ilitek,ili2130 + - ilitek,ili2131 + - ilitek,ili2132 + - ilitek,ili2316 + - ilitek,ili2322 + - ilitek,ili2323 + - ilitek,ili2326 + - ilitek,ili2520 + - ilitek,ili2521 + + reg: + const: 0x41 + + interrupts: + maxItems: 1 + + reset-gpios: + maxItems: 1 + + wakeup-source: + type: boolean + description: touchscreen can be used as a wakeup source. + + touchscreen-size-x: true + touchscreen-size-y: true + touchscreen-inverted-x: true + touchscreen-inverted-y: true + touchscreen-swapped-x-y: true + +additionalProperties: false + +required: + - compatible + - reg + - interrupts + - reset-gpios + +examples: + - | + #include + #include + i2c { + #address-cells = <1>; + #size-cells = <0>; + + touchscreen@41 { + compatible = "ilitek,ili2520"; + reg = <0x41>; + + interrupt-parent = <&gpio1>; + interrupts = <7 IRQ_TYPE_LEVEL_LOW>; + reset-gpios = <&gpio1 8 GPIO_ACTIVE_LOW>; + touchscreen-inverted-y; + wakeup-source; + }; + }; From 42370681bd46d2162093d40eb453695495483733 Mon Sep 17 00:00:00 2001 From: Joe Hung Date: Fri, 9 Apr 2021 15:24:31 -0700 Subject: [PATCH 0338/1379] Input: Add support for ILITEK Lego Series Add support for ILITEK Lego series of touch devices. Lego series includes ILITEK 213X/23XX/25XX. Tested/passed with evaluation board with ILI2520/2322 IC. Signed-off-by: Joe Hung Link: https://lore.kernel.org/r/20210324122601.125873-2-joe_hung@ilitek.com Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/Kconfig | 12 + drivers/input/touchscreen/Makefile | 1 + drivers/input/touchscreen/ilitek_ts_i2c.c | 690 ++++++++++++++++++++++ 3 files changed, 703 insertions(+) create mode 100644 drivers/input/touchscreen/ilitek_ts_i2c.c diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig index db1a4c78eec3..5f76192164c1 100644 --- a/drivers/input/touchscreen/Kconfig +++ b/drivers/input/touchscreen/Kconfig @@ -426,6 +426,18 @@ config TOUCHSCREEN_ILI210X To compile this driver as a module, choose M here: the module will be called ili210x. +config TOUCHSCREEN_ILITEK + tristate "Ilitek I2C 213X/23XX/25XX/Lego Series Touch ICs" + depends on I2C + help + Say Y here if you have touchscreen with ILITEK touch IC, + it supports 213X/23XX/25XX and other Lego series. + + If unsure, say N. + + To compile this driver as a module, choose M here: the + module will be called ilitek_ts_i2c. + config TOUCHSCREEN_IPROC tristate "IPROC touch panel driver support" depends on ARCH_BCM_IPROC || COMPILE_TEST diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile index ee1d3d3fd918..92267fe53cdb 100644 --- a/drivers/input/touchscreen/Makefile +++ b/drivers/input/touchscreen/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_TOUCHSCREEN_FUJITSU) += fujitsu_ts.o obj-$(CONFIG_TOUCHSCREEN_GOODIX) += goodix.o obj-$(CONFIG_TOUCHSCREEN_HIDEEP) += hideep.o obj-$(CONFIG_TOUCHSCREEN_ILI210X) += ili210x.o +obj-$(CONFIG_TOUCHSCREEN_ILITEK) += ilitek_ts_i2c.o obj-$(CONFIG_TOUCHSCREEN_IMX6UL_TSC) += imx6ul_tsc.o obj-$(CONFIG_TOUCHSCREEN_INEXIO) += inexio.o obj-$(CONFIG_TOUCHSCREEN_IPROC) += bcm_iproc_tsc.o diff --git a/drivers/input/touchscreen/ilitek_ts_i2c.c b/drivers/input/touchscreen/ilitek_ts_i2c.c new file mode 100644 index 000000000000..c5d259c76adc --- /dev/null +++ b/drivers/input/touchscreen/ilitek_ts_i2c.c @@ -0,0 +1,690 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ILITEK Touch IC driver for 23XX, 25XX and Lego series + * + * Copyright (C) 2011 ILI Technology Corporation. + * Copyright (C) 2020 Luca Hsu + * Copyright (C) 2021 Joe Hung + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define ILITEK_TS_NAME "ilitek_ts" +#define BL_V1_8 0x108 +#define BL_V1_7 0x107 +#define BL_V1_6 0x106 + +#define ILITEK_TP_CMD_GET_TP_RES 0x20 +#define ILITEK_TP_CMD_GET_SCRN_RES 0x21 +#define ILITEK_TP_CMD_SET_IC_SLEEP 0x30 +#define ILITEK_TP_CMD_SET_IC_WAKE 0x31 +#define ILITEK_TP_CMD_GET_FW_VER 0x40 +#define ILITEK_TP_CMD_GET_PRL_VER 0x42 +#define ILITEK_TP_CMD_GET_MCU_VER 0x61 +#define ILITEK_TP_CMD_GET_IC_MODE 0xC0 + +#define REPORT_COUNT_ADDRESS 61 +#define ILITEK_SUPPORT_MAX_POINT 40 + +struct ilitek_protocol_info { + u16 ver; + u8 ver_major; +}; + +struct ilitek_ts_data { + struct i2c_client *client; + struct gpio_desc *reset_gpio; + struct input_dev *input_dev; + struct touchscreen_properties prop; + + const struct ilitek_protocol_map *ptl_cb_func; + struct ilitek_protocol_info ptl; + + char product_id[30]; + u16 mcu_ver; + u8 ic_mode; + u8 firmware_ver[8]; + + s32 reset_time; + s32 screen_max_x; + s32 screen_max_y; + s32 screen_min_x; + s32 screen_min_y; + s32 max_tp; +}; + +struct ilitek_protocol_map { + u16 cmd; + const char *name; + int (*func)(struct ilitek_ts_data *ts, u16 cmd, u8 *inbuf, u8 *outbuf); +}; + +enum ilitek_cmds { + /* common cmds */ + GET_PTL_VER = 0, + GET_FW_VER, + GET_SCRN_RES, + GET_TP_RES, + GET_IC_MODE, + GET_MCU_VER, + SET_IC_SLEEP, + SET_IC_WAKE, + + /* ALWAYS keep at the end */ + MAX_CMD_CNT +}; + +/* ILITEK I2C R/W APIs */ +static int ilitek_i2c_write_and_read(struct ilitek_ts_data *ts, + u8 *cmd, int write_len, int delay, + u8 *data, int read_len) +{ + int error; + struct i2c_client *client = ts->client; + struct i2c_msg msgs[] = { + { + .addr = client->addr, + .flags = 0, + .len = write_len, + .buf = cmd, + }, + { + .addr = client->addr, + .flags = I2C_M_RD, + .len = read_len, + .buf = data, + }, + }; + + if (delay == 0 && write_len > 0 && read_len > 0) { + error = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs)); + if (error < 0) + return error; + } else { + if (write_len > 0) { + error = i2c_transfer(client->adapter, msgs, 1); + if (error < 0) + return error; + } + if (delay > 0) + mdelay(delay); + + if (read_len > 0) { + error = i2c_transfer(client->adapter, msgs + 1, 1); + if (error < 0) + return error; + } + } + + return 0; +} + +/* ILITEK ISR APIs */ +static void ilitek_touch_down(struct ilitek_ts_data *ts, unsigned int id, + unsigned int x, unsigned int y) +{ + struct input_dev *input = ts->input_dev; + + input_mt_slot(input, id); + input_mt_report_slot_state(input, MT_TOOL_FINGER, true); + + touchscreen_report_pos(input, &ts->prop, x, y, true); +} + +static int ilitek_process_and_report_v6(struct ilitek_ts_data *ts) +{ + int error = 0; + u8 buf[512]; + int packet_len = 5; + int packet_max_point = 10; + int report_max_point; + int i, count; + struct input_dev *input = ts->input_dev; + struct device *dev = &ts->client->dev; + unsigned int x, y, status, id; + + error = ilitek_i2c_write_and_read(ts, NULL, 0, 0, buf, 64); + if (error) { + dev_err(dev, "get touch info failed, err:%d\n", error); + goto err_sync_frame; + } + + report_max_point = buf[REPORT_COUNT_ADDRESS]; + if (report_max_point > ts->max_tp) { + dev_err(dev, "FW report max point:%d > panel info. max:%d\n", + report_max_point, ts->max_tp); + error = -EINVAL; + goto err_sync_frame; + } + + count = DIV_ROUND_UP(report_max_point, packet_max_point); + for (i = 1; i < count; i++) { + error = ilitek_i2c_write_and_read(ts, NULL, 0, 0, + buf + i * 64, 64); + if (error) { + dev_err(dev, "get touch info. failed, cnt:%d, err:%d\n", + count, error); + goto err_sync_frame; + } + } + + for (i = 0; i < report_max_point; i++) { + status = buf[i * packet_len + 1] & 0x40; + if (!status) + continue; + + id = buf[i * packet_len + 1] & 0x3F; + + x = get_unaligned_le16(buf + i * packet_len + 2); + y = get_unaligned_le16(buf + i * packet_len + 4); + + if (x > ts->screen_max_x || x < ts->screen_min_x || + y > ts->screen_max_y || y < ts->screen_min_y) { + dev_warn(dev, "invalid position, X[%d,%u,%d], Y[%d,%u,%d]\n", + ts->screen_min_x, x, ts->screen_max_x, + ts->screen_min_y, y, ts->screen_max_y); + continue; + } + + ilitek_touch_down(ts, id, x, y); + } + +err_sync_frame: + input_mt_sync_frame(input); + input_sync(input); + return error; +} + +/* APIs of cmds for ILITEK Touch IC */ +static int api_protocol_set_cmd(struct ilitek_ts_data *ts, + u16 idx, u8 *inbuf, u8 *outbuf) +{ + u16 cmd; + int error; + + if (idx >= MAX_CMD_CNT) + return -EINVAL; + + cmd = ts->ptl_cb_func[idx].cmd; + error = ts->ptl_cb_func[idx].func(ts, cmd, inbuf, outbuf); + if (error) + return error; + + return 0; +} + +static int api_protocol_get_ptl_ver(struct ilitek_ts_data *ts, + u16 cmd, u8 *inbuf, u8 *outbuf) +{ + int error; + u8 buf[64]; + + buf[0] = cmd; + error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 3); + if (error) + return error; + + ts->ptl.ver = get_unaligned_be16(outbuf); + ts->ptl.ver_major = outbuf[0]; + + return 0; +} + +static int api_protocol_get_mcu_ver(struct ilitek_ts_data *ts, + u16 cmd, u8 *inbuf, u8 *outbuf) +{ + int error; + u8 buf[64]; + + buf[0] = cmd; + error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 32); + if (error) + return error; + + ts->mcu_ver = get_unaligned_le16(outbuf); + memset(ts->product_id, 0, sizeof(ts->product_id)); + memcpy(ts->product_id, outbuf + 6, 26); + + return 0; +} + +static int api_protocol_get_fw_ver(struct ilitek_ts_data *ts, + u16 cmd, u8 *inbuf, u8 *outbuf) +{ + int error; + u8 buf[64]; + + buf[0] = cmd; + error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 8); + if (error) + return error; + + memcpy(ts->firmware_ver, outbuf, 8); + + return 0; +} + +static int api_protocol_get_scrn_res(struct ilitek_ts_data *ts, + u16 cmd, u8 *inbuf, u8 *outbuf) +{ + int error; + u8 buf[64]; + + buf[0] = cmd; + error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 8); + if (error) + return error; + + ts->screen_min_x = get_unaligned_le16(outbuf); + ts->screen_min_y = get_unaligned_le16(outbuf + 2); + ts->screen_max_x = get_unaligned_le16(outbuf + 4); + ts->screen_max_y = get_unaligned_le16(outbuf + 6); + + return 0; +} + +static int api_protocol_get_tp_res(struct ilitek_ts_data *ts, + u16 cmd, u8 *inbuf, u8 *outbuf) +{ + int error; + u8 buf[64]; + + buf[0] = cmd; + error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 15); + if (error) + return error; + + ts->max_tp = outbuf[8]; + if (ts->max_tp > ILITEK_SUPPORT_MAX_POINT) { + dev_err(&ts->client->dev, "Invalid MAX_TP:%d from FW\n", + ts->max_tp); + return -EINVAL; + } + + return 0; +} + +static int api_protocol_get_ic_mode(struct ilitek_ts_data *ts, + u16 cmd, u8 *inbuf, u8 *outbuf) +{ + int error; + u8 buf[64]; + + buf[0] = cmd; + error = ilitek_i2c_write_and_read(ts, buf, 1, 5, outbuf, 2); + if (error) + return error; + + ts->ic_mode = outbuf[0]; + return 0; +} + +static int api_protocol_set_ic_sleep(struct ilitek_ts_data *ts, + u16 cmd, u8 *inbuf, u8 *outbuf) +{ + u8 buf[64]; + + buf[0] = cmd; + return ilitek_i2c_write_and_read(ts, buf, 1, 0, NULL, 0); +} + +static int api_protocol_set_ic_wake(struct ilitek_ts_data *ts, + u16 cmd, u8 *inbuf, u8 *outbuf) +{ + u8 buf[64]; + + buf[0] = cmd; + return ilitek_i2c_write_and_read(ts, buf, 1, 0, NULL, 0); +} + +static const struct ilitek_protocol_map ptl_func_map[] = { + /* common cmds */ + [GET_PTL_VER] = { + ILITEK_TP_CMD_GET_PRL_VER, "GET_PTL_VER", + api_protocol_get_ptl_ver + }, + [GET_FW_VER] = { + ILITEK_TP_CMD_GET_FW_VER, "GET_FW_VER", + api_protocol_get_fw_ver + }, + [GET_SCRN_RES] = { + ILITEK_TP_CMD_GET_SCRN_RES, "GET_SCRN_RES", + api_protocol_get_scrn_res + }, + [GET_TP_RES] = { + ILITEK_TP_CMD_GET_TP_RES, "GET_TP_RES", + api_protocol_get_tp_res + }, + [GET_IC_MODE] = { + ILITEK_TP_CMD_GET_IC_MODE, "GET_IC_MODE", + api_protocol_get_ic_mode + }, + [GET_MCU_VER] = { + ILITEK_TP_CMD_GET_MCU_VER, "GET_MOD_VER", + api_protocol_get_mcu_ver + }, + [SET_IC_SLEEP] = { + ILITEK_TP_CMD_SET_IC_SLEEP, "SET_IC_SLEEP", + api_protocol_set_ic_sleep + }, + [SET_IC_WAKE] = { + ILITEK_TP_CMD_SET_IC_WAKE, "SET_IC_WAKE", + api_protocol_set_ic_wake + }, +}; + +/* Probe APIs */ +static void ilitek_reset(struct ilitek_ts_data *ts, int delay) +{ + if (ts->reset_gpio) { + gpiod_set_value(ts->reset_gpio, 1); + mdelay(10); + gpiod_set_value(ts->reset_gpio, 0); + mdelay(delay); + } +} + +static int ilitek_protocol_init(struct ilitek_ts_data *ts) +{ + int error; + u8 outbuf[64]; + + ts->ptl_cb_func = ptl_func_map; + ts->reset_time = 600; + + error = api_protocol_set_cmd(ts, GET_PTL_VER, NULL, outbuf); + if (error) + return error; + + /* Protocol v3 is not support currently */ + if (ts->ptl.ver_major == 0x3 || + ts->ptl.ver == BL_V1_6 || + ts->ptl.ver == BL_V1_7) + return -EINVAL; + + return 0; +} + +static int ilitek_read_tp_info(struct ilitek_ts_data *ts, bool boot) +{ + u8 outbuf[256]; + int error; + + error = api_protocol_set_cmd(ts, GET_PTL_VER, NULL, outbuf); + if (error) + return error; + + error = api_protocol_set_cmd(ts, GET_MCU_VER, NULL, outbuf); + if (error) + return error; + + error = api_protocol_set_cmd(ts, GET_FW_VER, NULL, outbuf); + if (error) + return error; + + if (boot) { + error = api_protocol_set_cmd(ts, GET_SCRN_RES, NULL, + outbuf); + if (error) + return error; + } + + error = api_protocol_set_cmd(ts, GET_TP_RES, NULL, outbuf); + if (error) + return error; + + error = api_protocol_set_cmd(ts, GET_IC_MODE, NULL, outbuf); + if (error) + return error; + + return 0; +} + +static int ilitek_input_dev_init(struct device *dev, struct ilitek_ts_data *ts) +{ + int error; + struct input_dev *input; + + input = devm_input_allocate_device(dev); + if (!input) + return -ENOMEM; + + ts->input_dev = input; + input->name = ILITEK_TS_NAME; + input->id.bustype = BUS_I2C; + + __set_bit(INPUT_PROP_DIRECT, input->propbit); + + input_set_abs_params(input, ABS_MT_POSITION_X, + ts->screen_min_x, ts->screen_max_x, 0, 0); + input_set_abs_params(input, ABS_MT_POSITION_Y, + ts->screen_min_y, ts->screen_max_y, 0, 0); + + touchscreen_parse_properties(input, true, &ts->prop); + + error = input_mt_init_slots(input, ts->max_tp, + INPUT_MT_DIRECT | INPUT_MT_DROP_UNUSED); + if (error) { + dev_err(dev, "initialize MT slots failed, err:%d\n", error); + return error; + } + + error = input_register_device(input); + if (error) { + dev_err(dev, "register input device failed, err:%d\n", error); + return error; + } + + return 0; +} + +static irqreturn_t ilitek_i2c_isr(int irq, void *dev_id) +{ + struct ilitek_ts_data *ts = dev_id; + int error; + + error = ilitek_process_and_report_v6(ts); + if (error < 0) { + dev_err(&ts->client->dev, "[%s] err:%d\n", __func__, error); + return IRQ_NONE; + } + + return IRQ_HANDLED; +} + +static ssize_t firmware_version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + struct ilitek_ts_data *ts = i2c_get_clientdata(client); + + return scnprintf(buf, PAGE_SIZE, + "fw version: [%02X%02X.%02X%02X.%02X%02X.%02X%02X]\n", + ts->firmware_ver[0], ts->firmware_ver[1], + ts->firmware_ver[2], ts->firmware_ver[3], + ts->firmware_ver[4], ts->firmware_ver[5], + ts->firmware_ver[6], ts->firmware_ver[7]); +} +static DEVICE_ATTR_RO(firmware_version); + +static ssize_t product_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + struct ilitek_ts_data *ts = i2c_get_clientdata(client); + + return scnprintf(buf, PAGE_SIZE, "product id: [%04X], module: [%s]\n", + ts->mcu_ver, ts->product_id); +} +static DEVICE_ATTR_RO(product_id); + +static struct attribute *ilitek_sysfs_attrs[] = { + &dev_attr_firmware_version.attr, + &dev_attr_product_id.attr, + NULL +}; + +static struct attribute_group ilitek_attrs_group = { + .attrs = ilitek_sysfs_attrs, +}; + +static int ilitek_ts_i2c_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct ilitek_ts_data *ts; + struct device *dev = &client->dev; + int error; + + if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { + dev_err(dev, "i2c check functionality failed\n"); + return -ENXIO; + } + + ts = devm_kzalloc(dev, sizeof(*ts), GFP_KERNEL); + if (!ts) + return -ENOMEM; + + ts->client = client; + i2c_set_clientdata(client, ts); + + ts->reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW); + if (IS_ERR(ts->reset_gpio)) { + error = PTR_ERR(ts->reset_gpio); + dev_err(dev, "request gpiod failed: %d", error); + return error; + } + + ilitek_reset(ts, 1000); + + error = ilitek_protocol_init(ts); + if (error) { + dev_err(dev, "protocol init failed: %d", error); + return error; + } + + error = ilitek_read_tp_info(ts, true); + if (error) { + dev_err(dev, "read tp info failed: %d", error); + return error; + } + + error = ilitek_input_dev_init(dev, ts); + if (error) { + dev_err(dev, "input dev init failed: %d", error); + return error; + } + + error = devm_request_threaded_irq(dev, ts->client->irq, + NULL, ilitek_i2c_isr, IRQF_ONESHOT, + "ilitek_touch_irq", ts); + if (error) { + dev_err(dev, "request threaded irq failed: %d\n", error); + return error; + } + + error = devm_device_add_group(dev, &ilitek_attrs_group); + if (error) { + dev_err(dev, "sysfs create group failed: %d\n", error); + return error; + } + + return 0; +} + +static int __maybe_unused ilitek_suspend(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + struct ilitek_ts_data *ts = i2c_get_clientdata(client); + int error; + + disable_irq(client->irq); + + if (!device_may_wakeup(dev)) { + error = api_protocol_set_cmd(ts, SET_IC_SLEEP, NULL, NULL); + if (error) + return error; + } + + return 0; +} + +static int __maybe_unused ilitek_resume(struct device *dev) +{ + struct i2c_client *client = to_i2c_client(dev); + struct ilitek_ts_data *ts = i2c_get_clientdata(client); + int error; + + if (!device_may_wakeup(dev)) { + error = api_protocol_set_cmd(ts, SET_IC_WAKE, NULL, NULL); + if (error) + return error; + + ilitek_reset(ts, ts->reset_time); + } + + enable_irq(client->irq); + + return 0; +} + +static SIMPLE_DEV_PM_OPS(ilitek_pm_ops, ilitek_suspend, ilitek_resume); + +static const struct i2c_device_id ilitek_ts_i2c_id[] = { + { ILITEK_TS_NAME, 0 }, + { }, +}; +MODULE_DEVICE_TABLE(i2c, ilitek_ts_i2c_id); + +#ifdef CONFIG_ACPI +static const struct acpi_device_id ilitekts_acpi_id[] = { + { "ILTK0001", 0 }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, ilitekts_acpi_id); +#endif + +#ifdef CONFIG_OF +static const struct of_device_id ilitek_ts_i2c_match[] = { + {.compatible = "ilitek,ili2130",}, + {.compatible = "ilitek,ili2131",}, + {.compatible = "ilitek,ili2132",}, + {.compatible = "ilitek,ili2316",}, + {.compatible = "ilitek,ili2322",}, + {.compatible = "ilitek,ili2323",}, + {.compatible = "ilitek,ili2326",}, + {.compatible = "ilitek,ili2520",}, + {.compatible = "ilitek,ili2521",}, + { }, +}; +MODULE_DEVICE_TABLE(of, ilitek_ts_i2c_match); +#endif + +static struct i2c_driver ilitek_ts_i2c_driver = { + .driver = { + .name = ILITEK_TS_NAME, + .pm = &ilitek_pm_ops, + .of_match_table = of_match_ptr(ilitek_ts_i2c_match), + .acpi_match_table = ACPI_PTR(ilitekts_acpi_id), + }, + .probe = ilitek_ts_i2c_probe, + .id_table = ilitek_ts_i2c_id, +}; +module_i2c_driver(ilitek_ts_i2c_driver); + +MODULE_AUTHOR("ILITEK"); +MODULE_DESCRIPTION("ILITEK I2C Touchscreen Driver"); +MODULE_LICENSE("GPL"); From a811ecf8f1dbae02b7d54d6e2e33cc6bce1f1200 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 9 Apr 2021 23:05:32 -0700 Subject: [PATCH 0339/1379] Input: mms114 - convert bindings to YAML and extend This converts the Melfas MMS114 bindings and extend like this: - Require nodename touchscreen@ (this seems to be the case for all in-tree DTS files) - Add the mms134s and mms136 compatibles - Add the avdd and vdd power supplies - Define the I2C clock frequency property Signed-off-by: Linus Walleij Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20210330201233.2360006-1-linus.walleij@linaro.org Signed-off-by: Dmitry Torokhov --- .../input/touchscreen/melfas,mms114.yaml | 87 +++++++++++++++++++ .../bindings/input/touchscreen/mms114.txt | 42 --------- 2 files changed, 87 insertions(+), 42 deletions(-) create mode 100644 Documentation/devicetree/bindings/input/touchscreen/melfas,mms114.yaml delete mode 100644 Documentation/devicetree/bindings/input/touchscreen/mms114.txt diff --git a/Documentation/devicetree/bindings/input/touchscreen/melfas,mms114.yaml b/Documentation/devicetree/bindings/input/touchscreen/melfas,mms114.yaml new file mode 100644 index 000000000000..62366886fb3e --- /dev/null +++ b/Documentation/devicetree/bindings/input/touchscreen/melfas,mms114.yaml @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/touchscreen/melfas,mms114.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Melfas MMS114 family touchscreen controller bindings + +maintainers: + - Linus Walleij + +allOf: + - $ref: touchscreen.yaml# + +properties: + $nodename: + pattern: "^touchscreen(@.*)?$" + + compatible: + items: + - enum: + - melfas,mms114 + - melfas,mms134s + - melfas,mms136 + - melfas,mms152 + - melfas,mms345l + + reg: + description: I2C address + + clock-frequency: + description: I2C client clock frequency, defined for host + minimum: 100000 + maximum: 400000 + + interrupts: + maxItems: 1 + + avdd-supply: + description: Analog power supply regulator on AVDD pin + + vdd-supply: + description: Digital power supply regulator on VDD pin + + touchscreen-size-x: true + touchscreen-size-y: true + touchscreen-fuzz-x: true + touchscreen-fuzz-y: true + touchscreen-fuzz-pressure: true + touchscreen-inverted-x: true + touchscreen-inverted-y: true + touchscreen-swapped-x-y: true + touchscreen-max-pressure: true + +additionalProperties: false + +required: + - compatible + - reg + - interrupts + - touchscreen-size-x + - touchscreen-size-y + +examples: + - | + #include + i2c { + #address-cells = <1>; + #size-cells = <0>; + touchscreen@48 { + compatible = "melfas,mms114"; + reg = <0x48>; + interrupt-parent = <&gpio>; + interrupts = <39 IRQ_TYPE_EDGE_FALLING>; + avdd-supply = <&ldo1_reg>; + vdd-supply = <&ldo2_reg>; + touchscreen-size-x = <720>; + touchscreen-size-y = <1280>; + touchscreen-fuzz-x = <10>; + touchscreen-fuzz-y = <10>; + touchscreen-fuzz-pressure = <10>; + touchscreen-inverted-x; + touchscreen-inverted-y; + }; + }; + +... diff --git a/Documentation/devicetree/bindings/input/touchscreen/mms114.txt b/Documentation/devicetree/bindings/input/touchscreen/mms114.txt deleted file mode 100644 index 707234cfd7e6..000000000000 --- a/Documentation/devicetree/bindings/input/touchscreen/mms114.txt +++ /dev/null @@ -1,42 +0,0 @@ -* MELFAS MMS114/MMS152/MMS345L touchscreen controller - -Required properties: -- compatible: should be one of: - - "melfas,mms114" - - "melfas,mms152" - - "melfas,mms345l" -- reg: I2C address of the chip -- interrupts: interrupt to which the chip is connected -- touchscreen-size-x: See [1] -- touchscreen-size-y: See [1] - -Optional properties: -- touchscreen-fuzz-x: See [1] -- touchscreen-fuzz-y: See [1] -- touchscreen-fuzz-pressure: See [1] -- touchscreen-inverted-x: See [1] -- touchscreen-inverted-y: See [1] -- touchscreen-swapped-x-y: See [1] - -[1]: Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt - -Example: - - i2c@00000000 { - /* ... */ - - touchscreen@48 { - compatible = "melfas,mms114"; - reg = <0x48>; - interrupts = <39 0>; - touchscreen-size-x = <720>; - touchscreen-size-y = <1280>; - touchscreen-fuzz-x = <10>; - touchscreen-fuzz-y = <10>; - touchscreen-fuzz-pressure = <10>; - touchscreen-inverted-x; - touchscreen-inverted-y; - }; - - /* ... */ - }; From 53fefdd1d3a3403d8c44e28898d1031d8763b913 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 9 Apr 2021 23:04:35 -0700 Subject: [PATCH 0340/1379] Input: mms114 - support MMS136 The Melfas MMS136 is similar to the other MMS variants but has event packages of 6 bytes rather than 8 as the others. The define is named FINGER_EVENT_SZ in the vendor drivers so I renamed it from MMS*_PACKET_SZ to MMS*_EVENT_SZ. After this patch, the touchscreen on the Samsung GT-I8530 works fine with PostmarketOS. Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20210404232619.3092682-1-linus.walleij@linaro.org Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/mms114.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/input/touchscreen/mms114.c b/drivers/input/touchscreen/mms114.c index 7043f57ea2dd..0efd1a1bb192 100644 --- a/drivers/input/touchscreen/mms114.c +++ b/drivers/input/touchscreen/mms114.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -// Melfas MMS114/MMS152 touchscreen device driver +// Melfas MMS114/MMS136/MMS152 touchscreen device driver // // Copyright (c) 2012 Samsung Electronics Co., Ltd. // Author: Joonyoung Shim @@ -44,7 +44,8 @@ #define MMS114_MAX_AREA 0xff #define MMS114_MAX_TOUCH 10 -#define MMS114_PACKET_NUM 8 +#define MMS114_EVENT_SIZE 8 +#define MMS136_EVENT_SIZE 6 /* Touch type */ #define MMS114_TYPE_NONE 0 @@ -53,6 +54,7 @@ enum mms_type { TYPE_MMS114 = 114, + TYPE_MMS136 = 136, TYPE_MMS152 = 152, TYPE_MMS345L = 345, }; @@ -209,7 +211,11 @@ static irqreturn_t mms114_interrupt(int irq, void *dev_id) if (packet_size <= 0) goto out; - touch_size = packet_size / MMS114_PACKET_NUM; + /* MMS136 has slightly different event size */ + if (data->type == TYPE_MMS136) + touch_size = packet_size / MMS136_EVENT_SIZE; + else + touch_size = packet_size / MMS114_EVENT_SIZE; error = __mms114_read_reg(data, MMS114_INFORMATION, packet_size, (u8 *)touch); @@ -275,6 +281,7 @@ static int mms114_get_version(struct mms114_data *data) break; case TYPE_MMS114: + case TYPE_MMS136: error = __mms114_read_reg(data, MMS114_TSP_REV, 6, buf); if (error) return error; @@ -297,8 +304,8 @@ static int mms114_setup_regs(struct mms114_data *data) if (error < 0) return error; - /* Only MMS114 has configuration and power on registers */ - if (data->type != TYPE_MMS114) + /* Only MMS114 and MMS136 have configuration and power on registers */ + if (data->type != TYPE_MMS114 && data->type != TYPE_MMS136) return 0; error = mms114_set_active(data, true); @@ -480,7 +487,7 @@ static int mms114_probe(struct i2c_client *client, 0, data->props.max_y, 0, 0); } - if (data->type == TYPE_MMS114) { + if (data->type == TYPE_MMS114 || data->type == TYPE_MMS136) { /* * The firmware handles movement and pressure fuzz, so * don't duplicate that in software. @@ -604,6 +611,9 @@ static const struct of_device_id mms114_dt_match[] = { { .compatible = "melfas,mms114", .data = (void *)TYPE_MMS114, + }, { + .compatible = "melfas,mms136", + .data = (void *)TYPE_MMS136, }, { .compatible = "melfas,mms152", .data = (void *)TYPE_MMS152, From 236798a1a95fa0c3f923d92d570ff656d2d8e8f5 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 9 Apr 2021 23:28:32 -0700 Subject: [PATCH 0341/1379] Input: apbps2 - remove useless variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following gcc warning: drivers/input/serio/apbps2.c:106:16: warning: variable ‘tmp’ set but not used [-Wunused-but-set-variable]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/1617958859-64707-1-git-send-email-jiapeng.chong@linux.alibaba.com Signed-off-by: Dmitry Torokhov --- drivers/input/serio/apbps2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/input/serio/apbps2.c b/drivers/input/serio/apbps2.c index 594ac4e6f8ea..974d7bfae0a0 100644 --- a/drivers/input/serio/apbps2.c +++ b/drivers/input/serio/apbps2.c @@ -103,7 +103,6 @@ static int apbps2_open(struct serio *io) { struct apbps2_priv *priv = io->port_data; int limit; - unsigned long tmp; /* clear error flags */ iowrite32be(0, &priv->regs->status); @@ -111,7 +110,7 @@ static int apbps2_open(struct serio *io) /* Clear old data if available (unlikely) */ limit = 1024; while ((ioread32be(&priv->regs->status) & APBPS2_STATUS_DR) && --limit) - tmp = ioread32be(&priv->regs->data); + ioread32be(&priv->regs->data); /* Enable reciever and it's interrupt */ iowrite32be(APBPS2_CTRL_RE | APBPS2_CTRL_RI, &priv->regs->ctrl); From 39841136766651e487458d9ee1660fe86aa697f3 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 10 Apr 2021 00:31:12 -0700 Subject: [PATCH 0342/1379] Input: cyttsp - error message on boot mode exit error Provide a proper error message when attempting to exit boot loader mode and failing, which is something that happened to me. Reviewed-by: Javier Martinez Canillas Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20210408131153.3446138-5-linus.walleij@linaro.org Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/cyttsp_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/cyttsp_core.c b/drivers/input/touchscreen/cyttsp_core.c index c59aa6b8d257..4dc387659e2a 100644 --- a/drivers/input/touchscreen/cyttsp_core.c +++ b/drivers/input/touchscreen/cyttsp_core.c @@ -408,8 +408,10 @@ static int cyttsp_power_on(struct cyttsp *ts) if (GET_BOOTLOADERMODE(ts->bl_data.bl_status) && IS_VALID_APP(ts->bl_data.bl_status)) { error = cyttsp_exit_bl_mode(ts); - if (error) + if (error) { + dev_err(ts->dev, "failed to exit bootloader mode\n"); return error; + } } if (GET_HSTMODE(ts->bl_data.bl_file) != CY_OPERATE_MODE || From c61ac36fd52cb3015acd93af5da01f8f8350051f Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 10 Apr 2021 00:31:52 -0700 Subject: [PATCH 0343/1379] Input: cyttsp - reduce reset pulse timings The data sheet for CY8CTMA340 specifies that the reset pulse shall be at least 1 ms. Specify 1-2 ms with usleep_range() to cut some slack for the scheduler. Curiously the datasheet does not specify how long we have to wait after a hard reset until the chip is up, but I have found a vendor tree (Samsung GT-S7710) that has code for this touch screen and there this is set to 5 ms so I use this with the same 1 ms fuzz. Reviewed-by: Javier Martinez Canillas Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20210408131153.3446138-6-linus.walleij@linaro.org Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/cyttsp_core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/input/touchscreen/cyttsp_core.c b/drivers/input/touchscreen/cyttsp_core.c index 4dc387659e2a..42e353f11976 100644 --- a/drivers/input/touchscreen/cyttsp_core.c +++ b/drivers/input/touchscreen/cyttsp_core.c @@ -229,10 +229,16 @@ static int cyttsp_set_sysinfo_regs(struct cyttsp *ts) static void cyttsp_hard_reset(struct cyttsp *ts) { if (ts->reset_gpio) { + /* + * According to the CY8CTMA340 datasheet page 21, the external + * reset pulse width should be >= 1 ms. The datasheet does not + * specify how long we have to wait after reset but a vendor + * tree specifies 5 ms here. + */ gpiod_set_value_cansleep(ts->reset_gpio, 1); - msleep(CY_DELAY_DFLT); + usleep_range(1000, 2000); gpiod_set_value_cansleep(ts->reset_gpio, 0); - msleep(CY_DELAY_DFLT); + usleep_range(5000, 6000); } } From 0bffa508d1365794b7688b9a2d1ad5af63434b58 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 10 Apr 2021 00:32:46 -0700 Subject: [PATCH 0344/1379] Input: cyttsp - drop the phys path When I test to use the CY8CTMA340 with PostmarketOS I don't have any problem whatsoever in dropping this phys path, it finds and uses the touchscreen just as well. I suppose it is because userspace is using modern input libraries. I challenge the maintainers to point out a valid and still used userspace that actually need this. I say we drop it. Reviewed-by: Javier Martinez Canillas Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20210408131153.3446138-7-linus.walleij@linaro.org Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/cyttsp_core.c | 2 -- drivers/input/touchscreen/cyttsp_core.h | 1 - 2 files changed, 3 deletions(-) diff --git a/drivers/input/touchscreen/cyttsp_core.c b/drivers/input/touchscreen/cyttsp_core.c index 42e353f11976..b0477f55fb79 100644 --- a/drivers/input/touchscreen/cyttsp_core.c +++ b/drivers/input/touchscreen/cyttsp_core.c @@ -640,10 +640,8 @@ struct cyttsp *cyttsp_probe(const struct cyttsp_bus_ops *bus_ops, return ERR_PTR(error); init_completion(&ts->bl_ready); - snprintf(ts->phys, sizeof(ts->phys), "%s/input0", dev_name(dev)); input_dev->name = "Cypress TTSP TouchScreen"; - input_dev->phys = ts->phys; input_dev->id.bustype = bus_ops->bustype; input_dev->dev.parent = ts->dev; diff --git a/drivers/input/touchscreen/cyttsp_core.h b/drivers/input/touchscreen/cyttsp_core.h index 8c651336ac12..9bc4fe7e6ac5 100644 --- a/drivers/input/touchscreen/cyttsp_core.h +++ b/drivers/input/touchscreen/cyttsp_core.h @@ -114,7 +114,6 @@ struct cyttsp { struct device *dev; int irq; struct input_dev *input; - char phys[32]; const struct cyttsp_bus_ops *bus_ops; struct cyttsp_bootloader_data bl_data; struct cyttsp_sysinfo_data sysinfo_data; From ddfe7e1ce3d5ce5ae0b25d107ba9d26fe8a4923b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 10 Apr 2021 00:33:46 -0700 Subject: [PATCH 0345/1379] Input: cyttsp - set abs params for ABS_MT_TOUCH_MAJOR The driver is certainly reporting pressure in cyttsp_report_tchdata() with input_report_abs(input, ABS_MT_TOUCH_MAJOR, tch->z); so we should also advertise this capability. Reviewed-by: Javier Martinez Canillas Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20210408131153.3446138-8-linus.walleij@linaro.org Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/cyttsp_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/touchscreen/cyttsp_core.c b/drivers/input/touchscreen/cyttsp_core.c index b0477f55fb79..3d605f4cbed9 100644 --- a/drivers/input/touchscreen/cyttsp_core.c +++ b/drivers/input/touchscreen/cyttsp_core.c @@ -652,6 +652,9 @@ struct cyttsp *cyttsp_probe(const struct cyttsp_bus_ops *bus_ops, input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_X); input_set_capability(input_dev, EV_ABS, ABS_MT_POSITION_Y); + /* One byte for width 0..255 so this is the limit */ + input_set_abs_params(input_dev, ABS_MT_TOUCH_MAJOR, 0, 255, 0, 0); + touchscreen_parse_properties(input_dev, true, NULL); error = input_mt_init_slots(input_dev, CY_MAX_ID, 0); From dadf1fd8807e61b1b1744836c96f48eb9ad56f5b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 10 Apr 2021 00:34:58 -0700 Subject: [PATCH 0346/1379] Input: cyttsp - flag the device properly This device is certainly a very simple touchscreen so we set INPUT_MT_DIRECT. Reviewed-by: Javier Martinez Canillas Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20210408131153.3446138-9-linus.walleij@linaro.org Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/cyttsp_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/cyttsp_core.c b/drivers/input/touchscreen/cyttsp_core.c index 3d605f4cbed9..106dd4962785 100644 --- a/drivers/input/touchscreen/cyttsp_core.c +++ b/drivers/input/touchscreen/cyttsp_core.c @@ -657,7 +657,7 @@ struct cyttsp *cyttsp_probe(const struct cyttsp_bus_ops *bus_ops, touchscreen_parse_properties(input_dev, true, NULL); - error = input_mt_init_slots(input_dev, CY_MAX_ID, 0); + error = input_mt_init_slots(input_dev, CY_MAX_ID, INPUT_MT_DIRECT); if (error) { dev_err(dev, "Unable to init MT slots.\n"); return ERR_PTR(error); From 5f029c045c948b6cb8ccfda614e73240c4a8363b Mon Sep 17 00:00:00 2001 From: Yi Zhuang Date: Tue, 6 Apr 2021 09:47:35 +0800 Subject: [PATCH 0347/1379] f2fs: clean up build warnings This patch combined the below three clean-up patches. - modify open brace '{' following function definitions - ERROR: spaces required around that ':' - ERROR: spaces required before the open parenthesis '(' - ERROR: spaces prohibited before that ',' - Made suggested modifications from checkpatch in reference to WARNING: Missing a blank line after declarations Signed-off-by: Yi Zhuang Signed-off-by: Jia Yang Signed-off-by: Jack Qiu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 1 + fs/f2fs/checkpoint.c | 1 + fs/f2fs/data.c | 2 ++ fs/f2fs/debug.c | 3 +++ fs/f2fs/dir.c | 1 + fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 6 +++++- fs/f2fs/inode.c | 1 + fs/f2fs/namei.c | 3 +++ fs/f2fs/node.c | 8 +++++--- fs/f2fs/recovery.c | 3 ++- fs/f2fs/segment.c | 18 +++++++++++++++--- fs/f2fs/super.c | 5 +++-- fs/f2fs/xattr.c | 1 + 14 files changed, 44 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 965037a9c205..239ad9453b99 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -29,6 +29,7 @@ static inline size_t f2fs_acl_size(int count) static inline int f2fs_acl_count(size_t size) { ssize_t s; + size -= sizeof(struct f2fs_acl_header); s = size - 4 * sizeof(struct f2fs_acl_entry_short); if (s < 0) { diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f6169611270f..817d0bcb5c67 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -719,6 +719,7 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) orphan_blk = (struct f2fs_orphan_block *)page_address(page); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); + err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4bf7e79c8342..cf935474ffba 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1086,6 +1086,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) for (; count > 0; dn->ofs_in_node++) { block_t blkaddr = f2fs_data_blkaddr(dn); + if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -3765,6 +3766,7 @@ int f2fs_migrate_page(struct address_space *mapping, if (atomic_written) { struct inmem_pages *cur; + list_for_each_entry(cur, &fi->inmem_pages, list) if (cur->page == page) { cur->page = newpage; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 91855d5721cd..c03949a7ccff 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -173,6 +173,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->util_invalid = 50 - si->util_free - si->util_valid; for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); + si->curseg[i] = curseg->segno; si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); @@ -300,10 +301,12 @@ get_cache: si->page_mem = 0; if (sbi->node_inode) { unsigned npages = NODE_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } if (sbi->meta_inode) { unsigned npages = META_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index e6270a867be1..ebf65c5fac40 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -473,6 +473,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; + lock_page(page); f2fs_wait_on_page_writeback(page, type, true, true); de->ino = cpu_to_le32(inode->i_ino); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f3ca63b55843..d697c8900fa7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2582,7 +2582,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_extent = NULL, - .m_seg_type = NO_CHECK_TYPE , + .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; struct extent_info ei = {0, 0, 0}; pgoff_t pg_start, pg_end, next_pgofs; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 5c48825fd12d..8d1f17ab94d8 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -160,7 +160,7 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; - gc_th->gc_wake= 0; + gc_th->gc_wake = 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); @@ -179,6 +179,7 @@ out: void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); @@ -858,6 +859,7 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) static void put_gc_inode(struct gc_inode_list *gc_list) { struct inode_entry *ie, *next_ie; + list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); iput(ie->inode); @@ -982,9 +984,11 @@ block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) bidx = node_ofs - 1; } else if (node_ofs <= indirect_blks) { int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 2 - dec; } else { int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 5 - dec; } return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 5d2253d53f17..b401f08569f7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -666,6 +666,7 @@ retry: node_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); + if (err == -ENOMEM) { cond_resched(); goto retry; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 17bd072a5d39..405d85dbf9f1 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -419,6 +419,7 @@ struct dentry *f2fs_get_parent(struct dentry *child) struct qstr dotdot = QSTR_INIT("..", 2); struct page *page; unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page); + if (!ino) { if (IS_ERR(page)) return ERR_CAST(page); @@ -628,6 +629,7 @@ static const char *f2fs_get_link(struct dentry *dentry, struct delayed_call *done) { const char *link = page_get_link(dentry, inode, done); + if (!IS_ERR(link) && !*link) { /* this is broken symlink case */ do_delayed_call(done); @@ -766,6 +768,7 @@ out_fail: static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); + if (f2fs_empty_dir(inode)) return f2fs_unlink(dir, dentry); return -ENOTEMPTY; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 3eb724bb6594..e67ce5f13b98 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -470,6 +470,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, /* increment version no as node is removed */ if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); + nat_set_version(e, inc_node_version(version)); } @@ -1391,7 +1392,7 @@ repeat: goto out_err; } page_hit: - if(unlikely(nid != nid_of_node(page))) { + if (unlikely(nid != nid_of_node(page))) { f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), @@ -1783,7 +1784,7 @@ continue_unlock: out: if (nwritten) f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE); - return ret ? -EIO: 0; + return ret ? -EIO : 0; } static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) @@ -2125,8 +2126,8 @@ static int __insert_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i) { struct f2fs_nm_info *nm_i = NM_I(sbi); - int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) return err; @@ -2991,6 +2992,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; + set_idx = setvec[found - 1]->set + 1; for (idx = 0; idx < found; idx++) __adjust_nat_entry_set(setvec[idx], &sets, diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index da75d5d52f0a..422146c6d866 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -458,6 +458,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, /* Get the previous summary */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); + if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; goto got_it; @@ -875,5 +876,5 @@ out: #endif sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ - return ret ? ret: err; + return ret ? ret : err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 5bd0e1d7a00c..0cb1ca88d4aa 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1794,7 +1794,7 @@ static int issue_discard_thread(void *data) if (issued > 0) { __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; - } else if (issued == -1){ + } else if (issued == -1) { wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME); if (!wait_ms) wait_ms = dpolicy.mid_interval; @@ -2171,6 +2171,7 @@ static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, unsigned int segno, int modified) { struct seg_entry *se = get_seg_entry(sbi, segno); + se->type = type; if (modified) __mark_sit_entry_dirty(sbi, segno); @@ -2362,6 +2363,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, { struct curseg_info *curseg = CURSEG_I(sbi, type); void *addr = curseg->sum_blk; + addr += curseg->next_blkoff * sizeof(struct f2fs_summary); memcpy(addr, sum, sizeof(struct f2fs_summary)); } @@ -3779,6 +3781,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) for (j = 0; j < blk_off; j++) { struct f2fs_summary *s; + s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; @@ -3841,6 +3844,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) if (__exist_node_summaries(sbi)) { struct f2fs_summary *ns = &sum->entries[0]; int i; + for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { ns->version = 0; ns->ofs_in_node = 0; @@ -3942,6 +3946,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { unsigned short blkoff; + seg_i = CURSEG_I(sbi, i); if (sbi->ckpt->alloc_type[i] == SSR) blkoff = sbi->blocks_per_seg; @@ -3978,6 +3983,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { int i, end; + if (IS_DATASEG(type)) end = type + NR_CURSEG_DATA_TYPE; else @@ -4561,6 +4567,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) /* set use the current segments */ for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { struct curseg_info *curseg_t = CURSEG_I(sbi, type); + __set_test_and_inuse(sbi, curseg_t->segno); } } @@ -4793,7 +4800,8 @@ static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, } static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, - void *data) { + void *data) +{ memcpy(data, zone, sizeof(struct blk_zone)); return 0; } @@ -4909,8 +4917,10 @@ struct check_zone_write_pointer_args { }; static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, - void *data) { + void *data) +{ struct check_zone_write_pointer_args *args; + args = (struct check_zone_write_pointer_args *)data; return check_zone_write_pointer(args->sbi, args->fdev, zone); @@ -5189,6 +5199,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + kvfree(dirty_i->victim_secmap); } @@ -5233,6 +5244,7 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) static void destroy_free_segmap(struct f2fs_sb_info *sbi) { struct free_segmap_info *free_i = SM_I(sbi)->free_info; + if (!free_i) return; SM_I(sbi)->free_info = NULL; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c15800c3cdb1..5020152aa8fc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -559,6 +559,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) while ((p = strsep(&options, ",")) != NULL) { int token; + if (!*p) continue; /* @@ -1892,7 +1893,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) ret = sync_filesystem(sbi->sb); if (ret || err) { - err = ret ? ret: err; + err = ret ? ret : err; goto restore_flag; } @@ -3757,7 +3758,7 @@ try_onemore: sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; for (i = 0; i < NR_PAGE_TYPE; i++) { - int n = (i == META) ? 1: NR_TEMP_TYPE; + int n = (i == META) ? 1 : NR_TEMP_TYPE; int j; sbi->write_io[i] = diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 490f843ec3bf..c8f34decbf8e 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -488,6 +488,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, f2fs_wait_on_page_writeback(xpage, NODE, true, true); } else { struct dnode_of_data dn; + set_new_dnode(&dn, inode, NULL, NULL, new_nid); xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { From ee1bf567c90df6fd04a3c31acb0995cb13f62c48 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 1 Apr 2021 17:53:36 +0800 Subject: [PATCH 0348/1379] dmaengine: qcom_hidma: remove unused code Fix the following clang warning: drivers/dma/qcom/hidma.c:94:20: warning: unused function 'to_hidma_desc' [-Wunused-function]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/1617270816-36400-1-git-send-email-jiapeng.chong@linux.alibaba.com Signed-off-by: Vinod Koul --- drivers/dma/qcom/hidma.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/dma/qcom/hidma.c b/drivers/dma/qcom/hidma.c index 6c0f9eb8ecc6..23d64489d25f 100644 --- a/drivers/dma/qcom/hidma.c +++ b/drivers/dma/qcom/hidma.c @@ -90,12 +90,6 @@ static inline struct hidma_chan *to_hidma_chan(struct dma_chan *dmach) return container_of(dmach, struct hidma_chan, chan); } -static inline -struct hidma_desc *to_hidma_desc(struct dma_async_tx_descriptor *t) -{ - return container_of(t, struct hidma_desc, desc); -} - static void hidma_free(struct hidma_dev *dmadev) { INIT_LIST_HEAD(&dmadev->ddev.channels); From 1b6216a61e1d5b6683f2d3362ff76d183e66c29f Mon Sep 17 00:00:00 2001 From: Hao Fang Date: Thu, 1 Apr 2021 19:50:20 +0800 Subject: [PATCH 0349/1379] dmaengine: k3dma: use the correct HiSilicon copyright s/Hisilicon/HiSilicon/g. It should use capital S, according to the official website. Signed-off-by: Hao Fang Acked-by: Zhangfei Gao Link: https://lore.kernel.org/r/1617277820-26971-1-git-send-email-fanghao11@huawei.com Signed-off-by: Vinod Koul --- drivers/dma/k3dma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/k3dma.c b/drivers/dma/k3dma.c index d0b2e601e3e5..ecdaada95120 100644 --- a/drivers/dma/k3dma.c +++ b/drivers/dma/k3dma.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 - 2015 Linaro Ltd. - * Copyright (c) 2013 Hisilicon Limited. + * Copyright (c) 2013 HiSilicon Limited. */ #include #include @@ -1039,6 +1039,6 @@ static struct platform_driver k3_pdma_driver = { module_platform_driver(k3_pdma_driver); -MODULE_DESCRIPTION("Hisilicon k3 DMA Driver"); +MODULE_DESCRIPTION("HiSilicon k3 DMA Driver"); MODULE_ALIAS("platform:k3dma"); MODULE_LICENSE("GPL v2"); From 332d1a0373be32a3a3c152756bca45ff4f4e11b5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 18:15:36 -0400 Subject: [PATCH 0350/1379] NFS: nfs4_bitmask_adjust() must not change the server global bitmasks As currently set, the calls to nfs4_bitmask_adjust() will end up overwriting the contents of the nfs_server cache_consistency_bitmask field. The intention here should be to modify a private copy of that mask in the close/delegreturn/write arguments. Fixes: 76bd5c016ef4 ("NFSv4: make cache consistency bitmask dynamic") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 56 +++++++++++++++++++++++++---------------- include/linux/nfs_xdr.h | 11 +++++--- 2 files changed, 42 insertions(+), 25 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index bca1726bcd55..1cf98c40e3b2 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -108,9 +108,10 @@ static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, const struct cred *, bool); #endif -static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode, - struct nfs_server *server, - struct nfs4_label *label); +static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], + const __u32 *src, struct inode *inode, + struct nfs_server *server, + struct nfs4_label *label); #ifdef CONFIG_NFS_V4_SECURITY_LABEL static inline struct nfs4_label * @@ -3591,6 +3592,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; struct inode *inode = calldata->inode; + struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; bool is_rdonly, is_wronly, is_rdwr; int call_close = 0; @@ -3647,8 +3649,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) { /* Close-to-open cache consistency revalidation */ if (!nfs4_have_delegation(inode, FMODE_READ)) { - calldata->arg.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; - nfs4_bitmask_adjust(calldata->arg.bitmask, inode, NFS_SERVER(inode), NULL); + nfs4_bitmask_set(calldata->arg.bitmask_store, + server->cache_consistency_bitmask, + inode, server, NULL); + calldata->arg.bitmask = calldata->arg.bitmask_store; } else calldata->arg.bitmask = NULL; } @@ -5416,19 +5420,17 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr) return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; } -static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode, - struct nfs_server *server, - struct nfs4_label *label) +static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src, + struct inode *inode, struct nfs_server *server, + struct nfs4_label *label) { - unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + unsigned int i; - if ((cache_validity & NFS_INO_INVALID_DATA) || - (cache_validity & NFS_INO_REVAL_PAGECACHE) || - (cache_validity & NFS_INO_REVAL_FORCED) || - (cache_validity & NFS_INO_INVALID_OTHER)) - nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode); + memcpy(bitmask, src, sizeof(*bitmask) * NFS4_BITMASK_SZ); + if (cache_validity & (NFS_INO_INVALID_CHANGE | NFS_INO_REVAL_PAGECACHE)) + bitmask[0] |= FATTR4_WORD0_CHANGE; if (cache_validity & NFS_INO_INVALID_ATIME) bitmask[1] |= FATTR4_WORD1_TIME_ACCESS; if (cache_validity & NFS_INO_INVALID_OTHER) @@ -5437,16 +5439,22 @@ static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode, FATTR4_WORD1_NUMLINKS; if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL) bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL; - if (cache_validity & NFS_INO_INVALID_CHANGE) - bitmask[0] |= FATTR4_WORD0_CHANGE; if (cache_validity & NFS_INO_INVALID_CTIME) bitmask[1] |= FATTR4_WORD1_TIME_METADATA; if (cache_validity & NFS_INO_INVALID_MTIME) bitmask[1] |= FATTR4_WORD1_TIME_MODIFY; - if (cache_validity & NFS_INO_INVALID_SIZE) - bitmask[0] |= FATTR4_WORD0_SIZE; if (cache_validity & NFS_INO_INVALID_BLOCKS) bitmask[1] |= FATTR4_WORD1_SPACE_USED; + + if (nfs4_have_delegation(inode, FMODE_READ) && + !(cache_validity & NFS_INO_REVAL_FORCED)) + bitmask[0] &= ~FATTR4_WORD0_SIZE; + else if (cache_validity & + (NFS_INO_INVALID_SIZE | NFS_INO_REVAL_PAGECACHE)) + bitmask[0] |= FATTR4_WORD0_SIZE; + + for (i = 0; i < NFS4_BITMASK_SZ; i++) + bitmask[i] &= server->attr_bitmask[i]; } static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, @@ -5459,8 +5467,10 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, hdr->args.bitmask = NULL; hdr->res.fattr = NULL; } else { - hdr->args.bitmask = server->cache_consistency_bitmask; - nfs4_bitmask_adjust(hdr->args.bitmask, hdr->inode, server, NULL); + nfs4_bitmask_set(hdr->args.bitmask_store, + server->cache_consistency_bitmask, + hdr->inode, server, NULL); + hdr->args.bitmask = hdr->args.bitmask_store; } if (!hdr->pgio_done_cb) @@ -6502,8 +6512,10 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; - data->args.bitmask = server->cache_consistency_bitmask; - nfs4_bitmask_adjust(data->args.bitmask, inode, server, NULL); + nfs4_bitmask_set(data->args.bitmask_store, + server->cache_consistency_bitmask, inode, server, + NULL); + data->args.bitmask = data->args.bitmask_store; nfs_copy_fh(&data->fh, NFS_FH(inode)); nfs4_stateid_copy(&data->stateid, stateid); data->res.fattr = &data->fattr; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 3327239fa2f9..cc29dee508f7 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -15,6 +15,8 @@ #define NFS_DEF_FILE_IO_SIZE (4096U) #define NFS_MIN_FILE_IO_SIZE (1024U) +#define NFS_BITMASK_SZ 3 + struct nfs4_string { unsigned int len; char *data; @@ -525,7 +527,8 @@ struct nfs_closeargs { struct nfs_seqid * seqid; fmode_t fmode; u32 share_access; - u32 * bitmask; + const u32 * bitmask; + u32 bitmask_store[NFS_BITMASK_SZ]; struct nfs4_layoutreturn_args *lr_args; }; @@ -608,7 +611,8 @@ struct nfs4_delegreturnargs { struct nfs4_sequence_args seq_args; const struct nfs_fh *fhandle; const nfs4_stateid *stateid; - u32 * bitmask; + const u32 *bitmask; + u32 bitmask_store[NFS_BITMASK_SZ]; struct nfs4_layoutreturn_args *lr_args; }; @@ -648,7 +652,8 @@ struct nfs_pgio_args { union { unsigned int replen; /* used by read */ struct { - u32 * bitmask; /* used by write */ + const u32 * bitmask; /* used by write */ + u32 bitmask_store[NFS_BITMASK_SZ]; /* used by write */ enum nfs3_stable_how stable; /* used by write */ }; }; From e99812e1382f0bfb6149393262bc70645c9f537a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 28 Mar 2021 18:12:03 -0400 Subject: [PATCH 0351/1379] NFS: Fix attribute bitmask in _nfs42_proc_fallocate() We can't use nfs4_fattr_bitmap as a bitmask, because it hasn't been filtered to represent the attributes supported by the server. Instead, let's revert to using server->cache_consistency_bitmask after adding in the missing SPACE_USED attribute. Fixes: 913eca1aea87 ("NFS: Fallocate should use the nfs4_fattr_bitmap") Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 094024b0aca1..597005bc8a05 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -46,11 +46,12 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, { struct inode *inode = file_inode(filep); struct nfs_server *server = NFS_SERVER(inode); + u32 bitmask[3]; struct nfs42_falloc_args args = { .falloc_fh = NFS_FH(inode), .falloc_offset = offset, .falloc_length = len, - .falloc_bitmask = nfs4_fattr_bitmap, + .falloc_bitmask = bitmask, }; struct nfs42_falloc_res res = { .falloc_server = server, @@ -68,6 +69,10 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, return status; } + memcpy(bitmask, server->cache_consistency_bitmask, sizeof(bitmask)); + if (server->attr_bitmask[1] & FATTR4_WORD1_SPACE_USED) + bitmask[1] |= FATTR4_WORD1_SPACE_USED; + res.falloc_fattr = nfs_alloc_fattr(); if (!res.falloc_fattr) return -ENOMEM; @@ -75,7 +80,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, status = nfs4_call_sync(server->client, server, msg, &args.seq_args, &res.seq_res, 0); if (status == 0) - status = nfs_post_op_update_inode(inode, res.falloc_fattr); + status = nfs_post_op_update_inode_force_wcc(inode, + res.falloc_fattr); kfree(res.falloc_fattr); return status; From 99f23783224355e7022ceea9b8d9f62c0fd01bd8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 28 Mar 2021 18:17:14 -0400 Subject: [PATCH 0352/1379] NFSv4.2: Always flush out writes in nfs42_proc_fallocate() Whether we're allocating or delallocating space, we should flush out the pending writes in order to avoid races with attribute updates. Fixes: 1e564d3dbd68 ("NFSv4.2: Fix a race in nfs42_proc_deallocate()") Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 597005bc8a05..704a5246ccb5 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -90,7 +90,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, loff_t offset, loff_t len) { - struct nfs_server *server = NFS_SERVER(file_inode(filep)); + struct inode *inode = file_inode(filep); + struct nfs_server *server = NFS_SERVER(inode); struct nfs4_exception exception = { }; struct nfs_lock_context *lock; int err; @@ -99,9 +100,13 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, if (IS_ERR(lock)) return PTR_ERR(lock); - exception.inode = file_inode(filep); + exception.inode = inode; exception.state = lock->open_context->state; + err = nfs_sync_inode(inode); + if (err) + goto out; + do { err = _nfs42_proc_fallocate(msg, filep, lock, offset, len); if (err == -ENOTSUPP) { @@ -110,7 +115,7 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, } err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); - +out: nfs_put_lock_context(lock); return err; } @@ -148,16 +153,13 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return -EOPNOTSUPP; inode_lock(inode); - err = nfs_sync_inode(inode); - if (err) - goto out_unlock; err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; -out_unlock: + inode_unlock(inode); return err; } From 9fdbfad1777cb4638f489eeb62d85432010c0031 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 29 Mar 2021 16:46:05 -0400 Subject: [PATCH 0353/1379] NFS: Deal correctly with attribute generation counter overflow We need to use unsigned long subtraction and then convert to signed in order to deal correcly with C overflow rules. Fixes: f5062003465c ("NFS: Set an attribute barrier on all updates") Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index ff737be559dc..8de5b3b9da91 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1662,10 +1662,10 @@ EXPORT_SYMBOL_GPL(_nfs_display_fhandle); */ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr) { - const struct nfs_inode *nfsi = NFS_I(inode); + unsigned long attr_gencount = NFS_I(inode)->attr_gencount; - return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || - ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); + return (long)(fattr->gencount - attr_gencount) > 0 || + (long)(attr_gencount - nfs_read_attr_generation_counter()) > 0; } static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) @@ -2094,7 +2094,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->attrtimeo_timestamp = now; } /* Set the barrier to be more recent than this fattr */ - if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0) + if ((long)(fattr->gencount - nfsi->attr_gencount) > 0) nfsi->attr_gencount = fattr->gencount; } From 8a27c7ccca3b0df83f13c591f58d68b057b67780 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 15:45:09 -0400 Subject: [PATCH 0354/1379] NFS: Fix up inode cache tracing Add missing enum definitions and missing entries for nfs_show_cache_validity(). Signed-off-by: Trond Myklebust --- fs/nfs/nfstrace.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 5a59dcdce0b2..cdba6eebe3cb 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -45,6 +45,9 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_CTIME); TRACE_DEFINE_ENUM(NFS_INO_INVALID_MTIME); TRACE_DEFINE_ENUM(NFS_INO_INVALID_SIZE); TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER); +TRACE_DEFINE_ENUM(NFS_INO_DATA_INVAL_DEFER); +TRACE_DEFINE_ENUM(NFS_INO_INVALID_BLOCKS); +TRACE_DEFINE_ENUM(NFS_INO_INVALID_XATTR); #define nfs_show_cache_validity(v) \ __print_flags(v, "|", \ @@ -60,6 +63,8 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER); { NFS_INO_INVALID_MTIME, "INVALID_MTIME" }, \ { NFS_INO_INVALID_SIZE, "INVALID_SIZE" }, \ { NFS_INO_INVALID_OTHER, "INVALID_OTHER" }, \ + { NFS_INO_DATA_INVAL_DEFER, "DATA_INVAL_DEFER" }, \ + { NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \ { NFS_INO_INVALID_XATTR, "INVALID_XATTR" }) TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS); From 4eb6a8230b589eb2bb02c4ce7418032552ef6470 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 15:59:53 -0400 Subject: [PATCH 0355/1379] NFS: Mask out unsupported attributes in nfs_getattr() We don't currently support STATX_BTIME, so don't advertise it in the return values for nfs_getattr(). Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8de5b3b9da91..c429a5375b92 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -815,6 +815,10 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, trace_nfs_getattr_enter(inode); + request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID | + STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME | + STATX_INO | STATX_SIZE | STATX_BLOCKS; + if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { nfs_readdirplus_parent_cache_hit(path->dentry); goto out_no_update; From 50c7a7994dd20af56e4d47e90af10bab71b71001 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 20:46:36 -0400 Subject: [PATCH 0356/1379] NFS: NFS_INO_REVAL_PAGECACHE should mark the change attribute invalid When we're looking to revalidate the page cache, we should just ensure that we mark the change attribute invalid. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c429a5375b92..a9b302b389a7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -219,7 +219,8 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) | NFS_INO_INVALID_SIZE | NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_XATTR); - } + } else if (flags & NFS_INO_REVAL_PAGECACHE) + flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE; if (!nfs_has_xattr_cache(nfsi)) flags &= ~NFS_INO_INVALID_XATTR; From 4cdfeb648ac56ef0f961549528ccebd76042f1d3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Mar 2021 12:29:41 -0400 Subject: [PATCH 0357/1379] NFS: Fix up revalidation of space used Ensure that when the change attribute or the size change, we also remember to revalidate the space used. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index a9b302b389a7..8b2f4a5eaa42 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -565,12 +565,13 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; - if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { + else if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } + } else if (fattr->size != 0) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); if (nfsi->cache_validity != 0) nfsi->cache_validity |= NFS_INO_REVAL_FORCED; @@ -711,7 +712,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, spin_lock(&inode->i_lock); NFS_I(inode)->attr_gencount = fattr->gencount; if ((attr->ia_valid & ATTR_SIZE) != 0) { - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME | + NFS_INO_INVALID_BLOCKS); nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } @@ -1933,6 +1935,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) save_cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE + | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); @@ -1990,6 +1993,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) (long long)cur_isize, (long long)new_isize); } + if (new_isize == 0 && + !(fattr->valid & (NFS_ATTR_FATTR_SPACE_USED | + NFS_ATTR_FATTR_BLOCKS_USED))) { + fattr->du.nfs3.used = 0; + fattr->valid |= NFS_ATTR_FATTR_SPACE_USED; + } } else { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_SIZE From e8764a6f96d3ca17be7466c7b0fddef64f4e3d14 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Mar 2021 13:55:09 -0400 Subject: [PATCH 0358/1379] NFS: Don't revalidate attributes that are not being asked for If the user doesn't set STATX_UID/GID/MODE, then don't care if they are known to be stale. Ditto if we're not being asked for the file size. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8b2f4a5eaa42..4982bc18ee26 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -857,12 +857,17 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, /* Check whether the cached attributes are stale */ do_update |= force_sync || nfs_attribute_cache_expired(inode); cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); - do_update |= cache_validity & - (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL); + do_update |= cache_validity & NFS_INO_INVALID_CHANGE; if (request_mask & STATX_ATIME) do_update |= cache_validity & NFS_INO_INVALID_ATIME; - if (request_mask & (STATX_CTIME|STATX_MTIME)) - do_update |= cache_validity & NFS_INO_REVAL_PAGECACHE; + if (request_mask & STATX_CTIME) + do_update |= cache_validity & NFS_INO_INVALID_CTIME; + if (request_mask & STATX_MTIME) + do_update |= cache_validity & NFS_INO_INVALID_MTIME; + if (request_mask & STATX_SIZE) + do_update |= cache_validity & NFS_INO_INVALID_SIZE; + if (request_mask & (STATX_UID | STATX_GID | STATX_MODE | STATX_NLINK)) + do_update |= cache_validity & NFS_INO_INVALID_OTHER; if (request_mask & STATX_BLOCKS) do_update |= cache_validity & NFS_INO_INVALID_BLOCKS; if (do_update) { From 63cdd7edfd2871e63f4ca001ff6b8e1e166a74ae Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 24 Mar 2021 14:29:12 -0400 Subject: [PATCH 0359/1379] NFS: Fix up statx() results If statx has valid attributes available that weren't asked for, then return them and set the result mask appropriately. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 4982bc18ee26..8c2d5f333e81 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -806,6 +806,28 @@ static bool nfs_need_revalidate_inode(struct inode *inode) return false; } +static u32 nfs_get_valid_attrmask(struct inode *inode) +{ + unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + u32 reply_mask = STATX_INO | STATX_TYPE; + + if (!(cache_validity & NFS_INO_INVALID_ATIME)) + reply_mask |= STATX_ATIME; + if (!(cache_validity & NFS_INO_REVAL_PAGECACHE)) { + if (!(cache_validity & NFS_INO_INVALID_CTIME)) + reply_mask |= STATX_CTIME; + if (!(cache_validity & NFS_INO_INVALID_MTIME)) + reply_mask |= STATX_MTIME; + if (!(cache_validity & NFS_INO_INVALID_SIZE)) + reply_mask |= STATX_SIZE; + } + if (!(cache_validity & NFS_INO_INVALID_OTHER)) + reply_mask |= STATX_UID | STATX_GID | STATX_MODE | STATX_NLINK; + if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) + reply_mask |= STATX_BLOCKS; + return reply_mask; +} + int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -824,7 +846,7 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { nfs_readdirplus_parent_cache_hit(path->dentry); - goto out_no_update; + goto out_no_revalidate; } /* Flush out writes to the server in order to update c/mtime. */ @@ -870,6 +892,7 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, do_update |= cache_validity & NFS_INO_INVALID_OTHER; if (request_mask & STATX_BLOCKS) do_update |= cache_validity & NFS_INO_INVALID_BLOCKS; + if (do_update) { /* Update the attribute cache */ if (!(server->flags & NFS_MOUNT_NOAC)) @@ -883,8 +906,8 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, nfs_readdirplus_parent_cache_hit(path->dentry); out_no_revalidate: /* Only return attributes that were revalidated. */ - stat->result_mask &= request_mask; -out_no_update: + stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask; + generic_fillattr(&init_user_ns, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); if (S_ISDIR(inode->i_mode)) From 1f9f4328155a6944903e2364f38bb6ed1e1ea9e9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 12 Apr 2021 20:08:00 -0400 Subject: [PATCH 0360/1379] NFS: nfs_setattr_update_inode() should clear the suid/sgid bits When we do a 'chown' or 'chgrp', the server will clear the suid/sgid bits. Ensure that we mirror that in nfs_setattr_update_inode(). Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8c2d5f333e81..d34da63202cc 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -636,8 +636,7 @@ nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Optimization: if the end result is no change, don't RPC */ - attr->ia_valid &= NFS_VALID_ATTRS; - if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0) + if (((attr->ia_valid & NFS_VALID_ATTRS) & ~(ATTR_FILE|ATTR_OPEN)) == 0) return 0; trace_nfs_setattr_enter(inode); @@ -719,6 +718,13 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, } if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_CTIME; + if ((attr->ia_valid & ATTR_KILL_SUID) != 0 && + inode->i_mode & S_ISUID) + inode->i_mode &= ~S_ISUID; + if ((attr->ia_valid & ATTR_KILL_SGID) != 0 && + (inode->i_mode & (S_ISGID | S_IXGRP)) == + (S_ISGID | S_IXGRP)) + inode->i_mode &= ~S_ISGID; if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; From 1f3208b2d6975f31b9c7c6bf174b84fe9c97492f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 11:04:34 -0400 Subject: [PATCH 0361/1379] NFS: Add a cache validity flag argument to nfs_revalidate_inode() Add an argument to nfs_revalidate_inode() to allow callers to specify which attributes they need to check for validity. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- fs/nfs/export.c | 6 +----- fs/nfs/inode.c | 25 +++++++------------------ fs/nfs/nfs3acl.c | 2 +- fs/nfs/nfs4proc.c | 6 +++--- include/linux/nfs_fs.h | 2 +- 6 files changed, 14 insertions(+), 29 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 0cd7c59a6601..e924d65c125e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -3006,7 +3006,7 @@ out_notsup: if (mask & MAY_NOT_BLOCK) return -ECHILD; - res = nfs_revalidate_inode(NFS_SERVER(inode), inode); + res = nfs_revalidate_inode(inode, NFS_INO_INVALID_OTHER); if (res == 0) res = generic_permission(&init_user_ns, inode, mask); goto out; diff --git a/fs/nfs/export.c b/fs/nfs/export.c index b347e3ce0cc8..37a1a88df771 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -169,11 +169,7 @@ out: static u64 nfs_fetch_iversion(struct inode *inode) { - struct nfs_server *server = NFS_SERVER(inode); - - if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_CHANGE | - NFS_INO_REVAL_PAGECACHE)) - __nfs_revalidate_inode(server, inode); + nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); return inode_peek_iversion_raw(inode); } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d34da63202cc..b9aac408f03a 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -802,16 +802,6 @@ static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) dput(parent); } -static bool nfs_need_revalidate_inode(struct inode *inode) -{ - if (NFS_I(inode)->cache_validity & - (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) - return true; - if (nfs_attribute_cache_expired(inode)) - return true; - return false; -} - static u32 nfs_get_valid_attrmask(struct inode *inode) { unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); @@ -1004,7 +994,6 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) { struct nfs_inode *nfsi; struct inode *inode; - struct nfs_server *server; if (!(ctx->mode & FMODE_WRITE)) return; @@ -1020,10 +1009,10 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) return; if (!list_empty(&nfsi->open_files)) return; - server = NFS_SERVER(inode); - if (server->flags & NFS_MOUNT_NOCTO) + if (NFS_SERVER(inode)->flags & NFS_MOUNT_NOCTO) return; - nfs_revalidate_inode(server, inode); + nfs_revalidate_inode(inode, + NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); } EXPORT_SYMBOL_GPL(nfs_close_context); @@ -1278,16 +1267,16 @@ int nfs_attribute_cache_expired(struct inode *inode) /** * nfs_revalidate_inode - Revalidate the inode attributes - * @server: pointer to nfs_server struct * @inode: pointer to inode struct + * @flags: cache flags to check * * Updates inode attribute information by retrieving the data from the server. */ -int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) +int nfs_revalidate_inode(struct inode *inode, unsigned long flags) { - if (!nfs_need_revalidate_inode(inode)) + if (!nfs_check_cache_invalid(inode, flags)) return NFS_STALE(inode) ? -ESTALE : 0; - return __nfs_revalidate_inode(server, inode); + return __nfs_revalidate_inode(NFS_SERVER(inode), inode); } EXPORT_SYMBOL_GPL(nfs_revalidate_inode); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index bb386a691e69..9ec560aa4a50 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -65,7 +65,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) if (!nfs_server_capable(inode, NFS_CAP_ACLS)) return ERR_PTR(-EOPNOTSUPP); - status = nfs_revalidate_inode(server, inode); + status = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); if (status < 0) return ERR_PTR(status); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1cf98c40e3b2..6b990fe5bc1f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5868,7 +5868,7 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) if (!nfs4_server_supports_acls(server)) return -EOPNOTSUPP; - ret = nfs_revalidate_inode(server, inode); + ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); if (ret < 0) return ret; if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) @@ -7619,7 +7619,7 @@ static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler, return -EACCES; } - ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); if (ret) return ret; @@ -7650,7 +7650,7 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) return 0; } - ret = nfs_revalidate_inode(NFS_SERVER(inode), inode); + ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); if (ret) return ret; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index eadaabd18dc7..624ffd47a9d4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -386,7 +386,7 @@ extern void nfs_access_set_mask(struct nfs_access_entry *, u32); extern int nfs_permission(struct user_namespace *, struct inode *, int); extern int nfs_open(struct inode *, struct file *); extern int nfs_attribute_cache_expired(struct inode *inode); -extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode); +extern int nfs_revalidate_inode(struct inode *inode, unsigned long flags); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_clear_invalid_mapping(struct address_space *mapping); extern bool nfs_mapping_need_revalidate_inode(struct inode *inode); From 13c0b082b6a90b1b87b5fb100983d05bcc75d9b6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 21:07:21 -0400 Subject: [PATCH 0362/1379] NFS: Replace use of NFS_INO_REVAL_PAGECACHE when checking cache validity When checking cache validity, be more specific than just 'we want to check the page cache validity'. In almost all cases, we want to check that change attribute, and possibly also the size. Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 2 +- fs/nfs/inode.c | 41 ++++++++++++----------------------------- fs/nfs/nfs4proc.c | 5 ++--- fs/nfs/write.c | 2 +- 4 files changed, 16 insertions(+), 34 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 16ad5050e046..1fef107961bc 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -105,7 +105,7 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp) if (filp->f_flags & O_DIRECT) goto force_reval; - if (nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE)) + if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_SIZE)) goto force_reval; return 0; force_reval: diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b9aac408f03a..aec402d048eb 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -164,34 +164,19 @@ static int nfs_attribute_timeout(struct inode *inode) return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); } -static bool nfs_check_cache_invalid_delegated(struct inode *inode, unsigned long flags) +static bool nfs_check_cache_flags_invalid(struct inode *inode, + unsigned long flags) { unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); - /* Special case for the pagecache or access cache */ - if (flags == NFS_INO_REVAL_PAGECACHE && - !(cache_validity & NFS_INO_REVAL_FORCED)) - return false; return (cache_validity & flags) != 0; } -static bool nfs_check_cache_invalid_not_delegated(struct inode *inode, unsigned long flags) -{ - unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); - - if ((cache_validity & flags) != 0) - return true; - if (nfs_attribute_timeout(inode)) - return true; - return false; -} - bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) { - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) - return nfs_check_cache_invalid_delegated(inode, flags); - - return nfs_check_cache_invalid_not_delegated(inode, flags); + if (nfs_check_cache_flags_invalid(inode, flags)) + return true; + return nfs_attribute_cache_expired(inode); } EXPORT_SYMBOL_GPL(nfs_check_cache_invalid); @@ -809,14 +794,12 @@ static u32 nfs_get_valid_attrmask(struct inode *inode) if (!(cache_validity & NFS_INO_INVALID_ATIME)) reply_mask |= STATX_ATIME; - if (!(cache_validity & NFS_INO_REVAL_PAGECACHE)) { - if (!(cache_validity & NFS_INO_INVALID_CTIME)) - reply_mask |= STATX_CTIME; - if (!(cache_validity & NFS_INO_INVALID_MTIME)) - reply_mask |= STATX_MTIME; - if (!(cache_validity & NFS_INO_INVALID_SIZE)) - reply_mask |= STATX_SIZE; - } + if (!(cache_validity & NFS_INO_INVALID_CTIME)) + reply_mask |= STATX_CTIME; + if (!(cache_validity & NFS_INO_INVALID_MTIME)) + reply_mask |= STATX_MTIME; + if (!(cache_validity & NFS_INO_INVALID_SIZE)) + reply_mask |= STATX_SIZE; if (!(cache_validity & NFS_INO_INVALID_OTHER)) reply_mask |= STATX_UID | STATX_GID | STATX_MODE | STATX_NLINK; if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) @@ -1362,7 +1345,7 @@ out: bool nfs_mapping_need_revalidate_inode(struct inode *inode) { - return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) || + return nfs_check_cache_invalid(inode, NFS_INO_INVALID_CHANGE) || NFS_STALE(inode); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6b990fe5bc1f..a21311520404 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5429,7 +5429,7 @@ static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src, memcpy(bitmask, src, sizeof(*bitmask) * NFS4_BITMASK_SZ); - if (cache_validity & (NFS_INO_INVALID_CHANGE | NFS_INO_REVAL_PAGECACHE)) + if (cache_validity & NFS_INO_INVALID_CHANGE) bitmask[0] |= FATTR4_WORD0_CHANGE; if (cache_validity & NFS_INO_INVALID_ATIME) bitmask[1] |= FATTR4_WORD1_TIME_ACCESS; @@ -5449,8 +5449,7 @@ static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src, if (nfs4_have_delegation(inode, FMODE_READ) && !(cache_validity & NFS_INO_REVAL_FORCED)) bitmask[0] &= ~FATTR4_WORD0_SIZE; - else if (cache_validity & - (NFS_INO_INVALID_SIZE | NFS_INO_REVAL_PAGECACHE)) + else if (cache_validity & NFS_INO_INVALID_SIZE) bitmask[0] |= FATTR4_WORD0_SIZE; for (i = 0; i < NFS4_BITMASK_SZ; i++) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f05a90338a76..7a39b3d424da 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1293,7 +1293,7 @@ static bool nfs_write_pageuptodate(struct page *page, struct inode *inode, if (nfs_have_delegated_attributes(inode)) goto out; if (nfsi->cache_validity & - (NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_SIZE)) + (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE)) return false; smp_rmb(); if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags) && pagelen != 0) From 36a9346c225270262d9f34e66c91aa1723fa903f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 21:20:32 -0400 Subject: [PATCH 0363/1379] NFS: Don't set NFS_INO_REVAL_PAGECACHE in the inode cache validity It is no longer necessary to preserve the NFS_INO_REVAL_PAGECACHE flag. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 6 ++---- fs/nfs/nfs4proc.c | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index aec402d048eb..3d18e66a4b8f 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -202,11 +202,12 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) flags &= ~NFS_INO_INVALID_OTHER; flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE - | NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_XATTR); } else if (flags & NFS_INO_REVAL_PAGECACHE) flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE; + flags &= ~NFS_INO_REVAL_PAGECACHE; + if (!nfs_has_xattr_cache(nfsi)) flags &= ~NFS_INO_INVALID_XATTR; if (flags & NFS_INO_INVALID_DATA) @@ -1917,7 +1918,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME | NFS_INO_REVAL_FORCED - | NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_BLOCKS); /* Do atomic weak cache consistency updates */ @@ -1956,7 +1956,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } else { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_CHANGE - | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); cache_revalidated = false; } @@ -2008,7 +2007,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } else { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_SIZE - | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); cache_revalidated = false; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a21311520404..a490f1373765 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1191,7 +1191,6 @@ nfs4_update_changeattr_locked(struct inode *inode, cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(inode)) { - nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; nfsi->attrtimeo_timestamp = jiffies; } else { if (S_ISDIR(inode->i_mode)) { From 80c253bd7ffbb40ae550c8ae5bed9126b8f99f96 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 26 Mar 2021 19:09:09 +0000 Subject: [PATCH 0364/1379] PCI: endpoint: Remove redundant initialization of pointer dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pointer dev is being initialized with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Link: https://lore.kernel.org/r/20210326190909.622369-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Wilczyński --- drivers/pci/endpoint/pci-epf-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/endpoint/pci-epf-core.c b/drivers/pci/endpoint/pci-epf-core.c index 7646c8660d42..e9289d10f822 100644 --- a/drivers/pci/endpoint/pci-epf-core.c +++ b/drivers/pci/endpoint/pci-epf-core.c @@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(pci_epf_bind); void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar, enum pci_epc_interface_type type) { - struct device *dev = epf->epc->dev.parent; + struct device *dev; struct pci_epf_bar *epf_bar; struct pci_epc *epc; From acaef7981a218813e3617edb9c01837808de063c Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 31 Mar 2021 16:40:12 +0800 Subject: [PATCH 0365/1379] PCI: endpoint: Fix missing destroy_workqueue() Add the missing destroy_workqueue() before return from pci_epf_test_init() in the error handling case and add destroy_workqueue() in pci_epf_test_exit(). Link: https://lore.kernel.org/r/20210331084012.2091010-1-yangyingliang@huawei.com Fixes: 349e7a85b25fa ("PCI: endpoint: functions: Add an EP function to test PCI") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: Lorenzo Pieralisi --- drivers/pci/endpoint/functions/pci-epf-test.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c index bc35b3566be6..f9760e73d568 100644 --- a/drivers/pci/endpoint/functions/pci-epf-test.c +++ b/drivers/pci/endpoint/functions/pci-epf-test.c @@ -925,6 +925,7 @@ static int __init pci_epf_test_init(void) ret = pci_epf_register_driver(&test_driver); if (ret) { + destroy_workqueue(kpcitest_workqueue); pr_err("Failed to register pci epf test driver --> %d\n", ret); return ret; } @@ -935,6 +936,8 @@ module_init(pci_epf_test_init); static void __exit pci_epf_test_exit(void) { + if (kpcitest_workqueue) + destroy_workqueue(kpcitest_workqueue); pci_epf_unregister_driver(&test_driver); } module_exit(pci_epf_test_exit); From 87db343f809d611405c45cbb691657e7df5c047d Mon Sep 17 00:00:00 2001 From: Qiheng Lin Date: Wed, 31 Mar 2021 16:59:38 +0800 Subject: [PATCH 0366/1379] PCI: mediatek: Add missing MODULE_DEVICE_TABLE This patch adds missing MODULE_DEVICE_TABLE definition which generates correct modalias for automatic loading of this driver when it is built as an external module. Link: https://lore.kernel.org/r/20210331085938.3115-1-linqiheng@huawei.com Reported-by: Hulk Robot Signed-off-by: Qiheng Lin Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/pcie-mediatek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/pcie-mediatek.c b/drivers/pci/controller/pcie-mediatek.c index 8af04848e0f9..ddbd2f8b3226 100644 --- a/drivers/pci/controller/pcie-mediatek.c +++ b/drivers/pci/controller/pcie-mediatek.c @@ -1210,6 +1210,7 @@ static const struct of_device_id mtk_pcie_ids[] = { { .compatible = "mediatek,mt7629-pcie", .data = &mtk_pcie_soc_mt7629 }, {}, }; +MODULE_DEVICE_TABLE(of, mtk_pcie_ids); static struct platform_driver mtk_pcie_driver = { .probe = mtk_pcie_probe, From a71029b86752e8d40301af235a6bbf4896cc1402 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 10 Apr 2021 00:23:03 -0400 Subject: [PATCH 0367/1379] NFSv4: Fix nfs4_bitmap_copy_adjust() Don't remove flags from the set retrieved from the cache_validity. We do want to retrieve all attributes that are listed as being invalid, whether or not there is a delegation set. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a490f1373765..8dc595ce40ca 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -284,7 +284,7 @@ const u32 nfs4_fs_locations_bitmap[3] = { }; static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src, - struct inode *inode) + struct inode *inode, unsigned long flags) { unsigned long cache_validity; @@ -292,22 +292,19 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src, if (!inode || !nfs4_have_delegation(inode, FMODE_READ)) return; - cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); - if (!(cache_validity & NFS_INO_REVAL_FORCED)) - cache_validity &= ~(NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_SIZE); + cache_validity = READ_ONCE(NFS_I(inode)->cache_validity) | flags; + /* Remove the attributes over which we have full control */ + dst[1] &= ~FATTR4_WORD1_RAWDEV; if (!(cache_validity & NFS_INO_INVALID_SIZE)) dst[0] &= ~FATTR4_WORD0_SIZE; if (!(cache_validity & NFS_INO_INVALID_CHANGE)) dst[0] &= ~FATTR4_WORD0_CHANGE; -} -static void nfs4_bitmap_copy_adjust_setattr(__u32 *dst, - const __u32 *src, struct inode *inode) -{ - nfs4_bitmap_copy_adjust(dst, src, inode); + if (!(cache_validity & NFS_INO_INVALID_OTHER)) + dst[1] &= ~(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | + FATTR4_WORD1_OWNER_GROUP); } static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, @@ -3344,12 +3341,15 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred, .inode = inode, .stateid = &arg.stateid, }; + unsigned long adjust_flags = NFS_INO_INVALID_CHANGE; int err; + if (sattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) + adjust_flags |= NFS_INO_INVALID_OTHER; + do { - nfs4_bitmap_copy_adjust_setattr(bitmask, - nfs4_bitmask(server, olabel), - inode); + nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, olabel), + inode, adjust_flags); err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx); switch (err) { @@ -4157,8 +4157,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, if (inode && (server->flags & NFS_MOUNT_SOFTREVAL)) task_flags |= RPC_TASK_TIMEOUT; - nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode); - + nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, label), inode, 0); nfs_fattr_init(fattr); nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); return nfs4_do_call_sync(server->client, server, &msg, @@ -4764,8 +4763,8 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct } nfs4_inode_make_writeable(inode); - nfs4_bitmap_copy_adjust_setattr(bitmask, nfs4_bitmask(server, res.label), inode); - + nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, res.label), inode, + NFS_INO_INVALID_CHANGE); status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { nfs4_update_changeattr(dir, &res.cinfo, res.fattr->time_start, From fabf2b341502e894001d70f91309dd6f3785e2dc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 13:14:42 -0400 Subject: [PATCH 0368/1379] NFS: Separate tracking of file nlinks cache validity from the mode/uid/gid Rename can cause us to revalidate the access cache, so lets track the nlinks separately from the mode/uid/gid. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- fs/nfs/inode.c | 15 ++++++++++----- fs/nfs/nfs4proc.c | 13 +++++++------ fs/nfs/nfstrace.h | 4 +++- include/linux/nfs_fs.h | 2 ++ 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e924d65c125e..f748d2294261 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1711,7 +1711,7 @@ static void nfs_drop_nlink(struct inode *inode) NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); nfs_set_cache_invalid( inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | - NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); + NFS_INO_INVALID_NLINK | NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 3d18e66a4b8f..7bf9138330f2 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -538,7 +538,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); else if (nfs_server_capable(inode, NFS_CAP_NLINK)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (nfs_server_capable(inode, NFS_CAP_OWNER)) @@ -801,8 +801,10 @@ static u32 nfs_get_valid_attrmask(struct inode *inode) reply_mask |= STATX_MTIME; if (!(cache_validity & NFS_INO_INVALID_SIZE)) reply_mask |= STATX_SIZE; + if (!(cache_validity & NFS_INO_INVALID_NLINK)) + reply_mask |= STATX_NLINK; if (!(cache_validity & NFS_INO_INVALID_OTHER)) - reply_mask |= STATX_UID | STATX_GID | STATX_MODE | STATX_NLINK; + reply_mask |= STATX_UID | STATX_GID | STATX_MODE; if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) reply_mask |= STATX_BLOCKS; return reply_mask; @@ -868,7 +870,9 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, do_update |= cache_validity & NFS_INO_INVALID_MTIME; if (request_mask & STATX_SIZE) do_update |= cache_validity & NFS_INO_INVALID_SIZE; - if (request_mask & (STATX_UID | STATX_GID | STATX_MODE | STATX_NLINK)) + if (request_mask & STATX_NLINK) + do_update |= cache_validity & NFS_INO_INVALID_NLINK; + if (request_mask & (STATX_UID | STATX_GID | STATX_MODE)) do_update |= cache_validity & NFS_INO_INVALID_OTHER; if (request_mask & STATX_BLOCKS) do_update |= cache_validity & NFS_INO_INVALID_BLOCKS; @@ -1518,7 +1522,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat /* Has the link count changed? */ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) - invalid |= NFS_INO_INVALID_OTHER; + invalid |= NFS_INO_INVALID_NLINK; ts = inode->i_atime; if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime)) @@ -1942,6 +1946,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_BLOCKS + | NFS_INO_INVALID_NLINK | NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); @@ -2074,7 +2079,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } } else if (server->caps & NFS_CAP_NLINK) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_OTHER + (NFS_INO_INVALID_NLINK | NFS_INO_REVAL_FORCED); cache_revalidated = false; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8dc595ce40ca..a74c1c3c4192 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1167,14 +1167,14 @@ int nfs4_call_sync(struct rpc_clnt *clnt, static void nfs4_inc_nlink_locked(struct inode *inode) { - nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); inc_nlink(inode); } static void nfs4_dec_nlink_locked(struct inode *inode) { - nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); drop_nlink(inode); } @@ -4717,11 +4717,11 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, /* Note: If we moved a directory, nlink will change */ nfs4_update_changeattr(old_dir, &res->old_cinfo, res->old_fattr->time_start, - NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_NLINK | NFS_INO_INVALID_DATA); nfs4_update_changeattr(new_dir, &res->new_cinfo, res->new_fattr->time_start, - NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_NLINK | NFS_INO_INVALID_DATA); } else nfs4_update_changeattr(old_dir, &res->old_cinfo, @@ -5433,8 +5433,9 @@ static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src, bitmask[1] |= FATTR4_WORD1_TIME_ACCESS; if (cache_validity & NFS_INO_INVALID_OTHER) bitmask[1] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | - FATTR4_WORD1_OWNER_GROUP | - FATTR4_WORD1_NUMLINKS; + FATTR4_WORD1_OWNER_GROUP; + if (cache_validity & NFS_INO_INVALID_NLINK) + bitmask[1] |= FATTR4_WORD1_NUMLINKS; if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL) bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL; if (cache_validity & NFS_INO_INVALID_CTIME) diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index cdba6eebe3cb..a0ebc53160dd 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -48,6 +48,7 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_OTHER); TRACE_DEFINE_ENUM(NFS_INO_DATA_INVAL_DEFER); TRACE_DEFINE_ENUM(NFS_INO_INVALID_BLOCKS); TRACE_DEFINE_ENUM(NFS_INO_INVALID_XATTR); +TRACE_DEFINE_ENUM(NFS_INO_INVALID_NLINK); #define nfs_show_cache_validity(v) \ __print_flags(v, "|", \ @@ -65,7 +66,8 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_XATTR); { NFS_INO_INVALID_OTHER, "INVALID_OTHER" }, \ { NFS_INO_DATA_INVAL_DEFER, "DATA_INVAL_DEFER" }, \ { NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \ - { NFS_INO_INVALID_XATTR, "INVALID_XATTR" }) + { NFS_INO_INVALID_XATTR, "INVALID_XATTR" }, \ + { NFS_INO_INVALID_NLINK, "INVALID_NLINK" }) TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS); TRACE_DEFINE_ENUM(NFS_INO_STALE); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 624ffd47a9d4..41165b988dfb 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -246,11 +246,13 @@ struct nfs4_copy_state { BIT(13) /* Deferred cache invalidation */ #define NFS_INO_INVALID_BLOCKS BIT(14) /* cached blocks are invalid */ #define NFS_INO_INVALID_XATTR BIT(15) /* xattrs are invalid */ +#define NFS_INO_INVALID_NLINK BIT(16) /* cached nlinks is invalid */ #define NFS_INO_INVALID_ATTR (NFS_INO_INVALID_CHANGE \ | NFS_INO_INVALID_CTIME \ | NFS_INO_INVALID_MTIME \ | NFS_INO_INVALID_SIZE \ + | NFS_INO_INVALID_NLINK \ | NFS_INO_INVALID_OTHER) /* inode metadata is invalid */ /* From 720869eb19f3161980d6d4631d3df7e8c5355993 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 13 Apr 2021 09:41:16 -0400 Subject: [PATCH 0369/1379] NFS: Separate tracking of file mode cache validity from the uid/gid chown()/chgrp() and chmod() are separate operations, and in addition, there are mode operations that are performed automatically by the server. So let's track mode validity separately from the file ownership validity. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 5 +++-- fs/nfs/inode.c | 18 ++++++++++++------ fs/nfs/nfs4proc.c | 14 +++++++++----- fs/nfs/nfstrace.h | 4 +++- fs/nfs/write.c | 2 +- include/linux/nfs_fs.h | 2 ++ 6 files changed, 30 insertions(+), 15 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f748d2294261..d2835d211a73 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2948,7 +2948,7 @@ static int nfs_execute_ok(struct inode *inode, int mask) if (S_ISDIR(inode->i_mode)) return 0; - if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_OTHER)) { + if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_MODE)) { if (mask & MAY_NOT_BLOCK) return -ECHILD; ret = __nfs_revalidate_inode(server, inode); @@ -3006,7 +3006,8 @@ out_notsup: if (mask & MAY_NOT_BLOCK) return -ECHILD; - res = nfs_revalidate_inode(inode, NFS_INO_INVALID_OTHER); + res = nfs_revalidate_inode(inode, NFS_INO_INVALID_MODE | + NFS_INO_INVALID_OTHER); if (res == 0) res = generic_permission(&init_user_ns, inode, mask); goto out; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7bf9138330f2..81e3e140e923 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -199,7 +199,8 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) if (have_delegation) { if (!(flags & NFS_INO_REVAL_FORCED)) - flags &= ~NFS_INO_INVALID_OTHER; + flags &= ~(NFS_INO_INVALID_MODE | + NFS_INO_INVALID_OTHER); flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_XATTR); @@ -472,7 +473,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfsi->cache_validity = 0; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 && nfs_server_capable(inode, NFS_CAP_MODE)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -803,8 +804,10 @@ static u32 nfs_get_valid_attrmask(struct inode *inode) reply_mask |= STATX_SIZE; if (!(cache_validity & NFS_INO_INVALID_NLINK)) reply_mask |= STATX_NLINK; + if (!(cache_validity & NFS_INO_INVALID_MODE)) + reply_mask |= STATX_MODE; if (!(cache_validity & NFS_INO_INVALID_OTHER)) - reply_mask |= STATX_UID | STATX_GID | STATX_MODE; + reply_mask |= STATX_UID | STATX_GID; if (!(cache_validity & NFS_INO_INVALID_BLOCKS)) reply_mask |= STATX_BLOCKS; return reply_mask; @@ -872,7 +875,9 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, do_update |= cache_validity & NFS_INO_INVALID_SIZE; if (request_mask & STATX_NLINK) do_update |= cache_validity & NFS_INO_INVALID_NLINK; - if (request_mask & (STATX_UID | STATX_GID | STATX_MODE)) + if (request_mask & STATX_MODE) + do_update |= cache_validity & NFS_INO_INVALID_MODE; + if (request_mask & (STATX_UID | STATX_GID)) do_update |= cache_validity & NFS_INO_INVALID_OTHER; if (request_mask & STATX_BLOCKS) do_update |= cache_validity & NFS_INO_INVALID_BLOCKS; @@ -1510,7 +1515,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_OTHER; + | NFS_INO_INVALID_MODE; if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid)) invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL @@ -1947,6 +1952,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK + | NFS_INO_INVALID_MODE | NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); @@ -2037,7 +2043,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } } else if (server->caps & NFS_CAP_MODE) { nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_OTHER + (NFS_INO_INVALID_MODE | NFS_INO_REVAL_FORCED); cache_revalidated = false; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a74c1c3c4192..bc90f2a12d5d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -302,9 +302,10 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src, if (!(cache_validity & NFS_INO_INVALID_CHANGE)) dst[0] &= ~FATTR4_WORD0_CHANGE; + if (!(cache_validity & NFS_INO_INVALID_MODE)) + dst[1] &= ~FATTR4_WORD1_MODE; if (!(cache_validity & NFS_INO_INVALID_OTHER)) - dst[1] &= ~(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | - FATTR4_WORD1_OWNER_GROUP); + dst[1] &= ~(FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP); } static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dentry, @@ -3344,7 +3345,9 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred, unsigned long adjust_flags = NFS_INO_INVALID_CHANGE; int err; - if (sattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) + if (sattr->ia_valid & (ATTR_MODE | ATTR_KILL_SUID | ATTR_KILL_SGID)) + adjust_flags |= NFS_INO_INVALID_MODE; + if (sattr->ia_valid & (ATTR_UID | ATTR_GID)) adjust_flags |= NFS_INO_INVALID_OTHER; do { @@ -5431,9 +5434,10 @@ static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src, bitmask[0] |= FATTR4_WORD0_CHANGE; if (cache_validity & NFS_INO_INVALID_ATIME) bitmask[1] |= FATTR4_WORD1_TIME_ACCESS; + if (cache_validity & NFS_INO_INVALID_MODE) + bitmask[1] |= FATTR4_WORD1_MODE; if (cache_validity & NFS_INO_INVALID_OTHER) - bitmask[1] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | - FATTR4_WORD1_OWNER_GROUP; + bitmask[1] |= FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP; if (cache_validity & NFS_INO_INVALID_NLINK) bitmask[1] |= FATTR4_WORD1_NUMLINKS; if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL) diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index a0ebc53160dd..41a161cd31f6 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -49,6 +49,7 @@ TRACE_DEFINE_ENUM(NFS_INO_DATA_INVAL_DEFER); TRACE_DEFINE_ENUM(NFS_INO_INVALID_BLOCKS); TRACE_DEFINE_ENUM(NFS_INO_INVALID_XATTR); TRACE_DEFINE_ENUM(NFS_INO_INVALID_NLINK); +TRACE_DEFINE_ENUM(NFS_INO_INVALID_MODE); #define nfs_show_cache_validity(v) \ __print_flags(v, "|", \ @@ -67,7 +68,8 @@ TRACE_DEFINE_ENUM(NFS_INO_INVALID_NLINK); { NFS_INO_DATA_INVAL_DEFER, "DATA_INVAL_DEFER" }, \ { NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \ { NFS_INO_INVALID_XATTR, "INVALID_XATTR" }, \ - { NFS_INO_INVALID_NLINK, "INVALID_NLINK" }) + { NFS_INO_INVALID_NLINK, "INVALID_NLINK" }, \ + { NFS_INO_INVALID_MODE, "INVALID_MODE" }) TRACE_DEFINE_ENUM(NFS_INO_ADVISE_RDPLUS); TRACE_DEFINE_ENUM(NFS_INO_STALE); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7a39b3d424da..61d1174935b6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1604,7 +1604,7 @@ static int nfs_writeback_done(struct rpc_task *task, /* Deal with the suid/sgid bit corner case */ if (nfs_should_remove_suid(inode)) { spin_lock(&inode->i_lock); - nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); spin_unlock(&inode->i_lock); } return 0; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 41165b988dfb..ffba254d2098 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -247,12 +247,14 @@ struct nfs4_copy_state { #define NFS_INO_INVALID_BLOCKS BIT(14) /* cached blocks are invalid */ #define NFS_INO_INVALID_XATTR BIT(15) /* xattrs are invalid */ #define NFS_INO_INVALID_NLINK BIT(16) /* cached nlinks is invalid */ +#define NFS_INO_INVALID_MODE BIT(17) /* cached mode is invalid */ #define NFS_INO_INVALID_ATTR (NFS_INO_INVALID_CHANGE \ | NFS_INO_INVALID_CTIME \ | NFS_INO_INVALID_MTIME \ | NFS_INO_INVALID_SIZE \ | NFS_INO_INVALID_NLINK \ + | NFS_INO_INVALID_MODE \ | NFS_INO_INVALID_OTHER) /* inode metadata is invalid */ /* From 709fa5769914b377af87962bbe4ff81ffb019b2d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 14:51:36 -0400 Subject: [PATCH 0370/1379] NFS: Fix up handling of outstanding layoutcommit in nfs_update_inode() If there is an outstanding layoutcommit, then the list of attributes whose values are expected to change is not the full set. So let's be explicit about the full list. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 81e3e140e923..18c7277d17a8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1933,7 +1933,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_wcc_update_inode(inode, fattr); if (pnfs_layoutcommit_outstanding(inode)) { - nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR; + nfsi->cache_validity |= + save_cache_validity & + (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | + NFS_INO_REVAL_FORCED); cache_revalidated = false; } From c88c696c59f5775d8781d632f7d10823563027eb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 14:53:48 -0400 Subject: [PATCH 0371/1379] NFS: Remove a line of code that has no effect in nfs_update_inode() Commit 0b467264d0db ("NFS: Fix attribute revalidation") changed the way we populate the 'invalid' attribute, and made the line that strips away the NFS_INO_INVALID_ATTR bits redundant. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 18c7277d17a8..e4333b6ab2d4 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2110,7 +2110,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* Update attrtimeo value if we're out of the unstable period */ if (attr_changed) { - invalid &= ~NFS_INO_INVALID_ATTR; nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; From 04c63498b6d4ea445cd2cb02599467b48687244b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2021 18:13:36 -0400 Subject: [PATCH 0372/1379] NFS: Simplify cache consistency in nfs_check_inode_attributes() We should not be invalidating the access or acl caches in nfs_check_inode_attributes(), since the point is we're unsure about whether the contents of the struct nfs_fattr are fully up to date. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e4333b6ab2d4..fc8e92794636 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1491,8 +1491,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) - invalid |= NFS_INO_INVALID_CHANGE - | NFS_INO_REVAL_PAGECACHE; + invalid |= NFS_INO_INVALID_CHANGE; ts = inode->i_mtime; if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime)) @@ -1506,24 +1505,17 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); if (cur_size != new_isize) - invalid |= NFS_INO_INVALID_SIZE - | NFS_INO_REVAL_PAGECACHE; + invalid |= NFS_INO_INVALID_SIZE; } } /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) - invalid |= NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_MODE; + invalid |= NFS_INO_INVALID_MODE; if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid)) - invalid |= NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_OTHER; + invalid |= NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid)) - invalid |= NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_INVALID_OTHER; + invalid |= NFS_INO_INVALID_OTHER; /* Has the link count changed? */ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) From 57a789a1dee39e1625956d7dea4f7637f9160f21 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 26 Mar 2021 10:17:25 -0400 Subject: [PATCH 0373/1379] NFSv4: Fix value of decode_fsinfo_maxsz At least two extra fields have been added to fsinfo since this was last updated. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index ac6b79ee9355..d8a1911dd39e 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -144,7 +144,16 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, * layout types will be returned. */ #define decode_fsinfo_maxsz (op_decode_hdr_maxsz + \ - nfs4_fattr_bitmap_maxsz + 4 + 8 + 5) + nfs4_fattr_bitmap_maxsz + 1 + \ + 1 /* lease time */ + \ + 2 /* max filesize */ + \ + 2 /* max read */ + \ + 2 /* max write */ + \ + nfstime4_maxsz /* time delta */ + \ + 5 /* fs layout types */ + \ + 1 /* layout blksize */ + \ + 1 /* clone blksize */ + \ + 1 /* xattr support */) #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) #define decode_renew_maxsz (op_decode_hdr_maxsz) #define encode_setclientid_maxsz \ From 993e2d4bd9efc41f0943de39e9374ffdfde62e87 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 12 Apr 2021 12:15:50 -0400 Subject: [PATCH 0374/1379] NFSv4: Don't modify the change attribute cached in the inode When the client is caching data and a write delegation is held, then the server may send a CB_GETATTR to query the attributes. When this happens, the client is supposed to bump the change attribute value that it returns if it holds cached data. However that process uses a value that is stored in the delegation. We do not want to bump the change attribute held in the inode. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 61d1174935b6..3bf82178166a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -764,9 +764,6 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) * with invalidate/truncate. */ spin_lock(&mapping->private_lock); - if (!nfs_have_writebacks(inode) && - NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) - inode_inc_iversion_raw(inode); if (likely(!PageSwapCache(req->wb_page))) { set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); From 7f08a3359a3c1e39c2a118fbbe583d8c8db14ace Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 26 Mar 2021 09:50:19 -0400 Subject: [PATCH 0375/1379] NFSv4: Add support for the NFSv4.2 "change_attr_type" attribute The change_attr_type allows the server to provide a description of how the change attribute will behave. This again will allow the client to optimise its behaviour w.r.t. attribute revalidation. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 3 +++ fs/nfs/nfs3xdr.c | 1 + fs/nfs/nfs4proc.c | 1 + fs/nfs/nfs4xdr.c | 32 ++++++++++++++++++++++++++++++++ fs/nfs/proc.c | 1 + include/linux/nfs4.h | 9 +++++++++ include/linux/nfs_fs_sb.h | 3 +++ include/linux/nfs_xdr.h | 2 ++ 8 files changed, 52 insertions(+) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 399a8eb15397..2aeb4e52a4f1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -792,6 +792,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->maxfilesize = fsinfo->maxfilesize; server->time_delta = fsinfo->time_delta; + server->change_attr_type = fsinfo->change_attr_type; server->clone_blksize = fsinfo->clone_blksize; /* We're airborne Set socket buffersize */ @@ -933,6 +934,8 @@ struct nfs_server *nfs_alloc_server(void) return NULL; } + server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; + ida_init(&server->openowner_id); ida_init(&server->lockowner_id); pnfs_init_server(server); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index ed1c83738c30..83ad62c81fc7 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -2227,6 +2227,7 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr, /* ignore properties */ result->lease_time = 0; + result->change_attr_type = NFS4_CHANGE_TYPE_IS_TIME_METADATA; return 0; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index bc90f2a12d5d..6992c88a25e7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -264,6 +264,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD1_FS_LAYOUT_TYPES, FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_CLONE_BLKSIZE + | FATTR4_WORD2_CHANGE_ATTR_TYPE | FATTR4_WORD2_XATTR_SUPPORT }; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index d8a1911dd39e..edac4718dec1 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -153,6 +153,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, 5 /* fs layout types */ + \ 1 /* layout blksize */ + \ 1 /* clone blksize */ + \ + 1 /* change attr type */ + \ 1 /* xattr support */) #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) #define decode_renew_maxsz (op_decode_hdr_maxsz) @@ -4846,6 +4847,32 @@ static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap, return 0; } +static int decode_attr_change_attr_type(struct xdr_stream *xdr, + uint32_t *bitmap, + enum nfs4_change_attr_type *res) +{ + u32 tmp = NFS4_CHANGE_TYPE_IS_UNDEFINED; + + dprintk("%s: bitmap is %x\n", __func__, bitmap[2]); + if (bitmap[2] & FATTR4_WORD2_CHANGE_ATTR_TYPE) { + if (xdr_stream_decode_u32(xdr, &tmp)) + return -EIO; + bitmap[2] &= ~FATTR4_WORD2_CHANGE_ATTR_TYPE; + } + + switch(tmp) { + case NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR: + case NFS4_CHANGE_TYPE_IS_VERSION_COUNTER: + case NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS: + case NFS4_CHANGE_TYPE_IS_TIME_METADATA: + *res = tmp; + break; + default: + *res = NFS4_CHANGE_TYPE_IS_UNDEFINED; + } + return 0; +} + static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) { unsigned int savep; @@ -4894,6 +4921,11 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) if (status) goto xdr_error; + status = decode_attr_change_attr_type(xdr, bitmap, + &fsinfo->change_attr_type); + if (status) + goto xdr_error; + status = decode_attr_xattrsupport(xdr, bitmap, &fsinfo->xattr_support); if (status) diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 73ab7c59d3a7..ea19dbf12301 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -91,6 +91,7 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, info->dtpref = fsinfo.tsize; info->maxfilesize = 0x7FFFFFFF; info->lease_time = 0; + info->change_attr_type = NFS4_CHANGE_TYPE_IS_TIME_METADATA; return 0; } diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 5b4c67c91f56..15004c469807 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -452,6 +452,7 @@ enum lock_type4 { #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1) #define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4) #define FATTR4_WORD2_CLONE_BLKSIZE (1UL << 13) +#define FATTR4_WORD2_CHANGE_ATTR_TYPE (1UL << 15) #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) #define FATTR4_WORD2_MODE_UMASK (1UL << 17) #define FATTR4_WORD2_XATTR_SUPPORT (1UL << 18) @@ -709,6 +710,14 @@ struct nl4_server { } u; }; +enum nfs4_change_attr_type { + NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR = 0, + NFS4_CHANGE_TYPE_IS_VERSION_COUNTER = 1, + NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS = 2, + NFS4_CHANGE_TYPE_IS_TIME_METADATA = 3, + NFS4_CHANGE_TYPE_IS_UNDEFINED = 4, +}; + /* * Options for setxattr. These match the flags for setxattr(2). */ diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 6f76b32a0238..fbcdfd9f7a7f 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -180,6 +180,9 @@ struct nfs_server { #define NFS_OPTION_FSCACHE 0x00000001 /* - local caching enabled */ #define NFS_OPTION_MIGRATION 0x00000002 /* - NFSv4 migration enabled */ + enum nfs4_change_attr_type + change_attr_type;/* Description of change attribute */ + struct nfs_fsid fsid; __u64 maxfilesize; /* maximum file size */ struct timespec64 time_delta; /* smallest time granularity */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index cc29dee508f7..717ecc87c9e7 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -152,6 +152,8 @@ struct nfs_fsinfo { __u32 layouttype[NFS_MAX_LAYOUT_TYPES]; /* supported pnfs layout driver */ __u32 blksize; /* preferred pnfs io block size */ __u32 clone_blksize; /* granularity of a CLONE operation */ + enum nfs4_change_attr_type + change_attr_type; /* Info about change attr */ __u32 xattr_support; /* User xattrs supported */ }; From 6f9be83d07615e6af8838a1d489080b399f42a08 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 26 Mar 2021 11:01:19 -0400 Subject: [PATCH 0376/1379] NFS: Use information about the change attribute to optimise updates If the NFSv4.2 server supports the 'change_attr_type' attribute, then allow the client to optimise its attribute cache update strategy. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 111 ++++++++++++++++++++++++++++++++++++++-------- fs/nfs/nfs4proc.c | 20 +++++++-- 2 files changed, 110 insertions(+), 21 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index fc8e92794636..d218d164414f 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1657,25 +1657,20 @@ EXPORT_SYMBOL_GPL(_nfs_display_fhandle); #endif /** - * nfs_inode_attrs_need_update - check if the inode attributes need updating - * @inode: pointer to inode + * nfs_inode_attrs_cmp_generic - compare attributes * @fattr: attributes + * @inode: pointer to inode * * Attempt to divine whether or not an RPC call reply carrying stale * attributes got scheduled after another call carrying updated ones. - * - * To do so, the function first assumes that a more recent ctime means - * that the attributes in fattr are newer, however it also attempt to - * catch the case where ctime either didn't change, or went backwards - * (if someone reset the clock on the server) by looking at whether - * or not this RPC call was started after the inode was last updated. * Note also the check for wraparound of 'attr_gencount' * - * The function returns 'true' if it thinks the attributes in 'fattr' are - * more recent than the ones cached in the inode. - * + * The function returns '1' if it thinks the attributes in @fattr are + * more recent than the ones cached in @inode. Otherwise it returns + * the value '0'. */ -static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +static int nfs_inode_attrs_cmp_generic(const struct nfs_fattr *fattr, + const struct inode *inode) { unsigned long attr_gencount = NFS_I(inode)->attr_gencount; @@ -1683,15 +1678,93 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n (long)(attr_gencount - nfs_read_attr_generation_counter()) > 0; } -static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +/** + * nfs_inode_attrs_cmp_monotonic - compare attributes + * @fattr: attributes + * @inode: pointer to inode + * + * Attempt to divine whether or not an RPC call reply carrying stale + * attributes got scheduled after another call carrying updated ones. + * + * We assume that the server observes monotonic semantics for + * the change attribute, so a larger value means that the attributes in + * @fattr are more recent, in which case the function returns the + * value '1'. + * A return value of '0' indicates no measurable change + * A return value of '-1' means that the attributes in @inode are + * more recent. + */ +static int nfs_inode_attrs_cmp_monotonic(const struct nfs_fattr *fattr, + const struct inode *inode) { - int ret; + s64 diff = fattr->change_attr - inode_peek_iversion_raw(inode); + if (diff > 0) + return 1; + return diff == 0 ? 0 : -1; +} + +/** + * nfs_inode_attrs_cmp_strict_monotonic - compare attributes + * @fattr: attributes + * @inode: pointer to inode + * + * Attempt to divine whether or not an RPC call reply carrying stale + * attributes got scheduled after another call carrying updated ones. + * + * We assume that the server observes strictly monotonic semantics for + * the change attribute, so a larger value means that the attributes in + * @fattr are more recent, in which case the function returns the + * value '1'. + * A return value of '-1' means that the attributes in @inode are + * more recent or unchanged. + */ +static int nfs_inode_attrs_cmp_strict_monotonic(const struct nfs_fattr *fattr, + const struct inode *inode) +{ + return nfs_inode_attrs_cmp_monotonic(fattr, inode) > 0 ? 1 : -1; +} + +/** + * nfs_inode_attrs_cmp - compare attributes + * @fattr: attributes + * @inode: pointer to inode + * + * This function returns '1' if it thinks the attributes in @fattr are + * more recent than the ones cached in @inode. It returns '-1' if + * the attributes in @inode are more recent than the ones in @fattr, + * and it returns 0 if not sure. + */ +static int nfs_inode_attrs_cmp(const struct nfs_fattr *fattr, + const struct inode *inode) +{ + if (nfs_inode_attrs_cmp_generic(fattr, inode) > 0) + return 1; + switch (NFS_SERVER(inode)->change_attr_type) { + case NFS4_CHANGE_TYPE_IS_UNDEFINED: + break; + case NFS4_CHANGE_TYPE_IS_TIME_METADATA: + if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE)) + break; + return nfs_inode_attrs_cmp_monotonic(fattr, inode); + default: + if (!(fattr->valid & NFS_ATTR_FATTR_CHANGE)) + break; + return nfs_inode_attrs_cmp_strict_monotonic(fattr, inode); + } + return 0; +} + +static int nfs_refresh_inode_locked(struct inode *inode, + struct nfs_fattr *fattr) +{ + int attr_cmp = nfs_inode_attrs_cmp(fattr, inode); + int ret = 0; trace_nfs_refresh_inode_enter(inode); - if (nfs_inode_attrs_need_update(inode, fattr)) + if (attr_cmp > 0) ret = nfs_update_inode(inode, fattr); - else + else if (attr_cmp == 0) ret = nfs_check_inode_attributes(inode, fattr); trace_nfs_refresh_inode_exit(inode, ret); @@ -1776,11 +1849,13 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); */ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr) { + int attr_cmp = nfs_inode_attrs_cmp(fattr, inode); int status; /* Don't do a WCC update if these attributes are already stale */ - if ((fattr->valid & NFS_ATTR_FATTR) == 0 || - !nfs_inode_attrs_need_update(inode, fattr)) { + if (attr_cmp < 0) + return 0; + if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !attr_cmp) { fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE | NFS_ATTR_FATTR_PRESIZE | NFS_ATTR_FATTR_PREMTIME diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6992c88a25e7..0b0a48d78299 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1186,10 +1186,23 @@ nfs4_update_changeattr_locked(struct inode *inode, unsigned long timestamp, unsigned long cache_validity) { struct nfs_inode *nfsi = NFS_I(inode); + u64 change_attr = inode_peek_iversion_raw(inode); cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; - if (cinfo->atomic && cinfo->before == inode_peek_iversion_raw(inode)) { + switch (NFS_SERVER(inode)->change_attr_type) { + case NFS4_CHANGE_TYPE_IS_UNDEFINED: + break; + case NFS4_CHANGE_TYPE_IS_TIME_METADATA: + if ((s64)(change_attr - cinfo->after) > 0) + goto out; + break; + default: + if ((s64)(change_attr - cinfo->after) >= 0) + goto out; + } + + if (cinfo->atomic && cinfo->before == change_attr) { nfsi->attrtimeo_timestamp = jiffies; } else { if (S_ISDIR(inode->i_mode)) { @@ -1201,7 +1214,7 @@ nfs4_update_changeattr_locked(struct inode *inode, cache_validity |= NFS_INO_REVAL_PAGECACHE; } - if (cinfo->before != inode_peek_iversion_raw(inode)) + if (cinfo->before != change_attr) cache_validity |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_XATTR; @@ -1209,8 +1222,9 @@ nfs4_update_changeattr_locked(struct inode *inode, inode_set_iversion_raw(inode, cinfo->after); nfsi->read_cache_jiffies = timestamp; nfsi->attr_gencount = nfs_inc_attr_generation_counter(); - nfs_set_cache_invalid(inode, cache_validity); nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE; +out: + nfs_set_cache_invalid(inode, cache_validity); } void From 7b24dacf0840056f9c51107eddc5bd7efbd20133 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 9 Apr 2021 09:30:21 -0400 Subject: [PATCH 0377/1379] NFS: Another inode revalidation improvement If we're trying to update the inode because a previous update left the cache in a partially unrevalidated state, then allow the update if the change attrs match. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d218d164414f..b88e9dc72eec 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1754,6 +1754,34 @@ static int nfs_inode_attrs_cmp(const struct nfs_fattr *fattr, return 0; } +/** + * nfs_inode_finish_partial_attr_update - complete a previous inode update + * @fattr: attributes + * @inode: pointer to inode + * + * Returns '1' if the last attribute update left the inode cached + * attributes in a partially unrevalidated state, and @fattr + * matches the change attribute of that partial update. + * Otherwise returns '0'. + */ +static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr, + const struct inode *inode) +{ + const unsigned long check_valid = + NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | + NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_NLINK; + unsigned long cache_validity = NFS_I(inode)->cache_validity; + + if (!(cache_validity & NFS_INO_INVALID_CHANGE) && + (cache_validity & check_valid) != 0 && + (fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && + nfs_inode_attrs_cmp_monotonic(fattr, inode) == 0) + return 1; + return 0; +} + static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { @@ -1762,7 +1790,7 @@ static int nfs_refresh_inode_locked(struct inode *inode, trace_nfs_refresh_inode_enter(inode); - if (attr_cmp > 0) + if (attr_cmp > 0 || nfs_inode_finish_partial_attr_update(fattr, inode)) ret = nfs_update_inode(inode, fattr); else if (attr_cmp == 0) ret = nfs_check_inode_attributes(inode, fattr); From 38740707c5bc1253069eb932bc6d244f80ec21f0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 13 Apr 2021 17:56:53 +0800 Subject: [PATCH 0378/1379] f2fs: document: add description about compressed space handling User or developer may still be confused about why f2fs doesn't expose compressed space to userspace, add description about compressed space handling policy into f2fs documentation. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 63c0c49b726d..992bf91eeec8 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -819,6 +819,14 @@ Compression implementation * chattr +c file * chattr +c dir; touch dir/file * mount w/ -o compress_extension=ext; touch file.ext + * mount w/ -o compress_extension=*; touch any_file + +- At this point, compression feature doesn't expose compressed space to user + directly in order to guarantee potential data updates later to the space. + Instead, the main goal is to reduce data writes to flash disk as much as + possible, resulting in extending disk life time as well as relaxing IO + congestion. Alternatively, we've added ioctl interface to reclaim compressed + space and show it to user after putting the immutable bit. Compress metadata layout:: From 453e2ff8e4ff2747acee1799e7ef959970c5cc78 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 13 Apr 2021 17:56:18 +0800 Subject: [PATCH 0379/1379] f2fs: avoid duplicated codes for cleanup f2fs_segment_has_free_slot() was copied and modified from __next_free_blkoff(), they are almost the same, clean up to reuse common code as much as possible. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0cb1ca88d4aa..41df26292d7d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2635,22 +2635,20 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->alloc_type = LFS; } -static void __next_free_blkoff(struct f2fs_sb_info *sbi, - struct curseg_info *seg, block_t start) +static int __next_free_blkoff(struct f2fs_sb_info *sbi, + int segno, block_t start) { - struct seg_entry *se = get_seg_entry(sbi, seg->segno); + struct seg_entry *se = get_seg_entry(sbi, segno); int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); unsigned long *target_map = SIT_I(sbi)->tmp_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *cur_map = (unsigned long *)se->cur_valid_map; - int i, pos; + int i; for (i = 0; i < entries; i++) target_map[i] = ckpt_map[i] | cur_map[i]; - pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); - - seg->next_blkoff = pos; + return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); } /* @@ -2662,26 +2660,16 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, struct curseg_info *seg) { if (seg->alloc_type == SSR) - __next_free_blkoff(sbi, seg, seg->next_blkoff + 1); + seg->next_blkoff = + __next_free_blkoff(sbi, seg->segno, + seg->next_blkoff + 1); else seg->next_blkoff++; } bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) { - struct seg_entry *se = get_seg_entry(sbi, segno); - int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); - unsigned long *target_map = SIT_I(sbi)->tmp_map; - unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; - unsigned long *cur_map = (unsigned long *)se->cur_valid_map; - int i, pos; - - for (i = 0; i < entries; i++) - target_map[i] = ckpt_map[i] | cur_map[i]; - - pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, 0); - - return pos < sbi->blocks_per_seg; + return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg; } /* @@ -2709,7 +2697,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) reset_curseg(sbi, type, 1); curseg->alloc_type = SSR; - __next_free_blkoff(sbi, curseg, 0); + curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0); sum_page = f2fs_get_sum_page(sbi, new_segno); if (IS_ERR(sum_page)) { From 594b6d0428ae304e0b44457398beb458b938f005 Mon Sep 17 00:00:00 2001 From: Yi Chen Date: Tue, 13 Apr 2021 17:30:50 +0800 Subject: [PATCH 0380/1379] f2fs: fix to avoid NULL pointer dereference Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 pc : f2fs_put_page+0x1c/0x26c lr : __revoke_inmem_pages+0x544/0x75c f2fs_put_page+0x1c/0x26c __revoke_inmem_pages+0x544/0x75c __f2fs_commit_inmem_pages+0x364/0x3c0 f2fs_commit_inmem_pages+0xc8/0x1a0 f2fs_ioc_commit_atomic_write+0xa4/0x15c f2fs_ioctl+0x5b0/0x1574 file_ioctl+0x154/0x320 do_vfs_ioctl+0x164/0x740 __arm64_sys_ioctl+0x78/0xa4 el0_svc_common+0xbc/0x1d0 el0_svc_handler+0x74/0x98 el0_svc+0x8/0xc In f2fs_put_page, we access page->mapping is NULL. The root cause is: In some cases, the page refcount and ATOMIC_WRITTEN_PAGE flag miss set for page-priavte flag has been set. We add f2fs_bug_on like this: f2fs_register_inmem_page() { ... f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); f2fs_bug_on(F2FS_I_SB(inode), !IS_ATOMIC_WRITTEN_PAGE(page)); ... } The bug on stack follow link this: PC is at f2fs_register_inmem_page+0x238/0x2b4 LR is at f2fs_register_inmem_page+0x2a8/0x2b4 f2fs_register_inmem_page+0x238/0x2b4 f2fs_set_data_page_dirty+0x104/0x164 set_page_dirty+0x78/0xc8 f2fs_write_end+0x1b4/0x444 generic_perform_write+0x144/0x1cc __generic_file_write_iter+0xc4/0x174 f2fs_file_write_iter+0x2c0/0x350 __vfs_write+0x104/0x134 vfs_write+0xe8/0x19c SyS_pwrite64+0x78/0xb8 To fix this issue, let's add page refcount add page-priavte flag. The page-private flag is not cleared and needs further analysis. Signed-off-by: Chao Yu Signed-off-by: Ge Qiu Signed-off-by: Dehe Gu Signed-off-by: Yi Chen Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 41df26292d7d..6e740ecf0814 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -186,7 +186,10 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; - f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); + if (PagePrivate(page)) + set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); + else + f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); From 6d1f2803cb6b414c2e45fa64d1fdad6b581e1e88 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Wed, 7 Apr 2021 10:56:39 -0500 Subject: [PATCH 0381/1379] remoteproc: pru: Fixup interrupt-parent logic for fw events The PRU firmware interrupt mapping logic in pru_handle_intrmap() uses of_irq_find_parent() with PRU device node to get a handle to the PRUSS Interrupt Controller at present. This logic however requires that the PRU nodes always define a interrupt-parent property. This property is neither a required/defined property as per the PRU remoteproc binding, nor is relevant from a DT node point of view without any associated interrupts. The current logic finds a wrong interrupt controller and fails to perform proper mapping without any interrupt-parent property in the PRU nodes. Fix this logic to always find and use the sibling interrupt controller. Also, while at this, fix the acquired interrupt controller device node reference properly. Fixes: c75c9fdac66e ("remoteproc: pru: Add support for PRU specific interrupt configuration") Signed-off-by: Suman Anna Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/20210407155641.5501-2-s-anna@ti.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/pru_rproc.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/remoteproc/pru_rproc.c b/drivers/remoteproc/pru_rproc.c index d6086f90e809..b8b3c1921fe6 100644 --- a/drivers/remoteproc/pru_rproc.c +++ b/drivers/remoteproc/pru_rproc.c @@ -284,7 +284,7 @@ static int pru_handle_intrmap(struct rproc *rproc) struct pru_rproc *pru = rproc->priv; struct pru_irq_rsc *rsc = pru->pru_interrupt_map; struct irq_fwspec fwspec; - struct device_node *irq_parent; + struct device_node *parent, *irq_parent; int i, ret = 0; /* not having pru_interrupt_map is not an error */ @@ -312,9 +312,16 @@ static int pru_handle_intrmap(struct rproc *rproc) /* * parse and fill in system event to interrupt channel and - * channel-to-host mapping + * channel-to-host mapping. The interrupt controller to be used + * for these mappings for a given PRU remoteproc is always its + * corresponding sibling PRUSS INTC node. */ - irq_parent = of_irq_find_parent(pru->dev->of_node); + parent = of_get_parent(dev_of_node(pru->dev)); + if (!parent) + return -ENODEV; + + irq_parent = of_get_child_by_name(parent, "interrupt-controller"); + of_node_put(parent); if (!irq_parent) { kfree(pru->mapped_irq); return -ENODEV; @@ -337,11 +344,13 @@ static int pru_handle_intrmap(struct rproc *rproc) goto map_fail; } } + of_node_put(irq_parent); return ret; map_fail: pru_dispose_irq_mapping(pru); + of_node_put(irq_parent); return ret; } From 1fe72bcfac087dba5ab52778e0646ed9e145cd32 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Wed, 7 Apr 2021 10:56:40 -0500 Subject: [PATCH 0382/1379] remoteproc: pru: Fix wrong success return value for fw events The irq_create_fwspec_mapping() returns a proper virq value on success and 0 upon any failure. The pru_handle_intrmap() treats this as an error and disposes all firmware event mappings correctly, but is returning this incorrect value as is, letting the pru_rproc_start() interpret it as a success and boot the PRU. Fix this by returning an error value back upon any such failure. While at this, revise the error trace to print some meaningful info about the failed event. Fixes: c75c9fdac66e ("remoteproc: pru: Add support for PRU specific interrupt configuration") Signed-off-by: Suman Anna Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/20210407155641.5501-3-s-anna@ti.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/pru_rproc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/remoteproc/pru_rproc.c b/drivers/remoteproc/pru_rproc.c index b8b3c1921fe6..ac6348cc094f 100644 --- a/drivers/remoteproc/pru_rproc.c +++ b/drivers/remoteproc/pru_rproc.c @@ -339,8 +339,10 @@ static int pru_handle_intrmap(struct rproc *rproc) pru->mapped_irq[i] = irq_create_fwspec_mapping(&fwspec); if (!pru->mapped_irq[i]) { - dev_err(dev, "failed to get virq\n"); - ret = pru->mapped_irq[i]; + dev_err(dev, "failed to get virq for fw mapping %d: event %d chnl %d host %d\n", + i, fwspec.param[0], fwspec.param[1], + fwspec.param[2]); + ret = -EINVAL; goto map_fail; } } From 880a66e026fbe6a17cd59fe0ee942bbad62a6c26 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Wed, 7 Apr 2021 10:56:41 -0500 Subject: [PATCH 0383/1379] remoteproc: pru: Fix and cleanup firmware interrupt mapping logic The PRU firmware interrupt mappings are configured and unconfigured in .start() and .stop() callbacks respectively using the variables 'evt_count' and a 'mapped_irq' pointer. These variables are modified only during these callbacks but are not re-initialized/reset properly during unwind or failure paths. These stale values caused a kernel crash while stopping a PRU remoteproc running a different firmware with no events on a subsequent run after a previous run that was running a firmware with events. Fix this crash by ensuring that the evt_count is 0 and the mapped_irq pointer is set to NULL in pru_dispose_irq_mapping(). Also, reset these variables properly during any failures in the .start() callback. While at this, the pru_dispose_irq_mapping() callsites are all made to look the same, moving any conditional logic to inside the function. Reviewed-by: Mathieu Poirier Fixes: c75c9fdac66e ("remoteproc: pru: Add support for PRU specific interrupt configuration") Reported-by: Vignesh Raghavendra Signed-off-by: Suman Anna Link: https://lore.kernel.org/r/20210407155641.5501-4-s-anna@ti.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/pru_rproc.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/remoteproc/pru_rproc.c b/drivers/remoteproc/pru_rproc.c index ac6348cc094f..b13ff4283232 100644 --- a/drivers/remoteproc/pru_rproc.c +++ b/drivers/remoteproc/pru_rproc.c @@ -266,12 +266,17 @@ static void pru_rproc_create_debug_entries(struct rproc *rproc) static void pru_dispose_irq_mapping(struct pru_rproc *pru) { - while (pru->evt_count--) { + if (!pru->mapped_irq) + return; + + while (pru->evt_count) { + pru->evt_count--; if (pru->mapped_irq[pru->evt_count] > 0) irq_dispose_mapping(pru->mapped_irq[pru->evt_count]); } kfree(pru->mapped_irq); + pru->mapped_irq = NULL; } /* @@ -307,8 +312,10 @@ static int pru_handle_intrmap(struct rproc *rproc) pru->evt_count = rsc->num_evts; pru->mapped_irq = kcalloc(pru->evt_count, sizeof(unsigned int), GFP_KERNEL); - if (!pru->mapped_irq) + if (!pru->mapped_irq) { + pru->evt_count = 0; return -ENOMEM; + } /* * parse and fill in system event to interrupt channel and @@ -317,13 +324,19 @@ static int pru_handle_intrmap(struct rproc *rproc) * corresponding sibling PRUSS INTC node. */ parent = of_get_parent(dev_of_node(pru->dev)); - if (!parent) + if (!parent) { + kfree(pru->mapped_irq); + pru->mapped_irq = NULL; + pru->evt_count = 0; return -ENODEV; + } irq_parent = of_get_child_by_name(parent, "interrupt-controller"); of_node_put(parent); if (!irq_parent) { kfree(pru->mapped_irq); + pru->mapped_irq = NULL; + pru->evt_count = 0; return -ENODEV; } @@ -398,8 +411,7 @@ static int pru_rproc_stop(struct rproc *rproc) pru_control_write_reg(pru, PRU_CTRL_CTRL, val); /* dispose irq mapping - new firmware can provide new mapping */ - if (pru->mapped_irq) - pru_dispose_irq_mapping(pru); + pru_dispose_irq_mapping(pru); return 0; } From 859fd2418b4ba00bba56263b8abc7f54f4a7260f Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 19 Mar 2021 09:41:00 +0000 Subject: [PATCH 0384/1379] remoteproc: qcom: wcss: Fix return value check in q6v5_wcss_init_mmio() In case of error, the function devm_ioremap() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. Fixes: 0af65b9b915e ("remoteproc: qcom: wcss: Add non pas wcss Q6 support for QCS404") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Link: https://lore.kernel.org/r/20210319094100.4185044-1-weiyongjun1@huawei.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_wcss.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/remoteproc/qcom_q6v5_wcss.c b/drivers/remoteproc/qcom_q6v5_wcss.c index 71ec1a451e35..6f7d940d4431 100644 --- a/drivers/remoteproc/qcom_q6v5_wcss.c +++ b/drivers/remoteproc/qcom_q6v5_wcss.c @@ -829,8 +829,8 @@ static int q6v5_wcss_init_mmio(struct q6v5_wcss *wcss, res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "qdsp6"); wcss->reg_base = devm_ioremap(&pdev->dev, res->start, resource_size(res)); - if (IS_ERR(wcss->reg_base)) - return PTR_ERR(wcss->reg_base); + if (!wcss->reg_base) + return -ENOMEM; if (wcss->version == WCSS_IPQ8074) { res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rmb"); From 2254f405d199385e133fe730a4f3bab3f06a050f Mon Sep 17 00:00:00 2001 From: Giulio Benetti Date: Tue, 13 Apr 2021 18:46:30 -0700 Subject: [PATCH 0385/1379] dt-bindings: Add Hycon Technology vendor prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update Documentation/devicetree/bindings/vendor-prefixes.yaml to include "hycon" as a vendor prefix for "Hycon Technology". Company website: https://www.hycontek.com/ Signed-off-by: Giulio Benetti Reviewed-by: Jonathan Neuschäfer Acked-by: Rob Herring Link: https://lore.kernel.org/r/20210413144446.2277817-2-giulio.benetti@benettiengineering.com Signed-off-by: Dmitry Torokhov --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index f6064d84a424..944b02bb96d7 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -489,6 +489,8 @@ patternProperties: description: Shenzhen Hugsun Technology Co. Ltd. "^hwacom,.*": description: HwaCom Systems Inc. + "^hycon,.*": + description: Hycon Technology Corp. "^hydis,.*": description: Hydis Technologies "^hyundai,.*": From d19989a2d9d79c226bb5c094d1cc9f0c8959e9f4 Mon Sep 17 00:00:00 2001 From: Giulio Benetti Date: Tue, 13 Apr 2021 18:46:41 -0700 Subject: [PATCH 0386/1379] dt-bindings: touchscreen: Add HY46XX bindings This adds device tree bindings for the Hycon HY46XX touchscreen series. Signed-off-by: Giulio Benetti Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20210413144446.2277817-3-giulio.benetti@benettiengineering.com Signed-off-by: Dmitry Torokhov --- .../input/touchscreen/hycon,hy46xx.yaml | 119 ++++++++++++++++++ MAINTAINERS | 6 + 2 files changed, 125 insertions(+) create mode 100644 Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml diff --git a/Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml b/Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml new file mode 100644 index 000000000000..942562f1e45b --- /dev/null +++ b/Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml @@ -0,0 +1,119 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/input/touchscreen/hycon,hy46xx.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Hycon HY46XX series touchscreen controller bindings + +description: | + There are 6 variants of the chip for various touch panel sizes and cover lens material + Glass: 0.3mm--4.0mm + PET/PMMA: 0.2mm--2.0mm + HY4613(B)-N048 < 6" + HY4614(B)-N068 7" .. 10.1" + HY4621-NS32 < 5" + HY4623-NS48 5.1" .. 7" + Glass: 0.3mm--8.0mm + PET/PMMA: 0.2mm--4.0mm + HY4633(B)-N048 < 6" + HY4635(B)-N048 < 7" .. 10.1" + +maintainers: + - Giulio Benetti + +allOf: + - $ref: touchscreen.yaml# + +properties: + compatible: + enum: + - hycon,hy4613 + - hycon,hy4614 + - hycon,hy4621 + - hycon,hy4623 + - hycon,hy4633 + - hycon,hy4635 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + reset-gpios: + maxItems: 1 + + vcc-supply: true + + hycon,threshold: + description: Allows setting the sensitivity in the range from 0 to 255. + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 0 + maximum: 255 + + hycon,glove-enable: + type: boolean + description: Allows enabling glove setting. + + hycon,report-speed-hz: + description: Allows setting the report speed in Hertz. + minimum: 1 + maximum: 255 + + hycon,noise-filter-enable: + type: boolean + description: Allows enabling power noise filter. + + hycon,filter-data: + description: Allows setting how many samples throw before reporting touch + in the range from 0 to 5. + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 0 + maximum: 5 + + hycon,gain: + description: Allows setting the sensitivity distance in the range from 0 to 5. + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 0 + maximum: 5 + + hycon,edge-offset: + description: Allows setting the edge compensation in the range from 0 to 16. + $ref: /schemas/types.yaml#/definitions/uint32 + minimum: 0 + maximum: 16 + + touchscreen-size-x: true + touchscreen-size-y: true + touchscreen-fuzz-x: true + touchscreen-fuzz-y: true + touchscreen-inverted-x: true + touchscreen-inverted-y: true + touchscreen-swapped-x-y: true + interrupt-controller: true + +additionalProperties: false + +required: + - compatible + - reg + - interrupts + +examples: + - | + #include + #include + i2c { + #address-cells = <1>; + #size-cells = <0>; + touchscreen@1c { + compatible = "hycon,hy4633"; + reg = <0x1c>; + interrupt-parent = <&gpio2>; + interrupts = <5 IRQ_TYPE_EDGE_FALLING>; + reset-gpios = <&gpio2 6 GPIO_ACTIVE_LOW>; + }; + }; + +... diff --git a/MAINTAINERS b/MAINTAINERS index 9e876927c60d..a27c8efaa681 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8244,6 +8244,12 @@ S: Maintained F: mm/hwpoison-inject.c F: mm/memory-failure.c +HYCON HY46XX TOUCHSCREEN SUPPORT +M: Giulio Benetti +L: linux-input@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml + HYGON PROCESSOR SUPPORT M: Pu Wen L: linux-kernel@vger.kernel.org From aa2f62cf211a0985c14fd78a17d55296769698d6 Mon Sep 17 00:00:00 2001 From: Giulio Benetti Date: Tue, 13 Apr 2021 18:46:53 -0700 Subject: [PATCH 0387/1379] Input: add driver for the Hycon HY46XX touchpanel series This patch adds support for Hycon HY46XX. Signed-off-by: Giulio Benetti Link: https://lore.kernel.org/r/20210413144446.2277817-4-giulio.benetti@benettiengineering.com Signed-off-by: Dmitry Torokhov --- MAINTAINERS | 1 + drivers/input/touchscreen/Kconfig | 11 + drivers/input/touchscreen/Makefile | 1 + drivers/input/touchscreen/hycon-hy46xx.c | 591 +++++++++++++++++++++++ 4 files changed, 604 insertions(+) create mode 100644 drivers/input/touchscreen/hycon-hy46xx.c diff --git a/MAINTAINERS b/MAINTAINERS index a27c8efaa681..e2a4d7cb38a2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8249,6 +8249,7 @@ M: Giulio Benetti L: linux-input@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml +F: drivers/input/touchscreen/hy46xx.c HYGON PROCESSOR SUPPORT M: Pu Wen diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig index 5f76192164c1..ad454cd2855a 100644 --- a/drivers/input/touchscreen/Kconfig +++ b/drivers/input/touchscreen/Kconfig @@ -411,6 +411,17 @@ config TOUCHSCREEN_HIDEEP To compile this driver as a module, choose M here : the module will be called hideep_ts. +config TOUCHSCREEN_HYCON_HY46XX + tristate "Hycon hy46xx touchscreen support" + depends on I2C + help + Say Y here if you have a touchscreen using Hycon hy46xx + + If unsure, say N. + + To compile this driver as a module, choose M here: the + module will be called hycon-hy46xx. + config TOUCHSCREEN_ILI210X tristate "Ilitek ILI210X based touchscreen" depends on I2C diff --git a/drivers/input/touchscreen/Makefile b/drivers/input/touchscreen/Makefile index 92267fe53cdb..7d34100f7f22 100644 --- a/drivers/input/touchscreen/Makefile +++ b/drivers/input/touchscreen/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_TOUCHSCREEN_DA9052) += da9052_tsi.o obj-$(CONFIG_TOUCHSCREEN_DYNAPRO) += dynapro.o obj-$(CONFIG_TOUCHSCREEN_EDT_FT5X06) += edt-ft5x06.o obj-$(CONFIG_TOUCHSCREEN_HAMPSHIRE) += hampshire.o +obj-$(CONFIG_TOUCHSCREEN_HYCON_HY46XX) += hycon-hy46xx.o obj-$(CONFIG_TOUCHSCREEN_GUNZE) += gunze.o obj-$(CONFIG_TOUCHSCREEN_EETI) += eeti_ts.o obj-$(CONFIG_TOUCHSCREEN_EKTF2127) += ektf2127.o diff --git a/drivers/input/touchscreen/hycon-hy46xx.c b/drivers/input/touchscreen/hycon-hy46xx.c new file mode 100644 index 000000000000..891d0430083e --- /dev/null +++ b/drivers/input/touchscreen/hycon-hy46xx.c @@ -0,0 +1,591 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 + * Author(s): Giulio Benetti + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define HY46XX_CHKSUM_CODE 0x1 +#define HY46XX_FINGER_NUM 0x2 +#define HY46XX_CHKSUM_LEN 0x7 +#define HY46XX_THRESHOLD 0x80 +#define HY46XX_GLOVE_EN 0x84 +#define HY46XX_REPORT_SPEED 0x88 +#define HY46XX_PWR_NOISE_EN 0x89 +#define HY46XX_FILTER_DATA 0x8A +#define HY46XX_GAIN 0x92 +#define HY46XX_EDGE_OFFSET 0x93 +#define HY46XX_RX_NR_USED 0x94 +#define HY46XX_TX_NR_USED 0x95 +#define HY46XX_PWR_MODE 0xA5 +#define HY46XX_FW_VERSION 0xA6 +#define HY46XX_LIB_VERSION 0xA7 +#define HY46XX_TP_INFO 0xA8 +#define HY46XX_TP_CHIP_ID 0xA9 +#define HY46XX_BOOT_VER 0xB0 + +#define HY46XX_TPLEN 0x6 +#define HY46XX_REPORT_PKT_LEN 0x44 + +#define HY46XX_MAX_SUPPORTED_POINTS 11 + +#define TOUCH_EVENT_DOWN 0x00 +#define TOUCH_EVENT_UP 0x01 +#define TOUCH_EVENT_CONTACT 0x02 +#define TOUCH_EVENT_RESERVED 0x03 + +struct hycon_hy46xx_data { + struct i2c_client *client; + struct input_dev *input; + struct touchscreen_properties prop; + struct regulator *vcc; + + struct gpio_desc *reset_gpio; + + struct mutex mutex; + struct regmap *regmap; + + int threshold; + bool glove_enable; + int report_speed; + bool noise_filter_enable; + int filter_data; + int gain; + int edge_offset; + int rx_number_used; + int tx_number_used; + int power_mode; + int fw_version; + int lib_version; + int tp_information; + int tp_chip_id; + int bootloader_version; +}; + +static const struct regmap_config hycon_hy46xx_i2c_regmap_config = { + .reg_bits = 8, + .val_bits = 8, +}; + +static bool hycon_hy46xx_check_checksum(struct hycon_hy46xx_data *tsdata, u8 *buf) +{ + u8 chksum = 0; + int i; + + for (i = 2; i < buf[HY46XX_CHKSUM_LEN]; i++) + chksum += buf[i]; + + if (chksum == buf[HY46XX_CHKSUM_CODE]) + return true; + + dev_err_ratelimited(&tsdata->client->dev, + "checksum error: 0x%02x expected, got 0x%02x\n", + chksum, buf[HY46XX_CHKSUM_CODE]); + + return false; +} + +static irqreturn_t hycon_hy46xx_isr(int irq, void *dev_id) +{ + struct hycon_hy46xx_data *tsdata = dev_id; + struct device *dev = &tsdata->client->dev; + u8 rdbuf[HY46XX_REPORT_PKT_LEN]; + int i, x, y, id; + int error; + + memset(rdbuf, 0, sizeof(rdbuf)); + + error = regmap_bulk_read(tsdata->regmap, 0, rdbuf, sizeof(rdbuf)); + if (error) { + dev_err_ratelimited(dev, "Unable to fetch data, error: %d\n", + error); + goto out; + } + + if (!hycon_hy46xx_check_checksum(tsdata, rdbuf)) + goto out; + + for (i = 0; i < HY46XX_MAX_SUPPORTED_POINTS; i++) { + u8 *buf = &rdbuf[3 + (HY46XX_TPLEN * i)]; + int type = buf[0] >> 6; + + if (type == TOUCH_EVENT_RESERVED) + continue; + + x = get_unaligned_be16(buf) & 0x0fff; + y = get_unaligned_be16(buf + 2) & 0x0fff; + + id = buf[2] >> 4; + + input_mt_slot(tsdata->input, id); + if (input_mt_report_slot_state(tsdata->input, MT_TOOL_FINGER, + type != TOUCH_EVENT_UP)) + touchscreen_report_pos(tsdata->input, &tsdata->prop, + x, y, true); + } + + input_mt_report_pointer_emulation(tsdata->input, false); + input_sync(tsdata->input); + +out: + return IRQ_HANDLED; +} + +struct hycon_hy46xx_attribute { + struct device_attribute dattr; + size_t field_offset; + u8 address; + u8 limit_low; + u8 limit_high; +}; + +#define HYCON_ATTR_U8(_field, _mode, _address, _limit_low, _limit_high) \ + struct hycon_hy46xx_attribute hycon_hy46xx_attr_##_field = { \ + .dattr = __ATTR(_field, _mode, \ + hycon_hy46xx_setting_show, \ + hycon_hy46xx_setting_store), \ + .field_offset = offsetof(struct hycon_hy46xx_data, _field), \ + .address = _address, \ + .limit_low = _limit_low, \ + .limit_high = _limit_high, \ + } + +#define HYCON_ATTR_BOOL(_field, _mode, _address) \ + struct hycon_hy46xx_attribute hycon_hy46xx_attr_##_field = { \ + .dattr = __ATTR(_field, _mode, \ + hycon_hy46xx_setting_show, \ + hycon_hy46xx_setting_store), \ + .field_offset = offsetof(struct hycon_hy46xx_data, _field), \ + .address = _address, \ + .limit_low = false, \ + .limit_high = true, \ + } + +static ssize_t hycon_hy46xx_setting_show(struct device *dev, + struct device_attribute *dattr, char *buf) +{ + struct i2c_client *client = to_i2c_client(dev); + struct hycon_hy46xx_data *tsdata = i2c_get_clientdata(client); + struct hycon_hy46xx_attribute *attr = + container_of(dattr, struct hycon_hy46xx_attribute, dattr); + u8 *field = (u8 *)tsdata + attr->field_offset; + size_t count = 0; + int error = 0; + int val; + + mutex_lock(&tsdata->mutex); + + error = regmap_read(tsdata->regmap, attr->address, &val); + if (error < 0) { + dev_err(&tsdata->client->dev, + "Failed to fetch attribute %s, error %d\n", + dattr->attr.name, error); + goto out; + } + + if (val != *field) { + dev_warn(&tsdata->client->dev, + "%s: read (%d) and stored value (%d) differ\n", + dattr->attr.name, val, *field); + *field = val; + } + + count = scnprintf(buf, PAGE_SIZE, "%d\n", val); + +out: + mutex_unlock(&tsdata->mutex); + return error ?: count; +} + +static ssize_t hycon_hy46xx_setting_store(struct device *dev, + struct device_attribute *dattr, + const char *buf, size_t count) +{ + struct i2c_client *client = to_i2c_client(dev); + struct hycon_hy46xx_data *tsdata = i2c_get_clientdata(client); + struct hycon_hy46xx_attribute *attr = + container_of(dattr, struct hycon_hy46xx_attribute, dattr); + u8 *field = (u8 *)tsdata + attr->field_offset; + unsigned int val; + int error; + + mutex_lock(&tsdata->mutex); + + error = kstrtouint(buf, 0, &val); + if (error) + goto out; + + if (val < attr->limit_low || val > attr->limit_high) { + error = -ERANGE; + goto out; + } + + error = regmap_write(tsdata->regmap, attr->address, val); + if (error < 0) { + dev_err(&tsdata->client->dev, + "Failed to update attribute %s, error: %d\n", + dattr->attr.name, error); + goto out; + } + *field = val; + +out: + mutex_unlock(&tsdata->mutex); + return error ?: count; +} + +static HYCON_ATTR_U8(threshold, 0644, HY46XX_THRESHOLD, 0, 255); +static HYCON_ATTR_BOOL(glove_enable, 0644, HY46XX_GLOVE_EN); +static HYCON_ATTR_U8(report_speed, 0644, HY46XX_REPORT_SPEED, 0, 255); +static HYCON_ATTR_BOOL(noise_filter_enable, 0644, HY46XX_PWR_NOISE_EN); +static HYCON_ATTR_U8(filter_data, 0644, HY46XX_FILTER_DATA, 0, 5); +static HYCON_ATTR_U8(gain, 0644, HY46XX_GAIN, 0, 5); +static HYCON_ATTR_U8(edge_offset, 0644, HY46XX_EDGE_OFFSET, 0, 5); +static HYCON_ATTR_U8(fw_version, 0444, HY46XX_FW_VERSION, 0, 255); +static HYCON_ATTR_U8(lib_version, 0444, HY46XX_LIB_VERSION, 0, 255); +static HYCON_ATTR_U8(tp_information, 0444, HY46XX_TP_INFO, 0, 255); +static HYCON_ATTR_U8(tp_chip_id, 0444, HY46XX_TP_CHIP_ID, 0, 255); +static HYCON_ATTR_U8(bootloader_version, 0444, HY46XX_BOOT_VER, 0, 255); + +static struct attribute *hycon_hy46xx_attrs[] = { + &hycon_hy46xx_attr_threshold.dattr.attr, + &hycon_hy46xx_attr_glove_enable.dattr.attr, + &hycon_hy46xx_attr_report_speed.dattr.attr, + &hycon_hy46xx_attr_noise_filter_enable.dattr.attr, + &hycon_hy46xx_attr_filter_data.dattr.attr, + &hycon_hy46xx_attr_gain.dattr.attr, + &hycon_hy46xx_attr_edge_offset.dattr.attr, + &hycon_hy46xx_attr_fw_version.dattr.attr, + &hycon_hy46xx_attr_lib_version.dattr.attr, + &hycon_hy46xx_attr_tp_information.dattr.attr, + &hycon_hy46xx_attr_tp_chip_id.dattr.attr, + &hycon_hy46xx_attr_bootloader_version.dattr.attr, + NULL +}; + +static const struct attribute_group hycon_hy46xx_attr_group = { + .attrs = hycon_hy46xx_attrs, +}; + +static void hycon_hy46xx_get_defaults(struct device *dev, struct hycon_hy46xx_data *tsdata) +{ + bool val_bool; + int error; + u32 val; + + error = device_property_read_u32(dev, "hycon,threshold", &val); + if (!error) { + error = regmap_write(tsdata->regmap, HY46XX_THRESHOLD, val); + if (error < 0) + goto out; + + tsdata->threshold = val; + } + + val_bool = device_property_read_bool(dev, "hycon,glove-enable"); + error = regmap_write(tsdata->regmap, HY46XX_GLOVE_EN, val_bool); + if (error < 0) + goto out; + tsdata->glove_enable = val_bool; + + error = device_property_read_u32(dev, "hycon,report-speed-hz", &val); + if (!error) { + error = regmap_write(tsdata->regmap, HY46XX_REPORT_SPEED, val); + if (error < 0) + goto out; + + tsdata->report_speed = val; + } + + val_bool = device_property_read_bool(dev, "hycon,noise-filter-enable"); + error = regmap_write(tsdata->regmap, HY46XX_PWR_NOISE_EN, val_bool); + if (error < 0) + goto out; + tsdata->noise_filter_enable = val_bool; + + error = device_property_read_u32(dev, "hycon,filter-data", &val); + if (!error) { + error = regmap_write(tsdata->regmap, HY46XX_FILTER_DATA, val); + if (error < 0) + goto out; + + tsdata->filter_data = val; + } + + error = device_property_read_u32(dev, "hycon,gain", &val); + if (!error) { + error = regmap_write(tsdata->regmap, HY46XX_GAIN, val); + if (error < 0) + goto out; + + tsdata->gain = val; + } + + error = device_property_read_u32(dev, "hycon,edge-offset", &val); + if (!error) { + error = regmap_write(tsdata->regmap, HY46XX_EDGE_OFFSET, val); + if (error < 0) + goto out; + + tsdata->edge_offset = val; + } + + return; +out: + dev_err(&tsdata->client->dev, "Failed to set default settings"); +} + +static void hycon_hy46xx_get_parameters(struct hycon_hy46xx_data *tsdata) +{ + int error; + u32 val; + + error = regmap_read(tsdata->regmap, HY46XX_THRESHOLD, &val); + if (error < 0) + goto out; + tsdata->threshold = val; + + error = regmap_read(tsdata->regmap, HY46XX_GLOVE_EN, &val); + if (error < 0) + goto out; + tsdata->glove_enable = val; + + error = regmap_read(tsdata->regmap, HY46XX_REPORT_SPEED, &val); + if (error < 0) + goto out; + tsdata->report_speed = val; + + error = regmap_read(tsdata->regmap, HY46XX_PWR_NOISE_EN, &val); + if (error < 0) + goto out; + tsdata->noise_filter_enable = val; + + error = regmap_read(tsdata->regmap, HY46XX_FILTER_DATA, &val); + if (error < 0) + goto out; + tsdata->filter_data = val; + + error = regmap_read(tsdata->regmap, HY46XX_GAIN, &val); + if (error < 0) + goto out; + tsdata->gain = val; + + error = regmap_read(tsdata->regmap, HY46XX_EDGE_OFFSET, &val); + if (error < 0) + goto out; + tsdata->edge_offset = val; + + error = regmap_read(tsdata->regmap, HY46XX_RX_NR_USED, &val); + if (error < 0) + goto out; + tsdata->rx_number_used = val; + + error = regmap_read(tsdata->regmap, HY46XX_TX_NR_USED, &val); + if (error < 0) + goto out; + tsdata->tx_number_used = val; + + error = regmap_read(tsdata->regmap, HY46XX_PWR_MODE, &val); + if (error < 0) + goto out; + tsdata->power_mode = val; + + error = regmap_read(tsdata->regmap, HY46XX_FW_VERSION, &val); + if (error < 0) + goto out; + tsdata->fw_version = val; + + error = regmap_read(tsdata->regmap, HY46XX_LIB_VERSION, &val); + if (error < 0) + goto out; + tsdata->lib_version = val; + + error = regmap_read(tsdata->regmap, HY46XX_TP_INFO, &val); + if (error < 0) + goto out; + tsdata->tp_information = val; + + error = regmap_read(tsdata->regmap, HY46XX_TP_CHIP_ID, &val); + if (error < 0) + goto out; + tsdata->tp_chip_id = val; + + error = regmap_read(tsdata->regmap, HY46XX_BOOT_VER, &val); + if (error < 0) + goto out; + tsdata->bootloader_version = val; + + return; +out: + dev_err(&tsdata->client->dev, "Failed to read default settings"); +} + +static void hycon_hy46xx_disable_regulator(void *arg) +{ + struct hycon_hy46xx_data *data = arg; + + regulator_disable(data->vcc); +} + +static int hycon_hy46xx_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct hycon_hy46xx_data *tsdata; + struct input_dev *input; + int error; + + dev_dbg(&client->dev, "probing for HYCON HY46XX I2C\n"); + + tsdata = devm_kzalloc(&client->dev, sizeof(*tsdata), GFP_KERNEL); + if (!tsdata) + return -ENOMEM; + + tsdata->vcc = devm_regulator_get(&client->dev, "vcc"); + if (IS_ERR(tsdata->vcc)) { + error = PTR_ERR(tsdata->vcc); + if (error != -EPROBE_DEFER) + dev_err(&client->dev, + "failed to request regulator: %d\n", error); + return error; + } + + error = regulator_enable(tsdata->vcc); + if (error < 0) { + dev_err(&client->dev, "failed to enable vcc: %d\n", error); + return error; + } + + error = devm_add_action_or_reset(&client->dev, + hycon_hy46xx_disable_regulator, + tsdata); + if (error) + return error; + + tsdata->reset_gpio = devm_gpiod_get_optional(&client->dev, + "reset", GPIOD_OUT_LOW); + if (IS_ERR(tsdata->reset_gpio)) { + error = PTR_ERR(tsdata->reset_gpio); + dev_err(&client->dev, + "Failed to request GPIO reset pin, error %d\n", error); + return error; + } + + if (tsdata->reset_gpio) { + usleep_range(5000, 6000); + gpiod_set_value_cansleep(tsdata->reset_gpio, 1); + usleep_range(5000, 6000); + gpiod_set_value_cansleep(tsdata->reset_gpio, 0); + msleep(1000); + } + + input = devm_input_allocate_device(&client->dev); + if (!input) { + dev_err(&client->dev, "failed to allocate input device.\n"); + return -ENOMEM; + } + + mutex_init(&tsdata->mutex); + tsdata->client = client; + tsdata->input = input; + + tsdata->regmap = devm_regmap_init_i2c(client, + &hycon_hy46xx_i2c_regmap_config); + if (IS_ERR(tsdata->regmap)) { + dev_err(&client->dev, "regmap allocation failed\n"); + return PTR_ERR(tsdata->regmap); + } + + hycon_hy46xx_get_defaults(&client->dev, tsdata); + hycon_hy46xx_get_parameters(tsdata); + + input->name = "Hycon Capacitive Touch"; + input->id.bustype = BUS_I2C; + input->dev.parent = &client->dev; + + input_set_abs_params(input, ABS_MT_POSITION_X, 0, -1, 0, 0); + input_set_abs_params(input, ABS_MT_POSITION_Y, 0, -1, 0, 0); + + touchscreen_parse_properties(input, true, &tsdata->prop); + + error = input_mt_init_slots(input, HY46XX_MAX_SUPPORTED_POINTS, + INPUT_MT_DIRECT); + if (error) { + dev_err(&client->dev, "Unable to init MT slots.\n"); + return error; + } + + i2c_set_clientdata(client, tsdata); + + error = devm_request_threaded_irq(&client->dev, client->irq, + NULL, hycon_hy46xx_isr, IRQF_ONESHOT, + client->name, tsdata); + if (error) { + dev_err(&client->dev, "Unable to request touchscreen IRQ.\n"); + return error; + } + + error = devm_device_add_group(&client->dev, &hycon_hy46xx_attr_group); + if (error) + return error; + + error = input_register_device(input); + if (error) + return error; + + dev_dbg(&client->dev, + "HYCON HY46XX initialized: IRQ %d, Reset pin %d.\n", + client->irq, + tsdata->reset_gpio ? desc_to_gpio(tsdata->reset_gpio) : -1); + + return 0; +} + +static const struct i2c_device_id hycon_hy46xx_id[] = { + { .name = "hy4613" }, + { .name = "hy4614" }, + { .name = "hy4621" }, + { .name = "hy4623" }, + { .name = "hy4633" }, + { .name = "hy4635" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(i2c, hycon_hy46xx_id); + +static const struct of_device_id hycon_hy46xx_of_match[] = { + { .compatible = "hycon,hy4613" }, + { .compatible = "hycon,hy4614" }, + { .compatible = "hycon,hy4621" }, + { .compatible = "hycon,hy4623" }, + { .compatible = "hycon,hy4633" }, + { .compatible = "hycon,hy4635" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, hycon_hy46xx_of_match); + +static struct i2c_driver hycon_hy46xx_driver = { + .driver = { + .name = "hycon_hy46xx", + .of_match_table = hycon_hy46xx_of_match, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + }, + .id_table = hycon_hy46xx_id, + .probe = hycon_hy46xx_probe, +}; + +module_i2c_driver(hycon_hy46xx_driver); + +MODULE_AUTHOR("Giulio Benetti "); +MODULE_DESCRIPTION("HYCON HY46XX I2C Touchscreen Driver"); +MODULE_LICENSE("GPL v2"); From 7a64a5c2e04a9613425b76b0294fa75607cdd513 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 8 Apr 2021 22:34:56 +0530 Subject: [PATCH 0388/1379] dt-bindings: remoteproc: qcom: pas: Add binding for SDX55 Add devicetree binding for SDX55 remoteproc. Acked-by: Rob Herring Cc: Rob Herring Cc: devicetree@vger.kernel.org Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20210408170457.91409-15-manivannan.sadhasivam@linaro.org Signed-off-by: Bjorn Andersson --- Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt index 1c330a8941f9..229f908fd831 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt @@ -18,6 +18,7 @@ on the Qualcomm ADSP Hexagon core. "qcom,sc7180-mpss-pas" "qcom,sdm845-adsp-pas" "qcom,sdm845-cdsp-pas" + "qcom,sdx55-mpss-pas" "qcom,sm8150-adsp-pas" "qcom,sm8150-cdsp-pas" "qcom,sm8150-mpss-pas" @@ -61,6 +62,7 @@ on the Qualcomm ADSP Hexagon core. must be "wdog", "fatal", "ready", "handover", "stop-ack" qcom,qcs404-wcss-pas: qcom,sc7180-mpss-pas: + qcom,sdx55-mpss-pas: qcom,sm8150-mpss-pas: qcom,sm8350-mpss-pas: must be "wdog", "fatal", "ready", "handover", "stop-ack", @@ -128,6 +130,8 @@ on the Qualcomm ADSP Hexagon core. qcom,sm8150-mpss-pas: qcom,sm8350-mpss-pas: must be "cx", "load_state", "mss" + qcom,sdx55-mpss-pas: + must be "cx", "mss" qcom,sm8250-adsp-pas: qcom,sm8350-adsp-pas: qcom,sm8150-slpi-pas: From 3fdba9d27cc68a4d561da80bfecd4f73239b9a86 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Thu, 8 Apr 2021 22:42:11 +0530 Subject: [PATCH 0389/1379] remoteproc: qcom: pas: Add modem support for SDX55 Add remoteproc support for Hexagon modem found on the Qualcomm SDX55 platform. Signed-off-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20210408171211.92141-1-manivannan.sadhasivam@linaro.org Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_pas.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c index ef85b5511dc9..b921fc26cd04 100644 --- a/drivers/remoteproc/qcom_q6v5_pas.c +++ b/drivers/remoteproc/qcom_q6v5_pas.c @@ -785,6 +785,22 @@ static const struct adsp_data wcss_resource_init = { .ssctl_id = 0x12, }; +static const struct adsp_data sdx55_mpss_resource = { + .crash_reason_smem = 421, + .firmware_name = "modem.mdt", + .pas_id = 4, + .has_aggre2_clk = false, + .auto_boot = true, + .proxy_pd_names = (char*[]){ + "cx", + "mss", + NULL + }, + .ssr_name = "mpss", + .sysmon_name = "modem", + .ssctl_id = 0x22, +}; + static const struct of_device_id adsp_of_match[] = { { .compatible = "qcom,msm8974-adsp-pil", .data = &adsp_resource_init}, { .compatible = "qcom,msm8996-adsp-pil", .data = &adsp_resource_init}, @@ -797,6 +813,7 @@ static const struct of_device_id adsp_of_match[] = { { .compatible = "qcom,sc7180-mpss-pas", .data = &mpss_resource_init}, { .compatible = "qcom,sdm845-adsp-pas", .data = &adsp_resource_init}, { .compatible = "qcom,sdm845-cdsp-pas", .data = &cdsp_resource_init}, + { .compatible = "qcom,sdx55-mpss-pas", .data = &sdx55_mpss_resource}, { .compatible = "qcom,sm8150-adsp-pas", .data = &sm8150_adsp_resource}, { .compatible = "qcom,sm8150-cdsp-pas", .data = &sm8150_cdsp_resource}, { .compatible = "qcom,sm8150-mpss-pas", .data = &mpss_resource_init}, From ca0e89406ba18e9ba98d28413b50469f631bb583 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 26 Mar 2021 02:47:41 +0000 Subject: [PATCH 0390/1379] remoteproc: qcom: wcss: Fix wrong pointer passed to PTR_ERR() PTR_ERR should access the value just tested by IS_ERR, otherwise the wrong error code will be returned. This commit fix it by return 'ret' directly. Reviewed-by: Dan Carpenter Fixes: 0af65b9b915e ("remoteproc: qcom: wcss: Add non pas wcss Q6 support for QCS404") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Link: https://lore.kernel.org/r/20210326024741.841267-1-weiyongjun1@huawei.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_wcss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/remoteproc/qcom_q6v5_wcss.c b/drivers/remoteproc/qcom_q6v5_wcss.c index 6f7d940d4431..eab911dabb0b 100644 --- a/drivers/remoteproc/qcom_q6v5_wcss.c +++ b/drivers/remoteproc/qcom_q6v5_wcss.c @@ -972,7 +972,7 @@ static int q6v5_wcss_init_clock(struct q6v5_wcss *wcss) ret = PTR_ERR(wcss->qdsp6ss_axim_cbcr); if (ret != -EPROBE_DEFER) dev_err(wcss->dev, "failed to get axim cbcr clk\n"); - return PTR_ERR(wcss->qdsp6ss_abhm_cbcr); + return ret; } wcss->lcc_bcr_sleep = devm_clk_get(wcss->dev, "lcc_bcr_sleep"); From 6353da47ee0082286e4d4a7dc4b38d4ee5ea4cb4 Mon Sep 17 00:00:00 2001 From: Junlin Yang Date: Thu, 8 Apr 2021 22:33:22 +0800 Subject: [PATCH 0391/1379] remoteproc: qcom: wcss: Remove unnecessary PTR_ERR() Remove unnecessary PTR_ERR(), it has been assigned to ret before, so return ret directly. Signed-off-by: Junlin Yang Link: https://lore.kernel.org/r/20210408143322.1647-1-angkery@163.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/qcom_q6v5_wcss.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/remoteproc/qcom_q6v5_wcss.c b/drivers/remoteproc/qcom_q6v5_wcss.c index eab911dabb0b..20d50ec7eff1 100644 --- a/drivers/remoteproc/qcom_q6v5_wcss.c +++ b/drivers/remoteproc/qcom_q6v5_wcss.c @@ -913,7 +913,7 @@ static int q6v5_wcss_init_clock(struct q6v5_wcss *wcss) ret = PTR_ERR(wcss->gcc_abhs_cbcr); if (ret != -EPROBE_DEFER) dev_err(wcss->dev, "failed to get gcc abhs clock"); - return PTR_ERR(wcss->gcc_abhs_cbcr); + return ret; } wcss->gcc_axim_cbcr = devm_clk_get(wcss->dev, "gcc_axim_cbcr"); @@ -921,7 +921,7 @@ static int q6v5_wcss_init_clock(struct q6v5_wcss *wcss) ret = PTR_ERR(wcss->gcc_axim_cbcr); if (ret != -EPROBE_DEFER) dev_err(wcss->dev, "failed to get gcc axim clock\n"); - return PTR_ERR(wcss->gcc_axim_cbcr); + return ret; } wcss->ahbfabric_cbcr_clk = devm_clk_get(wcss->dev, @@ -930,7 +930,7 @@ static int q6v5_wcss_init_clock(struct q6v5_wcss *wcss) ret = PTR_ERR(wcss->ahbfabric_cbcr_clk); if (ret != -EPROBE_DEFER) dev_err(wcss->dev, "failed to get ahbfabric clock\n"); - return PTR_ERR(wcss->ahbfabric_cbcr_clk); + return ret; } wcss->lcc_csr_cbcr = devm_clk_get(wcss->dev, "tcsr_lcc_cbc"); @@ -938,7 +938,7 @@ static int q6v5_wcss_init_clock(struct q6v5_wcss *wcss) ret = PTR_ERR(wcss->lcc_csr_cbcr); if (ret != -EPROBE_DEFER) dev_err(wcss->dev, "failed to get csr cbcr clk\n"); - return PTR_ERR(wcss->lcc_csr_cbcr); + return ret; } wcss->ahbs_cbcr = devm_clk_get(wcss->dev, @@ -947,7 +947,7 @@ static int q6v5_wcss_init_clock(struct q6v5_wcss *wcss) ret = PTR_ERR(wcss->ahbs_cbcr); if (ret != -EPROBE_DEFER) dev_err(wcss->dev, "failed to get ahbs_cbcr clk\n"); - return PTR_ERR(wcss->ahbs_cbcr); + return ret; } wcss->tcm_slave_cbcr = devm_clk_get(wcss->dev, @@ -956,7 +956,7 @@ static int q6v5_wcss_init_clock(struct q6v5_wcss *wcss) ret = PTR_ERR(wcss->tcm_slave_cbcr); if (ret != -EPROBE_DEFER) dev_err(wcss->dev, "failed to get tcm cbcr clk\n"); - return PTR_ERR(wcss->tcm_slave_cbcr); + return ret; } wcss->qdsp6ss_abhm_cbcr = devm_clk_get(wcss->dev, "lcc_abhm_cbc"); @@ -964,7 +964,7 @@ static int q6v5_wcss_init_clock(struct q6v5_wcss *wcss) ret = PTR_ERR(wcss->qdsp6ss_abhm_cbcr); if (ret != -EPROBE_DEFER) dev_err(wcss->dev, "failed to get abhm cbcr clk\n"); - return PTR_ERR(wcss->qdsp6ss_abhm_cbcr); + return ret; } wcss->qdsp6ss_axim_cbcr = devm_clk_get(wcss->dev, "lcc_axim_cbc"); @@ -980,7 +980,7 @@ static int q6v5_wcss_init_clock(struct q6v5_wcss *wcss) ret = PTR_ERR(wcss->lcc_bcr_sleep); if (ret != -EPROBE_DEFER) dev_err(wcss->dev, "failed to get bcr cbcr clk\n"); - return PTR_ERR(wcss->lcc_bcr_sleep); + return ret; } return 0; From 889cb0d43d18ed348971e49378ed6a63e3a4745e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 31 Mar 2021 12:27:09 +0000 Subject: [PATCH 0392/1379] remoteproc: imx_rproc: fix build error without CONFIG_MAILBOX Fix build error when CONFIG_MAILBOX is not set: arm-linux-gnueabi-ld: drivers/remoteproc/imx_rproc.o: in function `imx_rproc_kick': imx_rproc.c:(.text+0x328): undefined reference to `mbox_send_message' arm-linux-gnueabi-ld: drivers/remoteproc/imx_rproc.o: in function `imx_rproc_probe': imx_rproc.c:(.text+0x52c): undefined reference to `mbox_request_channel_byname' arm-linux-gnueabi-ld: imx_rproc.c:(.text+0x548): undefined reference to `mbox_request_channel_byname' arm-linux-gnueabi-ld: imx_rproc.c:(.text+0x76c): undefined reference to `mbox_free_channel' arm-linux-gnueabi-ld: imx_rproc.c:(.text+0x774): undefined reference to `mbox_free_channel' arm-linux-gnueabi-ld: imx_rproc.c:(.text+0x7c4): undefined reference to `mbox_free_channel' arm-linux-gnueabi-ld: drivers/remoteproc/imx_rproc.o: in function `imx_rproc_remove': imx_rproc.c:(.text+0x86c): undefined reference to `mbox_free_channel' arm-linux-gnueabi-ld: imx_rproc.c:(.text+0x874): undefined reference to `mbox_free_channel' make: *** [Makefile:1292: vmlinux] Error 1 Reviewed-by: Mathieu Poirier Fixes: 2df7062002d0 ("remoteproc: imx_proc: enable virtio/mailbox") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Link: https://lore.kernel.org/r/20210331122709.3935521-1-weiyongjun1@huawei.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig index 7cf3d1b40c55..e68fcedc999c 100644 --- a/drivers/remoteproc/Kconfig +++ b/drivers/remoteproc/Kconfig @@ -26,6 +26,7 @@ config REMOTEPROC_CDEV config IMX_REMOTEPROC tristate "i.MX remoteproc support" depends on ARCH_MXC + select MAILBOX help Say y here to support iMX's remote processors via the remote processor framework. From 6e962bfe56b99843ab716dc8a9438039476c99e2 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Thu, 8 Apr 2021 09:44:46 +0800 Subject: [PATCH 0393/1379] remoteproc: imx_rproc: add missing of_node_put After of_parse_phandle, we need of_node_put to decrease the refcount of the device_node. Reported-by: Mathieu Poirier Signed-off-by: Peng Fan Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/1617846289-13496-2-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/imx_rproc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 6d3207ccbaef..077413319f58 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -459,6 +459,8 @@ static int imx_rproc_addr_init(struct imx_rproc *priv, return err; } + of_node_put(node); + if (b >= IMX7D_RPROC_MEM_MAX) break; From f638a19775ae60ae919ff604fdc04362ff4f817f Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Thu, 8 Apr 2021 09:44:47 +0800 Subject: [PATCH 0394/1379] remoteproc: imx_rproc: enlarge IMX7D_RPROC_MEM_MAX 8 is not enough when we need more, such as resource table for remote cores that booted before Linux Kernel, so enlarge IMX7D_RPROC_MEM_MAX to 32. And also rename it to IMX_RPROC_MEM_MAX which make more sense. Signed-off-by: Peng Fan Reviewed-by: Mathieu Poirier Link: https://lore.kernel.org/r/1617846289-13496-3-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/imx_rproc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 077413319f58..b05aae0ad7a2 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -48,7 +48,7 @@ | IMX6SX_SW_M4C_NON_SCLR_RST \ | IMX6SX_SW_M4C_RST) -#define IMX7D_RPROC_MEM_MAX 8 +#define IMX_RPROC_MEM_MAX 32 /** * struct imx_rproc_mem - slim internal memory structure @@ -88,7 +88,7 @@ struct imx_rproc { struct regmap *regmap; struct rproc *rproc; const struct imx_rproc_dcfg *dcfg; - struct imx_rproc_mem mem[IMX7D_RPROC_MEM_MAX]; + struct imx_rproc_mem mem[IMX_RPROC_MEM_MAX]; struct clk *clk; struct mbox_client cl; struct mbox_chan *tx_ch; @@ -272,7 +272,7 @@ static void *imx_rproc_da_to_va(struct rproc *rproc, u64 da, size_t len, bool *i if (imx_rproc_da_to_sys(priv, da, len, &sys)) return NULL; - for (i = 0; i < IMX7D_RPROC_MEM_MAX; i++) { + for (i = 0; i < IMX_RPROC_MEM_MAX; i++) { if (sys >= priv->mem[i].sys_addr && sys + len < priv->mem[i].sys_addr + priv->mem[i].size) { unsigned int offset = sys - priv->mem[i].sys_addr; @@ -425,7 +425,7 @@ static int imx_rproc_addr_init(struct imx_rproc *priv, if (!(att->flags & ATT_OWN)) continue; - if (b >= IMX7D_RPROC_MEM_MAX) + if (b >= IMX_RPROC_MEM_MAX) break; priv->mem[b].cpu_addr = devm_ioremap(&pdev->dev, @@ -461,7 +461,7 @@ static int imx_rproc_addr_init(struct imx_rproc *priv, of_node_put(node); - if (b >= IMX7D_RPROC_MEM_MAX) + if (b >= IMX_RPROC_MEM_MAX) break; /* Not use resource version, because we might share region */ From 10a3d4079eaea06472f1981152e2840e7232ffa9 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Thu, 8 Apr 2021 09:44:48 +0800 Subject: [PATCH 0395/1379] remoteproc: imx_rproc: move memory parsing to rproc_ops Use the rproc_ops::prepare() hook for doing memory resources reallocation when reattaching a remote procesor. Suggested-by: Mathieu Poirier Reviewed-by: Mathieu Poirier Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1617846289-13496-4-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/imx_rproc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index b05aae0ad7a2..7cd09971d1a4 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -317,7 +317,7 @@ static int imx_rproc_mem_release(struct rproc *rproc, return 0; } -static int imx_rproc_parse_memory_regions(struct rproc *rproc) +static int imx_rproc_prepare(struct rproc *rproc) { struct imx_rproc *priv = rproc->priv; struct device_node *np = priv->dev->of_node; @@ -363,10 +363,7 @@ static int imx_rproc_parse_memory_regions(struct rproc *rproc) static int imx_rproc_parse_fw(struct rproc *rproc, const struct firmware *fw) { - int ret = imx_rproc_parse_memory_regions(rproc); - - if (ret) - return ret; + int ret; ret = rproc_elf_load_rsc_table(rproc, fw); if (ret) @@ -399,6 +396,7 @@ static void imx_rproc_kick(struct rproc *rproc, int vqid) } static const struct rproc_ops imx_rproc_ops = { + .prepare = imx_rproc_prepare, .start = imx_rproc_start, .stop = imx_rproc_stop, .kick = imx_rproc_kick, From 5e4c1243071d29ed5511121d044116b942ba6a7b Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Thu, 8 Apr 2021 09:44:49 +0800 Subject: [PATCH 0396/1379] remoteproc: imx_rproc: support remote cores booted before Linux Kernel - When remote cores are kicked before Linux Kernel, we are not able to get resource table from the firmware elf file, so we need to add rsc_table to hold the resource table published by remote cores and imx_rproc_get_loaded_rsc_table is to get the resource table. - Per remoteproc framework, add attach hook for processor in a detached state. - Add imx_rproc_detect_mode to detect remote cores' working mode to set the state which is required by remoteproc framework. Reviewed-by: Mathieu Poirier Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/1617846289-13496-5-git-send-email-peng.fan@oss.nxp.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/imx_rproc.c | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/drivers/remoteproc/imx_rproc.c b/drivers/remoteproc/imx_rproc.c index 7cd09971d1a4..d6338872c6db 100644 --- a/drivers/remoteproc/imx_rproc.c +++ b/drivers/remoteproc/imx_rproc.c @@ -95,6 +95,7 @@ struct imx_rproc { struct mbox_chan *rx_ch; struct work_struct rproc_work; struct workqueue_struct *workqueue; + void __iomem *rsc_table; }; static const struct imx_rproc_att imx_rproc_att_imx8mq[] = { @@ -395,8 +396,26 @@ static void imx_rproc_kick(struct rproc *rproc, int vqid) __func__, vqid, err); } +static int imx_rproc_attach(struct rproc *rproc) +{ + return 0; +} + +static struct resource_table *imx_rproc_get_loaded_rsc_table(struct rproc *rproc, size_t *table_sz) +{ + struct imx_rproc *priv = rproc->priv; + + /* The resource table has already been mapped in imx_rproc_addr_init */ + if (!priv->rsc_table) + return NULL; + + *table_sz = SZ_1K; + return (struct resource_table *)priv->rsc_table; +} + static const struct rproc_ops imx_rproc_ops = { .prepare = imx_rproc_prepare, + .attach = imx_rproc_attach, .start = imx_rproc_start, .stop = imx_rproc_stop, .kick = imx_rproc_kick, @@ -404,6 +423,7 @@ static const struct rproc_ops imx_rproc_ops = { .load = rproc_elf_load_segments, .parse_fw = imx_rproc_parse_fw, .find_loaded_rsc_table = rproc_elf_find_loaded_rsc_table, + .get_loaded_rsc_table = imx_rproc_get_loaded_rsc_table, .sanity_check = rproc_elf_sanity_check, .get_boot_addr = rproc_elf_get_boot_addr, }; @@ -470,6 +490,8 @@ static int imx_rproc_addr_init(struct imx_rproc *priv, } priv->mem[b].sys_addr = res.start; priv->mem[b].size = resource_size(&res); + if (!strcmp(node->name, "rsc_table")) + priv->rsc_table = priv->mem[b].cpu_addr; b++; } @@ -536,6 +558,25 @@ static void imx_rproc_free_mbox(struct rproc *rproc) mbox_free_channel(priv->rx_ch); } +static int imx_rproc_detect_mode(struct imx_rproc *priv) +{ + const struct imx_rproc_dcfg *dcfg = priv->dcfg; + struct device *dev = priv->dev; + int ret; + u32 val; + + ret = regmap_read(priv->regmap, dcfg->src_reg, &val); + if (ret) { + dev_err(dev, "Failed to read src\n"); + return ret; + } + + if (!(val & dcfg->src_stop)) + priv->rproc->state = RPROC_DETACHED; + + return 0; +} + static int imx_rproc_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -590,6 +631,10 @@ static int imx_rproc_probe(struct platform_device *pdev) goto err_put_mbox; } + ret = imx_rproc_detect_mode(priv); + if (ret) + goto err_put_mbox; + priv->clk = devm_clk_get(dev, NULL); if (IS_ERR(priv->clk)) { dev_err(dev, "Failed to get clock\n"); From 82eae5a432cb2d50d7ea51428dcad7894e4165fe Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 1 Apr 2021 14:59:59 -0400 Subject: [PATCH 0397/1379] NFSv4: nfs4_inc/dec_nlink_locked should also invalidate ctime If the nlink changes, then so will the ctime. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0b0a48d78299..d05f4ca5d9c0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1169,14 +1169,18 @@ int nfs4_call_sync(struct rpc_clnt *clnt, static void nfs4_inc_nlink_locked(struct inode *inode) { - nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_NLINK); inc_nlink(inode); } static void nfs4_dec_nlink_locked(struct inode *inode) { - nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_NLINK); drop_nlink(inode); } @@ -4602,11 +4606,11 @@ _nfs4_proc_remove(struct inode *dir, const struct qstr *name, u32 ftype) status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); if (status == 0) { spin_lock(&dir->i_lock); - nfs4_update_changeattr_locked(dir, &res.cinfo, timestamp, - NFS_INO_INVALID_DATA); /* Removing a directory decrements nlink in the parent */ if (ftype == NF4DIR && dir->i_nlink > 2) nfs4_dec_nlink_locked(dir); + nfs4_update_changeattr_locked(dir, &res.cinfo, timestamp, + NFS_INO_INVALID_DATA); spin_unlock(&dir->i_lock); } return status; @@ -4864,12 +4868,12 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { spin_lock(&dir->i_lock); - nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo, - data->res.fattr->time_start, - NFS_INO_INVALID_DATA); /* Creating a directory bumps nlink in the parent */ if (data->arg.ftype == NF4DIR) nfs4_inc_nlink_locked(dir); + nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo, + data->res.fattr->time_start, + NFS_INO_INVALID_DATA); spin_unlock(&dir->i_lock); status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label); } From 1301e421b75b90b1a6101961b3aca2d91a9a0599 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 1 Apr 2021 14:57:56 -0400 Subject: [PATCH 0398/1379] NFSv4: link must update the inode nlink. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- fs/nfs/nfs4proc.c | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d2835d211a73..5c25c8cc037a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1711,7 +1711,7 @@ static void nfs_drop_nlink(struct inode *inode) NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); nfs_set_cache_invalid( inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | - NFS_INO_INVALID_NLINK | NFS_INO_REVAL_FORCED); + NFS_INO_INVALID_NLINK); spin_unlock(&inode->i_lock); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d05f4ca5d9c0..2215f20e0e78 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1175,6 +1175,14 @@ nfs4_inc_nlink_locked(struct inode *inode) inc_nlink(inode); } +static void +nfs4_inc_nlink(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs4_inc_nlink_locked(inode); + spin_unlock(&inode->i_lock); +} + static void nfs4_dec_nlink_locked(struct inode *inode) { @@ -4791,6 +4799,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct if (!status) { nfs4_update_changeattr(dir, &res.cinfo, res.fattr->time_start, NFS_INO_INVALID_DATA); + nfs4_inc_nlink(inode); status = nfs_post_op_update_inode(inode, res.fattr); if (!status) nfs_setsecurity(inode, res.fattr, res.label); From cc7f2dae63bca9579d65a46f3bf2c48a15961e19 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 11 Apr 2021 14:31:24 -0400 Subject: [PATCH 0399/1379] NFS: Don't store NFS_INO_REVAL_FORCED NFS_INO_REVAL_FORCED is intended to tell us that the cache needs revalidation despite the fact that we hold a delegation. We shouldn't need to store it anymore, though. Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 21 +++++++++++---- fs/nfs/delegation.h | 3 +-- fs/nfs/inode.c | 66 +++++++++++++++++---------------------------- fs/nfs/nfs4proc.c | 5 +--- 4 files changed, 43 insertions(+), 52 deletions(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 6a29de964268..e6ec6f09ac6e 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -481,6 +481,22 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, if (freeme == NULL) goto out; add_new: + /* + * If we didn't revalidate the change attribute before setting + * the delegation, then pre-emptively ask for a full attribute + * cache revalidation. + */ + spin_lock(&inode->i_lock); + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_CHANGE) + nfs_set_cache_invalid(inode, + NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | + NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | + NFS_INO_INVALID_OTHER | NFS_INO_INVALID_DATA | + NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR); + spin_unlock(&inode->i_lock); + list_add_tail_rcu(&delegation->super_list, &server->delegations); rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; @@ -488,11 +504,6 @@ add_new: atomic_long_inc(&nfs_active_delegations); trace_nfs4_set_delegation(inode, type); - - spin_lock(&inode->i_lock); - if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) - NFS_I(inode)->cache_validity |= NFS_INO_REVAL_FORCED; - spin_unlock(&inode->i_lock); out: spin_unlock(&clp->cl_lock); if (delegation != NULL) diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 9b00a0b7f832..c19b4fd20781 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -84,8 +84,7 @@ int nfs4_inode_make_writeable(struct inode *inode); static inline int nfs_have_delegated_attributes(struct inode *inode) { - return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) && - !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED); + return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ); } #endif diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b88e9dc72eec..7fa914e24fc4 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -200,21 +200,19 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) if (have_delegation) { if (!(flags & NFS_INO_REVAL_FORCED)) flags &= ~(NFS_INO_INVALID_MODE | - NFS_INO_INVALID_OTHER); - flags &= ~(NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_SIZE - | NFS_INO_INVALID_XATTR); + NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_XATTR); + flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); } else if (flags & NFS_INO_REVAL_PAGECACHE) flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE; - flags &= ~NFS_INO_REVAL_PAGECACHE; - if (!nfs_has_xattr_cache(nfsi)) flags &= ~NFS_INO_INVALID_XATTR; if (flags & NFS_INO_INVALID_DATA) nfs_fscache_invalidate(inode); if (inode->i_mapping->nrpages == 0) flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER); + flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); nfsi->cache_validity |= flags; } EXPORT_SYMBOL_GPL(nfs_set_cache_invalid); @@ -560,9 +558,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st } else if (fattr->size != 0) nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); - if (nfsi->cache_validity != 0) - nfsi->cache_validity |= NFS_INO_REVAL_FORCED; - nfs_setsecurity(inode, fattr, label); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); @@ -2032,7 +2027,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) save_cache_validity & (NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | - NFS_INO_REVAL_FORCED); + NFS_INO_INVALID_BLOCKS); cache_revalidated = false; } @@ -2064,27 +2059,24 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) attr_changed = true; } } else { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_CHANGE - | NFS_INO_REVAL_FORCED); + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_CHANGE; cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_MTIME) { inode->i_mtime = fattr->mtime; } else if (server->caps & NFS_CAP_MTIME) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_MTIME - | NFS_INO_REVAL_FORCED); + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_MTIME; cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_CTIME) { inode->i_ctime = fattr->ctime; } else if (server->caps & NFS_CAP_CTIME) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_CTIME - | NFS_INO_REVAL_FORCED); + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_CTIME; cache_revalidated = false; } @@ -2115,19 +2107,16 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) fattr->valid |= NFS_ATTR_FATTR_SPACE_USED; } } else { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_SIZE - | NFS_INO_REVAL_FORCED); + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_SIZE; cache_revalidated = false; } - if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; else if (server->caps & NFS_CAP_ATIME) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_FORCED); + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_ATIME; cache_revalidated = false; } @@ -2141,9 +2130,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) attr_changed = true; } } else if (server->caps & NFS_CAP_MODE) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_MODE - | NFS_INO_REVAL_FORCED); + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_MODE; cache_revalidated = false; } @@ -2155,9 +2143,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) attr_changed = true; } } else if (server->caps & NFS_CAP_OWNER) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_OTHER - | NFS_INO_REVAL_FORCED); + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_OTHER; cache_revalidated = false; } @@ -2169,9 +2156,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) attr_changed = true; } } else if (server->caps & NFS_CAP_OWNER_GROUP) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_OTHER - | NFS_INO_REVAL_FORCED); + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_OTHER; cache_revalidated = false; } @@ -2183,9 +2169,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) attr_changed = true; } } else if (server->caps & NFS_CAP_NLINK) { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_NLINK - | NFS_INO_REVAL_FORCED); + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_NLINK; cache_revalidated = false; } @@ -2197,9 +2182,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; else { - nfsi->cache_validity |= save_cache_validity & - (NFS_INO_INVALID_BLOCKS - | NFS_INO_REVAL_FORCED); + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_BLOCKS; cache_revalidated = false; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2215f20e0e78..bcbb057d5529 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5477,10 +5477,7 @@ static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src, if (cache_validity & NFS_INO_INVALID_BLOCKS) bitmask[1] |= FATTR4_WORD1_SPACE_USED; - if (nfs4_have_delegation(inode, FMODE_READ) && - !(cache_validity & NFS_INO_REVAL_FORCED)) - bitmask[0] &= ~FATTR4_WORD0_SIZE; - else if (cache_validity & NFS_INO_INVALID_SIZE) + if (cache_validity & NFS_INO_INVALID_SIZE) bitmask[0] |= FATTR4_WORD0_SIZE; for (i = 0; i < NFS4_BITMASK_SZ; i++) From ce62b114bbad9346641d16853c528ba01513e1b0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 5 Mar 2018 15:01:18 -0500 Subject: [PATCH 0400/1379] NFS: Split attribute support out from the server capabilities There are lots of attributes, and they are crowding out the bit space. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 15 +++++++++--- fs/nfs/inode.c | 51 ++++++++++++++++++++++++--------------- fs/nfs/nfs4proc.c | 49 +++++++++++++++++++------------------ include/linux/nfs_fs_sb.h | 11 ++------- 4 files changed, 70 insertions(+), 56 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 2aeb4e52a4f1..cfeaadf56bf0 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -696,9 +696,18 @@ static int nfs_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = ctx->flags; server->options = ctx->options; - server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| - NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| - NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; + server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; + + switch (clp->rpc_ops->version) { + case 2: + server->fattr_valid = NFS_ATTR_FATTR_V2; + break; + case 3: + server->fattr_valid = NFS_ATTR_FATTR_V3; + break; + default: + server->fattr_valid = NFS_ATTR_FATTR_V4; + } if (ctx->rsize) server->rsize = nfs_block_size(ctx->rsize, NULL); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7fa914e24fc4..6d04ebb4f084 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -438,6 +438,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st .fattr = fattr }; struct inode *inode = ERR_PTR(-ENOENT); + u64 fattr_supported = NFS_SB(sb)->fattr_valid; unsigned long hash; nfs_attr_check_mountpoint(sb, fattr); @@ -470,7 +471,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_mode = fattr->mode; nfsi->cache_validity = 0; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 - && nfs_server_capable(inode, NFS_CAP_MODE)) + && (fattr_supported & NFS_ATTR_FATTR_MODE)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. @@ -516,15 +517,15 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; - else if (nfs_server_capable(inode, NFS_CAP_ATIME)) + else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; - else if (nfs_server_capable(inode, NFS_CAP_MTIME)) + else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; - else if (nfs_server_capable(inode, NFS_CAP_CTIME)) + else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode_set_iversion_raw(inode, fattr->change_attr); @@ -536,26 +537,30 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE); if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); - else if (nfs_server_capable(inode, NFS_CAP_NLINK)) + else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; - else if (nfs_server_capable(inode, NFS_CAP_OWNER)) + else if (fattr_supported & NFS_ATTR_FATTR_OWNER) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; - else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) + else if (fattr_supported & NFS_ATTR_FATTR_GROUP) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (nfs_server_capable(inode, NFS_CAP_XATTR)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; - else if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { + else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED && + fattr->size != 0) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); + if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } else if (fattr->size != 0) + } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED && + fattr->size != 0) nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); nfs_setsecurity(inode, fattr, label); @@ -1952,9 +1957,10 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); */ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_server *server; + struct nfs_server *server = NFS_SERVER(inode); struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_isize, new_isize; + u64 fattr_supported = server->fattr_valid; unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; @@ -1998,7 +2004,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) goto out_err; } - server = NFS_SERVER(inode); /* Update the fsid? */ if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && !nfs_fsid_equal(&server->fsid, &fattr->fsid) && @@ -2066,7 +2071,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_MTIME) { inode->i_mtime = fattr->mtime; - } else if (server->caps & NFS_CAP_MTIME) { + } else if (fattr_supported & NFS_ATTR_FATTR_MTIME) { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MTIME; cache_revalidated = false; @@ -2074,7 +2079,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_CTIME) { inode->i_ctime = fattr->ctime; - } else if (server->caps & NFS_CAP_CTIME) { + } else if (fattr_supported & NFS_ATTR_FATTR_CTIME) { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CTIME; cache_revalidated = false; @@ -2114,7 +2119,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; - else if (server->caps & NFS_CAP_ATIME) { + else if (fattr_supported & NFS_ATTR_FATTR_ATIME) { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATIME; cache_revalidated = false; @@ -2129,7 +2134,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_INVALID_ACL; attr_changed = true; } - } else if (server->caps & NFS_CAP_MODE) { + } else if (fattr_supported & NFS_ATTR_FATTR_MODE) { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MODE; cache_revalidated = false; @@ -2142,7 +2147,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_uid = fattr->uid; attr_changed = true; } - } else if (server->caps & NFS_CAP_OWNER) { + } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_OTHER; cache_revalidated = false; @@ -2155,7 +2160,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_gid = fattr->gid; attr_changed = true; } - } else if (server->caps & NFS_CAP_OWNER_GROUP) { + } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_OTHER; cache_revalidated = false; @@ -2168,7 +2173,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) set_nlink(inode, fattr->nlink); attr_changed = true; } - } else if (server->caps & NFS_CAP_NLINK) { + } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_NLINK; cache_revalidated = false; @@ -2179,9 +2184,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) + } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) { + nfsi->cache_validity |= + save_cache_validity & NFS_INO_INVALID_BLOCKS; + cache_revalidated = false; + } + + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) { inode->i_blocks = fattr->du.nfs2.blocks; - else { + } else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_BLOCKS; cache_revalidated = false; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index bcbb057d5529..21c31aebb116 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3868,12 +3868,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK; } memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); - server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| - NFS_CAP_SYMLINKS|NFS_CAP_FILEID| - NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER| - NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| - NFS_CAP_CTIME|NFS_CAP_MTIME| - NFS_CAP_SECURITY_LABEL); + server->caps &= ~(NFS_CAP_ACLS | NFS_CAP_HARDLINKS | + NFS_CAP_SYMLINKS| NFS_CAP_SECURITY_LABEL); + server->fattr_valid = NFS_ATTR_FATTR_V4; if (res.attr_bitmask[0] & FATTR4_WORD0_ACL && res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) server->caps |= NFS_CAP_ACLS; @@ -3881,25 +3878,29 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; - if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID) - server->caps |= NFS_CAP_FILEID; - if (res.attr_bitmask[1] & FATTR4_WORD1_MODE) - server->caps |= NFS_CAP_MODE; - if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS) - server->caps |= NFS_CAP_NLINK; - if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER) - server->caps |= NFS_CAP_OWNER; - if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP) - server->caps |= NFS_CAP_OWNER_GROUP; - if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS) - server->caps |= NFS_CAP_ATIME; - if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA) - server->caps |= NFS_CAP_CTIME; - if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) - server->caps |= NFS_CAP_MTIME; + if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID)) + server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE)) + server->fattr_valid &= ~NFS_ATTR_FATTR_MODE; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)) + server->fattr_valid &= ~NFS_ATTR_FATTR_NLINK; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_OWNER)) + server->fattr_valid &= ~(NFS_ATTR_FATTR_OWNER | + NFS_ATTR_FATTR_OWNER_NAME); + if (!(res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)) + server->fattr_valid &= ~(NFS_ATTR_FATTR_GROUP | + NFS_ATTR_FATTR_GROUP_NAME); + if (!(res.attr_bitmask[1] & FATTR4_WORD1_SPACE_USED)) + server->fattr_valid &= ~NFS_ATTR_FATTR_SPACE_USED; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)) + server->fattr_valid &= ~NFS_ATTR_FATTR_ATIME; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)) + server->fattr_valid &= ~NFS_ATTR_FATTR_CTIME; + if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)) + server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME; #ifdef CONFIG_NFS_V4_SECURITY_LABEL - if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) - server->caps |= NFS_CAP_SECURITY_LABEL; + if (!(res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL)) + server->fattr_valid &= ~NFS_ATTR_FATTR_V4_SECURITY_LABEL; #endif memcpy(server->attr_bitmask_nl, res.attr_bitmask, sizeof(server->attr_bitmask)); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index fbcdfd9f7a7f..d28d7a62864f 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -191,6 +191,8 @@ struct nfs_server { dev_t s_dev; /* superblock dev numbers */ struct nfs_auth_info auth_info; /* parsed auth flavors */ + __u64 fattr_valid; /* Valid attributes */ + #ifdef CONFIG_NFS_FSCACHE struct nfs_fscache_key *fscache_key; /* unique key for superblock */ struct fscache_cookie *fscache; /* superblock cookie */ @@ -267,16 +269,7 @@ struct nfs_server { #define NFS_CAP_SYMLINKS (1U << 2) #define NFS_CAP_ACLS (1U << 3) #define NFS_CAP_ATOMIC_OPEN (1U << 4) -/* #define NFS_CAP_CHANGE_ATTR (1U << 5) */ #define NFS_CAP_LGOPEN (1U << 5) -#define NFS_CAP_FILEID (1U << 6) -#define NFS_CAP_MODE (1U << 7) -#define NFS_CAP_NLINK (1U << 8) -#define NFS_CAP_OWNER (1U << 9) -#define NFS_CAP_OWNER_GROUP (1U << 10) -#define NFS_CAP_ATIME (1U << 11) -#define NFS_CAP_CTIME (1U << 12) -#define NFS_CAP_MTIME (1U << 13) #define NFS_CAP_POSIX_LOCK (1U << 14) #define NFS_CAP_UIDGID_NOMAP (1U << 15) #define NFS_CAP_STATEID_NFSV41 (1U << 16) From da934ae0a8aa20f5ac6bfa04bdfe8bd8c5b438e7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 1 Apr 2021 14:38:13 -0400 Subject: [PATCH 0401/1379] NFSv4: Add tracing for COMPOUND errors When the server returns a different operation than we expected, then trace that. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4trace.h | 35 +++++++++++++++++++++++++++++++++++ fs/nfs/nfs4xdr.c | 4 +--- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 48d761e593fb..3c505bed52a7 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -666,6 +666,41 @@ TRACE_EVENT(nfs4_state_mgr_failed, ) ) +TRACE_EVENT(nfs4_xdr_bad_operation, + TP_PROTO( + const struct xdr_stream *xdr, + u32 op, + u32 expected + ), + + TP_ARGS(xdr, op, expected), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(u32, op) + __field(u32, expected) + ), + + TP_fast_assign( + const struct rpc_rqst *rqstp = xdr->rqst; + const struct rpc_task *task = rqstp->rq_task; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client->cl_clid; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->op = op; + __entry->expected = expected; + ), + + TP_printk( + "task:%u@%d xid=0x%08x operation=%u, expected=%u", + __entry->task_id, __entry->client_id, __entry->xid, + __entry->op, __entry->expected + ) +); + TRACE_EVENT(nfs4_xdr_status, TP_PROTO( const struct xdr_stream *xdr, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index edac4718dec1..ee7a7a132acc 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3210,9 +3210,7 @@ out_status: *nfs_retval = nfs4_stat_to_errno(nfserr); return true; out_bad_operation: - dprintk("nfs: Server returned operation" - " %d but we issued a request for %d\n", - opnum, expected); + trace_nfs4_xdr_bad_operation(xdr, opnum, expected); *nfs_retval = -EREMOTEIO; return false; out_overflow: From 3d66bae156a652be18e278f3c88bc3e069ae824b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 1 Apr 2021 13:54:19 -0400 Subject: [PATCH 0402/1379] NFSv4: Convert nfs_xdr_status tracepoint to an event class We would like the ability to record other XDR errors, particularly those that are due to server bugs. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4trace.h | 11 ++++++++++- fs/nfs/nfstrace.h | 10 +++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 3c505bed52a7..114983a706e9 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -701,7 +701,7 @@ TRACE_EVENT(nfs4_xdr_bad_operation, ) ); -TRACE_EVENT(nfs4_xdr_status, +DECLARE_EVENT_CLASS(nfs4_xdr_event, TP_PROTO( const struct xdr_stream *xdr, u32 op, @@ -736,6 +736,15 @@ TRACE_EVENT(nfs4_xdr_status, __entry->op ) ); +#define DEFINE_NFS4_XDR_EVENT(name) \ + DEFINE_EVENT(nfs4_xdr_event, name, \ + TP_PROTO( \ + const struct xdr_stream *xdr, \ + u32 op, \ + u32 error \ + ), \ + TP_ARGS(xdr, op, error)) +DEFINE_NFS4_XDR_EVENT(nfs4_xdr_status); DECLARE_EVENT_CLASS(nfs4_cb_error_class, TP_PROTO( diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 41a161cd31f6..45eda4db42fd 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1401,7 +1401,7 @@ TRACE_DEFINE_ENUM(NFSERR_JUKEBOX); { NFSERR_BADTYPE, "BADTYPE" }, \ { NFSERR_JUKEBOX, "JUKEBOX" }) -TRACE_EVENT(nfs_xdr_status, +DECLARE_EVENT_CLASS(nfs_xdr_event, TP_PROTO( const struct xdr_stream *xdr, int error @@ -1443,6 +1443,14 @@ TRACE_EVENT(nfs_xdr_status, nfs_show_status(__entry->error) ) ); +#define DEFINE_NFS_XDR_EVENT(name) \ + DEFINE_EVENT(nfs_xdr_event, name, \ + TP_PROTO( \ + const struct xdr_stream *xdr, \ + int error \ + ), \ + TP_ARGS(xdr, error)) +DEFINE_NFS_XDR_EVENT(nfs_xdr_status); #endif /* _TRACE_NFS_H */ From eb3d58c68e39fad68d8054e0324eb06d82dcedbb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 1 Apr 2021 14:03:26 -0400 Subject: [PATCH 0403/1379] NFSv4: Catch and trace server filehandle encoding errors If the server returns a filehandle with an invalid length, then trace that, and return an EREMOTEIO error. Signed-off-by: Trond Myklebust --- fs/nfs/nfs3xdr.c | 4 ++-- fs/nfs/nfs4trace.h | 1 + fs/nfs/nfs4xdr.c | 13 +++++++++---- fs/nfs/nfstrace.c | 1 + fs/nfs/nfstrace.h | 1 + 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 83ad62c81fc7..e6eca1d7481b 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -433,7 +433,7 @@ static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) if (unlikely(!p)) return -EIO; length = be32_to_cpup(p++); - if (unlikely(length > NFS3_FHSIZE)) + if (unlikely(length > NFS3_FHSIZE || length == 0)) goto out_toobig; p = xdr_inline_decode(xdr, length); if (unlikely(!p)) @@ -442,7 +442,7 @@ static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) memcpy(fh->data, p, length); return 0; out_toobig: - dprintk("NFS: file handle size (%u) too big\n", length); + trace_nfs_xdr_bad_filehandle(xdr, NFSERR_BADHANDLE); return -E2BIG; } diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 114983a706e9..2ef75caad6da 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -745,6 +745,7 @@ DECLARE_EVENT_CLASS(nfs4_xdr_event, ), \ TP_ARGS(xdr, op, error)) DEFINE_NFS4_XDR_EVENT(nfs4_xdr_status); +DEFINE_NFS4_XDR_EVENT(nfs4_xdr_bad_filehandle); DECLARE_EVENT_CLASS(nfs4_cb_error_class, TP_PROTO( diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index ee7a7a132acc..3d7e10c26597 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3495,8 +3495,11 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru if (unlikely(!p)) return -EIO; len = be32_to_cpup(p); - if (len > NFS4_FHSIZE) - return -EIO; + if (len > NFS4_FHSIZE || len == 0) { + trace_nfs4_xdr_bad_filehandle(xdr, OP_READDIR, + NFS4ERR_BADHANDLE); + return -EREMOTEIO; + } p = xdr_inline_decode(xdr, len); if (unlikely(!p)) return -EIO; @@ -4952,8 +4955,10 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) if (unlikely(!p)) return -EIO; len = be32_to_cpup(p); - if (len > NFS4_FHSIZE) - return -EIO; + if (len > NFS4_FHSIZE || len == 0) { + trace_nfs4_xdr_bad_filehandle(xdr, OP_GETFH, NFS4ERR_BADHANDLE); + return -EREMOTEIO; + } fh->size = len; p = xdr_inline_decode(xdr, len); if (unlikely(!p)) diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c index a90b363500c2..5d1bfccbb4da 100644 --- a/fs/nfs/nfstrace.c +++ b/fs/nfs/nfstrace.c @@ -12,3 +12,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_status); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_bad_filehandle); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 45eda4db42fd..eb1ef3462e84 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1451,6 +1451,7 @@ DECLARE_EVENT_CLASS(nfs_xdr_event, ), \ TP_ARGS(xdr, error)) DEFINE_NFS_XDR_EVENT(nfs_xdr_status); +DEFINE_NFS_XDR_EVENT(nfs_xdr_bad_filehandle); #endif /* _TRACE_NFS_H */ From ed34695e15aba74f45247f1ee2cf7e09d449f925 Mon Sep 17 00:00:00 2001 From: Nikola Livic Date: Mon, 29 Mar 2021 11:56:49 +0300 Subject: [PATCH 0404/1379] pNFS/flexfiles: fix incorrect size check in decode_nfs_fh() We (adam zabrocki, alexander matrosov, alexander tereshkin, maksym bazalii) observed the check: if (fh->size > sizeof(struct nfs_fh)) should not use the size of the nfs_fh struct which includes an extra two bytes from the size field. struct nfs_fh { unsigned short size; unsigned char data[NFS_MAXFHSIZE]; } but should determine the size from data[NFS_MAXFHSIZE] so the memcpy will not write 2 bytes beyond destination. The proposed fix is to compare against the NFS_MAXFHSIZE directly, as is done elsewhere in fs code base. Fixes: d67ae825a59d ("pnfs/flexfiles: Add the FlexFile Layout Driver") Signed-off-by: Nikola Livic Signed-off-by: Dan Carpenter Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 872112bffcab..d383de00d486 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -106,7 +106,7 @@ static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh) if (unlikely(!p)) return -ENOBUFS; fh->size = be32_to_cpup(p++); - if (fh->size > sizeof(struct nfs_fh)) { + if (fh->size > NFS_MAXFHSIZE) { printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n", fh->size); return -EOVERFLOW; From 73f5c88f521a630ea1628beb9c2d48a2e777a419 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 31 Mar 2021 15:30:25 -0400 Subject: [PATCH 0405/1379] NFSv4.2 fix handling of sr_eof in SEEK's reply Currently the client ignores the value of the sr_eof of the SEEK operation. According to the spec, if the server didn't find the requested extent and reached the end of the file, the server would return sr_eof=true. In case the request for DATA and no data was found (ie in the middle of the hole), then the lseek expects that ENXIO would be returned. Fixes: 1c6dcbe5ceff8 ("NFS: Implement SEEK") Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 704a5246ccb5..8d64eb953347 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -667,7 +667,10 @@ static loff_t _nfs42_proc_llseek(struct file *filep, if (status) return status; - return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes); + if (whence == SEEK_DATA && res.sr_eof) + return -NFS4ERR_NXIO; + else + return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes); } loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) From 7638e0bfaed1b653d3ca663e560e9ffb44bb1030 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 31 Mar 2021 13:22:14 -0400 Subject: [PATCH 0406/1379] SUNRPC: Move fault injection call sites I've hit some crashes that occur in the xprt_rdma_inject_disconnect path. It appears that, for some provides, rdma_disconnect() can take so long that the transport can disconnect and release its hardware resources while rdma_disconnect() is still running, resulting in a UAF in the provider. The transport's fault injection method may depend on the stability of transport data structures. That means it needs to be invoked only from contexts that hold the transport write lock. Fixes: 4a0682583988 ("SUNRPC: Transport fault injection") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 1 - net/sunrpc/xprt.c | 6 ++++-- net/sunrpc/xprtrdma/transport.c | 6 ++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 612f0a641f4c..c2a01125be1a 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1799,7 +1799,6 @@ call_allocate(struct rpc_task *task) status = xprt->ops->buf_alloc(task); trace_rpc_buf_alloc(task, status); - xprt_inject_disconnect(xprt); if (status == 0) return; if (status != -ENOMEM) { diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a853f75d4968..80c94ead4aa0 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1485,7 +1485,10 @@ bool xprt_prepare_transmit(struct rpc_task *task) void xprt_end_transmit(struct rpc_task *task) { - xprt_release_write(task->tk_rqstp->rq_xprt, task); + struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; + + xprt_inject_disconnect(xprt); + xprt_release_write(xprt, task); } /** @@ -1887,7 +1890,6 @@ void xprt_release(struct rpc_task *task) spin_unlock(&xprt->transport_lock); if (req->rq_buffer) xprt->ops->buf_free(task); - xprt_inject_disconnect(xprt); xdr_free_bvec(&req->rq_rcv_buf); xdr_free_bvec(&req->rq_snd_buf); if (req->rq_cred != NULL) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 78d29d1bcc20..09953597d055 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -262,8 +262,10 @@ xprt_rdma_connect_worker(struct work_struct *work) * xprt_rdma_inject_disconnect - inject a connection fault * @xprt: transport context * - * If @xprt is connected, disconnect it to simulate spurious connection - * loss. + * If @xprt is connected, disconnect it to simulate spurious + * connection loss. Caller must hold @xprt's send lock to + * ensure that data structures and hardware resources are + * stable during the rdma_disconnect() call. */ static void xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) From e936a5970ef596ff48fca72aa8200955753c543f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 31 Mar 2021 13:22:27 -0400 Subject: [PATCH 0407/1379] SUNRPC: Add tracepoint that fires when an RPC is retransmitted A separate tracepoint can be left enabled all the time to capture rare but important retransmission events. So for example: kworker/u26:3-568 [009] 156.967933: xprt_retransmit: task:44093@5 xid=0xa25dbc79 nfsv3 WRITE ntrans=2 Or, for example, enable all nfs and nfs4 tracepoints, and set up a trigger to disable tracing when xprt_retransmit fires to capture everything that leads up to it. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/sunrpc.h | 40 +++++++++++++++++++++++++++++++++++ net/sunrpc/xprt.c | 4 +++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 036eb1f5c133..430b42f2351f 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1079,6 +1079,46 @@ TRACE_EVENT(xprt_transmit, __entry->seqno, __entry->status) ); +TRACE_EVENT(xprt_retransmit, + TP_PROTO( + const struct rpc_rqst *rqst + ), + + TP_ARGS(rqst), + + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, xid) + __field(int, ntrans) + __field(int, version) + __string(progname, + rqst->rq_task->tk_client->cl_program->name) + __string(procedure, + rqst->rq_task->tk_msg.rpc_proc->p_name) + ), + + TP_fast_assign( + struct rpc_task *task = rqst->rq_task; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; + __entry->xid = be32_to_cpu(rqst->rq_xid); + __entry->ntrans = rqst->rq_ntrans; + __assign_str(progname, + task->tk_client->cl_program->name) + __entry->version = task->tk_client->cl_vers; + __assign_str(procedure, task->tk_msg.rpc_proc->p_name) + ), + + TP_printk( + "task:%u@%u xid=0x%08x %sv%d %s ntrans=%d", + __entry->task_id, __entry->client_id, __entry->xid, + __get_str(progname), __entry->version, __get_str(procedure), + __entry->ntrans) +); + TRACE_EVENT(xprt_ping, TP_PROTO(const struct rpc_xprt *xprt, int status), diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 80c94ead4aa0..67039ee443eb 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1542,8 +1542,10 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) return status; } - if (is_retrans) + if (is_retrans) { task->tk_client->cl_stats->rpcretrans++; + trace_xprt_retransmit(req); + } xprt_inject_disconnect(xprt); From 6cf23783f750634e10daeede48b0f5f5d64ebf3a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 31 Mar 2021 16:03:08 -0400 Subject: [PATCH 0408/1379] SUNRPC: Remove trace_xprt_transmit_queued This tracepoint can crash when dereferencing snd_task because when some transports connect, they put a cookie in that field instead of a pointer to an rpc_task. BUG: KASAN: use-after-free in trace_event_raw_event_xprt_writelock_event+0x141/0x18e [sunrpc] Read of size 2 at addr ffff8881a83bd3a0 by task git/331872 CPU: 11 PID: 331872 Comm: git Tainted: G S 5.12.0-rc2-00007-g3ab6e585a7f9 #1453 Hardware name: Supermicro SYS-6028R-T/X10DRi, BIOS 1.1a 10/16/2015 Call Trace: dump_stack+0x9c/0xcf print_address_description.constprop.0+0x18/0x239 kasan_report+0x174/0x1b0 trace_event_raw_event_xprt_writelock_event+0x141/0x18e [sunrpc] xprt_prepare_transmit+0x8e/0xc1 [sunrpc] call_transmit+0x4d/0xc6 [sunrpc] Fixes: 9ce07ae5eb1d ("SUNRPC: Replace dprintk() call site in xprt_prepare_transmit") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/sunrpc.h | 1 - net/sunrpc/xprt.c | 2 -- 2 files changed, 3 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 430b42f2351f..3c3c4c843c62 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1181,7 +1181,6 @@ DECLARE_EVENT_CLASS(xprt_writelock_event, DEFINE_WRITELOCK_EVENT(reserve_xprt); DEFINE_WRITELOCK_EVENT(release_xprt); -DEFINE_WRITELOCK_EVENT(transmit_queued); DECLARE_EVENT_CLASS(xprt_cong_event, TP_PROTO( diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 67039ee443eb..baba7d14825a 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1471,8 +1471,6 @@ bool xprt_prepare_transmit(struct rpc_task *task) struct rpc_xprt *xprt = req->rq_xprt; if (!xprt_lock_write(xprt, task)) { - trace_xprt_transmit_queued(xprt, task); - /* Race breaker: someone may have transmitted us */ if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) rpc_wake_up_queued_task_set_status(&xprt->sending, From 09252177d5f924f404551b4b4eded5daa7f04a3a Mon Sep 17 00:00:00 2001 From: Chris Dion Date: Sun, 4 Apr 2021 21:29:26 -0400 Subject: [PATCH 0409/1379] SUNRPC: Handle major timeout in xprt_adjust_timeout() Currently if a major timeout value is reached, but the minor value has not been reached, an ETIMEOUT will not be sent back to the caller. This can occur if the v4 server is not responding to requests and retrans is configured larger than the default of two. For example, A TCP mount with a configured timeout value of 50 and a retransmission count of 3 to a v4 server which is not responding: 1. Initial value and increment set to 5s, maxval set to 20s, retries at 3 2. Major timeout is set to 20s, minor timeout set to 5s initially 3. xport_adjust_timeout() is called after 5s, retry with 10s timeout, minor timeout is bumped to 10s 4. And again after another 10s, 15s total time with minor timeout set to 15s 5. After 20s total time xport_adjust_timeout is called as major timeout is reached, but skipped because the minor timeout is not reached - After this time the cpu spins continually calling xport_adjust_timeout() and returning 0 for 10 seconds. As seen on perf sched: 39243.913182 [0005] mount.nfs[3794] 4607.938 0.017 9746.863 6. This continues until the 15s minor timeout condition is reached (in this case for 10 seconds). After which the ETIMEOUT is processed back to the caller, the cpu spinning stops, and normal operations continue Fixes: 7de62bc09fe6 ("SUNRPC dont update timeout value on connection reset") Signed-off-by: Chris Dion Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index baba7d14825a..e5b5a960a69b 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -698,9 +698,9 @@ int xprt_adjust_timeout(struct rpc_rqst *req) const struct rpc_timeout *to = req->rq_task->tk_client->cl_timeout; int status = 0; - if (time_before(jiffies, req->rq_minortimeo)) - return status; if (time_before(jiffies, req->rq_majortimeo)) { + if (time_before(jiffies, req->rq_minortimeo)) + return status; if (to->to_exponential) req->rq_timeout <<= 1; else From 94d202d5ca39d0eb757d16ef2624b013fb64f64d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 14 Apr 2021 10:10:09 -0400 Subject: [PATCH 0410/1379] NFSv42: Copy offload should update the file size when appropriate If the result of a copy offload or clone operation is to grow the destination file size, then we should update it. The reason is that when a client holds a delegation, it is authoritative for the file size. Fixes: 16abd2a0c124 ("NFSv4.2: fix client's attribute cache management for copy_file_range") Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 8d64eb953347..3875120ef3ef 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -269,6 +269,33 @@ out: return status; } +/** + * nfs42_copy_dest_done - perform inode cache updates after clone/copy offload + * @inode: pointer to destination inode + * @pos: destination offset + * @len: copy length + * + * Punch a hole in the inode page cache, so that the NFS client will + * know to retrieve new data. + * Update the file size if necessary, and then mark the inode as having + * invalid cached values for change attribute, ctime, mtime and space used. + */ +static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len) +{ + loff_t newsize = pos + len; + loff_t end = newsize - 1; + + truncate_pagecache_range(inode, pos, end); + spin_lock(&inode->i_lock); + if (newsize > i_size_read(inode)) + i_size_write(inode, newsize); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | + NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_MTIME | + NFS_INO_INVALID_BLOCKS); + spin_unlock(&inode->i_lock); +} + static ssize_t _nfs42_proc_copy(struct file *src, struct nfs_lock_context *src_lock, struct file *dst, @@ -362,14 +389,8 @@ static ssize_t _nfs42_proc_copy(struct file *src, goto out; } - truncate_pagecache_range(dst_inode, pos_dst, - pos_dst + res->write_res.count); - spin_lock(&dst_inode->i_lock); - nfs_set_cache_invalid( - dst_inode, NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED | - NFS_INO_INVALID_SIZE | NFS_INO_INVALID_ATTR | - NFS_INO_INVALID_DATA); - spin_unlock(&dst_inode->i_lock); + nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count); + spin_lock(&src_inode->i_lock); nfs_set_cache_invalid(src_inode, NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED | @@ -1055,8 +1076,10 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, status = nfs4_call_sync(server->client, server, msg, &args.seq_args, &res.seq_res, 0); - if (status == 0) + if (status == 0) { + nfs42_copy_dest_done(dst_inode, dst_offset, count); status = nfs_post_op_update_inode(dst_inode, res.dst_fattr); + } kfree(res.dst_fattr); return status; From febfeaaefefb6e3a42e4ca279270a740014ce227 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 14 Apr 2021 10:27:39 -0400 Subject: [PATCH 0411/1379] NFSv42: Don't force attribute revalidation of the copy offload source When a copy offload is performed, we do not expect the source file to change other than perhaps to see the atime be updated. Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 3875120ef3ef..a24349512ffe 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -390,12 +390,7 @@ static ssize_t _nfs42_proc_copy(struct file *src, } nfs42_copy_dest_done(dst_inode, pos_dst, res->write_res.count); - - spin_lock(&src_inode->i_lock); - nfs_set_cache_invalid(src_inode, NFS_INO_REVAL_PAGECACHE | - NFS_INO_REVAL_FORCED | - NFS_INO_INVALID_ATIME); - spin_unlock(&src_inode->i_lock); + nfs_invalidate_atime(src_inode); status = res->write_res.count; out: if (args->sync) From aca8f94e5b69b31d7a6a18476a1011093e2a1c30 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Wed, 31 Mar 2021 09:33:46 +0200 Subject: [PATCH 0412/1379] dt-bindings: remoteproc: stm32-rproc: add new mailbox channel for detach Add the "detach" mailbox item, that allows to define a mailbox to send a IPCC signal to the remote processor on remoteproc detach action. Signed-off-by: Arnaud Pouliquen Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20210331073347.8293-2-arnaud.pouliquen@foss.st.com Signed-off-by: Bjorn Andersson --- .../bindings/remoteproc/st,stm32-rproc.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml b/Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml index a1171dfba024..64afdcfb613d 100644 --- a/Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml +++ b/Documentation/devicetree/bindings/remoteproc/st,stm32-rproc.yaml @@ -65,16 +65,23 @@ properties: Unidirectional channel: - from local to remote, where ACK from the remote means that it is ready for shutdown + - description: | + A channel (d) used by the local proc to notify the remote proc that it + has to stop interprocessor communnication. + Unidirectional channel: + - from local to remote, where ACK from the remote means that communnication + as been stopped on the remote side. minItems: 1 - maxItems: 3 + maxItems: 4 mbox-names: items: - const: vq0 - const: vq1 - const: shutdown + - const: detach minItems: 1 - maxItems: 3 + maxItems: 4 memory-region: description: From edf696f26855788cdff832ac83319e1f2aafcc90 Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Wed, 31 Mar 2021 09:33:47 +0200 Subject: [PATCH 0413/1379] remoteproc: stm32: add capability to detach A mechanism similar to the shutdown mailbox signal is implemented to detach a remote processor. Upon detachment, a signal is sent to the remote firmware, allowing it to perform specific actions such as stopping rpmsg communication. The Cortex-M hold boot is also disabled to allow the remote processor to restart in case of crash. Signed-off-by: Arnaud Pouliquen Reviewed-by: Mathieu Poirier Tested-by: Mathieu Poirier Link: https://lore.kernel.org/r/20210331073347.8293-3-arnaud.pouliquen@foss.st.com Signed-off-by: Bjorn Andersson --- drivers/remoteproc/stm32_rproc.c | 39 ++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/drivers/remoteproc/stm32_rproc.c b/drivers/remoteproc/stm32_rproc.c index 3d45f51de4d0..7353f9e7e7af 100644 --- a/drivers/remoteproc/stm32_rproc.c +++ b/drivers/remoteproc/stm32_rproc.c @@ -28,7 +28,7 @@ #define RELEASE_BOOT 1 #define MBOX_NB_VQ 2 -#define MBOX_NB_MBX 3 +#define MBOX_NB_MBX 4 #define STM32_SMC_RCC 0x82001000 #define STM32_SMC_REG_WRITE 0x1 @@ -38,6 +38,7 @@ #define STM32_MBX_VQ1 "vq1" #define STM32_MBX_VQ1_ID 1 #define STM32_MBX_SHUTDOWN "shutdown" +#define STM32_MBX_DETACH "detach" #define RSC_TBL_SIZE 1024 @@ -336,6 +337,15 @@ static const struct stm32_mbox stm32_rproc_mbox[MBOX_NB_MBX] = { .tx_done = NULL, .tx_tout = 500, /* 500 ms time out */ }, + }, + { + .name = STM32_MBX_DETACH, + .vq_id = -1, + .client = { + .tx_block = true, + .tx_done = NULL, + .tx_tout = 200, /* 200 ms time out to detach should be fair enough */ + }, } }; @@ -461,6 +471,25 @@ static int stm32_rproc_attach(struct rproc *rproc) return stm32_rproc_set_hold_boot(rproc, true); } +static int stm32_rproc_detach(struct rproc *rproc) +{ + struct stm32_rproc *ddata = rproc->priv; + int err, dummy_data, idx; + + /* Inform the remote processor of the detach */ + idx = stm32_rproc_mbox_idx(rproc, STM32_MBX_DETACH); + if (idx >= 0 && ddata->mb[idx].chan) { + /* A dummy data is sent to allow to block on transmit */ + err = mbox_send_message(ddata->mb[idx].chan, + &dummy_data); + if (err < 0) + dev_warn(&rproc->dev, "warning: remote FW detach without ack\n"); + } + + /* Allow remote processor to auto-reboot */ + return stm32_rproc_set_hold_boot(rproc, false); +} + static int stm32_rproc_stop(struct rproc *rproc) { struct stm32_rproc *ddata = rproc->priv; @@ -597,7 +626,12 @@ stm32_rproc_get_loaded_rsc_table(struct rproc *rproc, size_t *table_sz) } done: - /* Assuming the resource table fits in 1kB is fair */ + /* + * Assuming the resource table fits in 1kB is fair. + * Notice for the detach, that this 1 kB memory area has to be reserved in the coprocessor + * firmware for the resource table. On detach, the remoteproc core re-initializes this + * entire area by overwriting it with the initial values stored in rproc->clean_table. + */ *table_sz = RSC_TBL_SIZE; return (struct resource_table *)ddata->rsc_va; } @@ -607,6 +641,7 @@ static const struct rproc_ops st_rproc_ops = { .start = stm32_rproc_start, .stop = stm32_rproc_stop, .attach = stm32_rproc_attach, + .detach = stm32_rproc_detach, .kick = stm32_rproc_kick, .load = rproc_elf_load_segments, .parse_fw = stm32_rproc_parse_fw, From 351461f332db5670056a9c6bce6916027f91072f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 13 Apr 2021 17:53:22 -0400 Subject: [PATCH 0414/1379] svcrdma: Don't leak send_ctxt on Send errors Address a rare send_ctxt leak in the svc_rdma_sendto() error paths. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 056452cabc98..39aec629aaeb 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -936,7 +936,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) p = xdr_reserve_space(&sctxt->sc_stream, rpcrdma_fixed_maxsz * sizeof(*p)); if (!p) - goto err0; + goto err1; ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); if (ret < 0) @@ -948,11 +948,11 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) *p = pcl_is_empty(&rctxt->rc_reply_pcl) ? rdma_msg : rdma_nomsg; if (svc_rdma_encode_read_list(sctxt) < 0) - goto err0; + goto err1; if (svc_rdma_encode_write_list(rctxt, sctxt) < 0) - goto err0; + goto err1; if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0) - goto err0; + goto err1; ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp); if (ret < 0) From c7731d5e055453191a240d526c9d9e778ae2fce2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 13 Apr 2021 17:55:28 -0400 Subject: [PATCH 0415/1379] svcrdma: Rename goto labels in svc_rdma_sendto() Clean up: Make the goto labels consistent with other similar functions. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 39aec629aaeb..a3b0f5899f03 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -926,21 +926,21 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ret = -ENOTCONN; if (svc_xprt_is_dead(xprt)) - goto err0; + goto drop_connection; ret = -ENOMEM; sctxt = svc_rdma_send_ctxt_get(rdma); if (!sctxt) - goto err0; + goto drop_connection; p = xdr_reserve_space(&sctxt->sc_stream, rpcrdma_fixed_maxsz * sizeof(*p)); if (!p) - goto err1; + goto put_ctxt; ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); if (ret < 0) - goto err2; + goto reply_chunk; *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); @@ -948,15 +948,15 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) *p = pcl_is_empty(&rctxt->rc_reply_pcl) ? rdma_msg : rdma_nomsg; if (svc_rdma_encode_read_list(sctxt) < 0) - goto err1; + goto put_ctxt; if (svc_rdma_encode_write_list(rctxt, sctxt) < 0) - goto err1; + goto put_ctxt; if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0) - goto err1; + goto put_ctxt; ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp); if (ret < 0) - goto err1; + goto put_ctxt; /* Prevent svc_xprt_release() from releasing the page backing * rq_res.head[0].iov_base. It's no longer being accessed by @@ -964,16 +964,16 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) rqstp->rq_respages++; return 0; - err2: +reply_chunk: if (ret != -E2BIG && ret != -EINVAL) - goto err1; + goto put_ctxt; svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret); return 0; - err1: +put_ctxt: svc_rdma_send_ctxt_put(rdma, sctxt); - err0: +drop_connection: trace_svcrdma_send_err(rqstp, ret); svc_xprt_deferred_close(&rdma->sc_xprt); return -ENOTCONN; From 8727f78855b8d770b192949adbb1425092529e0f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 11 Apr 2021 14:19:08 -0400 Subject: [PATCH 0416/1379] svcrdma: Pass a useful error code to the send_err tracepoint Capture error codes in @ret, which is passed to the send_err tracepoint, so that they can be logged when something goes awry. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index a3b0f5899f03..d6bbafb773e1 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -921,6 +921,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; __be32 *rdma_argp = rctxt->rc_recv_buf; struct svc_rdma_send_ctxt *sctxt; + unsigned int rc_size; __be32 *p; int ret; @@ -933,6 +934,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!sctxt) goto drop_connection; + ret = -EMSGSIZE; p = xdr_reserve_space(&sctxt->sc_stream, rpcrdma_fixed_maxsz * sizeof(*p)); if (!p) @@ -941,17 +943,21 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); if (ret < 0) goto reply_chunk; + rc_size = ret; *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); *p++ = rdma->sc_fc_credits; *p = pcl_is_empty(&rctxt->rc_reply_pcl) ? rdma_msg : rdma_nomsg; - if (svc_rdma_encode_read_list(sctxt) < 0) + ret = svc_rdma_encode_read_list(sctxt); + if (ret < 0) goto put_ctxt; - if (svc_rdma_encode_write_list(rctxt, sctxt) < 0) + ret = svc_rdma_encode_write_list(rctxt, sctxt); + if (ret < 0) goto put_ctxt; - if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0) + ret = svc_rdma_encode_reply_chunk(rctxt, sctxt, rc_size); + if (ret < 0) goto put_ctxt; ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp); From b1160a06e0ea8c59454bc13b8d2a21cf569c0ff5 Mon Sep 17 00:00:00 2001 From: Chen Hui Date: Fri, 9 Apr 2021 15:57:48 +0800 Subject: [PATCH 0417/1379] PCI: altera-msi: Remove redundant dev_err call in altera_msi_probe() There is a error message within devm_ioremap_resource() already, so remove the dev_err() call to avoid redundant error message. Link: https://lore.kernel.org/r/20210409075748.226141-1-clare.chenhui@huawei.com Signed-off-by: Chen Hui Signed-off-by: Lorenzo Pieralisi Reviewed-by: Ley Foon Tan --- drivers/pci/controller/pcie-altera-msi.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/pci/controller/pcie-altera-msi.c b/drivers/pci/controller/pcie-altera-msi.c index 42691dd8ebef..98aa1dccc6e6 100644 --- a/drivers/pci/controller/pcie-altera-msi.c +++ b/drivers/pci/controller/pcie-altera-msi.c @@ -236,10 +236,8 @@ static int altera_msi_probe(struct platform_device *pdev) res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "vector_slave"); msi->vector_base = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(msi->vector_base)) { - dev_err(&pdev->dev, "failed to map vector_slave memory\n"); + if (IS_ERR(msi->vector_base)) return PTR_ERR(msi->vector_base); - } msi->vector_phy = res->start; From 3a306a5b60165a79c963585ac1863a12bba0ae7e Mon Sep 17 00:00:00 2001 From: Guobin Huang Date: Tue, 6 Apr 2021 20:06:37 +0800 Subject: [PATCH 0418/1379] PCI: cpqphp: Use DEFINE_SPINLOCK() for int15_lock Initialize the static int15_lock spinlock with DEFINE_SPINLOCK() rather than explicitly calling spin_lock_init(). Link: https://lore.kernel.org/r/1617710797-48903-1-git-send-email-huangguobin4@huawei.com Reported-by: Hulk Robot Signed-off-by: Guobin Huang Signed-off-by: Bjorn Helgaas --- drivers/pci/hotplug/cpqphp_nvram.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/pci/hotplug/cpqphp_nvram.c b/drivers/pci/hotplug/cpqphp_nvram.c index 00cd2b43364f..7a65d427ac11 100644 --- a/drivers/pci/hotplug/cpqphp_nvram.c +++ b/drivers/pci/hotplug/cpqphp_nvram.c @@ -80,7 +80,7 @@ static u8 evbuffer[1024]; static void __iomem *compaq_int15_entry_point; /* lock for ordering int15_bios_call() */ -static spinlock_t int15_lock; +static DEFINE_SPINLOCK(int15_lock); /* This is a series of function that deals with @@ -415,9 +415,6 @@ void compaq_nvram_init(void __iomem *rom_start) compaq_int15_entry_point = (rom_start + ROM_INT15_PHY_ADDR - ROM_PHY_ADDR); dbg("int15 entry = %p\n", compaq_int15_entry_point); - - /* initialize our int15 lock */ - spin_lock_init(&int15_lock); } From d44616c6cc3e35eea03ecfe9040edfa2b486a059 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 19 Mar 2021 21:22:57 +0100 Subject: [PATCH 0419/1379] thermal/core: Fix memory leak in the error path Fix the following error: smatch warnings: drivers/thermal/thermal_core.c:1020 __thermal_cooling_device_register() warn: possible memory leak of 'cdev' by freeing the cdev when exiting the function in the error path. Fixes: 584837618100 ("thermal/drivers/core: Use a char pointer for the cooling device name") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210319202257.890848-1-daniel.lezcano@linaro.org --- drivers/thermal/thermal_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index c8d4010940ef..3566fd291399 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1017,6 +1017,7 @@ out_kfree_type: out_ida_remove: ida_simple_remove(&thermal_cdev_ida, cdev->id); out_kfree_cdev: + kfree(cdev); return ERR_PTR(ret); } From 9aa80ab2c0ba67ce3281aee604b543293f71390d Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 19 Mar 2021 21:24:23 +0100 Subject: [PATCH 0420/1379] thermal/drivers/devfreq_cooling: Fix wrong return on error path The following error is reported by kbuild: smatch warnings: drivers/thermal/devfreq_cooling.c:433 of_devfreq_cooling_register_power() warn: passing zero to 'ERR_PTR' Fix the error code by the setting the 'err' variable instead of 'cdev'. Fixes: f8d354e821b2 ("thermal/drivers/devfreq_cooling: Use device name instead of auto-numbering") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210319202424.890968-1-daniel.lezcano@linaro.org --- drivers/thermal/devfreq_cooling.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c index fb250ac16f50..3a788ac4f525 100644 --- a/drivers/thermal/devfreq_cooling.c +++ b/drivers/thermal/devfreq_cooling.c @@ -402,7 +402,7 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df, if (err < 0) goto free_table; - cdev = ERR_PTR(-ENOMEM); + err = -ENOMEM; name = kasprintf(GFP_KERNEL, "devfreq-%s", dev_name(dev)); if (!name) goto remove_qos_req; From 6cc7b38c0ca3187abd07af849ec179b42337bcf6 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Fri, 19 Mar 2021 21:25:22 +0100 Subject: [PATCH 0421/1379] thermal/drivers/cpuidle_cooling: Fix use after error When the function successfully finishes it logs an information about the registration of the cooling device and use its name to build the message. Unfortunately it was freed right before: drivers/thermal/cpuidle_cooling.c:218 __cpuidle_cooling_register() warn: 'name' was already freed. Fix this by freeing after the message happened. Fixes: 6fd1b186d900 ("thermal/drivers/cpuidle_cooling: Use device name instead of auto-numbering") Reported-by: Dan Carpenter Signed-off-by: Daniel Lezcano Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20210319202522.891061-1-daniel.lezcano@linaro.org --- drivers/thermal/cpuidle_cooling.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/cpuidle_cooling.c b/drivers/thermal/cpuidle_cooling.c index f32976163bad..4f41102e8b16 100644 --- a/drivers/thermal/cpuidle_cooling.c +++ b/drivers/thermal/cpuidle_cooling.c @@ -208,18 +208,20 @@ static int __cpuidle_cooling_register(struct device_node *np, cdev = thermal_of_cooling_device_register(np, name, idle_cdev, &cpuidle_cooling_ops); - kfree(name); - if (IS_ERR(cdev)) { ret = PTR_ERR(cdev); - goto out_unregister; + goto out_kfree_name; } pr_debug("%s: Idle injection set with idle duration=%u, latency=%u\n", name, idle_duration_us, latency_us); + kfree(name); + return 0; +out_kfree_name: + kfree(name); out_unregister: idle_inject_unregister(ii_dev); out_kfree: From 957781612e44f9525a8c7ed52086ab4caaa301f6 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Fri, 19 Mar 2021 23:08:01 +0100 Subject: [PATCH 0422/1379] dt-bindings: tsens: qcom: Document MDM9607 compatible Add the compatible for MDM9607. Signed-off-by: Konrad Dybcio Signed-off-by: Daniel Lezcano Acked-by: Rob Herring Link: https://lore.kernel.org/r/20210319220802.198215-1-konrad.dybcio@somainline.org --- Documentation/devicetree/bindings/thermal/qcom-tsens.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml index 95462e071ab4..8ad9dc139c23 100644 --- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml +++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml @@ -22,6 +22,7 @@ properties: - description: v0.1 of TSENS items: - enum: + - qcom,mdm9607-tsens - qcom,msm8916-tsens - qcom,msm8939-tsens - qcom,msm8974-tsens @@ -94,6 +95,7 @@ allOf: compatible: contains: enum: + - qcom,mdm9607-tsens - qcom,msm8916-tsens - qcom,msm8974-tsens - qcom,msm8976-tsens From 4481b39f9390e82c73fb03193b4a5e7e242d22a4 Mon Sep 17 00:00:00 2001 From: Hao Fang Date: Tue, 30 Mar 2021 14:45:33 +0800 Subject: [PATCH 0423/1379] thermal/drivers/hisi: Use the correct HiSilicon copyright s/Hisilicon/HiSilicon/g. It should use capital S, according to https://www.hisilicon.com/en/terms-of-use. Signed-off-by: Hao Fang Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1617086733-2705-1-git-send-email-fanghao11@huawei.com --- drivers/thermal/hisi_thermal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c index ee05950afd2f..d902db9f267f 100644 --- a/drivers/thermal/hisi_thermal.c +++ b/drivers/thermal/hisi_thermal.c @@ -1,7 +1,7 @@ /* - * Hisilicon thermal sensor driver + * HiSilicon thermal sensor driver * - * Copyright (c) 2014-2015 Hisilicon Limited. + * Copyright (c) 2014-2015 HiSilicon Limited. * Copyright (c) 2014-2015 Linaro Limited. * * Xinwei Kong @@ -672,5 +672,5 @@ module_platform_driver(hisi_thermal_driver); MODULE_AUTHOR("Xinwei Kong "); MODULE_AUTHOR("Leo Yan "); -MODULE_DESCRIPTION("Hisilicon thermal driver"); +MODULE_DESCRIPTION("HiSilicon thermal driver"); MODULE_LICENSE("GPL v2"); From 34ab17cc6c2c1ac93d7e5d53bb972df9a968f085 Mon Sep 17 00:00:00 2001 From: brian-sy yang Date: Tue, 29 Dec 2020 13:08:31 +0800 Subject: [PATCH 0424/1379] thermal/drivers/cpufreq_cooling: Fix slab OOB issue Slab OOB issue is scanned by KASAN in cpu_power_to_freq(). If power is limited below the power of OPP0 in EM table, it will cause slab out-of-bound issue with negative array index. Return the lowest frequency if limited power cannot found a suitable OPP in EM table to fix this issue. Backtrace: [] die+0x104/0x5ac [] bug_handler+0x64/0xd0 [] brk_handler+0x160/0x258 [] do_debug_exception+0x248/0x3f0 [] el1_dbg+0x14/0xbc [] __kasan_report+0x1dc/0x1e0 [] kasan_report+0x10/0x20 [] __asan_report_load8_noabort+0x18/0x28 [] cpufreq_power2state+0x180/0x43c [] power_actor_set_power+0x114/0x1d4 [] allocate_power+0xaec/0xde0 [] power_allocator_throttle+0x3ec/0x5a4 [] handle_thermal_trip+0x160/0x294 [] thermal_zone_device_check+0xe4/0x154 [] process_one_work+0x5e4/0xe28 [] worker_thread+0xa4c/0xfac [] kthread+0x33c/0x358 [] ret_from_fork+0xc/0x18 Fixes: 371a3bc79c11b ("thermal/drivers/cpufreq_cooling: Fix wrong frequency converted from power") Signed-off-by: brian-sy yang Signed-off-by: Michael Kao Reviewed-by: Lukasz Luba Cc: stable@vger.kernel.org #v5.7 Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20201229050831.19493-1-michael.kao@mediatek.com --- drivers/thermal/cpufreq_cooling.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index f3d308427665..eeb4e4b76c0b 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -116,7 +116,7 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev, { int i; - for (i = cpufreq_cdev->max_level; i >= 0; i--) { + for (i = cpufreq_cdev->max_level; i > 0; i--) { if (power >= cpufreq_cdev->em->table[i].power) break; } From aa92b3310c55b21153ca1514719ff8d5dfe74bd7 Mon Sep 17 00:00:00 2001 From: David Collins Date: Wed, 29 Jul 2020 09:52:52 -0700 Subject: [PATCH 0425/1379] thermal/drivers/qcom-spmi-temp-alarm: Add support for GEN2 rev 1 PMIC peripherals Add support for TEMP_ALARM GEN2 PMIC peripherals with digital major revision 1. This revision utilizes a different temperature threshold mapping than earlier revisions. Signed-off-by: David Collins Signed-off-by: Guru Das Srinagesh Reviewed-by: Stephen Boyd Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/69c90a004b3f5b7ae282f5ec5ca2920a48f23e02.1596040416.git.gurus@codeaurora.org --- drivers/thermal/qcom/qcom-spmi-temp-alarm.c | 91 ++++++++++++++------- 1 file changed, 61 insertions(+), 30 deletions(-) diff --git a/drivers/thermal/qcom/qcom-spmi-temp-alarm.c b/drivers/thermal/qcom/qcom-spmi-temp-alarm.c index 6dc879fea9c8..7419e196dbb0 100644 --- a/drivers/thermal/qcom/qcom-spmi-temp-alarm.c +++ b/drivers/thermal/qcom/qcom-spmi-temp-alarm.c @@ -17,6 +17,7 @@ #include "../thermal_core.h" +#define QPNP_TM_REG_DIG_MAJOR 0x01 #define QPNP_TM_REG_TYPE 0x04 #define QPNP_TM_REG_SUBTYPE 0x05 #define QPNP_TM_REG_STATUS 0x08 @@ -38,26 +39,30 @@ #define ALARM_CTRL_FORCE_ENABLE BIT(7) -/* - * Trip point values based on threshold control - * 0 = {105 C, 125 C, 145 C} - * 1 = {110 C, 130 C, 150 C} - * 2 = {115 C, 135 C, 155 C} - * 3 = {120 C, 140 C, 160 C} -*/ -#define TEMP_STAGE_STEP 20000 /* Stage step: 20.000 C */ -#define TEMP_STAGE_HYSTERESIS 2000 +#define THRESH_COUNT 4 +#define STAGE_COUNT 3 -#define TEMP_THRESH_MIN 105000 /* Threshold Min: 105 C */ -#define TEMP_THRESH_STEP 5000 /* Threshold step: 5 C */ +/* Over-temperature trip point values in mC */ +static const long temp_map_gen1[THRESH_COUNT][STAGE_COUNT] = { + { 105000, 125000, 145000 }, + { 110000, 130000, 150000 }, + { 115000, 135000, 155000 }, + { 120000, 140000, 160000 }, +}; + +static const long temp_map_gen2_v1[THRESH_COUNT][STAGE_COUNT] = { + { 90000, 110000, 140000 }, + { 95000, 115000, 145000 }, + { 100000, 120000, 150000 }, + { 105000, 125000, 155000 }, +}; + +#define TEMP_THRESH_STEP 5000 /* Threshold step: 5 C */ #define THRESH_MIN 0 #define THRESH_MAX 3 -/* Stage 2 Threshold Min: 125 C */ -#define STAGE2_THRESHOLD_MIN 125000 -/* Stage 2 Threshold Max: 140 C */ -#define STAGE2_THRESHOLD_MAX 140000 +#define TEMP_STAGE_HYSTERESIS 2000 /* Temperature in Milli Celsius reported during stage 0 if no ADC is present */ #define DEFAULT_TEMP 37000 @@ -77,6 +82,7 @@ struct qpnp_tm_chip { bool initialized; struct iio_channel *adc; + const long (*temp_map)[THRESH_COUNT][STAGE_COUNT]; }; /* This array maps from GEN2 alarm state to GEN1 alarm stage */ @@ -100,6 +106,23 @@ static int qpnp_tm_write(struct qpnp_tm_chip *chip, u16 addr, u8 data) return regmap_write(chip->map, chip->base + addr, data); } +/** + * qpnp_tm_decode_temp() - return temperature in mC corresponding to the + * specified over-temperature stage + * @chip: Pointer to the qpnp_tm chip + * @stage: Over-temperature stage + * + * Return: temperature in mC + */ +static long qpnp_tm_decode_temp(struct qpnp_tm_chip *chip, unsigned int stage) +{ + if (!chip->temp_map || chip->thresh >= THRESH_COUNT || stage == 0 || + stage > STAGE_COUNT) + return 0; + + return (*chip->temp_map)[chip->thresh][stage - 1]; +} + /** * qpnp_tm_get_temp_stage() - return over-temperature stage * @chip: Pointer to the qpnp_tm chip @@ -149,14 +172,12 @@ static int qpnp_tm_update_temp_no_adc(struct qpnp_tm_chip *chip) if (stage_new > stage_old) { /* increasing stage, use lower bound */ - chip->temp = (stage_new - 1) * TEMP_STAGE_STEP + - chip->thresh * TEMP_THRESH_STEP + - TEMP_STAGE_HYSTERESIS + TEMP_THRESH_MIN; + chip->temp = qpnp_tm_decode_temp(chip, stage_new) + + TEMP_STAGE_HYSTERESIS; } else if (stage_new < stage_old) { /* decreasing stage, use upper bound */ - chip->temp = stage_new * TEMP_STAGE_STEP + - chip->thresh * TEMP_THRESH_STEP - - TEMP_STAGE_HYSTERESIS + TEMP_THRESH_MIN; + chip->temp = qpnp_tm_decode_temp(chip, stage_new + 1) + - TEMP_STAGE_HYSTERESIS; } chip->stage = stage; @@ -199,26 +220,28 @@ static int qpnp_tm_get_temp(void *data, int *temp) static int qpnp_tm_update_critical_trip_temp(struct qpnp_tm_chip *chip, int temp) { - u8 reg; + long stage2_threshold_min = (*chip->temp_map)[THRESH_MIN][1]; + long stage2_threshold_max = (*chip->temp_map)[THRESH_MAX][1]; bool disable_s2_shutdown = false; + u8 reg; WARN_ON(!mutex_is_locked(&chip->lock)); /* * Default: S2 and S3 shutdown enabled, thresholds at - * 105C/125C/145C, monitoring at 25Hz + * lowest threshold set, monitoring at 25Hz */ reg = SHUTDOWN_CTRL1_RATE_25HZ; if (temp == THERMAL_TEMP_INVALID || - temp < STAGE2_THRESHOLD_MIN) { + temp < stage2_threshold_min) { chip->thresh = THRESH_MIN; goto skip; } - if (temp <= STAGE2_THRESHOLD_MAX) { + if (temp <= stage2_threshold_max) { chip->thresh = THRESH_MAX - - ((STAGE2_THRESHOLD_MAX - temp) / + ((stage2_threshold_max - temp) / TEMP_THRESH_STEP); disable_s2_shutdown = true; } else { @@ -326,9 +349,7 @@ static int qpnp_tm_init(struct qpnp_tm_chip *chip) ? chip->stage : alarm_state_map[chip->stage]; if (stage) - chip->temp = chip->thresh * TEMP_THRESH_STEP + - (stage - 1) * TEMP_STAGE_STEP + - TEMP_THRESH_MIN; + chip->temp = qpnp_tm_decode_temp(chip, stage); crit_temp = qpnp_tm_get_critical_trip_temp(chip); ret = qpnp_tm_update_critical_trip_temp(chip, crit_temp); @@ -350,7 +371,7 @@ static int qpnp_tm_probe(struct platform_device *pdev) { struct qpnp_tm_chip *chip; struct device_node *node; - u8 type, subtype; + u8 type, subtype, dig_major; u32 res; int ret, irq; @@ -400,6 +421,12 @@ static int qpnp_tm_probe(struct platform_device *pdev) return ret; } + ret = qpnp_tm_read(chip, QPNP_TM_REG_DIG_MAJOR, &dig_major); + if (ret < 0) { + dev_err(&pdev->dev, "could not read dig_major\n"); + return ret; + } + if (type != QPNP_TM_TYPE || (subtype != QPNP_TM_SUBTYPE_GEN1 && subtype != QPNP_TM_SUBTYPE_GEN2)) { dev_err(&pdev->dev, "invalid type 0x%02x or subtype 0x%02x\n", @@ -408,6 +435,10 @@ static int qpnp_tm_probe(struct platform_device *pdev) } chip->subtype = subtype; + if (subtype == QPNP_TM_SUBTYPE_GEN2 && dig_major >= 1) + chip->temp_map = &temp_map_gen2_v1; + else + chip->temp_map = &temp_map_gen1; /* * Register the sensor before initializing the hardware to be able to From 5b5f1121d60bca8305951930d7aa2123fb213cb0 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Wed, 17 Feb 2021 11:59:08 +0000 Subject: [PATCH 0426/1379] MAINTAINERS: update thermal CPU cooling section Update maintainers responsible for CPU cooling on Arm side. Signed-off-by: Lukasz Luba Acked-by: Viresh Kumar Acked-by: Javi Merino Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210217115908.22547-1-lukasz.luba@arm.com --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f919aa861165..6926711e9cf8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17771,7 +17771,7 @@ THERMAL/CPU_COOLING M: Amit Daniel Kachhap M: Daniel Lezcano M: Viresh Kumar -M: Javi Merino +R: Lukasz Luba L: linux-pm@vger.kernel.org S: Supported F: Documentation/driver-api/thermal/cpu-cooling-api.rst From f4136863e8899fa0554343201b78b9e197c78a78 Mon Sep 17 00:00:00 2001 From: Guangqing Zhu Date: Sun, 4 Apr 2021 20:54:31 +0800 Subject: [PATCH 0427/1379] thermal/drivers/tsens: Fix missing put_device error Fixes coccicheck error: drivers/thermal/qcom/tsens.c:759:4-10: ERROR: missing put_device; call of_find_device_by_node on line 715, but without a corresponding object release within this function. Fixes: a7ff82976122 ("drivers: thermal: tsens: Merge tsens-common.c into tsens.c") Signed-off-by: Guangqing Zhu Reviewed-by: Bjorn Andersson Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210404125431.12208-1-zhuguangqing83@gmail.com --- drivers/thermal/qcom/tsens.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index d8ce3a687b80..3c4c0516e58a 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -755,8 +755,10 @@ int __init init_common(struct tsens_priv *priv) for (i = VER_MAJOR; i <= VER_STEP; i++) { priv->rf[i] = devm_regmap_field_alloc(dev, priv->srot_map, priv->fields[i]); - if (IS_ERR(priv->rf[i])) - return PTR_ERR(priv->rf[i]); + if (IS_ERR(priv->rf[i])) { + ret = PTR_ERR(priv->rf[i]); + goto err_put_device; + } } ret = regmap_field_read(priv->rf[VER_MINOR], &ver_minor); if (ret) From c0612265295bc7bbbc7189ab811192fe77be8196 Mon Sep 17 00:00:00 2001 From: Robert Foss Date: Wed, 24 Mar 2021 13:43:08 +0100 Subject: [PATCH 0428/1379] dt-bindings: thermal: qcom-tsens: Add compatible for sm8350 Add tsens bindings for sm8350. Signed-off-by: Robert Foss Reviewed-by: Vinod Koul Acked-by: Rob Herring Reviewed-by: Bjorn Andersson Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210324124308.1265626-1-robert.foss@linaro.org --- Documentation/devicetree/bindings/thermal/qcom-tsens.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml index 8ad9dc139c23..fbd03cd3eb9b 100644 --- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml +++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml @@ -44,6 +44,7 @@ properties: - qcom,sdm845-tsens - qcom,sm8150-tsens - qcom,sm8250-tsens + - qcom,sm8350-tsens - const: qcom,tsens-v2 reg: From 363f8dd5eecd6c67fe9840ef6065440f0ee7df3a Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 15 Apr 2021 16:38:24 +0800 Subject: [PATCH 0429/1379] nfsd: remove unused function Fix the following clang warning: fs/nfsd/nfs4state.c:6276:1: warning: unused function 'end_offset' [-Wunused-function]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 7698172ac0c7..6b8ad1a9ec34 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -6288,15 +6288,6 @@ out: return status; } -static inline u64 -end_offset(u64 start, u64 len) -{ - u64 end; - - end = start + len; - return end >= start ? end: NFS4_MAX_UINT64; -} - /* last octet in a range */ static inline u64 last_byte_offset(u64 start, u64 len) From 70c5307564035c160078401f541c397d77b95415 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 15 Apr 2021 15:00:58 +0300 Subject: [PATCH 0430/1379] nfsd: removed unused argument in nfsd_startup_generic() Since commit 501cb1849f86 ("nfsd: rip out the raparms cache") nrservs is not used in nfsd_startup_generic() Signed-off-by: Vasily Averin Signed-off-by: Chuck Lever --- fs/nfsd/nfssvc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 82ba034fa579..dd5d69921676 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -308,7 +308,7 @@ static int nfsd_init_socks(struct net *net, const struct cred *cred) static int nfsd_users = 0; -static int nfsd_startup_generic(int nrservs) +static int nfsd_startup_generic(void) { int ret; @@ -374,7 +374,7 @@ void nfsd_reset_boot_verifier(struct nfsd_net *nn) write_sequnlock(&nn->boot_lock); } -static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred) +static int nfsd_startup_net(struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); int ret; @@ -382,7 +382,7 @@ static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cre if (nn->nfsd_net_up) return 0; - ret = nfsd_startup_generic(nrservs); + ret = nfsd_startup_generic(); if (ret) return ret; ret = nfsd_init_socks(net, cred); @@ -790,7 +790,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) nfsd_up_before = nn->nfsd_net_up; - error = nfsd_startup_net(nrservs, net, cred); + error = nfsd_startup_net(net, cred); if (error) goto out_destroy; error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv, From eaa7a897206ac5bfa7da3f647686209ada1984d0 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Thu, 15 Apr 2021 21:18:49 +0300 Subject: [PATCH 0431/1379] tracing: Define static void trace_print_time() The part of the code that prints the time of the trace record in "int trace_print_context()" gets extracted in a static function. This is done as a preparation for a following patch, in which we will define a new ftrace event called "func_repeats". The new static method, defined here, will be used by this new event to print the time of the last repeat of a function that is consecutively called number of times. Link: https://lkml.kernel.org/r/20210415181854.147448-2-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a0146e1fffdf..333233d45596 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -587,13 +587,26 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) return !trace_seq_has_overflowed(s); } +static void trace_print_time(struct trace_seq *s, struct trace_iterator *iter, + unsigned long long ts) +{ + unsigned long secs, usec_rem; + unsigned long long t; + + if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { + t = ns2usecs(ts); + usec_rem = do_div(t, USEC_PER_SEC); + secs = (unsigned long)t; + trace_seq_printf(s, " %5lu.%06lu", secs, usec_rem); + } else + trace_seq_printf(s, " %12llu", ts); +} + int trace_print_context(struct trace_iterator *iter) { struct trace_array *tr = iter->tr; struct trace_seq *s = &iter->seq; struct trace_entry *entry = iter->ent; - unsigned long long t; - unsigned long secs, usec_rem; char comm[TASK_COMM_LEN]; trace_find_cmdline(entry->pid, comm); @@ -614,13 +627,8 @@ int trace_print_context(struct trace_iterator *iter) if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); - if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) { - t = ns2usecs(iter->ts); - usec_rem = do_div(t, USEC_PER_SEC); - secs = (unsigned long)t; - trace_seq_printf(s, " %5lu.%06lu: ", secs, usec_rem); - } else - trace_seq_printf(s, " %12llu: ", iter->ts); + trace_print_time(s, iter, iter->ts); + trace_seq_puts(s, ": "); return !trace_seq_has_overflowed(s); } From f689e4f280b69cd7341743c2ecacd1b13528a0d8 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Thu, 15 Apr 2021 21:18:50 +0300 Subject: [PATCH 0432/1379] tracing: Define new ftrace event "func_repeats" The event aims to consolidate the function tracing record in the cases when a single function is called number of times consecutively. while (cond) do_func(); This may happen in various scenarios (busy waiting for example). The new ftrace event can be used to show repeated function events with a single event and save space on the ring buffer Link: https://lkml.kernel.org/r/20210415181854.147448-3-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 3 +++ kernel/trace/trace_entries.h | 22 +++++++++++++++++ kernel/trace/trace_output.c | 48 ++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5506424eae2a..6a5b4c2a0fa7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -45,6 +45,7 @@ enum trace_type { TRACE_BPUTS, TRACE_HWLAT, TRACE_RAW_DATA, + TRACE_FUNC_REPEATS, __TRACE_LAST_TYPE, }; @@ -442,6 +443,8 @@ extern void __ftrace_bad_type(void); TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ + IF_ASSIGN(var, ent, struct func_repeats_entry, \ + TRACE_FUNC_REPEATS); \ __ftrace_bad_type(); \ } while (0) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4547ac59da61..251c819cf0c5 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -338,3 +338,25 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __entry->nmi_total_ts, __entry->nmi_count) ); + +#define FUNC_REPEATS_GET_DELTA_TS(entry) \ + (((u64)(entry)->top_delta_ts << 32) | (entry)->bottom_delta_ts) \ + +FTRACE_ENTRY(func_repeats, func_repeats_entry, + + TRACE_FUNC_REPEATS, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + __field( u16 , count ) + __field( u16 , top_delta_ts ) + __field( u32 , bottom_delta_ts ) + ), + + F_printk(" %ps <-%ps\t(repeats:%u delta: -%llu)", + (void *)__entry->ip, + (void *)__entry->parent_ip, + __entry->count, + FUNC_REPEATS_GET_DELTA_TS(__entry)) +); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 333233d45596..3037f0c88f90 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1381,6 +1381,53 @@ static struct trace_event trace_raw_data_event = { .funcs = &trace_raw_data_funcs, }; +static enum print_line_t +trace_func_repeats_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct func_repeats_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(s, "%lu %lu %u %llu\n", + field->ip, + field->parent_ip, + field->count, + FUNC_REPEATS_GET_DELTA_TS(field)); + + return trace_handle_return(s); +} + +static enum print_line_t +trace_func_repeats_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct func_repeats_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + seq_print_ip_sym(s, field->ip, flags); + trace_seq_puts(s, " <-"); + seq_print_ip_sym(s, field->parent_ip, flags); + trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); + trace_print_time(s, iter, + iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); + trace_seq_puts(s, ")\n"); + + return trace_handle_return(s); +} + +static struct trace_event_functions trace_func_repeats_funcs = { + .trace = trace_func_repeats_print, + .raw = trace_func_repeats_raw, +}; + +static struct trace_event trace_func_repeats_event = { + .type = TRACE_FUNC_REPEATS, + .funcs = &trace_func_repeats_funcs, +}; static struct trace_event *events[] __initdata = { &trace_fn_event, @@ -1393,6 +1440,7 @@ static struct trace_event *events[] __initdata = { &trace_print_event, &trace_hwlat_event, &trace_raw_data_event, + &trace_func_repeats_event, NULL }; From 20344c54d1c7ab7428e312bbe9b0097750875002 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Thu, 15 Apr 2021 21:18:51 +0300 Subject: [PATCH 0433/1379] tracing: Add "last_func_repeats" to struct trace_array The field is used to keep track of the consecutive (on the same CPU) calls of a single function. This information is needed in order to consolidate the function tracing record in the cases when a single function is called number of times. Link: https://lkml.kernel.org/r/20210415181854.147448-4-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 507a30bf26e4..82833be07c1e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9104,6 +9104,7 @@ static int __remove_instance(struct trace_array *tr) ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); tracefs_remove(tr->dir); + free_percpu(tr->last_func_repeats); free_trace_buffers(tr); for (i = 0; i < tr->nr_topts; i++) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6a5b4c2a0fa7..a4f1b66049fd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -262,6 +262,17 @@ struct cond_snapshot { cond_update_fn_t update; }; +/* + * struct trace_func_repeats - used to keep track of the consecutive + * (on the same CPU) calls of a single function. + */ +struct trace_func_repeats { + unsigned long ip; + unsigned long parent_ip; + unsigned long count; + u64 ts_last_call; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -358,6 +369,7 @@ struct trace_array { #ifdef CONFIG_TRACER_SNAPSHOT struct cond_snapshot *cond_snapshot; #endif + struct trace_func_repeats __percpu *last_func_repeats; }; enum { From c658797f1a70561205a224be0c8be64977ed64e8 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Thu, 15 Apr 2021 21:18:52 +0300 Subject: [PATCH 0434/1379] tracing: Add method for recording "func_repeats" events This patch only provides the implementation of the method. Later we will used it in a combination with a new option for function tracing. Link: https://lkml.kernel.org/r/20210415181854.147448-5-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 34 ++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 4 ++++ 2 files changed, 38 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 82833be07c1e..66a4ad93b5e9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3117,6 +3117,40 @@ static void ftrace_trace_userstack(struct trace_array *tr, #endif /* CONFIG_STACKTRACE */ +static inline void +func_repeats_set_delta_ts(struct func_repeats_entry *entry, + unsigned long long delta) +{ + entry->bottom_delta_ts = delta & U32_MAX; + entry->top_delta_ts = (delta >> 32); +} + +void trace_last_func_repeats(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned int trace_ctx) +{ + struct trace_buffer *buffer = tr->array_buffer.buffer; + struct func_repeats_entry *entry; + struct ring_buffer_event *event; + u64 delta; + + event = __trace_buffer_lock_reserve(buffer, TRACE_FUNC_REPEATS, + sizeof(*entry), trace_ctx); + if (!event) + return; + + delta = ring_buffer_event_time_stamp(buffer, event) - + last_info->ts_last_call; + + entry = ring_buffer_event_data(event); + entry->ip = last_info->ip; + entry->parent_ip = last_info->parent_ip; + entry->count = last_info->count; + func_repeats_set_delta_ts(entry, delta); + + __buffer_unlock_commit(buffer, event); +} + /* created for use with alloc_percpu */ struct trace_buffer_struct { int nesting; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a4f1b66049fd..cd80d046c7a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -695,6 +695,10 @@ static inline void __trace_stack(struct trace_array *tr, unsigned int trace_ctx, } #endif /* CONFIG_STACKTRACE */ +void trace_last_func_repeats(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned int trace_ctx); + extern u64 ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); From 4994891ebbb89b18903637dc1c8f27b42cb8b8b2 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Thu, 15 Apr 2021 21:18:53 +0300 Subject: [PATCH 0435/1379] tracing: Unify the logic for function tracing options Currently the logic for dealing with the options for function tracing has two different implementations. One is used when we set the flags (in "static int func_set_flag()") and another used when we initialize the tracer (in "static int function_trace_init()"). Those two implementations are meant to do essentially the same thing and they are both not very convenient for adding new options. In this patch we add a helper function that provides a single implementation of the logic for dealing with the options and we make it such that new options can be easily added. Link: https://lkml.kernel.org/r/20210415181854.147448-6-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions.c | 65 ++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index f93723ca66bc..f37f73a9b1b8 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -31,9 +31,12 @@ static struct tracer_flags func_flags; /* Our option */ enum { + TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ TRACE_FUNC_OPT_STACK = 0x1, }; +#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_STACK) + int ftrace_allocate_ftrace_ops(struct trace_array *tr) { struct ftrace_ops *ops; @@ -86,6 +89,18 @@ void ftrace_destroy_function_files(struct trace_array *tr) ftrace_free_ftrace_ops(tr); } +static ftrace_func_t select_trace_function(u32 flags_val) +{ + switch (flags_val & TRACE_FUNC_OPT_MASK) { + case TRACE_FUNC_NO_OPTS: + return function_trace_call; + case TRACE_FUNC_OPT_STACK: + return function_stack_trace_call; + default: + return NULL; + } +} + static int function_trace_init(struct trace_array *tr) { ftrace_func_t func; @@ -97,12 +112,9 @@ static int function_trace_init(struct trace_array *tr) if (!tr->ops) return -ENOMEM; - /* Currently only the global instance can do stack tracing */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL && - func_flags.val & TRACE_FUNC_OPT_STACK) - func = function_stack_trace_call; - else - func = function_trace_call; + func = select_trace_function(func_flags.val); + if (!func) + return -EINVAL; ftrace_init_array_ops(tr, func); @@ -213,7 +225,7 @@ static struct tracer_opt func_opts[] = { }; static struct tracer_flags func_flags = { - .val = 0, /* By default: all flags disabled */ + .val = TRACE_FUNC_NO_OPTS, /* By default: all flags disabled */ .opts = func_opts }; @@ -235,30 +247,29 @@ static struct tracer function_trace; static int func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { - switch (bit) { - case TRACE_FUNC_OPT_STACK: - /* do nothing if already set */ - if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) - break; + ftrace_func_t func; + u32 new_flags; - /* We can change this flag when not running. */ - if (tr->current_trace != &function_trace) - break; + /* Do nothing if already set. */ + if (!!set == !!(func_flags.val & bit)) + return 0; - unregister_ftrace_function(tr->ops); + /* We can change this flag only when not running. */ + if (tr->current_trace != &function_trace) + return 0; - if (set) { - tr->ops->func = function_stack_trace_call; - register_ftrace_function(tr->ops); - } else { - tr->ops->func = function_trace_call; - register_ftrace_function(tr->ops); - } - - break; - default: + new_flags = (func_flags.val & ~bit) | (set ? bit : 0); + func = select_trace_function(new_flags); + if (!func) return -EINVAL; - } + + /* Check if there's anything to change. */ + if (tr->ops->func == func) + return 0; + + unregister_ftrace_function(tr->ops); + tr->ops->func = func; + register_ftrace_function(tr->ops); return 0; } From 22db095d57b51ff71aaa8ddba515180399f54334 Mon Sep 17 00:00:00 2001 From: "Yordan Karadzhov (VMware)" Date: Thu, 15 Apr 2021 21:18:54 +0300 Subject: [PATCH 0436/1379] tracing: Add "func_no_repeats" option for function tracing If the option is activated the function tracing record gets consolidated in the cases when a single function is called number of times consecutively. Instead of having an identical record for each call of the function we will record only the first call following by event showing the number of repeats. Link: https://lkml.kernel.org/r/20210415181854.147448-7-y.karadz@gmail.com Signed-off-by: Yordan Karadzhov (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_functions.c | 162 ++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index f37f73a9b1b8..1f0e63f5d1f9 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -27,15 +27,27 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); +static void +function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +static void +function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs); static struct tracer_flags func_flags; /* Our option */ enum { - TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ - TRACE_FUNC_OPT_STACK = 0x1, + + TRACE_FUNC_NO_OPTS = 0x0, /* No flags set. */ + TRACE_FUNC_OPT_STACK = 0x1, + TRACE_FUNC_OPT_NO_REPEATS = 0x2, + + /* Update this to next highest bit. */ + TRACE_FUNC_OPT_HIGHEST_BIT = 0x4 }; -#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_STACK) +#define TRACE_FUNC_OPT_MASK (TRACE_FUNC_OPT_HIGHEST_BIT - 1) int ftrace_allocate_ftrace_ops(struct trace_array *tr) { @@ -96,11 +108,27 @@ static ftrace_func_t select_trace_function(u32 flags_val) return function_trace_call; case TRACE_FUNC_OPT_STACK: return function_stack_trace_call; + case TRACE_FUNC_OPT_NO_REPEATS: + return function_no_repeats_trace_call; + case TRACE_FUNC_OPT_STACK | TRACE_FUNC_OPT_NO_REPEATS: + return function_stack_no_repeats_trace_call; default: return NULL; } } +static bool handle_func_repeats(struct trace_array *tr, u32 flags_val) +{ + if (!tr->last_func_repeats && + (flags_val & TRACE_FUNC_OPT_NO_REPEATS)) { + tr->last_func_repeats = alloc_percpu(struct trace_func_repeats); + if (!tr->last_func_repeats) + return false; + } + + return true; +} + static int function_trace_init(struct trace_array *tr) { ftrace_func_t func; @@ -116,6 +144,9 @@ static int function_trace_init(struct trace_array *tr) if (!func) return -EINVAL; + if (!handle_func_repeats(tr, func_flags.val)) + return -ENOMEM; + ftrace_init_array_ops(tr, func); tr->array_buffer.cpu = raw_smp_processor_id(); @@ -217,10 +248,132 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, local_irq_restore(flags); } +static inline bool is_repeat_check(struct trace_array *tr, + struct trace_func_repeats *last_info, + unsigned long ip, unsigned long parent_ip) +{ + if (last_info->ip == ip && + last_info->parent_ip == parent_ip && + last_info->count < U16_MAX) { + last_info->ts_last_call = + ring_buffer_time_stamp(tr->array_buffer.buffer); + last_info->count++; + return true; + } + + return false; +} + +static inline void process_repeats(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + struct trace_func_repeats *last_info, + unsigned int trace_ctx) +{ + if (last_info->count) { + trace_last_func_repeats(tr, last_info, trace_ctx); + last_info->count = 0; + } + + last_info->ip = ip; + last_info->parent_ip = parent_ip; +} + +static void +function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct trace_func_repeats *last_info; + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned int trace_ctx; + unsigned long flags; + int bit; + int cpu; + + if (unlikely(!tr->function_enabled)) + return; + + bit = ftrace_test_recursion_trylock(ip, parent_ip); + if (bit < 0) + return; + + preempt_disable_notrace(); + + cpu = smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + if (atomic_read(&data->disabled)) + goto out; + + /* + * An interrupt may happen at any place here. But as far as I can see, + * the only damage that this can cause is to mess up the repetition + * counter without valuable data being lost. + * TODO: think about a solution that is better than just hoping to be + * lucky. + */ + last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + if (is_repeat_check(tr, last_info, ip, parent_ip)) + goto out; + + local_save_flags(flags); + trace_ctx = tracing_gen_ctx_flags(flags); + process_repeats(tr, ip, parent_ip, last_info, trace_ctx); + + trace_function(tr, ip, parent_ip, trace_ctx); + +out: + ftrace_test_recursion_unlock(bit); + preempt_enable_notrace(); +} + +static void +function_stack_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, + struct ftrace_regs *fregs) +{ + struct trace_func_repeats *last_info; + struct trace_array *tr = op->private; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + unsigned int trace_ctx; + + if (unlikely(!tr->function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = per_cpu_ptr(tr->array_buffer.data, cpu); + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + last_info = per_cpu_ptr(tr->last_func_repeats, cpu); + if (is_repeat_check(tr, last_info, ip, parent_ip)) + goto out; + + trace_ctx = tracing_gen_ctx_flags(flags); + process_repeats(tr, ip, parent_ip, last_info, trace_ctx); + + trace_function(tr, ip, parent_ip, trace_ctx); + __trace_stack(tr, trace_ctx, STACK_SKIP); + } + + out: + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, #endif + { TRACER_OPT(func-no-repeats, TRACE_FUNC_OPT_NO_REPEATS) }, { } /* Always set a last empty entry */ }; @@ -267,6 +420,9 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (tr->ops->func == func) return 0; + if (!handle_func_repeats(tr, new_flags)) + return -ENOMEM; + unregister_ftrace_function(tr->ops); tr->ops->func = func; register_ftrace_function(tr->ops); From 3e903315790baf4a966436e7f32e9c97864570ac Mon Sep 17 00:00:00 2001 From: Guochun Mao Date: Tue, 16 Mar 2021 16:52:14 +0800 Subject: [PATCH 0437/1379] ubifs: Only check replay with inode type to judge if inode linked Conside the following case, it just write a big file into flash, when complete writing, delete the file, and then power off promptly. Next time power on, we'll get a replay list like: ... LEB 1105:211344 len 4144 deletion 0 sqnum 428783 key type 1 inode 80 LEB 15:233544 len 160 deletion 1 sqnum 428785 key type 0 inode 80 LEB 1105:215488 len 4144 deletion 0 sqnum 428787 key type 1 inode 80 ... In the replay list, data nodes' deletion are 0, and the inode node's deletion is 1. In current logic, the file's dentry will be removed, but inode and the flash space it occupied will be reserved. User will see that much free space been disappeared. We only need to check the deletion value of the following inode type node of the replay entry. Fixes: e58725d51fa8 ("ubifs: Handle re-linking of inodes correctly while recovery") Cc: stable@vger.kernel.org Signed-off-by: Guochun Mao Signed-off-by: Richard Weinberger --- fs/ubifs/replay.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 0f8a6a16421b..1929ec63a0cb 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -223,7 +223,8 @@ static bool inode_still_linked(struct ubifs_info *c, struct replay_entry *rino) */ list_for_each_entry_reverse(r, &c->replay_list, list) { ubifs_assert(c, r->sqnum >= rino->sqnum); - if (key_inum(c, &r->key) == key_inum(c, &rino->key)) + if (key_inum(c, &r->key) == key_inum(c, &rino->key) && + key_type(c, &r->key) == UBIFS_INO_KEY) return r->deletion == 0; } From ba4884a6dbf002401081a8eb0ba85e5dc87025e1 Mon Sep 17 00:00:00 2001 From: Rui Salvaterra Date: Mon, 5 Apr 2021 16:29:35 +0100 Subject: [PATCH 0438/1379] ubifs: Default to zstd compression Compared to lzo and zlib, zstd is the best all-around performer, both in terms of speed and compression ratio. Set it as the default, if available. Signed-off-by: Rui Salvaterra Signed-off-by: Richard Weinberger --- fs/ubifs/sb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index c160f718c288..e7693b94e5b5 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -53,6 +53,9 @@ static int get_default_compressor(struct ubifs_info *c) { + if (ubifs_compr_present(c, UBIFS_COMPR_ZSTD)) + return UBIFS_COMPR_ZSTD; + if (ubifs_compr_present(c, UBIFS_COMPR_LZO)) return UBIFS_COMPR_LZO; From af61e7bf927855e9647393f6c5ac4e411ac2041e Mon Sep 17 00:00:00 2001 From: Steffen Trumtrar Date: Tue, 16 Feb 2021 08:23:34 +0100 Subject: [PATCH 0439/1379] ubifs: Set s_uuid in super block to support ima/evm uuid options This is required to provide uuid based integrity functionality for: ima_policy (fsuuid option) and the 'evmctl' command ('--uuid' option). Co-developed-by: Oleksij Rempel Co-developed-by: Juergen Borleis Signed-off-by: Steffen Trumtrar Reviewed-by: Andy Shevchenko Signed-off-by: Richard Weinberger --- fs/ubifs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ddb2ca636c93..73f0ac209fb8 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2232,6 +2232,8 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) goto out_umount; } + import_uuid(&sb->s_uuid, c->uuid); + mutex_unlock(&c->umount_mutex); return 0; From 829ad58a04e28e1979cc8b9ac7d2db69cc44dc80 Mon Sep 17 00:00:00 2001 From: Martin Devera Date: Wed, 3 Mar 2021 10:05:19 +0100 Subject: [PATCH 0440/1379] ubifs: Report max LEB count at mount time There is no other way to directly report/query this quantity. It is useful when planing how given filesystem can be resized. Signed-off-by: Martin Devera Signed-off-by: Richard Weinberger --- fs/ubifs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 73f0ac209fb8..7b572e1414ba 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1552,8 +1552,8 @@ static int mount_ubifs(struct ubifs_info *c) ubifs_msg(c, "LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", c->leb_size, c->leb_size >> 10, c->min_io_size, c->max_write_size); - ubifs_msg(c, "FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)", - x, x >> 20, c->main_lebs, + ubifs_msg(c, "FS size: %lld bytes (%lld MiB, %d LEBs), max %d LEBs, journal size %lld bytes (%lld MiB, %d LEBs)", + x, x >> 20, c->main_lebs, c->max_leb_cnt, y, y >> 20, c->log_lebs + c->max_bud_cnt); ubifs_msg(c, "reserved for root: %llu bytes (%llu KiB)", c->report_rp_size, c->report_rp_size >> 10); From 8aa058d79b6d491778836626ba94a5165589acaf Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 5 Mar 2021 02:28:32 -0600 Subject: [PATCH 0441/1379] ubi: Fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix a warning by explicitly adding a break statement instead of letting the code fall through to the next case. Link: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/build.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index f399edc82191..a7e3eb9befb6 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -1350,6 +1350,7 @@ static int bytes_str_to_int(const char *str) fallthrough; case 'K': result *= 1024; + break; case '\0': break; default: From 960b9a8a7676b9054d8b46a2c7db52a0c8766b56 Mon Sep 17 00:00:00 2001 From: lizhe Date: Thu, 18 Mar 2021 11:06:57 +0800 Subject: [PATCH 0442/1379] jffs2: Fix kasan slab-out-of-bounds problem KASAN report a slab-out-of-bounds problem. The logs are listed below. It is because in function jffs2_scan_dirent_node, we alloc "checkedlen+1" bytes for fd->name and we check crc with length rd->nsize. If checkedlen is less than rd->nsize, it will cause the slab-out-of-bounds problem. jffs2: Dirent at *** has zeroes in name. Truncating to %d char ================================================================== BUG: KASAN: slab-out-of-bounds in crc32_le+0x1ce/0x260 at addr ffff8800842cf2d1 Read of size 1 by task test_JFFS2/915 ============================================================================= BUG kmalloc-64 (Tainted: G B O ): kasan: bad access detected ----------------------------------------------------------------------------- INFO: Allocated in jffs2_alloc_full_dirent+0x2a/0x40 age=0 cpu=1 pid=915 ___slab_alloc+0x580/0x5f0 __slab_alloc.isra.24+0x4e/0x64 __kmalloc+0x170/0x300 jffs2_alloc_full_dirent+0x2a/0x40 jffs2_scan_eraseblock+0x1ca4/0x3b64 jffs2_scan_medium+0x285/0xfe0 jffs2_do_mount_fs+0x5fb/0x1bbc jffs2_do_fill_super+0x245/0x6f0 jffs2_fill_super+0x287/0x2e0 mount_mtd_aux.isra.0+0x9a/0x144 mount_mtd+0x222/0x2f0 jffs2_mount+0x41/0x60 mount_fs+0x63/0x230 vfs_kern_mount.part.6+0x6c/0x1f4 do_mount+0xae8/0x1940 SyS_mount+0x105/0x1d0 INFO: Freed in jffs2_free_full_dirent+0x22/0x40 age=27 cpu=1 pid=915 __slab_free+0x372/0x4e4 kfree+0x1d4/0x20c jffs2_free_full_dirent+0x22/0x40 jffs2_build_remove_unlinked_inode+0x17a/0x1e4 jffs2_do_mount_fs+0x1646/0x1bbc jffs2_do_fill_super+0x245/0x6f0 jffs2_fill_super+0x287/0x2e0 mount_mtd_aux.isra.0+0x9a/0x144 mount_mtd+0x222/0x2f0 jffs2_mount+0x41/0x60 mount_fs+0x63/0x230 vfs_kern_mount.part.6+0x6c/0x1f4 do_mount+0xae8/0x1940 SyS_mount+0x105/0x1d0 entry_SYSCALL_64_fastpath+0x1e/0x97 Call Trace: [] dump_stack+0x59/0x7e [] print_trailer+0x125/0x1b0 [] object_err+0x34/0x40 [] kasan_report.part.1+0x21f/0x534 [] ? vprintk+0x2d/0x40 [] ? crc32_le+0x1ce/0x260 [] kasan_report+0x26/0x30 [] __asan_load1+0x3d/0x50 [] crc32_le+0x1ce/0x260 [] ? jffs2_alloc_full_dirent+0x2a/0x40 [] jffs2_scan_eraseblock+0x1d0c/0x3b64 [] ? jffs2_scan_medium+0xccf/0xfe0 [] ? jffs2_scan_make_ino_cache+0x14c/0x14c [] ? kasan_unpoison_shadow+0x35/0x50 [] ? kasan_unpoison_shadow+0x35/0x50 [] ? kasan_kmalloc+0x5e/0x70 [] ? kmem_cache_alloc_trace+0x10c/0x2cc [] ? mtd_point+0xf7/0x130 [] jffs2_scan_medium+0x285/0xfe0 [] ? jffs2_scan_eraseblock+0x3b64/0x3b64 [] ? kasan_unpoison_shadow+0x35/0x50 [] ? kasan_unpoison_shadow+0x35/0x50 [] ? kasan_kmalloc+0x5e/0x70 [] ? __kmalloc+0x12b/0x300 [] ? kasan_kmalloc+0x5e/0x70 [] ? jffs2_sum_init+0x9f/0x240 [] jffs2_do_mount_fs+0x5fb/0x1bbc [] ? jffs2_del_noinode_dirent+0x640/0x640 [] ? kasan_kmalloc+0x5e/0x70 [] ? __init_rwsem+0x97/0xac [] jffs2_do_fill_super+0x245/0x6f0 [] jffs2_fill_super+0x287/0x2e0 [] ? jffs2_parse_options+0x594/0x594 [] mount_mtd_aux.isra.0+0x9a/0x144 [] mount_mtd+0x222/0x2f0 [] ? jffs2_parse_options+0x594/0x594 [] ? mount_mtd_aux.isra.0+0x144/0x144 [] ? free_pages+0x13/0x1c [] ? selinux_sb_copy_data+0x278/0x2e0 [] jffs2_mount+0x41/0x60 [] mount_fs+0x63/0x230 [] ? alloc_vfsmnt+0x32f/0x3b0 [] vfs_kern_mount.part.6+0x6c/0x1f4 [] do_mount+0xae8/0x1940 [] ? audit_filter_rules.constprop.6+0x1d10/0x1d10 [] ? copy_mount_string+0x40/0x40 [] ? alloc_pages_current+0xa4/0x1bc [] ? __get_free_pages+0x25/0x50 [] ? copy_mount_options.part.17+0x183/0x264 [] SyS_mount+0x105/0x1d0 [] ? copy_mnt_ns+0x560/0x560 [] ? msa_space_switch_handler+0x13d/0x190 [] entry_SYSCALL_64_fastpath+0x1e/0x97 [] ? msa_space_switch+0xb0/0xe0 Memory state around the buggy address: ffff8800842cf180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8800842cf200: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff8800842cf280: fc fc fc fc fc fc 00 00 00 00 01 fc fc fc fc fc ^ ffff8800842cf300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8800842cf380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== Cc: stable@vger.kernel.org Reported-by: Kunkun Xu Signed-off-by: lizhe Signed-off-by: Richard Weinberger --- fs/jffs2/scan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index db72a9d2d0af..b676056826be 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -1079,7 +1079,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo memcpy(&fd->name, rd->name, checkedlen); fd->name[checkedlen] = 0; - crc = crc32(0, fd->name, rd->nsize); + crc = crc32(0, fd->name, checkedlen); if (crc != je32_to_cpu(rd->name_crc)) { pr_notice("%s(): Name CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", __func__, ofs, je32_to_cpu(rd->name_crc), crc); From 81af4b7b53d3f2931db907c90822d0b89f0166b7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 22 Mar 2021 12:34:29 +0100 Subject: [PATCH 0443/1379] jffs2: avoid Wempty-body warnings Building with W=1 shows a few warnings for empty macros: fs/jffs2/scan.c: In function 'jffs2_scan_xattr_node': fs/jffs2/scan.c:378:66: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body] 378 | jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset); | ^ fs/jffs2/scan.c: In function 'jffs2_scan_xref_node': fs/jffs2/scan.c:434:65: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body] 434 | jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset); | ^ fs/jffs2/scan.c: In function 'jffs2_scan_eraseblock': fs/jffs2/scan.c:893:88: error: suggest braces around empty body in an 'if' statement [-Werror=empty-body] 893 | jffs2_sum_add_padding_mem(s, je32_to_cpu(node->totlen)); | ^ Change all these macros to 'do { } while (0)' statements to avoid the warnings and make the code a little more robust. Signed-off-by: Arnd Bergmann Signed-off-by: Richard Weinberger --- fs/jffs2/summary.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h index e4131cb1f1d4..36d9a1280770 100644 --- a/fs/jffs2/summary.h +++ b/fs/jffs2/summary.h @@ -194,18 +194,18 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb #define jffs2_sum_active() (0) #define jffs2_sum_init(a) (0) -#define jffs2_sum_exit(a) +#define jffs2_sum_exit(a) do { } while (0) #define jffs2_sum_disable_collecting(a) #define jffs2_sum_is_disabled(a) (0) -#define jffs2_sum_reset_collected(a) +#define jffs2_sum_reset_collected(a) do { } while (0) #define jffs2_sum_add_kvec(a,b,c,d) (0) -#define jffs2_sum_move_collected(a,b) +#define jffs2_sum_move_collected(a,b) do { } while (0) #define jffs2_sum_write_sumnode(a) (0) -#define jffs2_sum_add_padding_mem(a,b) -#define jffs2_sum_add_inode_mem(a,b,c) -#define jffs2_sum_add_dirent_mem(a,b,c) -#define jffs2_sum_add_xattr_mem(a,b,c) -#define jffs2_sum_add_xref_mem(a,b,c) +#define jffs2_sum_add_padding_mem(a,b) do { } while (0) +#define jffs2_sum_add_inode_mem(a,b,c) do { } while (0) +#define jffs2_sum_add_dirent_mem(a,b,c) do { } while (0) +#define jffs2_sum_add_xattr_mem(a,b,c) do { } while (0) +#define jffs2_sum_add_xref_mem(a,b,c) do { } while (0) #define jffs2_sum_scan_sumnode(a,b,c,d,e) (0) #endif /* CONFIG_JFFS2_SUMMARY */ From 42984af09afc414d540fcc8247f42894b0378a91 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 31 Mar 2021 00:15:37 +1030 Subject: [PATCH 0444/1379] jffs2: Hook up splice_write callback overlayfs using jffs2 as the upper filesystem would fail in some cases since moving to v5.10. The test case used was to run 'touch' on a file that exists in the lower fs, causing the modification time to be updated. It returns EINVAL when the bug is triggered. A bisection showed this was introduced in v5.9-rc1, with commit 36e2c7421f02 ("fs: don't allow splice read/write without explicit ops"). Reverting that commit restores the expected behaviour. Some digging showed that this was due to jffs2 lacking an implementation of splice_write. (For unknown reasons the warn_unsupported that should trigger was not displaying any output). Adding this patch resolved the issue and the test now passes. Cc: stable@vger.kernel.org Fixes: 36e2c7421f02 ("fs: don't allow splice read/write without explicit ops") Signed-off-by: Joel Stanley Reviewed-by: Christoph Hellwig Tested-by: Lei YU Signed-off-by: Richard Weinberger --- fs/jffs2/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index f8fb89b10227..4fc8cd698d1a 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -57,6 +57,7 @@ const struct file_operations jffs2_file_operations = .mmap = generic_file_readonly_mmap, .fsync = jffs2_fsync, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, }; /* jffs2_file_inode_operations */ From 9a29f7f020e06f14eb126bcb84a7f0d166415824 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Tue, 6 Apr 2021 15:16:18 +0800 Subject: [PATCH 0445/1379] ubi: Remove unnecessary struct declaration struct ubi_wl_entry is defined at 178th line. The declaration here is unnecessary. Remove it. Reviewed-by: Tudor Ambarus Signed-off-by: Wan Jiabing Signed-off-by: Richard Weinberger --- drivers/mtd/ubi/ubi.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h index c2da77163f94..7c083ad58274 100644 --- a/drivers/mtd/ubi/ubi.h +++ b/drivers/mtd/ubi/ubi.h @@ -388,8 +388,6 @@ struct ubi_volume_desc { int mode; }; -struct ubi_wl_entry; - /** * struct ubi_debug_info - debugging information for an UBI device. * From e1db6338d6fa0d409e45cf20ab5aeaca704f68e7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 15 Apr 2021 16:34:26 -0400 Subject: [PATCH 0446/1379] ftrace: Reuse the output of the function tracer for func_repeats The func_repeats event shows the output of the function tracer followed by a count of the number of repeats the previous function had made, as well as the timestamp of the last function that was repeated. The printing of the function should be the same as is for the function it is displaying. Reuse the code in trace_fn_trace() by making a helper function print_fn_trace() and use it for trace_func_repeats_print(). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3037f0c88f90..d0368a569bfa 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -845,6 +845,17 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, return trace_handle_return(&iter->seq); } +static void print_fn_trace(struct trace_seq *s, unsigned long ip, + unsigned long parent_ip, int flags) +{ + seq_print_ip_sym(s, ip, flags); + + if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) { + trace_seq_puts(s, " <-"); + seq_print_ip_sym(s, parent_ip, flags); + } +} + /* TRACE_FN */ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -854,13 +865,7 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - seq_print_ip_sym(s, field->ip, flags); - - if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { - trace_seq_puts(s, " <-"); - seq_print_ip_sym(s, field->parent_ip, flags); - } - + print_fn_trace(s, field->ip, field->parent_ip, flags); trace_seq_putc(s, '\n'); return trace_handle_return(s); @@ -1408,9 +1413,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - seq_print_ip_sym(s, field->ip, flags); - trace_seq_puts(s, " <-"); - seq_print_ip_sym(s, field->parent_ip, flags); + print_fn_trace(s, field->ip, field->parent_ip, flags); trace_seq_printf(s, " (repeats: %u, last_ts:", field->count); trace_print_time(s, iter, iter->ts - FUNC_REPEATS_GET_DELTA_TS(field)); From dc01a3b9db43abf95b801c9694980777a329e303 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 23 Feb 2021 10:44:08 +0100 Subject: [PATCH 0447/1379] um: Fix tag order in stub_32.h "static void inline" is the wrong way around, fix that. Reported-by: kernel test robot Fixes: 9f0b4807a44f ("um: rework userspace stubs to not hard-code stub location") Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/x86/um/shared/sysdep/stub_32.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h index c3891c1ada26..b95db9daf0e8 100644 --- a/arch/x86/um/shared/sysdep/stub_32.h +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -77,7 +77,7 @@ static inline void trap_myself(void) __asm("int3"); } -static void inline remap_stack_and_trap(void) +static inline void remap_stack_and_trap(void) { __asm__ volatile ( "movl %%esp,%%ebx ;" From d5027ca63e0e778b641cf23e3f5c6d6212cf412b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 5 Mar 2021 21:43:15 +0100 Subject: [PATCH 0448/1379] um: Mark all kernel symbols as local Ritesh reported a bug [1] against UML, noting that it crashed on startup. The backtrace shows the following (heavily redacted): (gdb) bt ... #26 0x0000000060015b5d in sem_init () at ipc/sem.c:268 #27 0x00007f89906d92f7 in ?? () from /lib/x86_64-linux-gnu/libcom_err.so.2 #28 0x00007f8990ab8fb2 in call_init (...) at dl-init.c:72 ... #40 0x00007f89909bf3a6 in nss_load_library (...) at nsswitch.c:359 ... #44 0x00007f8990895e35 in _nss_compat_getgrnam_r (...) at nss_compat/compat-grp.c:486 #45 0x00007f8990968b85 in __getgrnam_r [...] #46 0x00007f89909d6b77 in grantpt [...] #47 0x00007f8990a9394e in __GI_openpty [...] #48 0x00000000604a1f65 in openpty_cb (...) at arch/um/os-Linux/sigio.c:407 #49 0x00000000604a58d0 in start_idle_thread (...) at arch/um/os-Linux/skas/process.c:598 #50 0x0000000060004a3d in start_uml () at arch/um/kernel/skas/process.c:45 #51 0x00000000600047b2 in linux_main (...) at arch/um/kernel/um_arch.c:334 #52 0x000000006000574f in main (...) at arch/um/os-Linux/main.c:144 indicating that the UML function openpty_cb() calls openpty(), which internally calls __getgrnam_r(), which causes the nsswitch machinery to get started. This loads, through lots of indirection that I snipped, the libcom_err.so.2 library, which (in an unknown function, "??") calls sem_init(). Now, of course it wants to get libpthread's sem_init(), since it's linked against libpthread. However, the dynamic linker looks up that symbol against the binary first, and gets the kernel's sem_init(). Hajime Tazaki noted that "objcopy -L" can localize a symbol, so the dynamic linker wouldn't do the lookup this way. I tried, but for some reason that didn't seem to work. Doing the same thing in the linker script instead does seem to work, though I cannot entirely explain - it *also* works if I just add "VERSION { { global: *; }; }" instead, indicating that something else is happening that I don't really understand. It may be that explicitly doing that marks them with some kind of empty version, and that's different from the default. Explicitly marking them with a version breaks kallsyms, so that doesn't seem to be possible. Marking all the symbols as local seems correct, and does seem to address the issue, so do that. Also do it for static link, nsswitch libraries could still be loaded there. [1] https://bugs.debian.org/983379 Reported-by: Ritesh Raj Sarraf Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Tested-By: Ritesh Raj Sarraf Signed-off-by: Richard Weinberger --- arch/um/kernel/dyn.lds.S | 6 ++++++ arch/um/kernel/uml.lds.S | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index dacbfabf66d8..2f2a8ce92f1e 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -6,6 +6,12 @@ OUTPUT_ARCH(ELF_ARCH) ENTRY(_start) jiffies = jiffies_64; +VERSION { + { + local: *; + }; +} + SECTIONS { PROVIDE (__executable_start = START); diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 45d957d7004c..7a8e2b123e29 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -7,6 +7,12 @@ OUTPUT_ARCH(ELF_ARCH) ENTRY(_start) jiffies = jiffies_64; +VERSION { + { + local: *; + }; +} + SECTIONS { /* This must contain the right address - not quite the default ELF one.*/ From ea8e896cc15e8fac586d018733fd67f56cd721cc Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 10 Mar 2021 16:49:08 +0800 Subject: [PATCH 0449/1379] um: Remove unneeded variable 'ret' Fix the following coccicheck warning: ./arch/um/drivers/hostaudio_kern.c:125:10-14: Unneeded variable: "mask". Return "0" on line 131 Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Richard Weinberger --- arch/um/drivers/hostaudio_kern.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c index d35d3f305a31..5b064d360cb7 100644 --- a/arch/um/drivers/hostaudio_kern.c +++ b/arch/um/drivers/hostaudio_kern.c @@ -122,13 +122,11 @@ static ssize_t hostaudio_write(struct file *file, const char __user *buffer, static __poll_t hostaudio_poll(struct file *file, struct poll_table_struct *wait) { - __poll_t mask = 0; - #ifdef DEBUG printk(KERN_DEBUG "hostaudio: poll called (unimplemented)\n"); #endif - return mask; + return 0; } static long hostaudio_ioctl(struct file *file, From ad3d19911632debc886ef4a992d41d6de7927006 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Mar 2021 23:47:31 +0100 Subject: [PATCH 0450/1379] um: Disable CONFIG_GCOV with MODULES CONFIG_GCOV doesn't work with modules, and for various reasons it cannot work, see also https://lore.kernel.org/r/d36ea54d8c0a8dd706826ba844a6f27691f45d55.camel@sipsolutions.net Make CONFIG_GCOV depend on !MODULES to avoid anyone running into issues there. This also means we need not export the gcov symbols. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/Kconfig.debug | 1 + arch/um/kernel/Makefile | 1 - arch/um/kernel/gmon_syms.c | 16 ---------------- 3 files changed, 1 insertion(+), 17 deletions(-) delete mode 100644 arch/um/kernel/gmon_syms.c diff --git a/arch/um/Kconfig.debug b/arch/um/Kconfig.debug index 315d368e63ad..1dfb2959c73b 100644 --- a/arch/um/Kconfig.debug +++ b/arch/um/Kconfig.debug @@ -17,6 +17,7 @@ config GCOV bool "Enable gcov support" depends on DEBUG_INFO depends on !KCOV + depends on !MODULES help This option allows developers to retrieve coverage data from a UML session. diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 5aa882011e04..e698e0c7dbdc 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -21,7 +21,6 @@ obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \ obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o obj-$(CONFIG_GPROF) += gprof_syms.o -obj-$(CONFIG_GCOV) += gmon_syms.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/arch/um/kernel/gmon_syms.c b/arch/um/kernel/gmon_syms.c deleted file mode 100644 index 9361a8eb9bf1..000000000000 --- a/arch/um/kernel/gmon_syms.c +++ /dev/null @@ -1,16 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include - -extern void __bb_init_func(void *) __attribute__((weak)); -EXPORT_SYMBOL(__bb_init_func); - -extern void __gcov_init(void *) __attribute__((weak)); -EXPORT_SYMBOL(__gcov_init); -extern void __gcov_merge_add(void *, unsigned int) __attribute__((weak)); -EXPORT_SYMBOL(__gcov_merge_add); -extern void __gcov_exit(void) __attribute__((weak)); -EXPORT_SYMBOL(__gcov_exit); From 24271ffed750bc84ea8f73bbab30c59d53f56171 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 17 Mar 2021 15:11:36 +0800 Subject: [PATCH 0451/1379] um: Replace if (cond) BUG() with BUG_ON() Fix the following coccinelle reports: ./arch/um/kernel/mem.c:77:3-6: WARNING: Use BUG_ON instead of if condition followed by BUG. Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Richard Weinberger --- arch/um/kernel/mem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 9242dc91d751..22e28bfb22cc 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -73,8 +73,7 @@ static void __init one_page_table_init(pmd_t *pmd) set_pmd(pmd, __pmd(_KERNPG_TABLE + (unsigned long) __pa(pte))); - if (pte != pte_offset_kernel(pmd, 0)) - BUG(); + BUG_ON(pte != pte_offset_kernel(pmd, 0)); } } From a730af6e3114d549555f4b130c216bad1c3aa80e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 4 Apr 2021 11:20:44 -0700 Subject: [PATCH 0452/1379] um: Add 2 missing libs to fix various build errors Fix many build errors (at least 18 build error reports) for uml on i386 by adding 2 more library object files. All missing symbols are either cmpxchg8b_emu or atomic*386. Here are a few examples of the build errors that are eliminated: /usr/bin/ld: core.c:(.text+0xd83): undefined reference to `cmpxchg8b_emu' /usr/bin/ld: core.c:(.text+0x2bb2): undefined reference to `atomic64_add_386' /usr/bin/ld: core.c:(.text+0x2c5d): undefined reference to `atomic64_xchg_386' syscall.c:(.text+0x2f49): undefined reference to `atomic64_set_386' /usr/bin/ld: syscall.c:(.text+0x2f54): undefined reference to `atomic64_set_386' syscall.c:(.text+0x33a4): undefined reference to `atomic64_inc_386' /usr/bin/ld: syscall.c:(.text+0x33ac): undefined reference to `atomic64_inc_386' /usr/bin/ld: net/ipv4/inet_timewait_sock.o: in function `inet_twsk_alloc': inet_timewait_sock.c:(.text+0x3d1): undefined reference to `atomic64_read_386' /usr/bin/ld: inet_timewait_sock.c:(.text+0x3dd): undefined reference to `atomic64_set_386' /usr/bin/ld: net/ipv4/inet_connection_sock.o: in function `inet_csk_clone_lock': inet_connection_sock.c:(.text+0x1d74): undefined reference to `atomic64_read_386' /usr/bin/ld: inet_connection_sock.c:(.text+0x1d80): undefined reference to `atomic64_set_386' /usr/bin/ld: net/ipv4/tcp_input.o: in function `inet_reqsk_alloc': tcp_input.c:(.text+0xa345): undefined reference to `atomic64_set_386' /usr/bin/ld: net/mac80211/wpa.o: in function `ieee80211_crypto_tkip_encrypt': wpa.c:(.text+0x739): undefined reference to `atomic64_inc_return_386' Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Brendan Jackman Cc: Alexei Starovoitov Cc: kbuild-all@lists.01.org Cc: Jeff Dike Cc: Richard Weinberger Cc: Anton Ivanov Cc: linux-um@lists.infradead.org Cc: Johannes Berg Cc: Johannes Berg Signed-off-by: Richard Weinberger --- arch/x86/um/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 77f70b969d14..5ccb18290d71 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -21,6 +21,7 @@ obj-y += checksum_32.o syscalls_32.o obj-$(CONFIG_ELF_CORE) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o +subarch-y += ../lib/cmpxchg8b_emu.o ../lib/atomic64_386_32.o subarch-y += ../kernel/sys_ia32.o else From c521db95d4e3f7ba7dd8fd0679ff148d848c1d4b Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 6 Apr 2021 10:00:50 +0800 Subject: [PATCH 0453/1379] um: Remove unused including Fix the following versioncheck warning: ./arch/um/drivers/vector_kern.c: 11 linux/version.h not needed. Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Richard Weinberger --- arch/um/drivers/vector_kern.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 47a02e60898d..d27a2a9faf3e 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -8,7 +8,6 @@ * Copyright (C) 2001 by various other people who didn't put their name here. */ -#include #include #include #include From 6e166319a63448c1ba9e310ddd94c8e3e9ac4e3c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 13 Apr 2021 23:43:48 -0700 Subject: [PATCH 0454/1379] um: pgtable.h: Fix W=1 warning for empty body in 'do' statement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the common kernel style to eliminate a warning: ./arch/um/include/asm/pgtable.h:305:47: warning: suggest braces around empty body in ‘do’ statement [-Wempty-body] #define update_mmu_cache(vma,address,ptep) do ; while (0) ^ mm/filemap.c:3212:3: note: in expansion of macro ‘update_mmu_cache’ update_mmu_cache(vma, addr, vmf->pte); ^~~~~~~~~~~~~~~~ Signed-off-by: Randy Dunlap Cc: Jeff Dike Cc: Richard Weinberger Cc: Anton Ivanov Cc: linux-um@lists.infradead.org Signed-off-by: Richard Weinberger --- arch/um/include/asm/pgtable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index def376194dce..b9e20bbe2f75 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -302,7 +302,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) struct mm_struct; extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); -#define update_mmu_cache(vma,address,ptep) do ; while (0) +#define update_mmu_cache(vma,address,ptep) do {} while (0) /* Encode and de-code a swap entry */ #define __swp_type(x) (((x).val >> 5) & 0x1f) From 1a594f0afa2b176531e600a6bf8074849fe2a780 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 13 Apr 2021 23:43:49 -0700 Subject: [PATCH 0455/1379] um: elf.h: Fix W=1 warning for empty body in 'do' statement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the common kernel style to eliminate a warning: ./arch/x86/um/asm/elf.h:215:32: warning: suggest braces around empty body in ‘do’ statement [-Wempty-body] #define SET_PERSONALITY(ex) do ; while(0) ^ Signed-off-by: Randy Dunlap Cc: Jeff Dike Cc: Richard Weinberger Cc: Anton Ivanov Cc: linux-um@lists.infradead.org Signed-off-by: Richard Weinberger --- arch/x86/um/asm/elf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index c907b20d4993..dcaf3b38a9e0 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -212,6 +212,6 @@ extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu); extern long elf_aux_hwcap; #define ELF_HWCAP (elf_aux_hwcap) -#define SET_PERSONALITY(ex) do ; while(0) +#define SET_PERSONALITY(ex) do {} while(0) #endif From ed102bf2afed226703eaf85a704755bdbea34583 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 15 Apr 2021 10:13:52 -0700 Subject: [PATCH 0456/1379] um: Fix W=1 missing-include-dirs warnings Currently when using "W=1" with UML builds, there are over 700 warnings like so: CC arch/um/drivers/stderr_console.o cc1: warning: ./arch/um/include/uapi: No such file or directory [-Wmissing-include-dirs] but arch/um/ does not have include/uapi/ at all, so add that subdir and put one Kbuild file into it (since git does not track empty subdirs). Signed-off-by: Randy Dunlap Cc: Masahiro Yamada Cc: Michal Marek Cc: linux-kbuild@vger.kernel.org Cc: Jeff Dike Cc: Richard Weinberger Cc: Anton Ivanov Cc: linux-um@lists.infradead.org Reviewed-by: Masahiro Yamada Signed-off-by: Richard Weinberger --- arch/um/include/uapi/asm/Kbuild | 1 + 1 file changed, 1 insertion(+) create mode 100644 arch/um/include/uapi/asm/Kbuild diff --git a/arch/um/include/uapi/asm/Kbuild b/arch/um/include/uapi/asm/Kbuild new file mode 100644 index 000000000000..f66554cd5c45 --- /dev/null +++ b/arch/um/include/uapi/asm/Kbuild @@ -0,0 +1 @@ +# SPDX-License-Identifier: GPL-2.0 From 42eb0d54c08a0331d6d295420f602237968d792b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Mar 2021 09:22:09 +0100 Subject: [PATCH 0457/1379] fs: split receive_fd_replace from __receive_fd receive_fd_replace shares almost no code with the general case, so split it out. Also remove the "Bump the sock usage counts" comment from both copies, as that is now what __receive_sock actually does. [AV: ... and make the only user of receive_fd_replace() choose between it and receive_fd() according to what userland had passed to it in flags] Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/file.c | 39 +++++++++++++++++++-------------------- include/linux/file.h | 11 ++++------- kernel/seccomp.c | 17 ++++++++++++----- 3 files changed, 35 insertions(+), 32 deletions(-) diff --git a/fs/file.c b/fs/file.c index f3a4bac2cbe9..d8ccb95a7f41 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1068,8 +1068,6 @@ out_unlock: /** * __receive_fd() - Install received file into file descriptor table - * - * @fd: fd to install into (if negative, a new fd will be allocated) * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry @@ -1083,7 +1081,7 @@ out_unlock: * * Returns newly install fd or -ve on error. */ -int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags) +int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; @@ -1092,32 +1090,33 @@ int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flag if (error) return error; - if (fd < 0) { - new_fd = get_unused_fd_flags(o_flags); - if (new_fd < 0) - return new_fd; - } else { - new_fd = fd; - } + new_fd = get_unused_fd_flags(o_flags); + if (new_fd < 0) + return new_fd; if (ufd) { error = put_user(new_fd, ufd); if (error) { - if (fd < 0) - put_unused_fd(new_fd); + put_unused_fd(new_fd); return error; } } - if (fd < 0) { - fd_install(new_fd, get_file(file)); - } else { - error = replace_fd(new_fd, file, o_flags); - if (error) - return error; - } + fd_install(new_fd, get_file(file)); + __receive_sock(file); + return new_fd; +} - /* Bump the sock usage counts, if any. */ +int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) +{ + int error; + + error = security_file_receive(file); + if (error) + return error; + error = replace_fd(new_fd, file, o_flags); + if (error) + return error; __receive_sock(file); return new_fd; } diff --git a/include/linux/file.h b/include/linux/file.h index 225982792fa2..2de2e4613d7b 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -92,23 +92,20 @@ extern void put_unused_fd(unsigned int fd); extern void fd_install(unsigned int fd, struct file *file); -extern int __receive_fd(int fd, struct file *file, int __user *ufd, +extern int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags); static inline int receive_fd_user(struct file *file, int __user *ufd, unsigned int o_flags) { if (ufd == NULL) return -EFAULT; - return __receive_fd(-1, file, ufd, o_flags); + return __receive_fd(file, ufd, o_flags); } static inline int receive_fd(struct file *file, unsigned int o_flags) { - return __receive_fd(-1, file, NULL, o_flags); -} -static inline int receive_fd_replace(int fd, struct file *file, unsigned int o_flags) -{ - return __receive_fd(fd, file, NULL, o_flags); + return __receive_fd(file, NULL, o_flags); } +int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags); extern void flush_delayed_fput(void); extern void __fput_sync(struct file *); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1d60fc2c9987..4fe19cecaa94 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -119,8 +119,11 @@ struct seccomp_kaddfd { int fd; unsigned int flags; - /* To only be set on reply */ - int ret; + union { + bool setfd; + /* To only be set on reply */ + int ret; + }; struct completion completion; struct list_head list; }; @@ -1069,7 +1072,11 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) * that it has been handled. */ list_del_init(&addfd->list); - addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); + if (!addfd->setfd) + addfd->ret = receive_fd(addfd->file, addfd->flags); + else + addfd->ret = receive_fd_replace(addfd->fd, addfd->file, + addfd->flags); complete(&addfd->completion); } @@ -1583,8 +1590,8 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, return -EBADF; kaddfd.flags = addfd.newfd_flags; - kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ? - addfd.newfd : -1; + kaddfd.setfd = addfd.flags & SECCOMP_ADDFD_FLAG_SETFD; + kaddfd.fd = addfd.newfd; init_completion(&kaddfd.completion); ret = mutex_lock_interruptible(&filter->notify_lock); From 8926cc8302819be9e67f70409ed001ecb2c924a9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 15 Apr 2021 15:09:41 -0400 Subject: [PATCH 0458/1379] NFSv4.x: Don't return NFS4ERR_NOMATCHING_LAYOUT if we're unmounting If the NFS super block is being unmounted, then we currently may end up telling the server that we've forgotten the layout while it is actually still in use by the client. In that case, just assume that the client will soon return the layout anyway, and so return NFS4ERR_DELAY in response to the layout recall. Fixes: 58ac3e59235f ("NFSv4/pnfs: Clean up nfs_layout_find_inode()") Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f7786e00a6a7..ed9d580826f5 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -137,12 +137,12 @@ static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp, list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { if (!pnfs_layout_is_valid(lo)) continue; - if (stateid != NULL && - !nfs4_stateid_match_other(stateid, &lo->plh_stateid)) + if (!nfs4_stateid_match_other(stateid, &lo->plh_stateid)) continue; - if (!nfs_sb_active(server->super)) - continue; - inode = igrab(lo->plh_inode); + if (nfs_sb_active(server->super)) + inode = igrab(lo->plh_inode); + else + inode = ERR_PTR(-EAGAIN); rcu_read_unlock(); if (inode) return inode; @@ -176,9 +176,10 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp, continue; if (nfsi->layout != lo) continue; - if (!nfs_sb_active(server->super)) - continue; - inode = igrab(lo->plh_inode); + if (nfs_sb_active(server->super)) + inode = igrab(lo->plh_inode); + else + inode = ERR_PTR(-EAGAIN); rcu_read_unlock(); if (inode) return inode; From 39fd01863616964f009599e50ca5c6ea9ebf88d6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 15 Apr 2021 15:41:57 -0400 Subject: [PATCH 0459/1379] NFS: Don't discard pNFS layout segments that are marked for return If the pNFS layout segment is marked with the NFS_LSEG_LAYOUTRETURN flag, then the assumption is that it has some reporting requirement to perform through a layoutreturn (e.g. flexfiles layout stats or error information). Fixes: e0b7d420f72a ("pNFS: Don't discard layout segments that are marked for return") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 102b66e0bdef..33574f47601f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2468,6 +2468,9 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, assert_spin_locked(&lo->plh_inode->i_lock); + if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + tmp_list = &lo->plh_return_segs; + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (pnfs_match_lseg_recall(lseg, return_range, seq)) { dprintk("%s: marking lseg %p iomode %d " @@ -2475,6 +2478,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + tmp_list = &lo->plh_return_segs; if (mark_lseg_invalid(lseg, tmp_list)) continue; remaining++; From a7b6864da7e3fb59c5385bb1c28f3a676dc3da27 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 15 Apr 2021 16:30:22 +0800 Subject: [PATCH 0460/1379] PCI: shpchp: Remove unused shpc_writeb() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following clang warning: drivers/pci/hotplug/shpchp_hpc.c:177:20: warning: unused function 'shpc_writeb' [-Wunused-function]. Link: https://lore.kernel.org/r/1618475422-96531-1-git-send-email-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- drivers/pci/hotplug/shpchp_hpc.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/pci/hotplug/shpchp_hpc.c b/drivers/pci/hotplug/shpchp_hpc.c index db047284c291..9e3b27744305 100644 --- a/drivers/pci/hotplug/shpchp_hpc.c +++ b/drivers/pci/hotplug/shpchp_hpc.c @@ -174,11 +174,6 @@ static inline u8 shpc_readb(struct controller *ctrl, int reg) return readb(ctrl->creg + reg); } -static inline void shpc_writeb(struct controller *ctrl, int reg, u8 val) -{ - writeb(val, ctrl->creg + reg); -} - static inline u16 shpc_readw(struct controller *ctrl, int reg) { return readw(ctrl->creg + reg); From 7f100744749e4fe547dece3bb6557fae5f0a7252 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Date: Fri, 16 Apr 2021 19:15:37 +0530 Subject: [PATCH 0461/1379] PCI: tegra: Add Tegra194 MCFG quirks for ECAM errata The PCIe controller in Tegra194 SoC is not ECAM-compliant. With the current hardware design, ECAM can be enabled only for one controller (the C5 controller) with bus numbers starting from 160 instead of 0. A different approach is taken to avoid this abnormal way of enabling ECAM for just one controller but to enable configuration space access for all the other controllers. In this approach, ops are added through MCFG quirk mechanism which access the configuration spaces by dynamically programming iATU (internal AddressTranslation Unit) to generate respective configuration accesses just like the way it is done in DesignWare core sub-system. This issue is specific to Tegra194 and it would be fixed in the future generations of Tegra SoCs. Link: https://lore.kernel.org/r/20210416134537.19474-1-vidyas@nvidia.com Signed-off-by: Vidya Sagar Signed-off-by: Bjorn Helgaas --- drivers/acpi/pci_mcfg.c | 7 ++ drivers/pci/controller/dwc/Makefile | 2 +- drivers/pci/controller/dwc/pcie-tegra194.c | 102 +++++++++++++++++++++ include/linux/pci-ecam.h | 1 + 4 files changed, 111 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c index 95f23acd5b80..53cab975f612 100644 --- a/drivers/acpi/pci_mcfg.c +++ b/drivers/acpi/pci_mcfg.c @@ -116,6 +116,13 @@ static struct mcfg_fixup mcfg_quirks[] = { THUNDER_ECAM_QUIRK(2, 12), THUNDER_ECAM_QUIRK(2, 13), + { "NVIDIA", "TEGRA194", 1, 0, MCFG_BUS_ANY, &tegra194_pcie_ops}, + { "NVIDIA", "TEGRA194", 1, 1, MCFG_BUS_ANY, &tegra194_pcie_ops}, + { "NVIDIA", "TEGRA194", 1, 2, MCFG_BUS_ANY, &tegra194_pcie_ops}, + { "NVIDIA", "TEGRA194", 1, 3, MCFG_BUS_ANY, &tegra194_pcie_ops}, + { "NVIDIA", "TEGRA194", 1, 4, MCFG_BUS_ANY, &tegra194_pcie_ops}, + { "NVIDIA", "TEGRA194", 1, 5, MCFG_BUS_ANY, &tegra194_pcie_ops}, + #define XGENE_V1_ECAM_MCFG(rev, seg) \ {"APM ", "XGENE ", rev, seg, MCFG_BUS_ANY, \ &xgene_v1_pcie_ecam_ops } diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile index a751553fa0db..dbb981876556 100644 --- a/drivers/pci/controller/dwc/Makefile +++ b/drivers/pci/controller/dwc/Makefile @@ -17,7 +17,6 @@ obj-$(CONFIG_PCIE_INTEL_GW) += pcie-intel-gw.o obj-$(CONFIG_PCIE_KIRIN) += pcie-kirin.o obj-$(CONFIG_PCIE_HISI_STB) += pcie-histb.o obj-$(CONFIG_PCI_MESON) += pci-meson.o -obj-$(CONFIG_PCIE_TEGRA194) += pcie-tegra194.o obj-$(CONFIG_PCIE_UNIPHIER) += pcie-uniphier.o obj-$(CONFIG_PCIE_UNIPHIER_EP) += pcie-uniphier-ep.o @@ -34,4 +33,5 @@ obj-$(CONFIG_PCIE_UNIPHIER_EP) += pcie-uniphier-ep.o ifdef CONFIG_PCI obj-$(CONFIG_ARM64) += pcie-al.o obj-$(CONFIG_ARM64) += pcie-hisi.o +obj-$(CONFIG_ARM64) += pcie-tegra194.o endif diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index 6fa216e52d14..a3979d358d06 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include #include @@ -311,6 +313,104 @@ struct tegra_pcie_dw_of_data { enum dw_pcie_device_mode mode; }; +#if defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) +struct tegra194_pcie_ecam { + void __iomem *config_base; + void __iomem *iatu_base; + void __iomem *dbi_base; +}; + +static int tegra194_acpi_init(struct pci_config_window *cfg) +{ + struct device *dev = cfg->parent; + struct tegra194_pcie_ecam *pcie_ecam; + + pcie_ecam = devm_kzalloc(dev, sizeof(*pcie_ecam), GFP_KERNEL); + if (!pcie_ecam) + return -ENOMEM; + + pcie_ecam->config_base = cfg->win; + pcie_ecam->iatu_base = cfg->win + SZ_256K; + pcie_ecam->dbi_base = cfg->win + SZ_512K; + cfg->priv = pcie_ecam; + + return 0; +} + +static void atu_reg_write(struct tegra194_pcie_ecam *pcie_ecam, int index, + u32 val, u32 reg) +{ + u32 offset = PCIE_GET_ATU_OUTB_UNR_REG_OFFSET(index); + + writel(val, pcie_ecam->iatu_base + offset + reg); +} + +static void program_outbound_atu(struct tegra194_pcie_ecam *pcie_ecam, + int index, int type, u64 cpu_addr, + u64 pci_addr, u64 size) +{ + atu_reg_write(pcie_ecam, index, lower_32_bits(cpu_addr), + PCIE_ATU_LOWER_BASE); + atu_reg_write(pcie_ecam, index, upper_32_bits(cpu_addr), + PCIE_ATU_UPPER_BASE); + atu_reg_write(pcie_ecam, index, lower_32_bits(pci_addr), + PCIE_ATU_LOWER_TARGET); + atu_reg_write(pcie_ecam, index, lower_32_bits(cpu_addr + size - 1), + PCIE_ATU_LIMIT); + atu_reg_write(pcie_ecam, index, upper_32_bits(pci_addr), + PCIE_ATU_UPPER_TARGET); + atu_reg_write(pcie_ecam, index, type, PCIE_ATU_CR1); + atu_reg_write(pcie_ecam, index, PCIE_ATU_ENABLE, PCIE_ATU_CR2); +} + +static void __iomem *tegra194_map_bus(struct pci_bus *bus, + unsigned int devfn, int where) +{ + struct pci_config_window *cfg = bus->sysdata; + struct tegra194_pcie_ecam *pcie_ecam = cfg->priv; + u32 busdev; + int type; + + if (bus->number < cfg->busr.start || bus->number > cfg->busr.end) + return NULL; + + if (bus->number == cfg->busr.start) { + if (PCI_SLOT(devfn) == 0) + return pcie_ecam->dbi_base + where; + else + return NULL; + } + + busdev = PCIE_ATU_BUS(bus->number) | PCIE_ATU_DEV(PCI_SLOT(devfn)) | + PCIE_ATU_FUNC(PCI_FUNC(devfn)); + + if (bus->parent->number == cfg->busr.start) { + if (PCI_SLOT(devfn) == 0) + type = PCIE_ATU_TYPE_CFG0; + else + return NULL; + } else { + type = PCIE_ATU_TYPE_CFG1; + } + + program_outbound_atu(pcie_ecam, 0, type, cfg->res.start, busdev, + SZ_256K); + + return pcie_ecam->config_base + where; +} + +const struct pci_ecam_ops tegra194_pcie_ops = { + .init = tegra194_acpi_init, + .pci_ops = { + .map_bus = tegra194_map_bus, + .read = pci_generic_config_read, + .write = pci_generic_config_write, + } +}; +#endif /* defined(CONFIG_ACPI) && defined(CONFIG_PCI_QUIRKS) */ + +#ifdef CONFIG_PCIE_TEGRA194 + static inline struct tegra_pcie_dw *to_tegra_pcie(struct dw_pcie *pci) { return container_of(pci, struct tegra_pcie_dw, pci); @@ -2311,3 +2411,5 @@ MODULE_DEVICE_TABLE(of, tegra_pcie_dw_of_match); MODULE_AUTHOR("Vidya Sagar "); MODULE_DESCRIPTION("NVIDIA PCIe host controller driver"); MODULE_LICENSE("GPL v2"); + +#endif /* CONFIG_PCIE_TEGRA194 */ diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index 65d3d83015c3..fbdadd4d8377 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -85,6 +85,7 @@ extern const struct pci_ecam_ops pci_thunder_ecam_ops; /* Cavium ThunderX 1.x */ extern const struct pci_ecam_ops xgene_v1_pcie_ecam_ops; /* APM X-Gene PCIe v1 */ extern const struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */ extern const struct pci_ecam_ops al_pcie_ops; /* Amazon Annapurna Labs PCIe */ +extern const struct pci_ecam_ops tegra194_pcie_ops; /* Tegra194 PCIe */ #endif #if IS_ENABLED(CONFIG_PCI_HOST_COMMON) From 217fd6f625af591e2866bebb8cda778cf85bea2e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Apr 2021 14:00:14 -0400 Subject: [PATCH 0462/1379] nfsd: ensure new clients break delegations If nfsd already has an open file that it plans to use for IO from another, it may not need to do another vfs open, but it still may need to break any delegations in case the existing opens are for another client. Symptoms are that we may incorrectly fail to break a delegation on a write open from a different client, when the delegation-holding client already has a write open. Fixes: 28df3d1539de ("nfsd: clients don't need to break their own delegations") Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6b8ad1a9ec34..403c69efdbea 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4880,6 +4880,11 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, if (nf) nfsd_file_put(nf); + status = nfserrno(nfsd_open_break_lease(cur_fh->fh_dentry->d_inode, + access)); + if (status) + goto out_put_access; + status = nfsd4_truncate(rqstp, cur_fh, open); if (status) goto out_put_access; @@ -6856,11 +6861,20 @@ out: static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { struct nfsd_file *nf; - __be32 err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); - if (!err) { - err = nfserrno(vfs_test_lock(nf->nf_file, lock)); - nfsd_file_put(nf); - } + __be32 err; + + err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); + if (err) + return err; + fh_lock(fhp); /* to block new leases till after test_lock: */ + err = nfserrno(nfsd_open_break_lease(fhp->fh_dentry->d_inode, + NFSD_MAY_READ)); + if (err) + goto out; + err = nfserrno(vfs_test_lock(nf->nf_file, lock)); +out: + fh_unlock(fhp); + nfsd_file_put(nf); return err; } From ea3b50c51d19e2ac00861ac195a3ed5617765255 Mon Sep 17 00:00:00 2001 From: chakravarthikulkarni Date: Mon, 1 Mar 2021 12:51:45 +0530 Subject: [PATCH 0463/1379] PCI: acpiphp: Fix whitespace issue Fix coding style for braces. [bhelgaas: drop comment change] Link: https://lore.kernel.org/r/20210301072145.19018-1-chakravarthikulkarni2021@gmail.com Signed-off-by: chakravarthikulkarni Signed-off-by: Bjorn Helgaas --- drivers/pci/hotplug/acpiphp.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pci/hotplug/acpiphp.h b/drivers/pci/hotplug/acpiphp.h index a74b274a8c45..1f8ab4377ad8 100644 --- a/drivers/pci/hotplug/acpiphp.h +++ b/drivers/pci/hotplug/acpiphp.h @@ -148,8 +148,7 @@ static inline struct acpiphp_root_context *to_acpiphp_root_context(struct acpi_h * ACPI has no generic method of setting/getting attention status * this allows for device specific driver registration */ -struct acpiphp_attention_info -{ +struct acpiphp_attention_info { int (*set_attn)(struct hotplug_slot *slot, u8 status); int (*get_attn)(struct hotplug_slot *slot, u8 *status); struct module *owner; From 3fd00fdc4f11c656a63e6a6280c0bcb63cf109a2 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Tue, 23 Mar 2021 23:14:29 +0100 Subject: [PATCH 0464/1379] rtc: goldfish: remove dependency to OF We want to use the goldfish RTC on a machine without OF. As there is no real dependency on it, remove the OF dependency from the goldfish entry in Kconfig Signed-off-by: Laurent Vivier Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210323221430.3735147-2-laurent@vivier.eu --- drivers/rtc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 9a82cb57586f..993fd399f590 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -1899,7 +1899,7 @@ config RTC_DRV_HID_SENSOR_TIME config RTC_DRV_GOLDFISH tristate "Goldfish Real Time Clock" - depends on OF && HAS_IOMEM + depends on HAS_IOMEM help Say yes to enable RTC driver for the Goldfish based virtual platform. From 03531606ef4cda25b629f500d1ffb6173b805c05 Mon Sep 17 00:00:00 2001 From: Francois Gervais Date: Wed, 10 Mar 2021 16:10:26 -0500 Subject: [PATCH 0465/1379] rtc: pcf85063: fallback to parent of_node The rtc device node is always NULL. Since v5.12-rc1-dontuse/3c9ea42802a1fbf7ef29660ff8c6e526c58114f6 this will lead to a NULL pointer dereference. To fix this use the parent node which is the i2c client node as set by devm_rtc_allocate_device(). Using the i2c client node seems to be what other similar drivers do e.g. rtc-pcf8563.c. Signed-off-by: Francois Gervais Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210310211026.27299-1-fgervais@distech-controls.com --- drivers/rtc/rtc-pcf85063.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/rtc/rtc-pcf85063.c b/drivers/rtc/rtc-pcf85063.c index aef6c1ee8bb0..82becae14229 100644 --- a/drivers/rtc/rtc-pcf85063.c +++ b/drivers/rtc/rtc-pcf85063.c @@ -478,6 +478,7 @@ static struct clk *pcf85063_clkout_register_clk(struct pcf85063 *pcf85063) { struct clk *clk; struct clk_init_data init; + struct device_node *node = pcf85063->rtc->dev.parent->of_node; init.name = "pcf85063-clkout"; init.ops = &pcf85063_clkout_ops; @@ -487,15 +488,13 @@ static struct clk *pcf85063_clkout_register_clk(struct pcf85063 *pcf85063) pcf85063->clkout_hw.init = &init; /* optional override of the clockname */ - of_property_read_string(pcf85063->rtc->dev.of_node, - "clock-output-names", &init.name); + of_property_read_string(node, "clock-output-names", &init.name); /* register the clock */ clk = devm_clk_register(&pcf85063->rtc->dev, &pcf85063->clkout_hw); if (!IS_ERR(clk)) - of_clk_add_provider(pcf85063->rtc->dev.of_node, - of_clk_src_simple_get, clk); + of_clk_add_provider(node, of_clk_src_simple_get, clk); return clk; } From f1d304766c7f5388239d273fc0b72efa62acd9ca Mon Sep 17 00:00:00 2001 From: Liam Beguin Date: Wed, 7 Apr 2021 22:40:26 -0400 Subject: [PATCH 0466/1379] rtc: ab-eoz9: set regmap max_register Set regmap's max_register property to allow users to dump registers using debufgs. Signed-off-by: Liam Beguin Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210408024028.3526564-2-liambeguin@gmail.com --- drivers/rtc/rtc-ab-eoz9.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/rtc/rtc-ab-eoz9.c b/drivers/rtc/rtc-ab-eoz9.c index b20d8f26dcdb..dee17a76a72b 100644 --- a/drivers/rtc/rtc-ab-eoz9.c +++ b/drivers/rtc/rtc-ab-eoz9.c @@ -264,6 +264,7 @@ static const struct rtc_class_ops rtc_ops = { static const struct regmap_config abeoz9_rtc_regmap_config = { .reg_bits = 8, .val_bits = 8, + .max_register = 0x3f, }; #if IS_REACHABLE(CONFIG_HWMON) From e70e52e1bf1d6d0ea60e2f8294d5e76a8d8f5370 Mon Sep 17 00:00:00 2001 From: Liam Beguin Date: Wed, 7 Apr 2021 22:40:27 -0400 Subject: [PATCH 0467/1379] rtc: ab-eoz9: add alarm support Add alarm support for the rtc-ab-eoz9. Signed-off-by: Liam Beguin Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210408024028.3526564-3-liambeguin@gmail.com --- drivers/rtc/rtc-ab-eoz9.c | 136 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/drivers/rtc/rtc-ab-eoz9.c b/drivers/rtc/rtc-ab-eoz9.c index dee17a76a72b..7dc96fabc76f 100644 --- a/drivers/rtc/rtc-ab-eoz9.c +++ b/drivers/rtc/rtc-ab-eoz9.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -57,6 +58,24 @@ #define ABEOZ9_SEC_LEN 7 +#define ABEOZ9_REG_ALARM_SEC 0x10 +#define ABEOZ9_BIT_ALARM_SEC GENMASK(6, 0) +#define ABEOZ9_REG_ALARM_MIN 0x11 +#define ABEOZ9_BIT_ALARM_MIN GENMASK(6, 0) +#define ABEOZ9_REG_ALARM_HOURS 0x12 +#define ABEOZ9_BIT_ALARM_HOURS_PM BIT(5) +#define ABEOZ9_BIT_ALARM_HOURS GENMASK(4, 0) +#define ABEOZ9_REG_ALARM_DAYS 0x13 +#define ABEOZ9_BIT_ALARM_DAYS GENMASK(5, 0) +#define ABEOZ9_REG_ALARM_WEEKDAYS 0x14 +#define ABEOZ9_BIT_ALARM_WEEKDAYS GENMASK(2, 0) +#define ABEOZ9_REG_ALARM_MONTHS 0x15 +#define ABEOZ9_BIT_ALARM_MONTHS GENMASK(4, 0) +#define ABEOZ9_REG_ALARM_YEARS 0x16 + +#define ABEOZ9_ALARM_LEN 7 +#define ABEOZ9_BIT_ALARM_AE BIT(7) + #define ABEOZ9_REG_REG_TEMP 0x20 #define ABEOZ953_TEMP_MAX 120 #define ABEOZ953_TEMP_MIN -60 @@ -186,6 +205,98 @@ static int abeoz9_rtc_set_time(struct device *dev, struct rtc_time *tm) return abeoz9_reset_validity(regmap); } +static int abeoz9_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alarm) +{ + struct abeoz9_rtc_data *data = dev_get_drvdata(dev); + struct regmap *regmap = data->regmap; + u8 regs[ABEOZ9_ALARM_LEN]; + u8 val[2]; + int ret; + + ret = abeoz9_check_validity(dev); + if (ret) + return ret; + + ret = regmap_bulk_read(regmap, ABEOZ9_REG_CTRL_INT, val, sizeof(val)); + if (ret) + return ret; + + alarm->enabled = val[0] & ABEOZ9_REG_CTRL_INT_AIE; + alarm->pending = val[1] & ABEOZ9_REG_CTRL_INT_FLAG_AF; + + ret = regmap_bulk_read(regmap, ABEOZ9_REG_ALARM_SEC, regs, sizeof(regs)); + if (ret) + return ret; + + alarm->time.tm_sec = bcd2bin(FIELD_GET(ABEOZ9_BIT_ALARM_SEC, regs[0])); + alarm->time.tm_min = bcd2bin(FIELD_GET(ABEOZ9_BIT_ALARM_MIN, regs[1])); + alarm->time.tm_hour = bcd2bin(FIELD_GET(ABEOZ9_BIT_ALARM_HOURS, regs[2])); + if (FIELD_GET(ABEOZ9_BIT_ALARM_HOURS_PM, regs[2])) + alarm->time.tm_hour += 12; + + alarm->time.tm_mday = bcd2bin(FIELD_GET(ABEOZ9_BIT_ALARM_DAYS, regs[3])); + + return 0; +} + +static int abeoz9_rtc_alarm_irq_enable(struct device *dev, u32 enable) +{ + struct abeoz9_rtc_data *data = dev_get_drvdata(dev); + + return regmap_update_bits(data->regmap, ABEOZ9_REG_CTRL_INT, + ABEOZ9_REG_CTRL_INT_AIE, + FIELD_PREP(ABEOZ9_REG_CTRL_INT_AIE, enable)); +} + +static int abeoz9_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) +{ + struct abeoz9_rtc_data *data = dev_get_drvdata(dev); + u8 regs[ABEOZ9_ALARM_LEN] = {0}; + int ret; + + ret = regmap_update_bits(data->regmap, ABEOZ9_REG_CTRL_INT_FLAG, + ABEOZ9_REG_CTRL_INT_FLAG_AF, 0); + if (ret) + return ret; + + regs[0] = ABEOZ9_BIT_ALARM_AE | FIELD_PREP(ABEOZ9_BIT_ALARM_SEC, + bin2bcd(alarm->time.tm_sec)); + regs[1] = ABEOZ9_BIT_ALARM_AE | FIELD_PREP(ABEOZ9_BIT_ALARM_MIN, + bin2bcd(alarm->time.tm_min)); + regs[2] = ABEOZ9_BIT_ALARM_AE | FIELD_PREP(ABEOZ9_BIT_ALARM_HOURS, + bin2bcd(alarm->time.tm_hour)); + regs[3] = ABEOZ9_BIT_ALARM_AE | FIELD_PREP(ABEOZ9_BIT_ALARM_DAYS, + bin2bcd(alarm->time.tm_mday)); + + ret = regmap_bulk_write(data->regmap, ABEOZ9_REG_ALARM_SEC, regs, + sizeof(regs)); + if (ret) + return ret; + + return abeoz9_rtc_alarm_irq_enable(dev, alarm->enabled); +} + +static irqreturn_t abeoz9_rtc_irq(int irq, void *dev) +{ + struct abeoz9_rtc_data *data = dev_get_drvdata(dev); + unsigned int val; + int ret; + + ret = regmap_read(data->regmap, ABEOZ9_REG_CTRL_INT_FLAG, &val); + if (ret) + return IRQ_NONE; + + if (!FIELD_GET(ABEOZ9_REG_CTRL_INT_FLAG_AF, val)) + return IRQ_NONE; + + regmap_update_bits(data->regmap, ABEOZ9_REG_CTRL_INT_FLAG, + ABEOZ9_REG_CTRL_INT_FLAG_AF, 0); + + rtc_update_irq(data->rtc, 1, RTC_IRQF | RTC_AF); + + return IRQ_HANDLED; +} + static int abeoz9_trickle_parse_dt(struct device_node *node) { u32 ohms = 0; @@ -261,6 +372,14 @@ static const struct rtc_class_ops rtc_ops = { .set_time = abeoz9_rtc_set_time, }; +static const struct rtc_class_ops rtc_alarm_ops = { + .read_time = abeoz9_rtc_get_time, + .set_time = abeoz9_rtc_set_time, + .read_alarm = abeoz9_rtc_read_alarm, + .set_alarm = abeoz9_rtc_set_alarm, + .alarm_irq_enable = abeoz9_rtc_alarm_irq_enable, +}; + static const struct regmap_config abeoz9_rtc_regmap_config = { .reg_bits = 8, .val_bits = 8, @@ -420,6 +539,23 @@ static int abeoz9_probe(struct i2c_client *client, data->rtc->ops = &rtc_ops; data->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000; data->rtc->range_max = RTC_TIMESTAMP_END_2099; + data->rtc->uie_unsupported = 1; + + if (client->irq > 0) { + ret = devm_request_threaded_irq(dev, client->irq, NULL, + abeoz9_rtc_irq, + IRQF_TRIGGER_LOW | IRQF_ONESHOT, + dev_name(dev), dev); + if (ret) { + dev_err(dev, "failed to request alarm irq\n"); + return ret; + } + } + + if (client->irq > 0 || device_property_read_bool(dev, "wakeup-source")) { + ret = device_init_wakeup(dev, true); + data->rtc->ops = &rtc_alarm_ops; + } ret = devm_rtc_register_device(data->rtc); if (ret) From c52409eb16672907804b7acf1658bb1fd9dcb426 Mon Sep 17 00:00:00 2001 From: Liam Beguin Date: Wed, 7 Apr 2021 22:40:28 -0400 Subject: [PATCH 0468/1379] rtc: ab-eoz9: make use of RTC_FEATURE_ALARM Move the alarm callbacks in rtc_ops and use RTC_FEATURE_ALARM to notify the core whether alarm capabilities are available or not. Signed-off-by: Liam Beguin Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210408024028.3526564-4-liambeguin@gmail.com --- drivers/rtc/rtc-ab-eoz9.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/rtc/rtc-ab-eoz9.c b/drivers/rtc/rtc-ab-eoz9.c index 7dc96fabc76f..a9b355510cd4 100644 --- a/drivers/rtc/rtc-ab-eoz9.c +++ b/drivers/rtc/rtc-ab-eoz9.c @@ -368,11 +368,6 @@ static int abeoz9_rtc_setup(struct device *dev, struct device_node *node) } static const struct rtc_class_ops rtc_ops = { - .read_time = abeoz9_rtc_get_time, - .set_time = abeoz9_rtc_set_time, -}; - -static const struct rtc_class_ops rtc_alarm_ops = { .read_time = abeoz9_rtc_get_time, .set_time = abeoz9_rtc_set_time, .read_alarm = abeoz9_rtc_read_alarm, @@ -540,6 +535,7 @@ static int abeoz9_probe(struct i2c_client *client, data->rtc->range_min = RTC_TIMESTAMP_BEGIN_2000; data->rtc->range_max = RTC_TIMESTAMP_END_2099; data->rtc->uie_unsupported = 1; + clear_bit(RTC_FEATURE_ALARM, data->rtc->features); if (client->irq > 0) { ret = devm_request_threaded_irq(dev, client->irq, NULL, @@ -554,7 +550,7 @@ static int abeoz9_probe(struct i2c_client *client, if (client->irq > 0 || device_property_read_bool(dev, "wakeup-source")) { ret = device_init_wakeup(dev, true); - data->rtc->ops = &rtc_alarm_ops; + set_bit(RTC_FEATURE_ALARM, data->rtc->features); } ret = devm_rtc_register_device(data->rtc); From c8f0ca8b7a4b91f637ccd9a55f37dbac73d6f6bf Mon Sep 17 00:00:00 2001 From: satya priya Date: Fri, 9 Apr 2021 19:29:23 +0530 Subject: [PATCH 0469/1379] rtc: pm8xxx: Add RTC support for PMIC PMK8350 Add the comaptible string for PMIC PMK8350. Signed-off-by: satya priya Reviewed-by: Bjorn Andersson Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/1617976766-7852-2-git-send-email-skakit@codeaurora.org --- drivers/rtc/rtc-pm8xxx.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c index eb206597a8fa..29a1c65661e9 100644 --- a/drivers/rtc/rtc-pm8xxx.c +++ b/drivers/rtc/rtc-pm8xxx.c @@ -445,6 +445,16 @@ static const struct pm8xxx_rtc_regs pm8941_regs = { .alarm_en = BIT(7), }; +static const struct pm8xxx_rtc_regs pmk8350_regs = { + .ctrl = 0x6146, + .write = 0x6140, + .read = 0x6148, + .alarm_rw = 0x6240, + .alarm_ctrl = 0x6246, + .alarm_ctrl2 = 0x6248, + .alarm_en = BIT(7), +}; + /* * Hardcoded RTC bases until IORESOURCE_REG mapping is figured out */ @@ -453,6 +463,7 @@ static const struct of_device_id pm8xxx_id_table[] = { { .compatible = "qcom,pm8018-rtc", .data = &pm8921_regs }, { .compatible = "qcom,pm8058-rtc", .data = &pm8058_regs }, { .compatible = "qcom,pm8941-rtc", .data = &pm8941_regs }, + { .compatible = "qcom,pmk8350-rtc", .data = &pmk8350_regs }, { }, }; MODULE_DEVICE_TABLE(of, pm8xxx_id_table); From 8138c5f0318c69a878582d2140dac08e6a99880d Mon Sep 17 00:00:00 2001 From: satya priya Date: Fri, 9 Apr 2021 19:29:26 +0530 Subject: [PATCH 0470/1379] dt-bindings: rtc: qcom-pm8xxx-rtc: Add qcom pm8xxx rtc bindings Add binding doc for qcom pm8xxx rtc device. Signed-off-by: satya priya Reviewed-by: Bjorn Andersson Reported-by: kernel test robot Reviewed-by: Rob Herring Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/1617976766-7852-5-git-send-email-skakit@codeaurora.org --- .../bindings/rtc/qcom-pm8xxx-rtc.yaml | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml diff --git a/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml b/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml new file mode 100644 index 000000000000..4fba6dba16f3 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/qcom-pm8xxx-rtc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm PM8xxx PMIC RTC device + +maintainers: + - Satya Priya + +properties: + compatible: + enum: + - qcom,pm8058-rtc + - qcom,pm8921-rtc + - qcom,pm8941-rtc + - qcom,pm8018-rtc + - qcom,pmk8350-rtc + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + allow-set-time: + $ref: /schemas/types.yaml#/definitions/flag + description: + Indicates that the setting of RTC time is allowed by the host CPU. + +required: + - compatible + - reg + - interrupts + +additionalProperties: false + +examples: + - | + #include + spmi_bus: spmi@c440000 { + reg = <0x0c440000 0x1100>; + #address-cells = <2>; + #size-cells = <0>; + pmicintc: pmic@0 { + reg = <0x0 SPMI_USID>; + compatible = "qcom,pm8921"; + interrupts = <104 8>; + #interrupt-cells = <2>; + interrupt-controller; + #address-cells = <1>; + #size-cells = <0>; + + pm8921_rtc: rtc@11d { + compatible = "qcom,pm8921-rtc"; + reg = <0x11d>; + interrupts = <0x27 0>; + }; + }; + }; +... From 880f25d690150937e42a2e8b86c111aae8da6d08 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Mon, 15 Mar 2021 10:39:30 +0800 Subject: [PATCH 0471/1379] rtc: rtc-spear: replace spin_lock_irqsave by spin_lock in hard IRQ The code has been in a irq-disabled context since it is hard IRQ. There is no necessity to do it again. Signed-off-by: Tian Tao Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/1615775970-59070-1-git-send-email-tiantao6@hisilicon.com --- drivers/rtc/rtc-spear.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-spear.c b/drivers/rtc/rtc-spear.c index 833daeb7b60e..ee721e53c155 100644 --- a/drivers/rtc/rtc-spear.c +++ b/drivers/rtc/rtc-spear.c @@ -153,12 +153,12 @@ static void rtc_wait_not_busy(struct spear_rtc_config *config) static irqreturn_t spear_rtc_irq(int irq, void *dev_id) { struct spear_rtc_config *config = dev_id; - unsigned long flags, events = 0; + unsigned long events = 0; unsigned int irq_data; - spin_lock_irqsave(&config->lock, flags); + spin_lock(&config->lock); irq_data = readl(config->ioaddr + STATUS_REG); - spin_unlock_irqrestore(&config->lock, flags); + spin_unlock(&config->lock); if ((irq_data & RTC_INT_MASK)) { spear_rtc_clear_interrupt(config); From 7fcb86185978661c9188397d474f90364745b8d9 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Wed, 14 Apr 2021 10:40:06 +0200 Subject: [PATCH 0472/1379] rtc: fsl-ftm-alarm: add MODULE_TABLE() The module doesn't load automatically. Fix it by adding the missing MODULE_TABLE(). Fixes: 7b0b551dbc1e ("rtc: fsl-ftm-alarm: add FTM alarm driver") Signed-off-by: Michael Walle Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210414084006.17933-1-michael@walle.cc --- drivers/rtc/rtc-fsl-ftm-alarm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/rtc/rtc-fsl-ftm-alarm.c b/drivers/rtc/rtc-fsl-ftm-alarm.c index 57cc09d0a806..c0df49fb978c 100644 --- a/drivers/rtc/rtc-fsl-ftm-alarm.c +++ b/drivers/rtc/rtc-fsl-ftm-alarm.c @@ -310,6 +310,7 @@ static const struct of_device_id ftm_rtc_match[] = { { .compatible = "fsl,lx2160a-ftm-alarm", }, { }, }; +MODULE_DEVICE_TABLE(of, ftm_rtc_match); static const struct acpi_device_id ftm_imx_acpi_ids[] = { {"NXP0014",}, From 9b9310445f5a6741399ebe2ba08137fecd7f73f9 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 15 Apr 2021 16:37:01 +0800 Subject: [PATCH 0473/1379] rtc: ds1511: remove unused function Fix the following clang warning: drivers/rtc/rtc-ds1511.c:108:1: warning: unused function 'rtc_write_alarm' [-Wunused-function]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/1618475821-102974-1-git-send-email-jiapeng.chong@linux.alibaba.com --- drivers/rtc/rtc-ds1511.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index bda884333082..1109cad83838 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -104,12 +104,6 @@ rtc_write(uint8_t val, uint32_t reg) writeb(val, ds1511_base + (reg * reg_spacing)); } -static inline void -rtc_write_alarm(uint8_t val, enum ds1511reg reg) -{ - rtc_write((val | 0x80), reg); -} - static noinline uint8_t rtc_read(enum ds1511reg reg) { From aefdd4383bb0057c1ec1e32e7de348ccd749eb20 Mon Sep 17 00:00:00 2001 From: "Jisheng Zhang (syna)" Date: Mon, 12 Apr 2021 09:05:48 +0100 Subject: [PATCH 0474/1379] ARM: 9072/1: mm: remove set_kernel_text_r[ow]() After commit 5a735583b764 ("arm/ftrace: Use __patch_text()"), the last and only user of these functions has gone, remove them. Signed-off-by: Jisheng Zhang Signed-off-by: Russell King --- arch/arm/include/asm/set_memory.h | 8 -------- arch/arm/mm/init.c | 21 --------------------- 2 files changed, 29 deletions(-) diff --git a/arch/arm/include/asm/set_memory.h b/arch/arm/include/asm/set_memory.h index a1ceff4295d3..ec17fc0fda7a 100644 --- a/arch/arm/include/asm/set_memory.h +++ b/arch/arm/include/asm/set_memory.h @@ -18,12 +18,4 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } #endif -#ifdef CONFIG_STRICT_KERNEL_RWX -void set_kernel_text_rw(void); -void set_kernel_text_ro(void); -#else -static inline void set_kernel_text_rw(void) { } -static inline void set_kernel_text_ro(void) { } -#endif - #endif diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 828a2561b229..039c19597c7a 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -487,33 +487,12 @@ static int __mark_rodata_ro(void *unused) return 0; } -static int kernel_set_to_readonly __read_mostly; - void mark_rodata_ro(void) { - kernel_set_to_readonly = 1; stop_machine(__mark_rodata_ro, NULL, NULL); debug_checkwx(); } -void set_kernel_text_rw(void) -{ - if (!kernel_set_to_readonly) - return; - - set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), false, - current->active_mm); -} - -void set_kernel_text_ro(void) -{ - if (!kernel_set_to_readonly) - return; - - set_section_perms(ro_perms, ARRAY_SIZE(ro_perms), true, - current->active_mm); -} - #else static inline void fix_kernmem_perms(void) { } #endif /* CONFIG_STRICT_KERNEL_RWX */ From a5e8acd94fe1fe60d92176424a2be6e52c8bd058 Mon Sep 17 00:00:00 2001 From: "Jisheng Zhang (syna)" Date: Mon, 12 Apr 2021 09:12:36 +0100 Subject: [PATCH 0475/1379] ARM: 9073/1: ptdump: add __init section marker to three functions They are not needed after booting, so mark them as __init to move them to the .init section. Signed-off-by: Jisheng Zhang Reviewed-by: Steven Price Signed-off-by: Russell King --- arch/arm/mm/dump.c | 4 ++-- arch/arm/mm/ptdump_debugfs.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c index 93ff0097f00b..fb688003d156 100644 --- a/arch/arm/mm/dump.c +++ b/arch/arm/mm/dump.c @@ -420,7 +420,7 @@ void ptdump_walk_pgd(struct seq_file *m, struct ptdump_info *info) note_page(&st, 0, 0, 0, NULL); } -static void ptdump_initialize(void) +static void __init ptdump_initialize(void) { unsigned i, j; @@ -466,7 +466,7 @@ void ptdump_check_wx(void) pr_info("Checked W+X mappings: passed, no W+X pages found\n"); } -static int ptdump_init(void) +static int __init ptdump_init(void) { ptdump_initialize(); ptdump_debugfs_register(&kernel_ptdump_info, "kernel_page_tables"); diff --git a/arch/arm/mm/ptdump_debugfs.c b/arch/arm/mm/ptdump_debugfs.c index 598b636615a2..8df9afac8d81 100644 --- a/arch/arm/mm/ptdump_debugfs.c +++ b/arch/arm/mm/ptdump_debugfs.c @@ -24,7 +24,7 @@ static const struct file_operations ptdump_fops = { .release = single_release, }; -void ptdump_debugfs_register(struct ptdump_info *info, const char *name) +void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name) { debugfs_create_file(name, 0400, NULL, info, &ptdump_fops); } From 5fafafe7eeac02643cb221fac9b27d72a86c286e Mon Sep 17 00:00:00 2001 From: "Jisheng Zhang (syna)" Date: Mon, 12 Apr 2021 09:14:27 +0100 Subject: [PATCH 0476/1379] ARM: 9074/1: ptdump: convert to DEFINE_SHOW_ATTRIBUTE Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code. Signed-off-by: Jisheng Zhang Reviewed-by: Steven Price Signed-off-by: Russell King --- arch/arm/mm/ptdump_debugfs.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/arch/arm/mm/ptdump_debugfs.c b/arch/arm/mm/ptdump_debugfs.c index 8df9afac8d81..318de969ae0f 100644 --- a/arch/arm/mm/ptdump_debugfs.c +++ b/arch/arm/mm/ptdump_debugfs.c @@ -11,18 +11,7 @@ static int ptdump_show(struct seq_file *m, void *v) ptdump_walk_pgd(m, info); return 0; } - -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, inode->i_private); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump); void __init ptdump_debugfs_register(struct ptdump_info *info, const char *name) { From 57ac51667d8cd62731223d687e5fe7b41c502f89 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Wed, 14 Apr 2021 04:41:16 +0100 Subject: [PATCH 0477/1379] ARM: 9075/1: kernel: Fix interrupted SMC calls On Qualcomm ARM32 platforms, the SMC call can return before it has completed. If this occurs, the call can be restarted, but it requires using the returned session ID value from the interrupted SMC call. The ARM32 SMCC code already has the provision to add platform specific quirks for things like this. So let's make use of it and add the Qualcomm specific quirk (ARM_SMCCC_QUIRK_QCOM_A6) used by the QCOM_SCM driver. This change is similar to the below one added for ARM64 a while ago: commit 82bcd087029f ("firmware: qcom: scm: Fix interrupted SCM calls") Without this change, the Qualcomm ARM32 platforms like SDX55 will return -EINVAL for SMC calls used for modem firmware loading and validation. Signed-off-by: Manivannan Sadhasivam Reviewed-by: Bjorn Andersson Signed-off-by: Russell King --- arch/arm/kernel/asm-offsets.c | 3 +++ arch/arm/kernel/smccc-call.S | 11 ++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index be8050b0c3df..70993af22d80 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "signal.h" /* @@ -148,6 +149,8 @@ int main(void) DEFINE(SLEEP_SAVE_SP_PHYS, offsetof(struct sleep_save_sp, save_ptr_stash_phys)); DEFINE(SLEEP_SAVE_SP_VIRT, offsetof(struct sleep_save_sp, save_ptr_stash)); #endif + DEFINE(ARM_SMCCC_QUIRK_ID_OFFS, offsetof(struct arm_smccc_quirk, id)); + DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS, offsetof(struct arm_smccc_quirk, state)); BLANK(); DEFINE(DMA_BIDIRECTIONAL, DMA_BIDIRECTIONAL); DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); diff --git a/arch/arm/kernel/smccc-call.S b/arch/arm/kernel/smccc-call.S index 00664c78faca..931df62a7831 100644 --- a/arch/arm/kernel/smccc-call.S +++ b/arch/arm/kernel/smccc-call.S @@ -3,7 +3,9 @@ * Copyright (c) 2015, Linaro Limited */ #include +#include +#include #include #include #include @@ -27,7 +29,14 @@ UNWIND( .fnstart) UNWIND( .save {r4-r7}) ldm r12, {r4-r7} \instr - pop {r4-r7} + ldr r4, [sp, #36] + cmp r4, #0 + beq 1f // No quirk structure + ldr r5, [r4, #ARM_SMCCC_QUIRK_ID_OFFS] + cmp r5, #ARM_SMCCC_QUIRK_QCOM_A6 + bne 1f // No quirk present + str r6, [r4, #ARM_SMCCC_QUIRK_STATE_OFFS] +1: pop {r4-r7} ldr r12, [sp, #(4 * 4)] stm r12, {r0-r3} bx lr From de144ff4234f935bd2150108019b5d87a90a8a96 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 18 Apr 2021 15:00:45 -0400 Subject: [PATCH 0478/1379] NFSv4: Don't discard segments marked for return in _pnfs_return_layout() If the pNFS layout segment is marked with the NFS_LSEG_LAYOUTRETURN flag, then the assumption is that it has some reporting requirement to perform through a layoutreturn (e.g. flexfiles layout stats or error information). Fixes: 6d597e175012 ("pnfs: only tear down lsegs that precede seqid in LAYOUTRETURN args") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 33574f47601f..f726f8b12b7e 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1344,7 +1344,7 @@ _pnfs_return_layout(struct inode *ino) } valid_layout = pnfs_layout_is_valid(lo); pnfs_clear_layoutcommit(ino, &tmp_list); - pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0); + pnfs_mark_matching_lsegs_return(lo, &tmp_list, NULL, 0); if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { struct pnfs_layout_range range = { From fb700ef026766c95578aafc0db1b208946e7ad4f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 15 Apr 2021 17:33:07 -0400 Subject: [PATCH 0479/1379] NFSv4.1: Simplify layout return in pnfs_layout_process() If the server hands us a layout that does not match the one we currently hold, then have pnfs_mark_matching_lsegs_return() just ditch the old layout if NFS_LSEG_LAYOUTRETURN is not set. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index f726f8b12b7e..03e0b34c4a64 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2410,9 +2410,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) .iomode = IOMODE_ANY, .length = NFS4_MAX_UINT64, }; - pnfs_set_plh_return_info(lo, IOMODE_ANY, 0); - pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, - &range, 0); + pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0); goto out_forget; } else { /* We have a completely new layout */ From 04d82a6d0881ef1ab1e9f66f10805177ee2fb1e8 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 17 Apr 2021 10:10:08 +0900 Subject: [PATCH 0480/1379] binfmt_flat: allow not offsetting data start Commit 2217b9826246 ("binfmt_flat: revert "binfmt_flat: don't offset the data start"") restored offsetting the start of the data section by a number of words defined by MAX_SHARED_LIBS. As a result, since MAX_SHARED_LIBS is never 0, a gap between the text and data sections always exists. For architectures which cannot support a such gap between the text and data sections (e.g. riscv nommu), flat binary programs cannot be executed. To allow an architecture to request no data start offset to allow for contiguous text and data sections for binaries flagged with FLAT_FLAG_RAM, introduce the new config option CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET. Using this new option, the macro DATA_START_OFFSET_WORDS is conditionally defined in binfmt_flat.c to MAX_SHARED_LIBS for architectures tolerating or needing the data start offset (CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET disabled case) and to 0 when CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET is enabled. DATA_START_OFFSET_WORDS is used in load_flat_file() to calculate the data section length and start position. Signed-off-by: Damien Le Moal Signed-off-by: Greg Ungerer --- fs/Kconfig.binfmt | 3 +++ fs/binfmt_flat.c | 18 +++++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index c6f1c8c1934e..06fb7a93a1bd 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK config BINFMT_FLAT_OLD_ALWAYS_RAM bool +config BINFMT_FLAT_NO_DATA_START_OFFSET + bool + config BINFMT_FLAT_OLD bool "Enable support for very old legacy flat binaries" depends on BINFMT_FLAT diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index b9c658e0548e..a1072c6a2341 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -74,6 +74,12 @@ #define MAX_SHARED_LIBS (1) #endif +#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET +#define DATA_START_OFFSET_WORDS (0) +#else +#define DATA_START_OFFSET_WORDS (MAX_SHARED_LIBS) +#endif + struct lib_info { struct { unsigned long start_code; /* Start of text segment */ @@ -576,7 +582,8 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } - len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); + len = data_len + extra + + DATA_START_OFFSET_WORDS * sizeof(unsigned long); len = PAGE_ALIGN(len); realdatastart = vm_mmap(NULL, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); @@ -591,7 +598,7 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } datapos = ALIGN(realdatastart + - MAX_SHARED_LIBS * sizeof(unsigned long), + DATA_START_OFFSET_WORDS * sizeof(unsigned long), FLAT_DATA_ALIGN); pr_debug("Allocated data+bss+stack (%u bytes): %lx\n", @@ -622,7 +629,8 @@ static int load_flat_file(struct linux_binprm *bprm, memp_size = len; } else { - len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32); + len = text_len + data_len + extra + + DATA_START_OFFSET_WORDS * sizeof(u32); len = PAGE_ALIGN(len); textpos = vm_mmap(NULL, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); @@ -638,7 +646,7 @@ static int load_flat_file(struct linux_binprm *bprm, realdatastart = textpos + ntohl(hdr->data_start); datapos = ALIGN(realdatastart + - MAX_SHARED_LIBS * sizeof(u32), + DATA_START_OFFSET_WORDS * sizeof(u32), FLAT_DATA_ALIGN); reloc = (__be32 __user *) @@ -714,7 +722,7 @@ static int load_flat_file(struct linux_binprm *bprm, ret = result; pr_err("Unable to read code+data+bss, errno %d\n", ret); vm_munmap(textpos, text_len + data_len + extra + - MAX_SHARED_LIBS * sizeof(u32)); + DATA_START_OFFSET_WORDS * sizeof(u32)); goto err; } } From 09d02efaafdc2114565a5ad218172b6064404f0f Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 30 Mar 2021 17:44:46 +0100 Subject: [PATCH 0481/1379] ecryptfs: read_write: File headers do not make good candidates for kernel-doc Provide missing param description for 'page_index' too. Fixes the following W=1 kernel build warning(s): fs/ecryptfs/read_write.c:16: warning: Incorrect use of kernel-doc format: * ecryptfs_write_lower fs/ecryptfs/read_write.c:29: warning: Function parameter or member 'ecryptfs_inode' not described in 'ecryptfs_write_lower' fs/ecryptfs/read_write.c:29: warning: Function parameter or member 'data' not described in 'ecryptfs_write_lower' fs/ecryptfs/read_write.c:29: warning: Function parameter or member 'offset' not described in 'ecryptfs_write_lower' fs/ecryptfs/read_write.c:29: warning: Function parameter or member 'size' not described in 'ecryptfs_write_lower' fs/ecryptfs/read_write.c:29: warning: expecting prototype for eCryptfs(). Prototype was for ecryptfs_write_lower() instead fs/ecryptfs/read_write.c:248: warning: Function parameter or member 'page_index' not described in 'ecryptfs_read_lower_page_segment' Cc: Tyler Hicks Cc: "Michael A. Halcrow" Cc: ecryptfs@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: Tyler Hicks --- fs/ecryptfs/read_write.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 0438997ac9d8..60bdcaddcbe5 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 2007 International Business Machines Corp. @@ -230,6 +230,8 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size, * ecryptfs_read_lower_page_segment * @page_for_ecryptfs: The page into which data for eCryptfs will be * written + * @page_index: Page index in @page_for_ecryptfs from which to start + * writing * @offset_in_page: Offset in @page_for_ecryptfs from which to start * writing * @size: The number of bytes to write into @page_for_ecryptfs From b0cfbeff12dab1ce89d55f5da11ae9d5dc20a9e2 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 30 Mar 2021 17:44:47 +0100 Subject: [PATCH 0482/1379] ecryptfs: debug: Demote a couple of kernel-doc abuses Fixes the following W=1 kernel build warning(s): fs/ecryptfs/debug.c:13: warning: Incorrect use of kernel-doc format: * ecryptfs_dump_auth_tok - debug function to print auth toks fs/ecryptfs/debug.c:19: warning: Function parameter or member 'auth_tok' not described in 'ecryptfs_dump_auth_tok' fs/ecryptfs/debug.c:19: warning: expecting prototype for eCryptfs(). Prototype was for ecryptfs_dump_auth_tok() instead Cc: Tyler Hicks Cc: "Michael A. Halcrow" Cc: ecryptfs@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: Tyler Hicks --- fs/ecryptfs/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c index 1f65e99f9a41..cf6d0e8e25a1 100644 --- a/fs/ecryptfs/debug.c +++ b/fs/ecryptfs/debug.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * Functions only useful for debugging. * @@ -9,7 +9,7 @@ #include "ecryptfs_kernel.h" -/** +/* * ecryptfs_dump_auth_tok - debug function to print auth toks * * This function will print the contents of an ecryptfs authentication From 64cbb654ed779a5beeda907c80c187a9c1f4f8cb Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 30 Mar 2021 17:44:48 +0100 Subject: [PATCH 0483/1379] ecryptfs: dentry: File headers are not good candidates for kernel-doc Fixes the following W=1 kernel build warning(s): fs/ecryptfs/dentry.c:19: warning: Incorrect use of kernel-doc format: * ecryptfs_d_revalidate - revalidate an ecryptfs dentry fs/ecryptfs/dentry.c:32: warning: Function parameter or member 'dentry' not described in 'ecryptfs_d_revalidate' fs/ecryptfs/dentry.c:32: warning: Function parameter or member 'flags' not described in 'ecryptfs_d_revalidate' fs/ecryptfs/dentry.c:32: warning: expecting prototype for eCryptfs(). Prototype was for ecryptfs_d_revalidate() instead Cc: Tyler Hicks Cc: "Michael A. Halcrow" Cc: ecryptfs@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: Tyler Hicks --- fs/ecryptfs/dentry.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 44606f079efb..acaa0825e9bb 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2003 Erez Zadok From 446b5836af9fc3518142077bde116e6a4b196e05 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 30 Mar 2021 17:44:49 +0100 Subject: [PATCH 0484/1379] ecryptfs: kthread: Demote file header and provide description for 'cred' Fixes the following W=1 kernel build warning(s): fs/ecryptfs/kthread.c:16: warning: cannot understand function prototype: 'struct ecryptfs_open_req ' fs/ecryptfs/kthread.c:120: warning: Function parameter or member 'cred' not described in 'ecryptfs_privileged_open' Cc: Tyler Hicks Cc: "Michael A. Halcrow" Cc: ecryptfs@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: Tyler Hicks --- fs/ecryptfs/kthread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index a7c903cb01a0..ae4cb4e2e134 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 2008 International Business Machines Corp. @@ -108,6 +108,7 @@ void ecryptfs_destroy_kthread(void) * @lower_file: Result of dentry_open by root on lower dentry * @lower_dentry: Lower dentry for file to open * @lower_mnt: Lower vfsmount for file to open + * @cred: credential to use for this call * * This function gets a r/w file opened against the lower dentry. * From a62187eb1f483e46e5dca796146f203112608cbc Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 30 Mar 2021 17:44:50 +0100 Subject: [PATCH 0485/1379] ecryptfs: file: Demote kernel-doc abuses Fixes the following W=1 kernel build warning(s): fs/ecryptfs/file.c:23: warning: Incorrect use of kernel-doc format: * ecryptfs_read_update_atime fs/ecryptfs/file.c:34: warning: Function parameter or member 'iocb' not described in 'ecryptfs_read_update_atime' fs/ecryptfs/file.c:34: warning: Function parameter or member 'to' not described in 'ecryptfs_read_update_atime' fs/ecryptfs/file.c:34: warning: expecting prototype for eCryptfs(). Prototype was for ecryptfs_read_update_atime() instead Cc: Tyler Hicks Cc: "Michael A. Halcrow" Cc: "Michael C. Thompson" Cc: ecryptfs@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: Tyler Hicks --- fs/ecryptfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 5fb45d865ce5..18d5b91cb573 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2004 Erez Zadok @@ -19,7 +19,7 @@ #include #include "ecryptfs_kernel.h" -/** +/* * ecryptfs_read_update_atime * * generic_file_read updates the atime of upper layer inode. But, it From 1c6675cafc8cf69dd33dab1fa129c00f56a3c84e Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 30 Mar 2021 17:44:51 +0100 Subject: [PATCH 0486/1379] ecryptfs: super: Fix formatting, naming and kernel-doc abuses Fixes the following W=1 kernel build warning(s): fs/ecryptfs/super.c:22: warning: cannot understand function prototype: 'struct kmem_cache *ecryptfs_inode_info_cache; ' fs/ecryptfs/super.c:91: warning: Function parameter or member 'dentry' not described in 'ecryptfs_statfs' fs/ecryptfs/super.c:91: warning: Excess function parameter 'sb' description in 'ecryptfs_statfs' fs/ecryptfs/super.c:120: warning: Function parameter or member 'inode' not described in 'ecryptfs_evict_inode' fs/ecryptfs/super.c:133: warning: Function parameter or member 'm' not described in 'ecryptfs_show_options' fs/ecryptfs/super.c:133: warning: Function parameter or member 'root' not described in 'ecryptfs_show_options' Cc: Tyler Hicks Cc: "Michael A. Halcrow" Cc: "Michael C. Thompson" Cc: ecryptfs@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: Tyler Hicks --- fs/ecryptfs/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 6b1853f1c06a..39116af0390f 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2003 Erez Zadok @@ -81,7 +81,7 @@ static void ecryptfs_destroy_inode(struct inode *inode) /** * ecryptfs_statfs - * @sb: The ecryptfs super block + * @dentry: The ecryptfs dentry * @buf: The struct kstatfs to fill in with stats * * Get the filesystem statistics. Currently, we let this pass right through @@ -108,7 +108,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) /** * ecryptfs_evict_inode - * @inode - The ecryptfs inode + * @inode: The ecryptfs inode * * Called by iput() when the inode reference count reached zero * and the inode is not hashed anywhere. Used to clear anything @@ -123,7 +123,7 @@ static void ecryptfs_evict_inode(struct inode *inode) iput(ecryptfs_inode_to_lower(inode)); } -/** +/* * ecryptfs_show_options * * Prints the mount options for a given superblock. From 1ab8e268ead8ef305ece732557c07e2e18632b5b Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 30 Mar 2021 17:44:52 +0100 Subject: [PATCH 0487/1379] ecryptfs: messaging: Add missing param descriptions and demote abuses Fixes the following W=1 kernel build warning(s): fs/ecryptfs/messaging.c:15: warning: Function parameter or member 'ecryptfs_msg_ctx_free_list' not described in 'LIST_HEAD' fs/ecryptfs/messaging.c:15: warning: expecting prototype for eCryptfs(). Prototype was for LIST_HEAD() instead fs/ecryptfs/messaging.c:157: warning: Function parameter or member 'daemon' not described in 'ecryptfs_exorcise_daemon' fs/ecryptfs/messaging.c:207: warning: Function parameter or member 'daemon' not described in 'ecryptfs_process_response' fs/ecryptfs/messaging.c:207: warning: expecting prototype for ecryptfs_process_reponse(). Prototype was for ecryptfs_process_response() instead fs/ecryptfs/messaging.c:262: warning: Function parameter or member 'msg_type' not described in 'ecryptfs_send_message_locked' Cc: Tyler Hicks Cc: David Howells Cc: Johannes Weiner Cc: Waiman Long Cc: Michal Hocko Cc: Andrew Morton Cc: "Michael A. Halcrow" Cc: ecryptfs@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: Tyler Hicks --- fs/ecryptfs/messaging.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index c0dfd9647627..b38bd742fd97 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 2004-2008 International Business Machines Corp. @@ -147,7 +147,7 @@ out: return rc; } -/** +/* * ecryptfs_exorcise_daemon - Destroy the daemon struct * * Must be called ceremoniously while in possession of @@ -181,7 +181,8 @@ out: } /** - * ecryptfs_process_reponse + * ecryptfs_process_response + * @daemon: eCryptfs daemon object * @msg: The ecryptfs message received; the caller should sanity check * msg->data_len and free the memory * @seq: The sequence number of the message; must match the sequence @@ -250,6 +251,7 @@ out: * ecryptfs_send_message_locked * @data: The data to send * @data_len: The length of data + * @msg_type: Type of message * @msg_ctx: The message context allocated for the send * * Must be called with ecryptfs_daemon_hash_mux held. From e24012062e3dfacc40fda0776af0c2987ee0c9ab Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 30 Mar 2021 17:44:53 +0100 Subject: [PATCH 0488/1379] ecryptfs: main: Demote a bunch of non-conformant kernel-doc headers Fixes the following W=1 kernel build warning(s): fs/ecryptfs/main.c:28: warning: Incorrect use of kernel-doc format: * Module parameter that defines the ecryptfs_verbosity level. fs/ecryptfs/main.c:30: warning: cannot understand function prototype: 'int ecryptfs_verbosity = 0; ' fs/ecryptfs/main.c:40: warning: cannot understand function prototype: 'unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; ' fs/ecryptfs/main.c:52: warning: cannot understand function prototype: 'signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ; ' fs/ecryptfs/main.c:65: warning: cannot understand function prototype: 'unsigned int ecryptfs_number_of_users = ECRYPTFS_DEFAULT_NUM_USERS; ' fs/ecryptfs/main.c:106: warning: Function parameter or member 'dentry' not described in 'ecryptfs_init_lower_file' fs/ecryptfs/main.c:106: warning: Function parameter or member 'lower_file' not described in 'ecryptfs_init_lower_file' fs/ecryptfs/main.c:106: warning: Excess function parameter 'ecryptfs_dentry' description in 'ecryptfs_init_lower_file' fs/ecryptfs/main.c:244: warning: Function parameter or member 'sbi' not described in 'ecryptfs_parse_options' fs/ecryptfs/main.c:244: warning: Excess function parameter 'sb' description in 'ecryptfs_parse_options' fs/ecryptfs/main.c:478: warning: Function parameter or member 'fs_type' not described in 'ecryptfs_mount' fs/ecryptfs/main.c:478: warning: Function parameter or member 'flags' not described in 'ecryptfs_mount' fs/ecryptfs/main.c:478: warning: expecting prototype for ecryptfs_get_sb(). Prototype was for ecryptfs_mount() instead fs/ecryptfs/main.c:645: warning: Function parameter or member 'vptr' not described in 'inode_info_init_once' Cc: Tyler Hicks Cc: Christian Brauner Cc: James Morris Cc: "Michael A. Halcrow" Cc: "Michael C. Thompson" Cc: ecryptfs@vger.kernel.org Signed-off-by: Lee Jones [tyhicks: Correct the function documentation for ecryptfs_mount()] Signed-off-by: Tyler Hicks --- fs/ecryptfs/main.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index cdf40a54a35d..77b96737b1ff 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2003 Erez Zadok @@ -24,7 +24,7 @@ #include #include "ecryptfs_kernel.h" -/** +/* * Module parameter that defines the ecryptfs_verbosity level. */ int ecryptfs_verbosity = 0; @@ -34,7 +34,7 @@ MODULE_PARM_DESC(ecryptfs_verbosity, "Initial verbosity level (0 or 1; defaults to " "0, which is Quiet)"); -/** +/* * Module parameter that defines the number of message buffer elements */ unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; @@ -43,7 +43,7 @@ module_param(ecryptfs_message_buf_len, uint, 0); MODULE_PARM_DESC(ecryptfs_message_buf_len, "Number of message buffer elements"); -/** +/* * Module parameter that defines the maximum guaranteed amount of time to wait * for a response from ecryptfsd. The actual sleep time will be, more than * likely, a small amount greater than this specified value, but only less if @@ -57,7 +57,7 @@ MODULE_PARM_DESC(ecryptfs_message_wait_timeout, "sleep while waiting for a message response from " "userspace"); -/** +/* * Module parameter that is an estimate of the maximum number of users * that will be concurrently using eCryptfs. Set this to the right * value to balance performance and memory use. @@ -80,7 +80,7 @@ void __ecryptfs_printk(const char *fmt, ...) va_end(args); } -/** +/* * ecryptfs_init_lower_file * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with * the lower dentry and the lower mount set @@ -221,7 +221,7 @@ static void ecryptfs_init_mount_crypt_stat( /** * ecryptfs_parse_options - * @sb: The ecryptfs super block + * @sbi: The ecryptfs super block * @options: The options passed to the kernel * @check_ruid: set to 1 if device uid should be checked against the ruid * @@ -466,10 +466,10 @@ out: struct kmem_cache *ecryptfs_sb_info_cache; static struct file_system_type ecryptfs_fs_type; -/** - * ecryptfs_get_sb - * @fs_type - * @flags +/* + * ecryptfs_mount + * @fs_type: The filesystem type that the superblock should belong to + * @flags: The flags associated with the mount * @dev_name: The path to mount over * @raw_data: The options passed into the kernel */ @@ -635,7 +635,7 @@ static struct file_system_type ecryptfs_fs_type = { }; MODULE_ALIAS_FS("ecryptfs"); -/** +/* * inode_info_init_once * * Initializes the ecryptfs_inode_info_cache when it is created From 65bbb7b84611c59725276ed1e03105fa002da9af Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 30 Mar 2021 17:44:54 +0100 Subject: [PATCH 0489/1379] ecryptfs: miscdev: File headers are not good kernel-doc candidates Supply description for the 'daemon' param too. Fixes the following W=1 kernel build warning(s): fs/ecryptfs/miscdev.c:19: warning: cannot understand function prototype: 'atomic_t ecryptfs_num_miscdev_opens; ' fs/ecryptfs/miscdev.c:323: warning: Function parameter or member 'daemon' not described in 'ecryptfs_miscdev_response' Cc: Tyler Hicks Cc: "Michael A. Halcrow" Cc: ecryptfs@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: Tyler Hicks --- fs/ecryptfs/miscdev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 742ece22c1d4..4e62c3cef70f 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 2008 International Business Machines Corp. @@ -312,6 +312,7 @@ out_unlock_daemon: /** * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon + * @daemon: eCryptfs daemon object * @data: Bytes comprising struct ecryptfs_message * @data_size: sizeof(struct ecryptfs_message) + data len * @seq: Sequence number for miscdev response packet From 5da877eadffb8b6b5b302673ab14ea4c3d7d1546 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 30 Mar 2021 17:44:55 +0100 Subject: [PATCH 0490/1379] ecryptfs: crypto: Supply some missing param descriptions and demote abuses Fixes the following W=1 kernel build warning(s): fs/ecryptfs/crypto.c:29: warning: expecting prototype for eCryptfs(). Prototype was for DECRYPT() instead fs/ecryptfs/crypto.c:360: warning: Function parameter or member 'crypt_stat' not described in 'lower_offset_for_page' fs/ecryptfs/crypto.c:360: warning: Function parameter or member 'page' not described in 'lower_offset_for_page' fs/ecryptfs/crypto.c:637: warning: Function parameter or member 'crypt_stat' not described in 'ecryptfs_compute_root_iv' fs/ecryptfs/crypto.c:1386: warning: Function parameter or member 'ecryptfs_dentry' not described in 'ecryptfs_read_metadata' fs/ecryptfs/crypto.c:1463: warning: Function parameter or member 'filename' not described in 'ecryptfs_encrypt_filename' fs/ecryptfs/crypto.c:1463: warning: Function parameter or member 'mount_crypt_stat' not described in 'ecryptfs_encrypt_filename' fs/ecryptfs/crypto.c:1897: warning: Function parameter or member 'encoded_name_size' not described in 'ecryptfs_encrypt_and_encode_filename' fs/ecryptfs/crypto.c:1897: warning: Function parameter or member 'mount_crypt_stat' not described in 'ecryptfs_encrypt_and_encode_filename' fs/ecryptfs/crypto.c:1897: warning: Function parameter or member 'name_size' not described in 'ecryptfs_encrypt_and_encode_filename' fs/ecryptfs/crypto.c:1897: warning: Excess function parameter 'crypt_stat' description in 'ecryptfs_encrypt_and_encode_filename' fs/ecryptfs/crypto.c:1897: warning: Excess function parameter 'length' description in 'ecryptfs_encrypt_and_encode_filename' fs/ecryptfs/crypto.c:2006: warning: Function parameter or member 'sb' not described in 'ecryptfs_decode_and_decrypt_filename' fs/ecryptfs/crypto.c:2006: warning: Excess function parameter 'ecryptfs_dir_dentry' description in 'ecryptfs_decode_and_decrypt_filename' Cc: Tyler Hicks Cc: Eric Biggers Cc: "Michael A. Halcrow" Cc: "Michael C. Thompson" Cc: ecryptfs@vger.kernel.org Signed-off-by: Lee Jones [tyhicks: Fix typo in ecryptfs_encrypt_and_encode_filename() func docs] Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 7671412b8f0b..147ccb5e9190 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2004 Erez Zadok @@ -350,7 +350,7 @@ out: return rc; } -/** +/* * lower_offset_for_page * * Convert an eCryptfs page index into a lower byte offset @@ -627,9 +627,8 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) } } -/** +/* * ecryptfs_compute_root_iv - * @crypt_stats * * On error, sets the root IV to all 0's. */ @@ -1370,7 +1369,7 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, return rc; } -/** +/* * ecryptfs_read_metadata * * Common entry point for reading file metadata. From here, we could @@ -1448,7 +1447,7 @@ out: return rc; } -/** +/* * ecryptfs_encrypt_filename - encrypt filename * * CBC-encrypts the filename. We do not want to encrypt the same @@ -1876,10 +1875,11 @@ out: /** * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text - * @crypt_stat: The crypt_stat struct associated with the file anem to encode + * @encoded_name: The encrypted name + * @encoded_name_size: Length of the encrypted name + * @mount_crypt_stat: The crypt_stat struct associated with the file name to encode * @name: The plaintext name - * @length: The length of the plaintext - * @encoded_name: The encypted name + * @name_size: The length of the plaintext name * * Encrypts and encodes a filename into something that constitutes a * valid filename for a filesystem, with printable characters. @@ -1991,7 +1991,7 @@ static bool is_dot_dotdot(const char *name, size_t name_size) * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext * @plaintext_name: The plaintext name * @plaintext_name_size: The plaintext name size - * @ecryptfs_dir_dentry: eCryptfs directory dentry + * @sb: Ecryptfs's super_block * @name: The filename in cipher text * @name_size: The cipher text name size * From 688a9f7cd824e76a893590e35c15017f1f956b88 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 30 Mar 2021 17:44:56 +0100 Subject: [PATCH 0491/1379] ecryptfs: mmap: Help out one function header and demote other abuses Fixes the following W=1 kernel build warning(s): fs/ecryptfs/mmap.c:26: warning: Incorrect use of kernel-doc format: * ecryptfs_get_locked_page fs/ecryptfs/mmap.c:34: warning: Function parameter or member 'inode' not described in 'ecryptfs_get_locked_page' fs/ecryptfs/mmap.c:34: warning: Function parameter or member 'index' not described in 'ecryptfs_get_locked_page' fs/ecryptfs/mmap.c:34: warning: expecting prototype for eCryptfs(). Prototype was for ecryptfs_get_locked_page() instead fs/ecryptfs/mmap.c:52: warning: Function parameter or member 'wbc' not described in 'ecryptfs_writepage' fs/ecryptfs/mmap.c:98: warning: Incorrect use of kernel-doc format: * ecryptfs_copy_up_encrypted_with_header fs/ecryptfs/mmap.c:110: warning: Function parameter or member 'page' not described in 'ecryptfs_copy_up_encrypted_with_header' fs/ecryptfs/mmap.c:110: warning: Function parameter or member 'crypt_stat' not described in 'ecryptfs_copy_up_encrypted_with_header' fs/ecryptfs/mmap.c:110: warning: expecting prototype for Header Extent(). Prototype was for ecryptfs_copy_up_encrypted_with_header() instead fs/ecryptfs/mmap.c:233: warning: wrong kernel-doc identifier on line: fs/ecryptfs/mmap.c:379: warning: Function parameter or member 'ecryptfs_inode' not described in 'ecryptfs_write_inode_size_to_header' Cc: Tyler Hicks Cc: James Morris Cc: Tycho Andersen Cc: Christian Brauner Cc: "Michael A. Halcrow" Cc: ecryptfs@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: Tyler Hicks --- fs/ecryptfs/mmap.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 2f333a40ff4d..392e721b50a3 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * This is where eCryptfs coordinates the symmetric encryption and * decryption of the file data as it passes between the lower @@ -22,7 +22,7 @@ #include #include "ecryptfs_kernel.h" -/** +/* * ecryptfs_get_locked_page * * Get one page from cache or lower f/s, return error otherwise. @@ -41,6 +41,7 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index) /** * ecryptfs_writepage * @page: Page that is locked before this call is made + * @wbc: Write-back control structure * * Returns zero on success; non-zero otherwise * @@ -78,7 +79,7 @@ static void strip_xattr_flag(char *page_virt, } } -/** +/* * Header Extent: * Octets 0-7: Unencrypted file size (big-endian) * Octets 8-15: eCryptfs special marker @@ -229,7 +230,7 @@ out: return rc; } -/** +/* * Called with lower inode mutex held. */ static int fill_zeros_to_end_of_page(struct page *page, unsigned int to) @@ -368,7 +369,7 @@ out: return rc; } -/** +/* * ecryptfs_write_inode_size_to_header * * Writes the lower file size to the first 8 bytes of the header. From d17074ac9ec8d4b7a5c2a305625443e5960fc530 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 30 Mar 2021 17:44:57 +0100 Subject: [PATCH 0492/1379] ecryptfs: inode: Help out nearly-there header and demote non-conformant ones Fixes the following W=1 kernel build warning(s): fs/ecryptfs/inode.c:27: warning: Function parameter or member 'dentry' not described in 'lock_parent' fs/ecryptfs/inode.c:27: warning: Function parameter or member 'lower_dentry' not described in 'lock_parent' fs/ecryptfs/inode.c:27: warning: Function parameter or member 'lower_dir' not described in 'lock_parent' fs/ecryptfs/inode.c:27: warning: expecting prototype for eCryptfs(). Prototype was for lock_parent() instead fs/ecryptfs/inode.c:211: warning: Function parameter or member 'ecryptfs_dentry' not described in 'ecryptfs_initialize_file' fs/ecryptfs/inode.c:211: warning: Function parameter or member 'ecryptfs_inode' not described in 'ecryptfs_initialize_file' fs/ecryptfs/inode.c:258: warning: Function parameter or member 'mnt_userns' not described in 'ecryptfs_create' fs/ecryptfs/inode.c:258: warning: Function parameter or member 'directory_inode' not described in 'ecryptfs_create' fs/ecryptfs/inode.c:258: warning: Function parameter or member 'ecryptfs_dentry' not described in 'ecryptfs_create' fs/ecryptfs/inode.c:258: warning: Function parameter or member 'excl' not described in 'ecryptfs_create' fs/ecryptfs/inode.c:258: warning: Excess function parameter 'dir' description in 'ecryptfs_create' fs/ecryptfs/inode.c:258: warning: Excess function parameter 'dentry' description in 'ecryptfs_create' fs/ecryptfs/inode.c:320: warning: Function parameter or member 'dentry' not described in 'ecryptfs_lookup_interpose' fs/ecryptfs/inode.c:320: warning: Function parameter or member 'lower_dentry' not described in 'ecryptfs_lookup_interpose' fs/ecryptfs/inode.c:887: warning: Function parameter or member 'mnt_userns' not described in 'ecryptfs_setattr' Cc: Tyler Hicks Cc: "Michael A. Halcrow" Cc: "Michael C. Thompsion" Cc: ecryptfs@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: Tyler Hicks --- fs/ecryptfs/inode.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 18e9285fbb4c..11953eff1ad8 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2004 Erez Zadok @@ -203,7 +203,7 @@ out_lock: return inode; } -/** +/* * ecryptfs_initialize_file * * Cause the file to be changed from a basic empty file to an ecryptfs @@ -246,10 +246,8 @@ out: return rc; } -/** +/* * ecryptfs_create - * @dir: The inode of the directory in which to create the file. - * @dentry: The eCryptfs dentry * @mode: The mode of the new file. * * Creates a new file. @@ -317,7 +315,7 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) return 0; } -/** +/* * ecryptfs_lookup_interpose - Dentry interposition for a lookup */ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, @@ -887,6 +885,7 @@ ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode, /** * ecryptfs_setattr + * @mnt_userns: user namespace of the target mount * @dentry: dentry handle to the inode to modify * @ia: Structure with flags of what to change and values * From ffbed072be00c1617e25396054f6f349befddffc Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Tue, 30 Mar 2021 17:44:58 +0100 Subject: [PATCH 0493/1379] ecryptfs: keystore: Fix some kernel-doc issues and demote non-conformant headers Fixes the following W=1 kernel build warning(s): fs/ecryptfs/keystore.c:25: warning: Incorrect use of kernel-doc format: * request_key returned an error instead of a valid key address; fs/ecryptfs/keystore.c:30: warning: Function parameter or member 'err_code' not described in 'process_request_key_err' fs/ecryptfs/keystore.c:30: warning: expecting prototype for eCryptfs(). Prototype was for process_request_key_err() instead fs/ecryptfs/keystore.c:558: warning: Function parameter or member 'auth_tok_key' not described in 'ecryptfs_find_auth_tok_for_sig' fs/ecryptfs/keystore.c:558: warning: Function parameter or member 'mount_crypt_stat' not described in 'ecryptfs_find_auth_tok_for_sig' fs/ecryptfs/keystore.c:558: warning: Excess function parameter 'crypt_stat' description in 'ecryptfs_find_auth_tok_for_sig' fs/ecryptfs/keystore.c:584: warning: cannot understand function prototype: 'struct ecryptfs_write_tag_70_packet_silly_stack ' fs/ecryptfs/keystore.c:622: warning: Function parameter or member 'dest' not described in 'ecryptfs_write_tag_70_packet' fs/ecryptfs/keystore.c:622: warning: Function parameter or member 'remaining_bytes' not described in 'ecryptfs_write_tag_70_packet' fs/ecryptfs/keystore.c:622: warning: Function parameter or member 'packet_size' not described in 'ecryptfs_write_tag_70_packet' fs/ecryptfs/keystore.c:622: warning: Function parameter or member 'mount_crypt_stat' not described in 'ecryptfs_write_tag_70_packet' fs/ecryptfs/keystore.c:622: warning: Function parameter or member 'filename_size' not described in 'ecryptfs_write_tag_70_packet' fs/ecryptfs/keystore.c:622: warning: expecting prototype for write_tag_70_packet(). Prototype was for ecryptfs_write_tag_70_packet() instead fs/ecryptfs/keystore.c:895: warning: expecting prototype for parse_tag_70_packet(). Prototype was for ecryptfs_parse_tag_70_packet() instead Cc: Tyler Hicks Cc: "Michael A. Halcrow" Cc: "Michael C. Thompson" Cc: "Trevor S. Highland" Cc: ecryptfs@vger.kernel.org Signed-off-by: Lee Jones Signed-off-by: Tyler Hicks --- fs/ecryptfs/keystore.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 2abd219cfeec..3fe41964c0d8 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * In-kernel key management code. Includes functions to parse and * write authentication token-related packets with the underlying @@ -21,7 +21,7 @@ #include #include "ecryptfs_kernel.h" -/** +/* * request_key returned an error instead of a valid key address; * determine the type of error, make appropriate log entries, and * return an error code. @@ -536,8 +536,9 @@ out: /** * ecryptfs_find_auth_tok_for_sig + * @auth_tok_key: key containing the authentication token * @auth_tok: Set to the matching auth_tok; NULL if not found - * @crypt_stat: inode crypt_stat crypto context + * @mount_crypt_stat: inode crypt_stat crypto context * @sig: Sig of auth_tok to find * * For now, this function simply looks at the registered auth_tok's @@ -576,7 +577,7 @@ ecryptfs_find_auth_tok_for_sig( return rc; } -/** +/* * write_tag_70_packet can gobble a lot of stack space. We stuff most * of the function's parameters in a kmalloc'd struct to help reduce * eCryptfs' overall stack usage. @@ -604,7 +605,7 @@ struct ecryptfs_write_tag_70_packet_silly_stack { struct shash_desc *hash_desc; }; -/** +/* * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK * @filename: NULL-terminated filename string * @@ -873,7 +874,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack { }; /** - * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet + * ecryptfs_parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet * @filename: This function kmalloc's the memory for the filename * @filename_size: This function sets this to the amount of memory * kmalloc'd for the filename From 724fa86291d0df2e15c3654c4b5684c64bb1d49c Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Fri, 9 Apr 2021 17:51:42 +0800 Subject: [PATCH 0494/1379] eCryptfs: Use DEFINE_MUTEX() for mutex lock mutex lock can be initialized automatically with DEFINE_MUTEX() rather than explicitly calling mutex_init(). Reported-by: Hulk Robot Signed-off-by: Ye Bin Signed-off-by: Tyler Hicks --- fs/ecryptfs/messaging.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index b38bd742fd97..6318f3500e5c 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -14,10 +14,10 @@ static LIST_HEAD(ecryptfs_msg_ctx_free_list); static LIST_HEAD(ecryptfs_msg_ctx_alloc_list); -static struct mutex ecryptfs_msg_ctx_lists_mux; +static DEFINE_MUTEX(ecryptfs_msg_ctx_lists_mux); static struct hlist_head *ecryptfs_daemon_hash; -struct mutex ecryptfs_daemon_hash_mux; +DEFINE_MUTEX(ecryptfs_daemon_hash_mux); static int ecryptfs_hash_bits; #define ecryptfs_current_euid_hash(uid) \ hash_long((unsigned long)from_kuid(&init_user_ns, current_euid()), ecryptfs_hash_bits) @@ -361,7 +361,6 @@ int __init ecryptfs_init_messaging(void) "too large, defaulting to [%d] users\n", __func__, ecryptfs_number_of_users); } - mutex_init(&ecryptfs_daemon_hash_mux); mutex_lock(&ecryptfs_daemon_hash_mux); ecryptfs_hash_bits = 1; while (ecryptfs_number_of_users >> ecryptfs_hash_bits) @@ -385,7 +384,6 @@ int __init ecryptfs_init_messaging(void) rc = -ENOMEM; goto out; } - mutex_init(&ecryptfs_msg_ctx_lists_mux); mutex_lock(&ecryptfs_msg_ctx_lists_mux); ecryptfs_msg_counter = 0; for (i = 0; i < ecryptfs_message_buf_len; i++) { From 1abbe1106d48ab0ee4980f8afb04ae2c71bbf3cc Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Wed, 24 Feb 2021 12:30:59 +0100 Subject: [PATCH 0495/1379] ecryptfs: Fix typo in message ecryptfs_decrypt_page() issues a warning "Error encrypting extent". This should be "Error decrypting extent" instead. Fixes: 0216f7f79217 ("eCryptfs: replace encrypt, decrypt, and inode size write") Signed-off-by: Sascha Hauer Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 147ccb5e9190..345f8061e3b4 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -535,7 +535,7 @@ int ecryptfs_decrypt_page(struct page *page) rc = crypt_extent(crypt_stat, page, page, extent_offset, DECRYPT); if (rc) { - printk(KERN_ERR "%s: Error encrypting extent; " + printk(KERN_ERR "%s: Error decrypting extent; " "rc = [%d]\n", __func__, rc); goto out; } From 7f06ecd3afb0f976a324d5d8505242c67e4b4719 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Apr 2021 18:24:20 +0200 Subject: [PATCH 0496/1379] ecryptfs: remove unused helpers Remove two helpers that are unused. Cc: Amir Goldstein Cc: Tyler Hicks Cc: ecryptfs@vger.kernel.org Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner Signed-off-by: Tyler Hicks --- fs/ecryptfs/ecryptfs_kernel.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 4a62a7dcc4f5..c8687c6384f5 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -496,12 +496,6 @@ ecryptfs_set_superblock_lower(struct super_block *sb, ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb; } -static inline struct ecryptfs_dentry_info * -ecryptfs_dentry_to_private(struct dentry *dentry) -{ - return (struct ecryptfs_dentry_info *)dentry->d_fsdata; -} - static inline void ecryptfs_set_dentry_private(struct dentry *dentry, struct ecryptfs_dentry_info *dentry_info) @@ -515,12 +509,6 @@ ecryptfs_dentry_to_lower(struct dentry *dentry) return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry; } -static inline struct vfsmount * -ecryptfs_dentry_to_lower_mnt(struct dentry *dentry) -{ - return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt; -} - static inline struct path * ecryptfs_dentry_to_lower_path(struct dentry *dentry) { From 9046625511ad8dfbc8c6c2de16b3532c43d68d48 Mon Sep 17 00:00:00 2001 From: Jeffrey Mitchell Date: Fri, 26 Feb 2021 15:00:23 -0600 Subject: [PATCH 0497/1379] ecryptfs: fix kernel panic with null dev_name When mounting eCryptfs, a null "dev_name" argument to ecryptfs_mount() causes a kernel panic if the parsed options are valid. The easiest way to reproduce this is to call mount() from userspace with an existing eCryptfs mount's options and a "source" argument of 0. Error out if "dev_name" is null in ecryptfs_mount() Fixes: 237fead61998 ("[PATCH] ecryptfs: fs/Makefile and fs/Kconfig") Cc: stable@vger.kernel.org Signed-off-by: Jeffrey Mitchell Signed-off-by: Tyler Hicks --- fs/ecryptfs/main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 77b96737b1ff..d66bbd2df191 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -492,6 +492,12 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out; } + if (!dev_name) { + rc = -EINVAL; + err = "Device name cannot be null"; + goto out; + } + rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid); if (rc) { err = "Error parsing options"; From a7b4e506dcc461c214734d03816c1d47bd88c9a3 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Mon, 19 Apr 2021 10:20:03 +0800 Subject: [PATCH 0498/1379] f2fs: remove unnecessary struct declaration struct dnode_of_data is defined at 897th line. The declaration here is unnecessary. Remove it. Signed-off-by: Wan Jiabing Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 87d734f5589d..984a2a546745 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3311,7 +3311,6 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname); /* * node.c */ -struct dnode_of_data; struct node_info; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); From f9b60e2209213fdfcc504ba25a404977c5d08b77 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Apr 2021 14:00:15 -0400 Subject: [PATCH 0499/1379] nfsd: hash nfs4_files by inode number The nfs4_file structure is per-filehandle, not per-inode, because the spec requires open and other state to be per filehandle. But it will turn out to be convenient for nfs4_files associated with the same inode to be hashed to the same bucket, so let's hash on the inode instead of the filehandle. Filehandle aliasing is rare, so that shouldn't have much performance impact. (If you have a ton of exported filesystems, though, and all of them have a root with inode number 2, could that get you an overlong hash chain? Perhaps this (and the v4 open file cache) should be hashed on the inode pointer instead.) Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 27 ++++++++++++--------------- fs/nfsd/state.h | 1 - 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 403c69efdbea..442034633ee8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -543,14 +543,12 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername) #define FILE_HASH_BITS 8 #define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh) +static unsigned int file_hashval(struct svc_fh *fh) { - return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0); -} + struct inode *inode = d_inode(fh->fh_dentry); -static unsigned int file_hashval(struct knfsd_fh *fh) -{ - return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1); + /* XXX: why not (here & in file cache) use inode? */ + return (unsigned int)hash_long(inode->i_ino, FILE_HASH_BITS); } static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; @@ -4072,7 +4070,7 @@ static struct nfs4_file *nfsd4_alloc_file(void) } /* OPEN Share state helper functions */ -static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, +static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval, struct nfs4_file *fp) { lockdep_assert_held(&state_lock); @@ -4082,7 +4080,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); INIT_LIST_HEAD(&fp->fi_clnt_odstate); - fh_copy_shallow(&fp->fi_fhandle, fh); + fh_copy_shallow(&fp->fi_fhandle, &fh->fh_handle); fp->fi_deleg_file = NULL; fp->fi_had_conflict = false; fp->fi_share_deny = 0; @@ -4426,13 +4424,13 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) /* search file_hashtbl[] for file */ static struct nfs4_file * -find_file_locked(struct knfsd_fh *fh, unsigned int hashval) +find_file_locked(struct svc_fh *fh, unsigned int hashval) { struct nfs4_file *fp; hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash, lockdep_is_held(&state_lock)) { - if (fh_match(&fp->fi_fhandle, fh)) { + if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) { if (refcount_inc_not_zero(&fp->fi_ref)) return fp; } @@ -4440,8 +4438,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval) return NULL; } -struct nfs4_file * -find_file(struct knfsd_fh *fh) +static struct nfs4_file * find_file(struct svc_fh *fh) { struct nfs4_file *fp; unsigned int hashval = file_hashval(fh); @@ -4453,7 +4450,7 @@ find_file(struct knfsd_fh *fh) } static struct nfs4_file * -find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh) +find_or_add_file(struct nfs4_file *new, struct svc_fh *fh) { struct nfs4_file *fp; unsigned int hashval = file_hashval(fh); @@ -4485,7 +4482,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) struct nfs4_file *fp; __be32 ret = nfs_ok; - fp = find_file(¤t_fh->fh_handle); + fp = find_file(current_fh); if (!fp) return ret; /* Check for conflicting share reservations */ @@ -5166,7 +5163,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf * and check for delegations in the process of being recalled. * If not found, create the nfs4_file struct */ - fp = find_or_add_file(open->op_file, ¤t_fh->fh_handle); + fp = find_or_add_file(open->op_file, current_fh); if (fp != open->op_file) { status = nfs4_check_deleg(cl, open, &dp); if (status) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 54cab651ac1d..61a2d95d7923 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -669,7 +669,6 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name struct xdr_netobj princhash, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); -struct nfs4_file *find_file(struct knfsd_fh *fh); void put_nfs4_file(struct nfs4_file *fi); extern void nfs4_put_copy(struct nfsd4_copy *copy); extern struct nfsd4_copy * From a0ce48375a367222989c2618fe68bf34db8c7bb7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Apr 2021 14:00:16 -0400 Subject: [PATCH 0500/1379] nfsd: track filehandle aliasing in nfs4_files It's unusual but possible for multiple filehandles to point to the same file. In that case, we may end up with multiple nfs4_files referencing the same inode. For delegation purposes it will turn out to be useful to flag those cases. Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 37 ++++++++++++++++++++++++++++--------- fs/nfsd/state.h | 2 ++ 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 442034633ee8..b12f530f6449 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4086,6 +4086,8 @@ static void nfsd4_init_file(struct svc_fh *fh, unsigned int hashval, fp->fi_share_deny = 0; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); + fp->fi_aliased = false; + fp->fi_inode = d_inode(fh->fh_dentry); #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&fp->fi_lo_states); atomic_set(&fp->fi_lo_recalls, 0); @@ -4438,6 +4440,31 @@ find_file_locked(struct svc_fh *fh, unsigned int hashval) return NULL; } +static struct nfs4_file *insert_file(struct nfs4_file *new, struct svc_fh *fh, + unsigned int hashval) +{ + struct nfs4_file *fp; + struct nfs4_file *ret = NULL; + bool alias_found = false; + + spin_lock(&state_lock); + hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash, + lockdep_is_held(&state_lock)) { + if (fh_match(&fp->fi_fhandle, &fh->fh_handle)) { + if (refcount_inc_not_zero(&fp->fi_ref)) + ret = fp; + } else if (d_inode(fh->fh_dentry) == fp->fi_inode) + fp->fi_aliased = alias_found = true; + } + if (likely(ret == NULL)) { + nfsd4_init_file(fh, hashval, new); + new->fi_aliased = alias_found; + ret = new; + } + spin_unlock(&state_lock); + return ret; +} + static struct nfs4_file * find_file(struct svc_fh *fh) { struct nfs4_file *fp; @@ -4461,15 +4488,7 @@ find_or_add_file(struct nfs4_file *new, struct svc_fh *fh) if (fp) return fp; - spin_lock(&state_lock); - fp = find_file_locked(fh, hashval); - if (likely(fp == NULL)) { - nfsd4_init_file(fh, hashval, new); - fp = new; - } - spin_unlock(&state_lock); - - return fp; + return insert_file(new, fh, hashval); } /* diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 61a2d95d7923..e73bdbb1634a 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -516,6 +516,8 @@ struct nfs4_clnt_odstate { */ struct nfs4_file { refcount_t fi_ref; + struct inode * fi_inode; + bool fi_aliased; spinlock_t fi_lock; struct hlist_node fi_hash; /* hash on fi_fhandle */ struct list_head fi_stateids; From ebd9d2c2f5a7ebaaed2d7bb4dee148755f46033d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Apr 2021 14:00:17 -0400 Subject: [PATCH 0501/1379] nfsd: reshuffle some code No change in behavior, I'm just moving some code around to avoid forward references in a following patch. (To do someday: figure out how to split up nfs4state.c. It's big and disorganized.) Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 235 ++++++++++++++++++++++---------------------- 1 file changed, 118 insertions(+), 117 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b12f530f6449..94cb5f02f4b5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -354,6 +354,124 @@ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { .release = nfsd4_cb_notify_lock_release, }; +/* + * We store the NONE, READ, WRITE, and BOTH bits separately in the + * st_{access,deny}_bmap field of the stateid, in order to track not + * only what share bits are currently in force, but also what + * combinations of share bits previous opens have used. This allows us + * to enforce the recommendation of rfc 3530 14.2.19 that the server + * return an error if the client attempt to downgrade to a combination + * of share bits not explicable by closing some of its previous opens. + * + * XXX: This enforcement is actually incomplete, since we don't keep + * track of access/deny bit combinations; so, e.g., we allow: + * + * OPEN allow read, deny write + * OPEN allow both, deny none + * DOWNGRADE allow read, deny none + * + * which we should reject. + */ +static unsigned int +bmap_to_share_mode(unsigned long bmap) +{ + int i; + unsigned int access = 0; + + for (i = 1; i < 4; i++) { + if (test_bit(i, &bmap)) + access |= i; + } + return access; +} + +/* set share access for a given stateid */ +static inline void +set_access(u32 access, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << access; + + WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); + stp->st_access_bmap |= mask; +} + +/* clear share access for a given stateid */ +static inline void +clear_access(u32 access, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << access; + + WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); + stp->st_access_bmap &= ~mask; +} + +/* test whether a given stateid has access */ +static inline bool +test_access(u32 access, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << access; + + return (bool)(stp->st_access_bmap & mask); +} + +/* set share deny for a given stateid */ +static inline void +set_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << deny; + + WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); + stp->st_deny_bmap |= mask; +} + +/* clear share deny for a given stateid */ +static inline void +clear_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << deny; + + WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); + stp->st_deny_bmap &= ~mask; +} + +/* test whether a given stateid is denying specific access */ +static inline bool +test_deny(u32 deny, struct nfs4_ol_stateid *stp) +{ + unsigned char mask = 1 << deny; + + return (bool)(stp->st_deny_bmap & mask); +} + +static int nfs4_access_to_omode(u32 access) +{ + switch (access & NFS4_SHARE_ACCESS_BOTH) { + case NFS4_SHARE_ACCESS_READ: + return O_RDONLY; + case NFS4_SHARE_ACCESS_WRITE: + return O_WRONLY; + case NFS4_SHARE_ACCESS_BOTH: + return O_RDWR; + } + WARN_ON_ONCE(1); + return O_RDONLY; +} + +static inline int +access_permit_read(struct nfs4_ol_stateid *stp) +{ + return test_access(NFS4_SHARE_ACCESS_READ, stp) || + test_access(NFS4_SHARE_ACCESS_BOTH, stp) || + test_access(NFS4_SHARE_ACCESS_WRITE, stp); +} + +static inline int +access_permit_write(struct nfs4_ol_stateid *stp) +{ + return test_access(NFS4_SHARE_ACCESS_WRITE, stp) || + test_access(NFS4_SHARE_ACCESS_BOTH, stp); +} + static inline struct nfs4_stateowner * nfs4_get_stateowner(struct nfs4_stateowner *sop) { @@ -1150,108 +1268,6 @@ static unsigned int clientstr_hashval(struct xdr_netobj name) return opaque_hashval(name.data, 8) & CLIENT_HASH_MASK; } -/* - * We store the NONE, READ, WRITE, and BOTH bits separately in the - * st_{access,deny}_bmap field of the stateid, in order to track not - * only what share bits are currently in force, but also what - * combinations of share bits previous opens have used. This allows us - * to enforce the recommendation of rfc 3530 14.2.19 that the server - * return an error if the client attempt to downgrade to a combination - * of share bits not explicable by closing some of its previous opens. - * - * XXX: This enforcement is actually incomplete, since we don't keep - * track of access/deny bit combinations; so, e.g., we allow: - * - * OPEN allow read, deny write - * OPEN allow both, deny none - * DOWNGRADE allow read, deny none - * - * which we should reject. - */ -static unsigned int -bmap_to_share_mode(unsigned long bmap) { - int i; - unsigned int access = 0; - - for (i = 1; i < 4; i++) { - if (test_bit(i, &bmap)) - access |= i; - } - return access; -} - -/* set share access for a given stateid */ -static inline void -set_access(u32 access, struct nfs4_ol_stateid *stp) -{ - unsigned char mask = 1 << access; - - WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); - stp->st_access_bmap |= mask; -} - -/* clear share access for a given stateid */ -static inline void -clear_access(u32 access, struct nfs4_ol_stateid *stp) -{ - unsigned char mask = 1 << access; - - WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH); - stp->st_access_bmap &= ~mask; -} - -/* test whether a given stateid has access */ -static inline bool -test_access(u32 access, struct nfs4_ol_stateid *stp) -{ - unsigned char mask = 1 << access; - - return (bool)(stp->st_access_bmap & mask); -} - -/* set share deny for a given stateid */ -static inline void -set_deny(u32 deny, struct nfs4_ol_stateid *stp) -{ - unsigned char mask = 1 << deny; - - WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); - stp->st_deny_bmap |= mask; -} - -/* clear share deny for a given stateid */ -static inline void -clear_deny(u32 deny, struct nfs4_ol_stateid *stp) -{ - unsigned char mask = 1 << deny; - - WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH); - stp->st_deny_bmap &= ~mask; -} - -/* test whether a given stateid is denying specific access */ -static inline bool -test_deny(u32 deny, struct nfs4_ol_stateid *stp) -{ - unsigned char mask = 1 << deny; - - return (bool)(stp->st_deny_bmap & mask); -} - -static int nfs4_access_to_omode(u32 access) -{ - switch (access & NFS4_SHARE_ACCESS_BOTH) { - case NFS4_SHARE_ACCESS_READ: - return O_RDONLY; - case NFS4_SHARE_ACCESS_WRITE: - return O_WRONLY; - case NFS4_SHARE_ACCESS_BOTH: - return O_RDWR; - } - WARN_ON_ONCE(1); - return O_RDONLY; -} - /* * A stateid that had a deny mode associated with it is being released * or downgraded. Recalculate the deny mode on the file. @@ -5523,21 +5539,6 @@ static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) return nfs_ok; } -static inline int -access_permit_read(struct nfs4_ol_stateid *stp) -{ - return test_access(NFS4_SHARE_ACCESS_READ, stp) || - test_access(NFS4_SHARE_ACCESS_BOTH, stp) || - test_access(NFS4_SHARE_ACCESS_WRITE, stp); -} - -static inline int -access_permit_write(struct nfs4_ol_stateid *stp) -{ - return test_access(NFS4_SHARE_ACCESS_WRITE, stp) || - test_access(NFS4_SHARE_ACCESS_BOTH, stp); -} - static __be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags) { From aba2072f452346d56a462718bcde93d697383148 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 16 Apr 2021 14:00:18 -0400 Subject: [PATCH 0502/1379] nfsd: grant read delegations to clients holding writes It's OK to grant a read delegation to a client that holds a write, as long as it's the only client holding the write. We originally tried to do this in commit 94415b06eb8a ("nfsd4: a client's own opens needn't prevent delegations"), which had to be reverted in commit 6ee65a773096 ("Revert "nfsd4: a client's own opens needn't prevent delegations""). Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/locks.c | 3 ++ fs/nfsd/nfs4state.c | 82 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 71 insertions(+), 14 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 6125d2de39b8..bcc71c469ede 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1808,6 +1808,9 @@ check_conflicting_open(struct file *filp, const long arg, int flags) if (flags & FL_LAYOUT) return 0; + if (flags & FL_DELEG) + /* We leave these checks to the caller */ + return 0; if (arg == F_RDLCK) return inode_is_open_for_write(inode) ? -EAGAIN : 0; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 94cb5f02f4b5..7364af580d83 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4988,6 +4988,65 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, return fl; } +static int nfsd4_check_conflicting_opens(struct nfs4_client *clp, + struct nfs4_file *fp) +{ + struct nfs4_ol_stateid *st; + struct file *f = fp->fi_deleg_file->nf_file; + struct inode *ino = locks_inode(f); + int writes; + + writes = atomic_read(&ino->i_writecount); + if (!writes) + return 0; + /* + * There could be multiple filehandles (hence multiple + * nfs4_files) referencing this file, but that's not too + * common; let's just give up in that case rather than + * trying to go look up all the clients using that other + * nfs4_file as well: + */ + if (fp->fi_aliased) + return -EAGAIN; + /* + * If there's a close in progress, make sure that we see it + * clear any fi_fds[] entries before we see it decrement + * i_writecount: + */ + smp_mb__after_atomic(); + + if (fp->fi_fds[O_WRONLY]) + writes--; + if (fp->fi_fds[O_RDWR]) + writes--; + if (writes > 0) + return -EAGAIN; /* There may be non-NFSv4 writers */ + /* + * It's possible there are non-NFSv4 write opens in progress, + * but if they haven't incremented i_writecount yet then they + * also haven't called break lease yet; so, they'll break this + * lease soon enough. So, all that's left to check for is NFSv4 + * opens: + */ + spin_lock(&fp->fi_lock); + list_for_each_entry(st, &fp->fi_stateids, st_perfile) { + if (st->st_openstp == NULL /* it's an open */ && + access_permit_write(st) && + st->st_stid.sc_client != clp) { + spin_unlock(&fp->fi_lock); + return -EAGAIN; + } + } + spin_unlock(&fp->fi_lock); + /* + * There's a small chance that we could be racing with another + * NFSv4 open. However, any open that hasn't added itself to + * the fi_stateids list also hasn't called break_lease yet; so, + * they'll break this lease soon enough. + */ + return 0; +} + static struct nfs4_delegation * nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate) @@ -5007,9 +5066,12 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, nf = find_readable_file(fp); if (!nf) { - /* We should always have a readable file here */ - WARN_ON_ONCE(1); - return ERR_PTR(-EBADF); + /* + * We probably could attempt another open and get a read + * delegation, but for now, don't bother until the + * client actually sends us one. + */ + return ERR_PTR(-EAGAIN); } spin_lock(&state_lock); spin_lock(&fp->fi_lock); @@ -5044,6 +5106,9 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, locks_free_lock(fl); if (status) goto out_clnt_odstate; + status = nfsd4_check_conflicting_opens(clp, fp); + if (status) + goto out_unlock; spin_lock(&state_lock); spin_lock(&fp->fi_lock); @@ -5125,17 +5190,6 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out_no_deleg; - /* - * Also, if the file was opened for write or - * create, there's a good chance the client's - * about to write to it, resulting in an - * immediate recall (since we don't support - * write delegations): - */ - if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) - goto out_no_deleg; - if (open->op_create == NFS4_OPEN_CREATE) - goto out_no_deleg; break; default: goto out_no_deleg; From 4f192ac00a1ba11e5137b7d901cc9384fadf2bf9 Mon Sep 17 00:00:00 2001 From: Zhang Yunkai Date: Wed, 3 Mar 2021 00:04:10 -0800 Subject: [PATCH 0503/1379] csky: Remove duplicate include in arch/csky/kernel/entry.S 'asm/setup.h' included in 'arch/csky/kernel/entry.S' is duplicated. Signed-off-by: Zhang Yunkai Signed-off-by: Guo Ren --- arch/csky/kernel/entry.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/csky/kernel/entry.S b/arch/csky/kernel/entry.S index c1bd7a6b4ab6..00e3c8ebf9b8 100644 --- a/arch/csky/kernel/entry.S +++ b/arch/csky/kernel/entry.S @@ -9,7 +9,6 @@ #include #include #include -#include #include #include From 0b1f557a1fa02174a982f557581e348d91987ec6 Mon Sep 17 00:00:00 2001 From: Junlin Yang Date: Sat, 6 Mar 2021 11:23:14 +0800 Subject: [PATCH 0504/1379] csky: Fixup typos fixes three typos found by codespell. Signed-off-by: Junlin Yang Signed-off-by: Guo Ren --- arch/csky/include/asm/asid.h | 2 +- arch/csky/include/asm/barrier.h | 2 +- arch/csky/include/asm/vdso.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/csky/include/asm/asid.h b/arch/csky/include/asm/asid.h index ac08b0ffbe1f..6ff205a97a14 100644 --- a/arch/csky/include/asm/asid.h +++ b/arch/csky/include/asm/asid.h @@ -37,7 +37,7 @@ void asid_new_context(struct asid_info *info, atomic64_t *pasid, * Check the ASID is still valid for the context. If not generate a new ASID. * * @pasid: Pointer to the current ASID batch - * @cpu: current CPU ID. Must have been acquired throught get_cpu() + * @cpu: current CPU ID. Must have been acquired through get_cpu() */ static inline void asid_check_context(struct asid_info *info, atomic64_t *pasid, unsigned int cpu, diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h index 84fc600c8b45..f4045dd53e17 100644 --- a/arch/csky/include/asm/barrier.h +++ b/arch/csky/include/asm/barrier.h @@ -64,7 +64,7 @@ /* * sync: completion barrier, all sync.xx instructions - * guarantee the last response recieved by bus transaction + * guarantee the last response received by bus transaction * made by ld/st instructions before sync.s * sync.s: inherit from sync, but also shareable to other cores * sync.i: inherit from sync, but also flush cpu pipeline diff --git a/arch/csky/include/asm/vdso.h b/arch/csky/include/asm/vdso.h index eb5142f9c564..bdce581b5fcb 100644 --- a/arch/csky/include/asm/vdso.h +++ b/arch/csky/include/asm/vdso.h @@ -16,7 +16,7 @@ struct vdso_data { * offset of 0, but since the linker must support setting weak undefined * symbols to the absolute address 0 it also happens to support other low * addresses even when the code model suggests those low addresses would not - * otherwise be availiable. + * otherwise be available. */ #define VDSO_SYMBOL(base, name) \ ({ \ From a2149ab815fce21d0d83082818116519e44f87be Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Fri, 19 Mar 2021 23:08:02 +0100 Subject: [PATCH 0505/1379] thermal/drivers/qcom/tsens-v0_1: Add support for MDM9607 MDM9607 TSENS IP is very similar to the one of MSM8916, with minor adjustments to various tuning values. Signed-off-by: Konrad Dybcio Acked-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210319220802.198215-2-konrad.dybcio@somainline.org --- drivers/thermal/qcom/tsens-v0_1.c | 98 ++++++++++++++++++++++++++++++- drivers/thermal/qcom/tsens.c | 3 + drivers/thermal/qcom/tsens.h | 2 +- 3 files changed, 101 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/qcom/tsens-v0_1.c b/drivers/thermal/qcom/tsens-v0_1.c index 4ffa2e2c0145..f136cb350238 100644 --- a/drivers/thermal/qcom/tsens-v0_1.c +++ b/drivers/thermal/qcom/tsens-v0_1.c @@ -190,6 +190,39 @@ #define BIT_APPEND 0x3 +/* eeprom layout data for mdm9607 */ +#define MDM9607_BASE0_MASK 0x000000ff +#define MDM9607_BASE1_MASK 0x000ff000 +#define MDM9607_BASE0_SHIFT 0 +#define MDM9607_BASE1_SHIFT 12 + +#define MDM9607_S0_P1_MASK 0x00003f00 +#define MDM9607_S1_P1_MASK 0x03f00000 +#define MDM9607_S2_P1_MASK 0x0000003f +#define MDM9607_S3_P1_MASK 0x0003f000 +#define MDM9607_S4_P1_MASK 0x0000003f + +#define MDM9607_S0_P2_MASK 0x000fc000 +#define MDM9607_S1_P2_MASK 0xfc000000 +#define MDM9607_S2_P2_MASK 0x00000fc0 +#define MDM9607_S3_P2_MASK 0x00fc0000 +#define MDM9607_S4_P2_MASK 0x00000fc0 + +#define MDM9607_S0_P1_SHIFT 8 +#define MDM9607_S1_P1_SHIFT 20 +#define MDM9607_S2_P1_SHIFT 0 +#define MDM9607_S3_P1_SHIFT 12 +#define MDM9607_S4_P1_SHIFT 0 + +#define MDM9607_S0_P2_SHIFT 14 +#define MDM9607_S1_P2_SHIFT 26 +#define MDM9607_S2_P2_SHIFT 6 +#define MDM9607_S3_P2_SHIFT 18 +#define MDM9607_S4_P2_SHIFT 6 + +#define MDM9607_CAL_SEL_MASK 0x00700000 +#define MDM9607_CAL_SEL_SHIFT 20 + static int calibrate_8916(struct tsens_priv *priv) { int base0 = 0, base1 = 0, i; @@ -452,7 +485,56 @@ static int calibrate_8974(struct tsens_priv *priv) return 0; } -/* v0.1: 8916, 8939, 8974 */ +static int calibrate_9607(struct tsens_priv *priv) +{ + int base, i; + u32 p1[5], p2[5]; + int mode = 0; + u32 *qfprom_cdata; + + qfprom_cdata = (u32 *)qfprom_read(priv->dev, "calib"); + if (IS_ERR(qfprom_cdata)) + return PTR_ERR(qfprom_cdata); + + mode = (qfprom_cdata[2] & MDM9607_CAL_SEL_MASK) >> MDM9607_CAL_SEL_SHIFT; + dev_dbg(priv->dev, "calibration mode is %d\n", mode); + + switch (mode) { + case TWO_PT_CALIB: + base = (qfprom_cdata[2] & MDM9607_BASE1_MASK) >> MDM9607_BASE1_SHIFT; + p2[0] = (qfprom_cdata[0] & MDM9607_S0_P2_MASK) >> MDM9607_S0_P2_SHIFT; + p2[1] = (qfprom_cdata[0] & MDM9607_S1_P2_MASK) >> MDM9607_S1_P2_SHIFT; + p2[2] = (qfprom_cdata[1] & MDM9607_S2_P2_MASK) >> MDM9607_S2_P2_SHIFT; + p2[3] = (qfprom_cdata[1] & MDM9607_S3_P2_MASK) >> MDM9607_S3_P2_SHIFT; + p2[4] = (qfprom_cdata[2] & MDM9607_S4_P2_MASK) >> MDM9607_S4_P2_SHIFT; + for (i = 0; i < priv->num_sensors; i++) + p2[i] = ((base + p2[i]) << 2); + fallthrough; + case ONE_PT_CALIB2: + base = (qfprom_cdata[0] & MDM9607_BASE0_MASK); + p1[0] = (qfprom_cdata[0] & MDM9607_S0_P1_MASK) >> MDM9607_S0_P1_SHIFT; + p1[1] = (qfprom_cdata[0] & MDM9607_S1_P1_MASK) >> MDM9607_S1_P1_SHIFT; + p1[2] = (qfprom_cdata[1] & MDM9607_S2_P1_MASK) >> MDM9607_S2_P1_SHIFT; + p1[3] = (qfprom_cdata[1] & MDM9607_S3_P1_MASK) >> MDM9607_S3_P1_SHIFT; + p1[4] = (qfprom_cdata[2] & MDM9607_S4_P1_MASK) >> MDM9607_S4_P1_SHIFT; + for (i = 0; i < priv->num_sensors; i++) + p1[i] = ((base + p1[i]) << 2); + break; + default: + for (i = 0; i < priv->num_sensors; i++) { + p1[i] = 500; + p2[i] = 780; + } + break; + } + + compute_intercept_slope(priv, p1, p2, mode); + kfree(qfprom_cdata); + + return 0; +} + +/* v0.1: 8916, 8939, 8974, 9607 */ static struct tsens_features tsens_v0_1_feat = { .ver_major = VER_0_1, @@ -540,3 +622,17 @@ struct tsens_plat_data data_8974 = { .feat = &tsens_v0_1_feat, .fields = tsens_v0_1_regfields, }; + +static const struct tsens_ops ops_9607 = { + .init = init_common, + .calibrate = calibrate_9607, + .get_temp = get_temp_common, +}; + +struct tsens_plat_data data_9607 = { + .num_sensors = 5, + .ops = &ops_9607, + .hw_ids = (unsigned int []){ 0, 1, 2, 3, 4 }, + .feat = &tsens_v0_1_feat, + .fields = tsens_v0_1_regfields, +}; diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index 3c4c0516e58a..a5da6a74a465 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -897,6 +897,9 @@ static SIMPLE_DEV_PM_OPS(tsens_pm_ops, tsens_suspend, tsens_resume); static const struct of_device_id tsens_table[] = { { + .compatible = "qcom,mdm9607-tsens", + .data = &data_9607, + }, { .compatible = "qcom,msm8916-tsens", .data = &data_8916, }, { diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h index f40b625f897e..cba64c33b4f9 100644 --- a/drivers/thermal/qcom/tsens.h +++ b/drivers/thermal/qcom/tsens.h @@ -585,7 +585,7 @@ int get_temp_common(const struct tsens_sensor *s, int *temp); extern struct tsens_plat_data data_8960; /* TSENS v0.1 targets */ -extern struct tsens_plat_data data_8916, data_8939, data_8974; +extern struct tsens_plat_data data_8916, data_8939, data_8974, data_9607; /* TSENS v1 targets */ extern struct tsens_plat_data data_tsens_v1, data_8976; From 8cd7ab2a1a393f37f2e2f4b3ff595d98c245b854 Mon Sep 17 00:00:00 2001 From: Ruiqi Gong Date: Thu, 8 Apr 2021 06:01:44 -0400 Subject: [PATCH 0506/1379] thermal/drivers/thermal_mmio: Remove redundant dev_err call in thermal_mmio_probe() There is a error message within devm_ioremap_resource already, so remove the dev_err call to avoid redundant error message. Reported-by: Hulk Robot Signed-off-by: Ruiqi Gong Acked-by: Talel Shenhar Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210408100144.7494-1-gongruiqi1@huawei.com --- drivers/thermal/thermal_mmio.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/thermal/thermal_mmio.c b/drivers/thermal/thermal_mmio.c index d0bdf1ea3331..ded1dd0d4ef7 100644 --- a/drivers/thermal/thermal_mmio.c +++ b/drivers/thermal/thermal_mmio.c @@ -54,11 +54,8 @@ static int thermal_mmio_probe(struct platform_device *pdev) resource = platform_get_resource(pdev, IORESOURCE_MEM, 0); sensor->mmio_base = devm_ioremap_resource(&pdev->dev, resource); - if (IS_ERR(sensor->mmio_base)) { - dev_err(&pdev->dev, "failed to ioremap memory (%ld)\n", - PTR_ERR(sensor->mmio_base)); + if (IS_ERR(sensor->mmio_base)) return PTR_ERR(sensor->mmio_base); - } sensor_init_func = device_get_match_data(&pdev->dev); if (sensor_init_func) { From d1ab7c3a33d27e7b63fd6207d88852561072ae36 Mon Sep 17 00:00:00 2001 From: Ruiqi Gong Date: Thu, 8 Apr 2021 06:03:29 -0400 Subject: [PATCH 0507/1379] thermal/drivers/bcm2835: Remove redundant dev_err call in bcm2835_thermal_probe() There is a error message within devm_ioremap_resource already, so remove the dev_err call to avoid redundant error message. Reported-by: Hulk Robot Signed-off-by: Ruiqi Gong Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210408100329.7585-1-gongruiqi1@huawei.com --- drivers/thermal/broadcom/bcm2835_thermal.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/thermal/broadcom/bcm2835_thermal.c b/drivers/thermal/broadcom/bcm2835_thermal.c index 3199977f1e73..c8e4344d5a3d 100644 --- a/drivers/thermal/broadcom/bcm2835_thermal.c +++ b/drivers/thermal/broadcom/bcm2835_thermal.c @@ -184,7 +184,6 @@ static int bcm2835_thermal_probe(struct platform_device *pdev) data->regs = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(data->regs)) { err = PTR_ERR(data->regs); - dev_err(&pdev->dev, "Could not get registers: %d\n", err); return err; } From 2eb87d75f980bcc7c2bd370661f8fcc4ec273ea5 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 12 Apr 2021 20:59:01 +0800 Subject: [PATCH 0508/1379] thermal/drivers/intel: Introduce tcc cooling driver On Intel processors, the core frequency can be reduced below OS request, when the current temperature reaches the TCC (Thermal Control Circuit) activation temperature. The default TCC activation temperature is specified by MSR_IA32_TEMPERATURE_TARGET. However, it can be adjusted by specifying an offset in degrees C, using the TCC Offset bits in the same MSR register. This patch introduces a cooling devices driver that utilizes the TCC Offset feature. The bigger the current cooling state is, the lower the effective TCC activation temperature is, so that the processors can be throttled earlier before system critical overheats. Note that, on different platforms, the behavior might be different on how fast the setting takes effect, and how much the CPU frequency is reduced. This patch has been tested on a KabyLake mobile platform from me, and also on a CometLake platform from Doug. Signed-off-by: Zhang Rui Tested by: Doug Smythies Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210412125901.12549-1-rui.zhang@intel.com --- drivers/thermal/intel/Kconfig | 11 ++ drivers/thermal/intel/Makefile | 1 + drivers/thermal/intel/intel_tcc_cooling.c | 129 ++++++++++++++++++++++ 3 files changed, 141 insertions(+) create mode 100644 drivers/thermal/intel/intel_tcc_cooling.c diff --git a/drivers/thermal/intel/Kconfig b/drivers/thermal/intel/Kconfig index ce4f59213c7a..e4299ca3423c 100644 --- a/drivers/thermal/intel/Kconfig +++ b/drivers/thermal/intel/Kconfig @@ -79,3 +79,14 @@ config INTEL_PCH_THERMAL Enable this to support thermal reporting on certain intel PCHs. Thermal reporting device will provide temperature reading, programmable trip points and other information. + +config INTEL_TCC_COOLING + tristate "Intel TCC offset cooling Driver" + depends on X86 + help + Enable this to support system cooling by adjusting the effective TCC + activation temperature via the TCC Offset register, which is widely + supported on modern Intel platforms. + Note that, on different platforms, the behavior might be different + on how fast the setting takes effect, and how much the CPU frequency + is reduced. diff --git a/drivers/thermal/intel/Makefile b/drivers/thermal/intel/Makefile index ff2ad30ef397..5ff2afa388f7 100644 --- a/drivers/thermal/intel/Makefile +++ b/drivers/thermal/intel/Makefile @@ -10,4 +10,5 @@ obj-$(CONFIG_INTEL_QUARK_DTS_THERMAL) += intel_quark_dts_thermal.o obj-$(CONFIG_INT340X_THERMAL) += int340x_thermal/ obj-$(CONFIG_INTEL_BXT_PMIC_THERMAL) += intel_bxt_pmic_thermal.o obj-$(CONFIG_INTEL_PCH_THERMAL) += intel_pch_thermal.o +obj-$(CONFIG_INTEL_TCC_COOLING) += intel_tcc_cooling.o obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o diff --git a/drivers/thermal/intel/intel_tcc_cooling.c b/drivers/thermal/intel/intel_tcc_cooling.c new file mode 100644 index 000000000000..8ec10d55d421 --- /dev/null +++ b/drivers/thermal/intel/intel_tcc_cooling.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * cooling device driver that activates the processor throttling by + * programming the TCC Offset register. + * Copyright (c) 2021, Intel Corporation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +#define TCC_SHIFT 24 +#define TCC_MASK (0x3fULL<<24) +#define TCC_PROGRAMMABLE BIT(30) + +static struct thermal_cooling_device *tcc_cdev; + +static int tcc_get_max_state(struct thermal_cooling_device *cdev, unsigned long + *state) +{ + *state = TCC_MASK >> TCC_SHIFT; + return 0; +} + +static int tcc_offset_update(int tcc) +{ + u64 val; + int err; + + err = rdmsrl_safe(MSR_IA32_TEMPERATURE_TARGET, &val); + if (err) + return err; + + val &= ~TCC_MASK; + val |= tcc << TCC_SHIFT; + + err = wrmsrl_safe(MSR_IA32_TEMPERATURE_TARGET, val); + if (err) + return err; + + return 0; +} + +static int tcc_get_cur_state(struct thermal_cooling_device *cdev, unsigned long + *state) +{ + u64 val; + int err; + + err = rdmsrl_safe(MSR_IA32_TEMPERATURE_TARGET, &val); + if (err) + return err; + + *state = (val & TCC_MASK) >> TCC_SHIFT; + return 0; +} + +static int tcc_set_cur_state(struct thermal_cooling_device *cdev, unsigned long + state) +{ + return tcc_offset_update(state); +} + +static const struct thermal_cooling_device_ops tcc_cooling_ops = { + .get_max_state = tcc_get_max_state, + .get_cur_state = tcc_get_cur_state, + .set_cur_state = tcc_set_cur_state, +}; + +static const struct x86_cpu_id tcc_ids[] __initconst = { + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, NULL), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, NULL), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, NULL), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, NULL), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, NULL), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, NULL), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, NULL), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, NULL), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, NULL), + {} +}; + +MODULE_DEVICE_TABLE(x86cpu, tcc_ids); + +static int __init tcc_cooling_init(void) +{ + int ret; + u64 val; + const struct x86_cpu_id *id; + + int err; + + id = x86_match_cpu(tcc_ids); + if (!id) + return -ENODEV; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, &val); + if (err) + return err; + + if (!(val & TCC_PROGRAMMABLE)) + return -ENODEV; + + pr_info("Programmable TCC Offset detected\n"); + + tcc_cdev = + thermal_cooling_device_register("TCC Offset", NULL, + &tcc_cooling_ops); + if (IS_ERR(tcc_cdev)) { + ret = PTR_ERR(tcc_cdev); + return ret; + } + return 0; +} + +module_init(tcc_cooling_init) + +static void __exit tcc_cooling_exit(void) +{ + thermal_cooling_device_unregister(tcc_cdev); +} + +module_exit(tcc_cooling_exit) + +MODULE_DESCRIPTION("TCC offset cooling device Driver"); +MODULE_AUTHOR("Zhang Rui "); +MODULE_LICENSE("GPL v2"); From beaa41029fdea9d3e01af3a1a800538542d30869 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Fri, 9 Apr 2021 15:52:24 +0800 Subject: [PATCH 0509/1379] thermal/drivers/hisi: Remove redundant dev_err call in hisi_thermal_probe() There is a error message within devm_ioremap_resource already, so remove the dev_err call to avoid redundant error message. Reported-by: Hulk Robot Signed-off-by: Ye Bin Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210409075224.2109503-1-yebin10@huawei.com --- drivers/thermal/hisi_thermal.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c index d902db9f267f..9a21ac0ceb11 100644 --- a/drivers/thermal/hisi_thermal.c +++ b/drivers/thermal/hisi_thermal.c @@ -572,10 +572,8 @@ static int hisi_thermal_probe(struct platform_device *pdev) res = platform_get_resource(pdev, IORESOURCE_MEM, 0); data->regs = devm_ioremap_resource(dev, res); - if (IS_ERR(data->regs)) { - dev_err(dev, "failed to get io address\n"); + if (IS_ERR(data->regs)) return PTR_ERR(data->regs); - } ret = data->ops->probe(data); if (ret) From fc88f7ad763a8ef2a20f8904bd241930b7696f86 Mon Sep 17 00:00:00 2001 From: dingsenjie Date: Wed, 14 Apr 2021 14:39:43 +0800 Subject: [PATCH 0510/1379] thermal/drivers/tegra: Use devm_platform_ioremap_resource_byname Use the devm_platform_ioremap_resource_byname() helper instead of calling platform_get_resource_byname() and devm_ioremap_resource() separately. Signed-off-by: dingsenjie Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210414063943.96244-1-dingsenjie@163.com --- drivers/thermal/tegra/soctherm.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c index 8b8fbd49679b..8e303e9d1dc0 100644 --- a/drivers/thermal/tegra/soctherm.c +++ b/drivers/thermal/tegra/soctherm.c @@ -2118,7 +2118,6 @@ static int tegra_soctherm_probe(struct platform_device *pdev) struct tegra_soctherm *tegra; struct thermal_zone_device *z; struct tsensor_shared_calib shared_calib; - struct resource *res; struct tegra_soctherm_soc *soc; unsigned int i; int err; @@ -2140,26 +2139,20 @@ static int tegra_soctherm_probe(struct platform_device *pdev) tegra->soc = soc; - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, - "soctherm-reg"); - tegra->regs = devm_ioremap_resource(&pdev->dev, res); + tegra->regs = devm_platform_ioremap_resource_byname(pdev, "soctherm-reg"); if (IS_ERR(tegra->regs)) { dev_err(&pdev->dev, "can't get soctherm registers"); return PTR_ERR(tegra->regs); } if (!tegra->soc->use_ccroc) { - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, - "car-reg"); - tegra->clk_regs = devm_ioremap_resource(&pdev->dev, res); + tegra->clk_regs = devm_platform_ioremap_resource_byname(pdev, "car-reg"); if (IS_ERR(tegra->clk_regs)) { dev_err(&pdev->dev, "can't get car clk registers"); return PTR_ERR(tegra->clk_regs); } } else { - res = platform_get_resource_byname(pdev, IORESOURCE_MEM, - "ccroc-reg"); - tegra->ccroc_regs = devm_ioremap_resource(&pdev->dev, res); + tegra->ccroc_regs = devm_platform_ioremap_resource_byname(pdev, "ccroc-reg"); if (IS_ERR(tegra->ccroc_regs)) { dev_err(&pdev->dev, "can't get ccroc registers"); return PTR_ERR(tegra->ccroc_regs); From 07ca255e3d49bb31f6b7398bd0b3c62c94142c19 Mon Sep 17 00:00:00 2001 From: Jianjun Wang Date: Tue, 20 Apr 2021 14:17:17 +0800 Subject: [PATCH 0511/1379] dt-bindings: PCI: mediatek-gen3: Add YAML schema Add YAML schemas documentation for Gen3 PCIe controller on MediaTek SoCs. Link: https://lore.kernel.org/r/20210420061723.989-2-jianjun.wang@mediatek.com Signed-off-by: Jianjun Wang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Ryder Lee --- .../bindings/pci/mediatek-pcie-gen3.yaml | 181 ++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml diff --git a/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml b/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml new file mode 100644 index 000000000000..e7b1f9892da4 --- /dev/null +++ b/Documentation/devicetree/bindings/pci/mediatek-pcie-gen3.yaml @@ -0,0 +1,181 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pci/mediatek-pcie-gen3.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Gen3 PCIe controller on MediaTek SoCs + +maintainers: + - Jianjun Wang + +description: |+ + PCIe Gen3 MAC controller for MediaTek SoCs, it supports Gen3 speed + and compatible with Gen2, Gen1 speed. + + This PCIe controller supports up to 256 MSI vectors, the MSI hardware + block diagram is as follows: + + +-----+ + | GIC | + +-----+ + ^ + | + port->irq + | + +-+-+-+-+-+-+-+-+ + |0|1|2|3|4|5|6|7| (PCIe intc) + +-+-+-+-+-+-+-+-+ + ^ ^ ^ + | | ... | + +-------+ +------+ +-----------+ + | | | + +-+-+---+--+--+ +-+-+---+--+--+ +-+-+---+--+--+ + |0|1|...|30|31| |0|1|...|30|31| |0|1|...|30|31| (MSI sets) + +-+-+---+--+--+ +-+-+---+--+--+ +-+-+---+--+--+ + ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ + | | | | | | | | | | | | (MSI vectors) + | | | | | | | | | | | | + + (MSI SET0) (MSI SET1) ... (MSI SET7) + + With 256 MSI vectors supported, the MSI vectors are composed of 8 sets, + each set has its own address for MSI message, and supports 32 MSI vectors + to generate interrupt. + +allOf: + - $ref: /schemas/pci/pci-bus.yaml# + +properties: + compatible: + const: mediatek,mt8192-pcie + + reg: + maxItems: 1 + + reg-names: + items: + - const: pcie-mac + + interrupts: + maxItems: 1 + + ranges: + minItems: 1 + maxItems: 8 + + resets: + minItems: 1 + maxItems: 2 + + reset-names: + minItems: 1 + maxItems: 2 + items: + - const: phy + - const: mac + + clocks: + maxItems: 6 + + clock-names: + items: + - const: pl_250m + - const: tl_26m + - const: tl_96m + - const: tl_32k + - const: peri_26m + - const: top_133m + + assigned-clocks: + maxItems: 1 + + assigned-clock-parents: + maxItems: 1 + + phys: + maxItems: 1 + + '#interrupt-cells': + const: 1 + + interrupt-controller: + description: Interrupt controller node for handling legacy PCI interrupts. + type: object + properties: + '#address-cells': + const: 0 + '#interrupt-cells': + const: 1 + interrupt-controller: true + + required: + - '#address-cells' + - '#interrupt-cells' + - interrupt-controller + + additionalProperties: false + +required: + - compatible + - reg + - reg-names + - interrupts + - ranges + - clocks + - '#interrupt-cells' + - interrupt-controller + +unevaluatedProperties: false + +examples: + - | + #include + #include + + bus { + #address-cells = <2>; + #size-cells = <2>; + + pcie: pcie@11230000 { + compatible = "mediatek,mt8192-pcie"; + device_type = "pci"; + #address-cells = <3>; + #size-cells = <2>; + reg = <0x00 0x11230000 0x00 0x4000>; + reg-names = "pcie-mac"; + interrupts = ; + bus-range = <0x00 0xff>; + ranges = <0x82000000 0x00 0x12000000 0x00 + 0x12000000 0x00 0x1000000>; + clocks = <&infracfg 44>, + <&infracfg 40>, + <&infracfg 43>, + <&infracfg 97>, + <&infracfg 99>, + <&infracfg 111>; + clock-names = "pl_250m", "tl_26m", "tl_96m", + "tl_32k", "peri_26m", "top_133m"; + assigned-clocks = <&topckgen 50>; + assigned-clock-parents = <&topckgen 91>; + + phys = <&pciephy>; + phy-names = "pcie-phy"; + + resets = <&infracfg_rst 2>, + <&infracfg_rst 3>; + reset-names = "phy", "mac"; + + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0x7>; + interrupt-map = <0 0 0 1 &pcie_intc 0>, + <0 0 0 2 &pcie_intc 1>, + <0 0 0 3 &pcie_intc 2>, + <0 0 0 4 &pcie_intc 3>; + pcie_intc: interrupt-controller { + #address-cells = <0>; + #interrupt-cells = <1>; + interrupt-controller; + }; + }; + }; From 9cc742078c9a90cdd4cf131e9f760e6965df9048 Mon Sep 17 00:00:00 2001 From: Jianjun Wang Date: Tue, 20 Apr 2021 14:17:18 +0800 Subject: [PATCH 0512/1379] PCI: Export pci_pio_to_address() for module use This interface will be used by PCI host drivers for PIO translation, export it to support compiling those drivers as kernel modules. Link: https://lore.kernel.org/r/20210420061723.989-3-jianjun.wang@mediatek.com Signed-off-by: Jianjun Wang Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- drivers/pci/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 16a17215f633..09f1714e23be 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4052,6 +4052,7 @@ phys_addr_t pci_pio_to_address(unsigned long pio) return address; } +EXPORT_SYMBOL_GPL(pci_pio_to_address); unsigned long __weak pci_address_to_pio(phys_addr_t address) { From 0b0f93cf913b96f7a6f3d6b1ab5dc697d7b9d0f6 Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Sat, 17 Apr 2021 09:19:50 +0300 Subject: [PATCH 0513/1379] dt-bindings: dmaengine: qcom: gpi: add compatible for sm8150 No functional changes, just adding a new compatible for a diferent SoC. Signed-off-by: Felipe Balbi Link: https://lore.kernel.org/r/20210417061951.2105530-2-balbi@kernel.org Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/qcom,gpi.yaml | 1 + drivers/dma/qcom/gpi.c | 1 + 2 files changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/dma/qcom,gpi.yaml b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml index f8142adf9aea..902e2e523386 100644 --- a/Documentation/devicetree/bindings/dma/qcom,gpi.yaml +++ b/Documentation/devicetree/bindings/dma/qcom,gpi.yaml @@ -20,6 +20,7 @@ properties: compatible: enum: - qcom,sdm845-gpi-dma + - qcom,sm8150-gpi-dma reg: maxItems: 1 diff --git a/drivers/dma/qcom/gpi.c b/drivers/dma/qcom/gpi.c index 57f5ee4235c7..43ac3ab23d4c 100644 --- a/drivers/dma/qcom/gpi.c +++ b/drivers/dma/qcom/gpi.c @@ -2281,6 +2281,7 @@ static int gpi_probe(struct platform_device *pdev) static const struct of_device_id gpi_of_match[] = { { .compatible = "qcom,sdm845-gpi-dma" }, + { .compatible = "qcom,sm8150-gpi-dma" }, { }, }; MODULE_DEVICE_TABLE(of, gpi_of_match); From 28ac8e03c43dfc6a703aa420d18222540b801120 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 15 Apr 2021 12:06:54 +0100 Subject: [PATCH 0514/1379] dmaengine: idxd: Fix potential null dereference on pointer status There are calls to idxd_cmd_exec that pass a null status pointer however a recent commit has added an assignment to *status that can end up with a null pointer dereference. The function expects a null status pointer sometimes as there is a later assignment to *status where status is first null checked. Fix the issue by null checking status before making the assignment. Addresses-Coverity: ("Explicit null dereferenced") Fixes: 89e3becd8f82 ("dmaengine: idxd: check device state before issue command") Signed-off-by: Colin Ian King Acked-by: Dave Jiang Link: https://lore.kernel.org/r/20210415110654.1941580-1-colin.king@canonical.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 84a6ea60ecf0..90b875b15c13 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -445,7 +445,8 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand, if (idxd_device_is_halted(idxd)) { dev_warn(&idxd->pdev->dev, "Device is HALTED!\n"); - *status = IDXD_CMDSTS_HW_ERR; + if (status) + *status = IDXD_CMDSTS_HW_ERR; return; } From 361e5fc7420ab56e4a940b8a500207a91830cd32 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 7 Apr 2021 21:25:43 +0800 Subject: [PATCH 0515/1379] dmaengine: at_xdmac: Remove unused inline function at_xdmac_csize() commit 765c37d87669 ("dmaengine: at_xdmac: rework slave configuration part") left behind this, so can remove it. Signed-off-by: YueHaibing Reviewed-by: Tudor Ambarus Acked-by: Ludovic Desroches Link: https://lore.kernel.org/r/20210407132543.23652-1-yuehaibing@huawei.com Signed-off-by: Vinod Koul --- drivers/dma/at_xdmac.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/dma/at_xdmac.c b/drivers/dma/at_xdmac.c index fe45ad5d06c4..64a52bf4d737 100644 --- a/drivers/dma/at_xdmac.c +++ b/drivers/dma/at_xdmac.c @@ -344,17 +344,6 @@ static inline int at_xdmac_chan_is_paused(struct at_xdmac_chan *atchan) return test_bit(AT_XDMAC_CHAN_IS_PAUSED, &atchan->status); } -static inline int at_xdmac_csize(u32 maxburst) -{ - int csize; - - csize = ffs(maxburst) - 1; - if (csize > 4) - csize = -EINVAL; - - return csize; -}; - static inline bool at_xdmac_chan_is_peripheral_xfer(u32 cfg) { return cfg & AT_XDMAC_CC_TYPE_PER_TRAN; From 397862855619271296e46d10f7dfa7bafe71eb81 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 15 Apr 2021 16:37:10 -0700 Subject: [PATCH 0516/1379] dmaengine: idxd: fix dma device lifetime The devm managed lifetime is incompatible with 'struct device' objects that resides in idxd context. This is one of the series that clean up the idxd driver 'struct device' lifetime. Remove embedding of dma_device and dma_chan in idxd since it's not the only interface that idxd will use. The freeing of the dma_device will be managed by the ->release() function. Reported-by: Jason Gunthorpe Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators") Signed-off-by: Dave Jiang Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/161852983001.2203940.14817017492384561719.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 2 - drivers/dma/idxd/dma.c | 77 ++++++++++++++++++++++++++++++++------- drivers/dma/idxd/idxd.h | 18 +++++++-- 3 files changed, 79 insertions(+), 18 deletions(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 78d2dc5e9bd8..d255bb016c4d 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -186,8 +186,6 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq) desc->id = i; desc->wq = wq; desc->cpu = -1; - dma_async_tx_descriptor_init(&desc->txd, &wq->dma_chan); - desc->txd.tx_submit = idxd_dma_tx_submit; } return 0; diff --git a/drivers/dma/idxd/dma.c b/drivers/dma/idxd/dma.c index a15e50126434..77439b645044 100644 --- a/drivers/dma/idxd/dma.c +++ b/drivers/dma/idxd/dma.c @@ -14,7 +14,10 @@ static inline struct idxd_wq *to_idxd_wq(struct dma_chan *c) { - return container_of(c, struct idxd_wq, dma_chan); + struct idxd_dma_chan *idxd_chan; + + idxd_chan = container_of(c, struct idxd_dma_chan, chan); + return idxd_chan->wq; } void idxd_dma_complete_txd(struct idxd_desc *desc, @@ -135,7 +138,7 @@ static void idxd_dma_issue_pending(struct dma_chan *dma_chan) { } -dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx) +static dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx) { struct dma_chan *c = tx->chan; struct idxd_wq *wq = to_idxd_wq(c); @@ -156,14 +159,25 @@ dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx) static void idxd_dma_release(struct dma_device *device) { + struct idxd_dma_dev *idxd_dma = container_of(device, struct idxd_dma_dev, dma); + + kfree(idxd_dma); } int idxd_register_dma_device(struct idxd_device *idxd) { - struct dma_device *dma = &idxd->dma_dev; + struct idxd_dma_dev *idxd_dma; + struct dma_device *dma; + struct device *dev = &idxd->pdev->dev; + int rc; + idxd_dma = kzalloc_node(sizeof(*idxd_dma), GFP_KERNEL, dev_to_node(dev)); + if (!idxd_dma) + return -ENOMEM; + + dma = &idxd_dma->dma; INIT_LIST_HEAD(&dma->channels); - dma->dev = &idxd->pdev->dev; + dma->dev = dev; dma_cap_set(DMA_PRIVATE, dma->cap_mask); dma_cap_set(DMA_COMPLETION_NO_ORDER, dma->cap_mask); @@ -179,35 +193,72 @@ int idxd_register_dma_device(struct idxd_device *idxd) dma->device_alloc_chan_resources = idxd_dma_alloc_chan_resources; dma->device_free_chan_resources = idxd_dma_free_chan_resources; - return dma_async_device_register(&idxd->dma_dev); + rc = dma_async_device_register(dma); + if (rc < 0) { + kfree(idxd_dma); + return rc; + } + + idxd_dma->idxd = idxd; + /* + * This pointer is protected by the refs taken by the dma_chan. It will remain valid + * as long as there are outstanding channels. + */ + idxd->idxd_dma = idxd_dma; + return 0; } void idxd_unregister_dma_device(struct idxd_device *idxd) { - dma_async_device_unregister(&idxd->dma_dev); + dma_async_device_unregister(&idxd->idxd_dma->dma); } int idxd_register_dma_channel(struct idxd_wq *wq) { struct idxd_device *idxd = wq->idxd; - struct dma_device *dma = &idxd->dma_dev; - struct dma_chan *chan = &wq->dma_chan; - int rc; + struct dma_device *dma = &idxd->idxd_dma->dma; + struct device *dev = &idxd->pdev->dev; + struct idxd_dma_chan *idxd_chan; + struct dma_chan *chan; + int rc, i; - memset(&wq->dma_chan, 0, sizeof(struct dma_chan)); + idxd_chan = kzalloc_node(sizeof(*idxd_chan), GFP_KERNEL, dev_to_node(dev)); + if (!idxd_chan) + return -ENOMEM; + + chan = &idxd_chan->chan; chan->device = dma; list_add_tail(&chan->device_node, &dma->channels); + + for (i = 0; i < wq->num_descs; i++) { + struct idxd_desc *desc = wq->descs[i]; + + dma_async_tx_descriptor_init(&desc->txd, chan); + desc->txd.tx_submit = idxd_dma_tx_submit; + } + rc = dma_async_device_channel_register(dma, chan); - if (rc < 0) + if (rc < 0) { + kfree(idxd_chan); return rc; + } + + wq->idxd_chan = idxd_chan; + idxd_chan->wq = wq; + get_device(&wq->conf_dev); return 0; } void idxd_unregister_dma_channel(struct idxd_wq *wq) { - struct dma_chan *chan = &wq->dma_chan; + struct idxd_dma_chan *idxd_chan = wq->idxd_chan; + struct dma_chan *chan = &idxd_chan->chan; + struct idxd_dma_dev *idxd_dma = wq->idxd->idxd_dma; - dma_async_device_channel_unregister(&wq->idxd->dma_dev, chan); + dma_async_device_channel_unregister(&idxd_dma->dma, chan); list_del(&chan->device_node); + kfree(wq->idxd_chan); + wq->idxd_chan = NULL; + put_device(&wq->conf_dev); } diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 76014c14f473..80e534680c9a 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -14,6 +14,9 @@ extern struct kmem_cache *idxd_desc_pool; +struct idxd_device; +struct idxd_wq; + #define IDXD_REG_TIMEOUT 50 #define IDXD_DRAIN_TIMEOUT 5000 @@ -96,6 +99,11 @@ enum idxd_complete_type { IDXD_COMPLETE_DEV_FAIL, }; +struct idxd_dma_chan { + struct dma_chan chan; + struct idxd_wq *wq; +}; + struct idxd_wq { void __iomem *portal; struct device conf_dev; @@ -125,7 +133,7 @@ struct idxd_wq { int compls_size; struct idxd_desc **descs; struct sbitmap_queue sbq; - struct dma_chan dma_chan; + struct idxd_dma_chan *idxd_chan; char name[WQ_NAME_SIZE + 1]; u64 max_xfer_bytes; u32 max_batch_size; @@ -162,6 +170,11 @@ enum idxd_device_flag { IDXD_FLAG_PASID_ENABLED, }; +struct idxd_dma_dev { + struct idxd_device *idxd; + struct dma_device dma; +}; + struct idxd_device { enum idxd_type type; struct device conf_dev; @@ -210,7 +223,7 @@ struct idxd_device { int num_wq_irqs; struct idxd_irq_entry *irq_entries; - struct dma_device dma_dev; + struct idxd_dma_dev *idxd_dma; struct workqueue_struct *wq; struct work_struct work; }; @@ -363,7 +376,6 @@ void idxd_unregister_dma_channel(struct idxd_wq *wq); void idxd_parse_completion_status(u8 status, enum dmaengine_tx_result *res); void idxd_dma_complete_txd(struct idxd_desc *desc, enum idxd_complete_type comp_type); -dma_cookie_t idxd_dma_tx_submit(struct dma_async_tx_descriptor *tx); /* cdev */ int idxd_cdev_register(void); From 5fc8e85ff12ce0530ac658686902a0ee64600f56 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 15 Apr 2021 16:37:15 -0700 Subject: [PATCH 0517/1379] dmaengine: idxd: cleanup pci interrupt vector allocation management The devm managed lifetime is incompatible with 'struct device' objects that resides in idxd context. This is one of the series that clean up the idxd driver 'struct device' lifetime. Remove devm managed pci interrupt vectors and replace with unmanged allocators. Reported-by: Jason Gunthorpe Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators") Signed-off-by: Dave Jiang Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/161852983563.2203940.8116028229124776669.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 4 +-- drivers/dma/idxd/idxd.h | 2 +- drivers/dma/idxd/init.c | 64 +++++++++++++++++---------------------- 3 files changed, 30 insertions(+), 40 deletions(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index d255bb016c4d..3f696abd74ac 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -19,7 +19,7 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand, /* Interrupt control bits */ void idxd_mask_msix_vector(struct idxd_device *idxd, int vec_id) { - struct irq_data *data = irq_get_irq_data(idxd->msix_entries[vec_id].vector); + struct irq_data *data = irq_get_irq_data(idxd->irq_entries[vec_id].vector); pci_msi_mask_irq(data); } @@ -36,7 +36,7 @@ void idxd_mask_msix_vectors(struct idxd_device *idxd) void idxd_unmask_msix_vector(struct idxd_device *idxd, int vec_id) { - struct irq_data *data = irq_get_irq_data(idxd->msix_entries[vec_id].vector); + struct irq_data *data = irq_get_irq_data(idxd->irq_entries[vec_id].vector); pci_msi_unmask_irq(data); } diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 80e534680c9a..401b035e42b1 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -36,6 +36,7 @@ struct idxd_device_driver { struct idxd_irq_entry { struct idxd_device *idxd; int id; + int vector; struct llist_head pending_llist; struct list_head work_list; /* @@ -219,7 +220,6 @@ struct idxd_device { union sw_err_reg sw_err; wait_queue_head_t cmd_waitq; - struct msix_entry *msix_entries; int num_wq_irqs; struct idxd_irq_entry *irq_entries; diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 6584b0ec07d5..5cf1bf095ae1 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -61,7 +61,6 @@ static int idxd_setup_interrupts(struct idxd_device *idxd) { struct pci_dev *pdev = idxd->pdev; struct device *dev = &pdev->dev; - struct msix_entry *msix; struct idxd_irq_entry *irq_entry; int i, msixcnt; int rc = 0; @@ -69,23 +68,13 @@ static int idxd_setup_interrupts(struct idxd_device *idxd) msixcnt = pci_msix_vec_count(pdev); if (msixcnt < 0) { dev_err(dev, "Not MSI-X interrupt capable.\n"); - goto err_no_irq; + return -ENOSPC; } - idxd->msix_entries = devm_kzalloc(dev, sizeof(struct msix_entry) * - msixcnt, GFP_KERNEL); - if (!idxd->msix_entries) { - rc = -ENOMEM; - goto err_no_irq; - } - - for (i = 0; i < msixcnt; i++) - idxd->msix_entries[i].entry = i; - - rc = pci_enable_msix_exact(pdev, idxd->msix_entries, msixcnt); - if (rc) { - dev_err(dev, "Failed enabling %d MSIX entries.\n", msixcnt); - goto err_no_irq; + rc = pci_alloc_irq_vectors(pdev, msixcnt, msixcnt, PCI_IRQ_MSIX); + if (rc != msixcnt) { + dev_err(dev, "Failed enabling %d MSIX entries: %d\n", msixcnt, rc); + return -ENOSPC; } dev_dbg(dev, "Enabled %d msix vectors\n", msixcnt); @@ -98,58 +87,57 @@ static int idxd_setup_interrupts(struct idxd_device *idxd) GFP_KERNEL); if (!idxd->irq_entries) { rc = -ENOMEM; - goto err_no_irq; + goto err_irq_entries; } for (i = 0; i < msixcnt; i++) { idxd->irq_entries[i].id = i; idxd->irq_entries[i].idxd = idxd; + idxd->irq_entries[i].vector = pci_irq_vector(pdev, i); spin_lock_init(&idxd->irq_entries[i].list_lock); } - msix = &idxd->msix_entries[0]; irq_entry = &idxd->irq_entries[0]; - rc = devm_request_threaded_irq(dev, msix->vector, idxd_irq_handler, - idxd_misc_thread, 0, "idxd-misc", - irq_entry); + rc = request_threaded_irq(irq_entry->vector, idxd_irq_handler, idxd_misc_thread, + 0, "idxd-misc", irq_entry); if (rc < 0) { dev_err(dev, "Failed to allocate misc interrupt.\n"); - goto err_no_irq; + goto err_misc_irq; } - dev_dbg(dev, "Allocated idxd-misc handler on msix vector %d\n", - msix->vector); + dev_dbg(dev, "Allocated idxd-misc handler on msix vector %d\n", irq_entry->vector); /* first MSI-X entry is not for wq interrupts */ idxd->num_wq_irqs = msixcnt - 1; for (i = 1; i < msixcnt; i++) { - msix = &idxd->msix_entries[i]; irq_entry = &idxd->irq_entries[i]; init_llist_head(&idxd->irq_entries[i].pending_llist); INIT_LIST_HEAD(&idxd->irq_entries[i].work_list); - rc = devm_request_threaded_irq(dev, msix->vector, - idxd_irq_handler, - idxd_wq_thread, 0, - "idxd-portal", irq_entry); + rc = request_threaded_irq(irq_entry->vector, idxd_irq_handler, + idxd_wq_thread, 0, "idxd-portal", irq_entry); if (rc < 0) { - dev_err(dev, "Failed to allocate irq %d.\n", - msix->vector); - goto err_no_irq; + dev_err(dev, "Failed to allocate irq %d.\n", irq_entry->vector); + goto err_wq_irqs; } - dev_dbg(dev, "Allocated idxd-msix %d for vector %d\n", - i, msix->vector); + dev_dbg(dev, "Allocated idxd-msix %d for vector %d\n", i, irq_entry->vector); } idxd_unmask_error_interrupts(idxd); idxd_msix_perm_setup(idxd); return 0; - err_no_irq: + err_wq_irqs: + while (--i >= 0) { + irq_entry = &idxd->irq_entries[i]; + free_irq(irq_entry->vector, irq_entry); + } + err_misc_irq: /* Disable error interrupt generation */ idxd_mask_error_interrupts(idxd); - pci_disable_msix(pdev); + err_irq_entries: + pci_free_irq_vectors(pdev); dev_err(dev, "No usable interrupts\n"); return rc; } @@ -495,7 +483,8 @@ static void idxd_shutdown(struct pci_dev *pdev) for (i = 0; i < msixcnt; i++) { irq_entry = &idxd->irq_entries[i]; - synchronize_irq(idxd->msix_entries[i].vector); + synchronize_irq(irq_entry->vector); + free_irq(irq_entry->vector, irq_entry); if (i == 0) continue; idxd_flush_pending_llist(irq_entry); @@ -503,6 +492,7 @@ static void idxd_shutdown(struct pci_dev *pdev) } idxd_msix_perm_clear(idxd); + pci_free_irq_vectors(pdev); destroy_workqueue(idxd->wq); } From a39c7cd0438ee2f0b859ee1eb86cdc52217d2223 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 15 Apr 2021 16:37:21 -0700 Subject: [PATCH 0518/1379] dmaengine: idxd: removal of pcim managed mmio mapping The devm managed lifetime is incompatible with 'struct device' objects that resides in idxd context. This is one of the series that clean up the idxd driver 'struct device' lifetime. Remove pcim_* management of the PCI device and the ioremap of MMIO BAR and replace with unmanaged versions. This is for consistency of removing all the pcim/devm based calls. Reported-by: Jason Gunthorpe Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators") Signed-off-by: Dave Jiang Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/161852984150.2203940.8043988289748519056.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 5cf1bf095ae1..11a2e14b5b80 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -384,32 +384,36 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) struct idxd_device *idxd; int rc; - rc = pcim_enable_device(pdev); + rc = pci_enable_device(pdev); if (rc) return rc; dev_dbg(dev, "Alloc IDXD context\n"); idxd = idxd_alloc(pdev); - if (!idxd) - return -ENOMEM; + if (!idxd) { + rc = -ENOMEM; + goto err_idxd_alloc; + } dev_dbg(dev, "Mapping BARs\n"); - idxd->reg_base = pcim_iomap(pdev, IDXD_MMIO_BAR, 0); - if (!idxd->reg_base) - return -ENOMEM; + idxd->reg_base = pci_iomap(pdev, IDXD_MMIO_BAR, 0); + if (!idxd->reg_base) { + rc = -ENOMEM; + goto err_iomap; + } dev_dbg(dev, "Set DMA masks\n"); rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); if (rc) rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (rc) - return rc; + goto err; rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); if (rc) rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); if (rc) - return rc; + goto err; idxd_set_type(idxd); @@ -423,13 +427,13 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) rc = idxd_probe(idxd); if (rc) { dev_err(dev, "Intel(R) IDXD DMA Engine init failed\n"); - return -ENODEV; + goto err; } rc = idxd_setup_sysfs(idxd); if (rc) { dev_err(dev, "IDXD sysfs setup failed\n"); - return -ENODEV; + goto err; } idxd->state = IDXD_DEV_CONF_READY; @@ -438,6 +442,13 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) idxd->hw.version); return 0; + + err: + pci_iounmap(pdev, idxd->reg_base); + err_iomap: + err_idxd_alloc: + pci_disable_device(pdev); + return rc; } static void idxd_flush_pending_llist(struct idxd_irq_entry *ie) @@ -493,6 +504,8 @@ static void idxd_shutdown(struct pci_dev *pdev) idxd_msix_perm_clear(idxd); pci_free_irq_vectors(pdev); + pci_iounmap(pdev, idxd->reg_base); + pci_disable_device(pdev); destroy_workqueue(idxd->wq); } From f7f7739847bd68b3c3103fd1b50d943038bd14c7 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 15 Apr 2021 16:37:27 -0700 Subject: [PATCH 0519/1379] dmaengine: idxd: use ida for device instance enumeration The idr is only used for an device id, never to lookup context from that id. Switch to plain ida. Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators") Reported-by: Jason Gunthorpe Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161852984730.2203940.15032482460902003819.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 11a2e14b5b80..a4f0489515b4 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -34,8 +34,7 @@ MODULE_PARM_DESC(sva, "Toggle SVA support on/off"); bool support_enqcmd; -static struct idr idxd_idrs[IDXD_TYPE_MAX]; -static DEFINE_MUTEX(idxd_idr_lock); +static struct ida idxd_idas[IDXD_TYPE_MAX]; static struct pci_device_id idxd_pci_tbl[] = { /* DSA ver 1.0 platforms */ @@ -348,12 +347,10 @@ static int idxd_probe(struct idxd_device *idxd) dev_dbg(dev, "IDXD interrupt setup complete.\n"); - mutex_lock(&idxd_idr_lock); - idxd->id = idr_alloc(&idxd_idrs[idxd->type], idxd, 0, 0, GFP_KERNEL); - mutex_unlock(&idxd_idr_lock); + idxd->id = ida_alloc(&idxd_idas[idxd->type], GFP_KERNEL); if (idxd->id < 0) { rc = -ENOMEM; - goto err_idr_fail; + goto err_ida_fail; } idxd->major = idxd_cdev_get_major(idxd); @@ -361,7 +358,7 @@ static int idxd_probe(struct idxd_device *idxd) dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id); return 0; - err_idr_fail: + err_ida_fail: idxd_mask_error_interrupts(idxd); idxd_mask_msix_vectors(idxd); err_setup: @@ -518,9 +515,7 @@ static void idxd_remove(struct pci_dev *pdev) idxd_shutdown(pdev); if (device_pasid_enabled(idxd)) idxd_disable_system_pasid(idxd); - mutex_lock(&idxd_idr_lock); - idr_remove(&idxd_idrs[idxd->type], idxd->id); - mutex_unlock(&idxd_idr_lock); + ida_free(&idxd_idas[idxd->type], idxd->id); } static struct pci_driver idxd_pci_driver = { @@ -550,7 +545,7 @@ static int __init idxd_init_module(void) support_enqcmd = true; for (i = 0; i < IDXD_TYPE_MAX; i++) - idr_init(&idxd_idrs[i]); + ida_init(&idxd_idas[i]); err = idxd_register_bus_type(); if (err < 0) From 47c16ac27d4cb664cee53ee0b9b7e2f907923fb3 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 15 Apr 2021 16:37:33 -0700 Subject: [PATCH 0520/1379] dmaengine: idxd: fix idxd conf_dev 'struct device' lifetime The devm managed lifetime is incompatible with 'struct device' objects that resides in idxd context. This is one of the series that clean up the idxd driver 'struct device' lifetime. Fix idxd->conf_dev 'struct device' lifetime. Address issues flagged by CONFIG_DEBUG_KOBJECT_RELEASE. Add release functions in order to free the allocated memory at the appropriate time. Reported-by: Jason Gunthorpe Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators") Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161852985319.2203940.4650791514462735368.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/idxd.h | 36 ++++++++++------ drivers/dma/idxd/init.c | 56 ++++++++++++++++-------- drivers/dma/idxd/sysfs.c | 92 ++++++++++++++-------------------------- 3 files changed, 94 insertions(+), 90 deletions(-) diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 401b035e42b1..bb3a580732af 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -8,6 +8,7 @@ #include #include #include +#include #include "registers.h" #define IDXD_DRIVER_VERSION "1.00" @@ -255,6 +256,23 @@ extern struct bus_type dsa_bus_type; extern struct bus_type iax_bus_type; extern bool support_enqcmd; +extern struct device_type dsa_device_type; +extern struct device_type iax_device_type; + +static inline bool is_dsa_dev(struct device *dev) +{ + return dev->type == &dsa_device_type; +} + +static inline bool is_iax_dev(struct device *dev) +{ + return dev->type == &iax_device_type; +} + +static inline bool is_idxd_dev(struct device *dev) +{ + return is_dsa_dev(dev) || is_iax_dev(dev); +} static inline bool wq_dedicated(struct idxd_wq *wq) { @@ -292,18 +310,6 @@ static inline int idxd_get_wq_portal_full_offset(int wq_id, return ((wq_id * 4) << PAGE_SHIFT) + idxd_get_wq_portal_offset(prot); } -static inline void idxd_set_type(struct idxd_device *idxd) -{ - struct pci_dev *pdev = idxd->pdev; - - if (pdev->device == PCI_DEVICE_ID_INTEL_DSA_SPR0) - idxd->type = IDXD_TYPE_DSA; - else if (pdev->device == PCI_DEVICE_ID_INTEL_IAX_SPR0) - idxd->type = IDXD_TYPE_IAX; - else - idxd->type = IDXD_TYPE_UNKNOWN; -} - static inline void idxd_wq_get(struct idxd_wq *wq) { wq->client_count++; @@ -319,14 +325,16 @@ static inline int idxd_wq_refcount(struct idxd_wq *wq) return wq->client_count; }; +struct ida *idxd_ida(struct idxd_device *idxd); const char *idxd_get_dev_name(struct idxd_device *idxd); int idxd_register_bus_type(void); void idxd_unregister_bus_type(void); -int idxd_setup_sysfs(struct idxd_device *idxd); -void idxd_cleanup_sysfs(struct idxd_device *idxd); +int idxd_register_devices(struct idxd_device *idxd); +void idxd_unregister_devices(struct idxd_device *idxd); int idxd_register_driver(void); void idxd_unregister_driver(void); struct bus_type *idxd_get_bus_type(struct idxd_device *idxd); +struct device_type *idxd_get_device_type(struct idxd_device *idxd); /* device interrupt control */ void idxd_msix_perm_setup(struct idxd_device *idxd); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index a4f0489515b4..17d3b36610a9 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -51,6 +51,11 @@ static char *idxd_name[] = { "iax" }; +struct ida *idxd_ida(struct idxd_device *idxd) +{ + return &idxd_idas[idxd->type]; +} + const char *idxd_get_dev_name(struct idxd_device *idxd) { return idxd_name[idxd->type]; @@ -81,9 +86,8 @@ static int idxd_setup_interrupts(struct idxd_device *idxd) * We implement 1 completion list per MSI-X entry except for * entry 0, which is for errors and others. */ - idxd->irq_entries = devm_kcalloc(dev, msixcnt, - sizeof(struct idxd_irq_entry), - GFP_KERNEL); + idxd->irq_entries = kcalloc_node(msixcnt, sizeof(struct idxd_irq_entry), + GFP_KERNEL, dev_to_node(dev)); if (!idxd->irq_entries) { rc = -ENOMEM; goto err_irq_entries; @@ -262,16 +266,44 @@ static void idxd_read_caps(struct idxd_device *idxd) } } +static inline void idxd_set_type(struct idxd_device *idxd) +{ + struct pci_dev *pdev = idxd->pdev; + + if (pdev->device == PCI_DEVICE_ID_INTEL_DSA_SPR0) + idxd->type = IDXD_TYPE_DSA; + else if (pdev->device == PCI_DEVICE_ID_INTEL_IAX_SPR0) + idxd->type = IDXD_TYPE_IAX; + else + idxd->type = IDXD_TYPE_UNKNOWN; +} + static struct idxd_device *idxd_alloc(struct pci_dev *pdev) { struct device *dev = &pdev->dev; struct idxd_device *idxd; + int rc; - idxd = devm_kzalloc(dev, sizeof(struct idxd_device), GFP_KERNEL); + idxd = kzalloc_node(sizeof(*idxd), GFP_KERNEL, dev_to_node(dev)); if (!idxd) return NULL; idxd->pdev = pdev; + idxd_set_type(idxd); + idxd->id = ida_alloc(idxd_ida(idxd), GFP_KERNEL); + if (idxd->id < 0) + return NULL; + + device_initialize(&idxd->conf_dev); + idxd->conf_dev.parent = dev; + idxd->conf_dev.bus = idxd_get_bus_type(idxd); + idxd->conf_dev.type = idxd_get_device_type(idxd); + rc = dev_set_name(&idxd->conf_dev, "%s%d", idxd_get_dev_name(idxd), idxd->id); + if (rc < 0) { + put_device(&idxd->conf_dev); + return NULL; + } + spin_lock_init(&idxd->dev_lock); return idxd; @@ -347,20 +379,11 @@ static int idxd_probe(struct idxd_device *idxd) dev_dbg(dev, "IDXD interrupt setup complete.\n"); - idxd->id = ida_alloc(&idxd_idas[idxd->type], GFP_KERNEL); - if (idxd->id < 0) { - rc = -ENOMEM; - goto err_ida_fail; - } - idxd->major = idxd_cdev_get_major(idxd); dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id); return 0; - err_ida_fail: - idxd_mask_error_interrupts(idxd); - idxd_mask_msix_vectors(idxd); err_setup: if (device_pasid_enabled(idxd)) idxd_disable_system_pasid(idxd); @@ -412,7 +435,6 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) goto err; - idxd_set_type(idxd); idxd_type_init(idxd); @@ -427,7 +449,7 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err; } - rc = idxd_setup_sysfs(idxd); + rc = idxd_register_devices(idxd); if (rc) { dev_err(dev, "IDXD sysfs setup failed\n"); goto err; @@ -443,6 +465,7 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) err: pci_iounmap(pdev, idxd->reg_base); err_iomap: + put_device(&idxd->conf_dev); err_idxd_alloc: pci_disable_device(pdev); return rc; @@ -511,11 +534,10 @@ static void idxd_remove(struct pci_dev *pdev) struct idxd_device *idxd = pci_get_drvdata(pdev); dev_dbg(&pdev->dev, "%s called\n", __func__); - idxd_cleanup_sysfs(idxd); idxd_shutdown(pdev); if (device_pasid_enabled(idxd)) idxd_disable_system_pasid(idxd); - ida_free(&idxd_idas[idxd->type], idxd->id); + idxd_unregister_devices(idxd); } static struct pci_driver idxd_pci_driver = { diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 18bf4d148989..36193e555e36 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -16,51 +16,26 @@ static char *idxd_wq_type_names[] = { [IDXD_WQT_USER] = "user", }; -static void idxd_conf_device_release(struct device *dev) +static void idxd_conf_sub_device_release(struct device *dev) { dev_dbg(dev, "%s for %s\n", __func__, dev_name(dev)); } static struct device_type idxd_group_device_type = { .name = "group", - .release = idxd_conf_device_release, + .release = idxd_conf_sub_device_release, }; static struct device_type idxd_wq_device_type = { .name = "wq", - .release = idxd_conf_device_release, + .release = idxd_conf_sub_device_release, }; static struct device_type idxd_engine_device_type = { .name = "engine", - .release = idxd_conf_device_release, + .release = idxd_conf_sub_device_release, }; -static struct device_type dsa_device_type = { - .name = "dsa", - .release = idxd_conf_device_release, -}; - -static struct device_type iax_device_type = { - .name = "iax", - .release = idxd_conf_device_release, -}; - -static inline bool is_dsa_dev(struct device *dev) -{ - return dev ? dev->type == &dsa_device_type : false; -} - -static inline bool is_iax_dev(struct device *dev) -{ - return dev ? dev->type == &iax_device_type : false; -} - -static inline bool is_idxd_dev(struct device *dev) -{ - return is_dsa_dev(dev) || is_iax_dev(dev); -} - static inline bool is_idxd_wq_dev(struct device *dev) { return dev ? dev->type == &idxd_wq_device_type : false; @@ -405,7 +380,7 @@ struct bus_type *idxd_get_bus_type(struct idxd_device *idxd) return idxd_bus_types[idxd->type]; } -static struct device_type *idxd_get_device_type(struct idxd_device *idxd) +struct device_type *idxd_get_device_type(struct idxd_device *idxd) { if (idxd->type == IDXD_TYPE_DSA) return &dsa_device_type; @@ -1644,6 +1619,30 @@ static const struct attribute_group *idxd_attribute_groups[] = { NULL, }; +static void idxd_conf_device_release(struct device *dev) +{ + struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); + + kfree(idxd->groups); + kfree(idxd->wqs); + kfree(idxd->engines); + kfree(idxd->irq_entries); + ida_free(idxd_ida(idxd), idxd->id); + kfree(idxd); +} + +struct device_type dsa_device_type = { + .name = "dsa", + .release = idxd_conf_device_release, + .groups = idxd_attribute_groups, +}; + +struct device_type iax_device_type = { + .name = "iax", + .release = idxd_conf_device_release, + .groups = idxd_attribute_groups, +}; + static int idxd_setup_engine_sysfs(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; @@ -1745,39 +1744,14 @@ cleanup: return rc; } -static int idxd_setup_device_sysfs(struct idxd_device *idxd) -{ - struct device *dev = &idxd->pdev->dev; - int rc; - char devname[IDXD_NAME_SIZE]; - - sprintf(devname, "%s%d", idxd_get_dev_name(idxd), idxd->id); - idxd->conf_dev.parent = dev; - dev_set_name(&idxd->conf_dev, "%s", devname); - idxd->conf_dev.bus = idxd_get_bus_type(idxd); - idxd->conf_dev.groups = idxd_attribute_groups; - idxd->conf_dev.type = idxd_get_device_type(idxd); - - dev_dbg(dev, "IDXD device register: %s\n", dev_name(&idxd->conf_dev)); - rc = device_register(&idxd->conf_dev); - if (rc < 0) { - put_device(&idxd->conf_dev); - return rc; - } - - return 0; -} - -int idxd_setup_sysfs(struct idxd_device *idxd) +int idxd_register_devices(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; int rc; - rc = idxd_setup_device_sysfs(idxd); - if (rc < 0) { - dev_dbg(dev, "Device sysfs registering failed: %d\n", rc); + rc = device_add(&idxd->conf_dev); + if (rc < 0) return rc; - } rc = idxd_setup_wq_sysfs(idxd); if (rc < 0) { @@ -1803,7 +1777,7 @@ int idxd_setup_sysfs(struct idxd_device *idxd) return 0; } -void idxd_cleanup_sysfs(struct idxd_device *idxd) +void idxd_unregister_devices(struct idxd_device *idxd) { int i; From 7c5dd23e57c14cf7177b8a5e0fd08916e0c60005 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 15 Apr 2021 16:37:39 -0700 Subject: [PATCH 0521/1379] dmaengine: idxd: fix wq conf_dev 'struct device' lifetime Remove devm_* allocation and fix wq->conf_dev 'struct device' lifetime. Address issues flagged by CONFIG_DEBUG_KOBJECT_RELEASE. Add release functions in order to free the allocated memory for the wq context at device destruction time. Reported-by: Jason Gunthorpe Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators") Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161852985907.2203940.6840120734115043753.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 6 +-- drivers/dma/idxd/idxd.h | 20 +++++++- drivers/dma/idxd/init.c | 105 ++++++++++++++++++++++++++++---------- drivers/dma/idxd/irq.c | 6 +-- drivers/dma/idxd/sysfs.c | 98 ++++++++++++++++------------------- 5 files changed, 145 insertions(+), 90 deletions(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 3f696abd74ac..c4183294a704 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -520,7 +520,7 @@ void idxd_device_wqs_clear_state(struct idxd_device *idxd) lockdep_assert_held(&idxd->dev_lock); for (i = 0; i < idxd->max_wqs; i++) { - struct idxd_wq *wq = &idxd->wqs[i]; + struct idxd_wq *wq = idxd->wqs[i]; if (wq->state == IDXD_WQ_ENABLED) { idxd_wq_disable_cleanup(wq); @@ -738,7 +738,7 @@ static int idxd_wqs_config_write(struct idxd_device *idxd) int i, rc; for (i = 0; i < idxd->max_wqs; i++) { - struct idxd_wq *wq = &idxd->wqs[i]; + struct idxd_wq *wq = idxd->wqs[i]; rc = idxd_wq_config_write(wq); if (rc < 0) @@ -816,7 +816,7 @@ static int idxd_wqs_setup(struct idxd_device *idxd) } for (i = 0; i < idxd->max_wqs; i++) { - wq = &idxd->wqs[i]; + wq = idxd->wqs[i]; group = wq->group; if (!wq->group) diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index bb3a580732af..6cade6a05314 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -194,7 +194,7 @@ struct idxd_device { spinlock_t dev_lock; /* spinlock for device */ struct completion *cmd_done; struct idxd_group *groups; - struct idxd_wq *wqs; + struct idxd_wq **wqs; struct idxd_engine *engines; struct iommu_sva *sva; @@ -258,6 +258,7 @@ extern struct bus_type iax_bus_type; extern bool support_enqcmd; extern struct device_type dsa_device_type; extern struct device_type iax_device_type; +extern struct device_type idxd_wq_device_type; static inline bool is_dsa_dev(struct device *dev) { @@ -274,6 +275,23 @@ static inline bool is_idxd_dev(struct device *dev) return is_dsa_dev(dev) || is_iax_dev(dev); } +static inline bool is_idxd_wq_dev(struct device *dev) +{ + return dev->type == &idxd_wq_device_type; +} + +static inline bool is_idxd_wq_dmaengine(struct idxd_wq *wq) +{ + if (wq->type == IDXD_WQT_KERNEL && strcmp(wq->name, "dmaengine") == 0) + return true; + return false; +} + +static inline bool is_idxd_wq_cdev(struct idxd_wq *wq) +{ + return wq->type == IDXD_WQT_USER; +} + static inline bool wq_dedicated(struct idxd_wq *wq) { return test_bit(WQ_FLAG_DEDICATED, &wq->flags); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 17d3b36610a9..a2dca27aebc3 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -145,16 +145,74 @@ static int idxd_setup_interrupts(struct idxd_device *idxd) return rc; } +static int idxd_setup_wqs(struct idxd_device *idxd) +{ + struct device *dev = &idxd->pdev->dev; + struct idxd_wq *wq; + int i, rc; + + idxd->wqs = kcalloc_node(idxd->max_wqs, sizeof(struct idxd_wq *), + GFP_KERNEL, dev_to_node(dev)); + if (!idxd->wqs) + return -ENOMEM; + + for (i = 0; i < idxd->max_wqs; i++) { + wq = kzalloc_node(sizeof(*wq), GFP_KERNEL, dev_to_node(dev)); + if (!wq) { + rc = -ENOMEM; + goto err; + } + + wq->id = i; + wq->idxd = idxd; + device_initialize(&wq->conf_dev); + wq->conf_dev.parent = &idxd->conf_dev; + wq->conf_dev.bus = idxd_get_bus_type(idxd); + wq->conf_dev.type = &idxd_wq_device_type; + rc = dev_set_name(&wq->conf_dev, "wq%d.%d", idxd->id, wq->id); + if (rc < 0) { + put_device(&wq->conf_dev); + goto err; + } + + mutex_init(&wq->wq_lock); + wq->idxd_cdev.minor = -1; + wq->max_xfer_bytes = idxd->max_xfer_bytes; + wq->max_batch_size = idxd->max_batch_size; + wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev)); + if (!wq->wqcfg) { + put_device(&wq->conf_dev); + rc = -ENOMEM; + goto err; + } + idxd->wqs[i] = wq; + } + + return 0; + + err: + while (--i >= 0) + put_device(&idxd->wqs[i]->conf_dev); + return rc; +} + static int idxd_setup_internals(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; - int i; + int i, rc; init_waitqueue_head(&idxd->cmd_waitq); + + rc = idxd_setup_wqs(idxd); + if (rc < 0) + return rc; + idxd->groups = devm_kcalloc(dev, idxd->max_groups, sizeof(struct idxd_group), GFP_KERNEL); - if (!idxd->groups) - return -ENOMEM; + if (!idxd->groups) { + rc = -ENOMEM; + goto err; + } for (i = 0; i < idxd->max_groups; i++) { idxd->groups[i].idxd = idxd; @@ -163,40 +221,31 @@ static int idxd_setup_internals(struct idxd_device *idxd) idxd->groups[i].tc_b = -1; } - idxd->wqs = devm_kcalloc(dev, idxd->max_wqs, sizeof(struct idxd_wq), - GFP_KERNEL); - if (!idxd->wqs) - return -ENOMEM; - idxd->engines = devm_kcalloc(dev, idxd->max_engines, sizeof(struct idxd_engine), GFP_KERNEL); - if (!idxd->engines) - return -ENOMEM; - - for (i = 0; i < idxd->max_wqs; i++) { - struct idxd_wq *wq = &idxd->wqs[i]; - - wq->id = i; - wq->idxd = idxd; - mutex_init(&wq->wq_lock); - wq->idxd_cdev.minor = -1; - wq->max_xfer_bytes = idxd->max_xfer_bytes; - wq->max_batch_size = idxd->max_batch_size; - wq->wqcfg = devm_kzalloc(dev, idxd->wqcfg_size, GFP_KERNEL); - if (!wq->wqcfg) - return -ENOMEM; + if (!idxd->engines) { + rc = -ENOMEM; + goto err; } + for (i = 0; i < idxd->max_engines; i++) { idxd->engines[i].idxd = idxd; idxd->engines[i].id = i; } idxd->wq = create_workqueue(dev_name(dev)); - if (!idxd->wq) - return -ENOMEM; + if (!idxd->wq) { + rc = -ENOMEM; + goto err; + } return 0; + + err: + for (i = 0; i < idxd->max_wqs; i++) + put_device(&idxd->wqs[i]->conf_dev); + return rc; } static void idxd_read_table_offsets(struct idxd_device *idxd) @@ -371,11 +420,11 @@ static int idxd_probe(struct idxd_device *idxd) rc = idxd_setup_internals(idxd); if (rc) - goto err_setup; + goto err; rc = idxd_setup_interrupts(idxd); if (rc) - goto err_setup; + goto err; dev_dbg(dev, "IDXD interrupt setup complete.\n"); @@ -384,7 +433,7 @@ static int idxd_probe(struct idxd_device *idxd) dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id); return 0; - err_setup: + err: if (device_pasid_enabled(idxd)) idxd_disable_system_pasid(idxd); return rc; diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index f1463fc58112..7b0181532f77 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -45,7 +45,7 @@ static void idxd_device_reinit(struct work_struct *work) goto out; for (i = 0; i < idxd->max_wqs; i++) { - struct idxd_wq *wq = &idxd->wqs[i]; + struct idxd_wq *wq = idxd->wqs[i]; if (wq->state == IDXD_WQ_ENABLED) { rc = idxd_wq_enable(wq); @@ -130,7 +130,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause) if (idxd->sw_err.valid && idxd->sw_err.wq_idx_valid) { int id = idxd->sw_err.wq_idx; - struct idxd_wq *wq = &idxd->wqs[id]; + struct idxd_wq *wq = idxd->wqs[id]; if (wq->type == IDXD_WQT_USER) wake_up_interruptible(&wq->idxd_cdev.err_queue); @@ -138,7 +138,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause) int i; for (i = 0; i < idxd->max_wqs; i++) { - struct idxd_wq *wq = &idxd->wqs[i]; + struct idxd_wq *wq = idxd->wqs[i]; if (wq->type == IDXD_WQT_USER) wake_up_interruptible(&wq->idxd_cdev.err_queue); diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 36193e555e36..409b3ce52f07 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -26,34 +26,11 @@ static struct device_type idxd_group_device_type = { .release = idxd_conf_sub_device_release, }; -static struct device_type idxd_wq_device_type = { - .name = "wq", - .release = idxd_conf_sub_device_release, -}; - static struct device_type idxd_engine_device_type = { .name = "engine", .release = idxd_conf_sub_device_release, }; -static inline bool is_idxd_wq_dev(struct device *dev) -{ - return dev ? dev->type == &idxd_wq_device_type : false; -} - -static inline bool is_idxd_wq_dmaengine(struct idxd_wq *wq) -{ - if (wq->type == IDXD_WQT_KERNEL && - strcmp(wq->name, "dmaengine") == 0) - return true; - return false; -} - -static inline bool is_idxd_wq_cdev(struct idxd_wq *wq) -{ - return wq->type == IDXD_WQT_USER; -} - static int idxd_config_bus_match(struct device *dev, struct device_driver *drv) { @@ -297,7 +274,7 @@ static int idxd_config_bus_remove(struct device *dev) dev_dbg(dev, "%s removing dev %s\n", __func__, dev_name(&idxd->conf_dev)); for (i = 0; i < idxd->max_wqs; i++) { - struct idxd_wq *wq = &idxd->wqs[i]; + struct idxd_wq *wq = idxd->wqs[i]; if (wq->state == IDXD_WQ_DISABLED) continue; @@ -309,7 +286,7 @@ static int idxd_config_bus_remove(struct device *dev) idxd_unregister_dma_device(idxd); rc = idxd_device_disable(idxd); for (i = 0; i < idxd->max_wqs; i++) { - struct idxd_wq *wq = &idxd->wqs[i]; + struct idxd_wq *wq = idxd->wqs[i]; mutex_lock(&wq->wq_lock); idxd_wq_disable_cleanup(wq); @@ -678,7 +655,7 @@ static ssize_t group_work_queues_show(struct device *dev, struct idxd_device *idxd = group->idxd; for (i = 0; i < idxd->max_wqs; i++) { - struct idxd_wq *wq = &idxd->wqs[i]; + struct idxd_wq *wq = idxd->wqs[i]; if (!wq->group) continue; @@ -935,7 +912,7 @@ static int total_claimed_wq_size(struct idxd_device *idxd) int wq_size = 0; for (i = 0; i < idxd->max_wqs; i++) { - struct idxd_wq *wq = &idxd->wqs[i]; + struct idxd_wq *wq = idxd->wqs[i]; wq_size += wq->size; } @@ -1331,6 +1308,20 @@ static const struct attribute_group *idxd_wq_attribute_groups[] = { NULL, }; +static void idxd_conf_wq_release(struct device *dev) +{ + struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); + + kfree(wq->wqcfg); + kfree(wq); +} + +struct device_type idxd_wq_device_type = { + .name = "wq", + .release = idxd_conf_wq_release, + .groups = idxd_wq_attribute_groups, +}; + /* IDXD device attribs */ static ssize_t version_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1461,7 +1452,7 @@ static ssize_t clients_show(struct device *dev, spin_lock_irqsave(&idxd->dev_lock, flags); for (i = 0; i < idxd->max_wqs; i++) { - struct idxd_wq *wq = &idxd->wqs[i]; + struct idxd_wq *wq = idxd->wqs[i]; count += wq->client_count; } @@ -1711,70 +1702,67 @@ cleanup: return rc; } -static int idxd_setup_wq_sysfs(struct idxd_device *idxd) +static int idxd_register_wq_devices(struct idxd_device *idxd) { - struct device *dev = &idxd->pdev->dev; - int i, rc; + int i, rc, j; for (i = 0; i < idxd->max_wqs; i++) { - struct idxd_wq *wq = &idxd->wqs[i]; + struct idxd_wq *wq = idxd->wqs[i]; - wq->conf_dev.parent = &idxd->conf_dev; - dev_set_name(&wq->conf_dev, "wq%d.%d", idxd->id, wq->id); - wq->conf_dev.bus = idxd_get_bus_type(idxd); - wq->conf_dev.groups = idxd_wq_attribute_groups; - wq->conf_dev.type = &idxd_wq_device_type; - dev_dbg(dev, "WQ device register: %s\n", - dev_name(&wq->conf_dev)); - rc = device_register(&wq->conf_dev); - if (rc < 0) { - put_device(&wq->conf_dev); + rc = device_add(&wq->conf_dev); + if (rc < 0) goto cleanup; - } } return 0; cleanup: - while (i--) { - struct idxd_wq *wq = &idxd->wqs[i]; + j = i - 1; + for (; i < idxd->max_wqs; i++) + put_device(&idxd->wqs[i]->conf_dev); - device_unregister(&wq->conf_dev); - } + while (j--) + device_unregister(&idxd->wqs[j]->conf_dev); return rc; } int idxd_register_devices(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; - int rc; + int rc, i; rc = device_add(&idxd->conf_dev); if (rc < 0) return rc; - rc = idxd_setup_wq_sysfs(idxd); + rc = idxd_register_wq_devices(idxd); if (rc < 0) { - /* unregister conf dev */ - dev_dbg(dev, "Work Queue sysfs registering failed: %d\n", rc); - return rc; + dev_dbg(dev, "WQ devices registering failed: %d\n", rc); + goto err_wq; } rc = idxd_setup_group_sysfs(idxd); if (rc < 0) { /* unregister conf dev */ dev_dbg(dev, "Group sysfs registering failed: %d\n", rc); - return rc; + goto err; } rc = idxd_setup_engine_sysfs(idxd); if (rc < 0) { /* unregister conf dev */ dev_dbg(dev, "Engine sysfs registering failed: %d\n", rc); - return rc; + goto err; } return 0; + + err: + for (i = 0; i < idxd->max_wqs; i++) + device_unregister(&idxd->wqs[i]->conf_dev); + err_wq: + device_del(&idxd->conf_dev); + return rc; } void idxd_unregister_devices(struct idxd_device *idxd) @@ -1782,7 +1770,7 @@ void idxd_unregister_devices(struct idxd_device *idxd) int i; for (i = 0; i < idxd->max_wqs; i++) { - struct idxd_wq *wq = &idxd->wqs[i]; + struct idxd_wq *wq = idxd->wqs[i]; device_unregister(&wq->conf_dev); } From 75b911309060f42ba94bbbf46f5f497d35d5cd02 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 15 Apr 2021 16:37:44 -0700 Subject: [PATCH 0522/1379] dmaengine: idxd: fix engine conf_dev lifetime Remove devm_* allocation and fix engine->conf_dev 'struct device' lifetime. Address issues flagged by CONFIG_DEBUG_KOBJECT_RELEASE. Add release functions in order to free the allocated memory at the engine conf_dev destruction time. Reported-by: Jason Gunthorpe Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators") Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161852986460.2203940.16603218225412118431.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 2 +- drivers/dma/idxd/idxd.h | 3 +- drivers/dma/idxd/init.c | 60 +++++++++++++++++++++++++------- drivers/dma/idxd/sysfs.c | 72 +++++++++++++++++++-------------------- 4 files changed, 86 insertions(+), 51 deletions(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index c4183294a704..be1dcddfe3c4 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -786,7 +786,7 @@ static int idxd_engines_setup(struct idxd_device *idxd) } for (i = 0; i < idxd->max_engines; i++) { - eng = &idxd->engines[i]; + eng = idxd->engines[i]; group = eng->group; if (!group) diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 6cade6a05314..b9b7e8e8c384 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -195,7 +195,7 @@ struct idxd_device { struct completion *cmd_done; struct idxd_group *groups; struct idxd_wq **wqs; - struct idxd_engine *engines; + struct idxd_engine **engines; struct iommu_sva *sva; unsigned int pasid; @@ -259,6 +259,7 @@ extern bool support_enqcmd; extern struct device_type dsa_device_type; extern struct device_type iax_device_type; extern struct device_type idxd_wq_device_type; +extern struct device_type idxd_engine_device_type; static inline bool is_dsa_dev(struct device *dev) { diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index a2dca27aebc3..b90ef2f519eb 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -196,6 +196,46 @@ static int idxd_setup_wqs(struct idxd_device *idxd) return rc; } +static int idxd_setup_engines(struct idxd_device *idxd) +{ + struct idxd_engine *engine; + struct device *dev = &idxd->pdev->dev; + int i, rc; + + idxd->engines = kcalloc_node(idxd->max_engines, sizeof(struct idxd_engine *), + GFP_KERNEL, dev_to_node(dev)); + if (!idxd->engines) + return -ENOMEM; + + for (i = 0; i < idxd->max_engines; i++) { + engine = kzalloc_node(sizeof(*engine), GFP_KERNEL, dev_to_node(dev)); + if (!engine) { + rc = -ENOMEM; + goto err; + } + + engine->id = i; + engine->idxd = idxd; + device_initialize(&engine->conf_dev); + engine->conf_dev.parent = &idxd->conf_dev; + engine->conf_dev.type = &idxd_engine_device_type; + rc = dev_set_name(&engine->conf_dev, "engine%d.%d", idxd->id, engine->id); + if (rc < 0) { + put_device(&engine->conf_dev); + goto err; + } + + idxd->engines[i] = engine; + } + + return 0; + + err: + while (--i >= 0) + put_device(&idxd->engines[i]->conf_dev); + return rc; +} + static int idxd_setup_internals(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; @@ -207,6 +247,10 @@ static int idxd_setup_internals(struct idxd_device *idxd) if (rc < 0) return rc; + rc = idxd_setup_engines(idxd); + if (rc < 0) + goto err_engine; + idxd->groups = devm_kcalloc(dev, idxd->max_groups, sizeof(struct idxd_group), GFP_KERNEL); if (!idxd->groups) { @@ -221,19 +265,6 @@ static int idxd_setup_internals(struct idxd_device *idxd) idxd->groups[i].tc_b = -1; } - idxd->engines = devm_kcalloc(dev, idxd->max_engines, - sizeof(struct idxd_engine), GFP_KERNEL); - if (!idxd->engines) { - rc = -ENOMEM; - goto err; - } - - - for (i = 0; i < idxd->max_engines; i++) { - idxd->engines[i].idxd = idxd; - idxd->engines[i].id = i; - } - idxd->wq = create_workqueue(dev_name(dev)); if (!idxd->wq) { rc = -ENOMEM; @@ -243,6 +274,9 @@ static int idxd_setup_internals(struct idxd_device *idxd) return 0; err: + for (i = 0; i < idxd->max_engines; i++) + put_device(&idxd->engines[i]->conf_dev); + err_engine: for (i = 0; i < idxd->max_wqs; i++) put_device(&idxd->wqs[i]->conf_dev); return rc; diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 409b3ce52f07..ab02e3b4d75d 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -26,11 +26,6 @@ static struct device_type idxd_group_device_type = { .release = idxd_conf_sub_device_release, }; -static struct device_type idxd_engine_device_type = { - .name = "engine", - .release = idxd_conf_sub_device_release, -}; - static int idxd_config_bus_match(struct device *dev, struct device_driver *drv) { @@ -464,6 +459,19 @@ static const struct attribute_group *idxd_engine_attribute_groups[] = { NULL, }; +static void idxd_conf_engine_release(struct device *dev) +{ + struct idxd_engine *engine = container_of(dev, struct idxd_engine, conf_dev); + + kfree(engine); +} + +struct device_type idxd_engine_device_type = { + .name = "engine", + .release = idxd_conf_engine_release, + .groups = idxd_engine_attribute_groups, +}; + /* Group attributes */ static void idxd_set_free_tokens(struct idxd_device *idxd) @@ -626,7 +634,7 @@ static ssize_t group_engines_show(struct device *dev, struct idxd_device *idxd = group->idxd; for (i = 0; i < idxd->max_engines; i++) { - struct idxd_engine *engine = &idxd->engines[i]; + struct idxd_engine *engine = idxd->engines[i]; if (!engine->group) continue; @@ -1634,37 +1642,27 @@ struct device_type iax_device_type = { .groups = idxd_attribute_groups, }; -static int idxd_setup_engine_sysfs(struct idxd_device *idxd) +static int idxd_register_engine_devices(struct idxd_device *idxd) { - struct device *dev = &idxd->pdev->dev; - int i, rc; + int i, j, rc; for (i = 0; i < idxd->max_engines; i++) { - struct idxd_engine *engine = &idxd->engines[i]; + struct idxd_engine *engine = idxd->engines[i]; - engine->conf_dev.parent = &idxd->conf_dev; - dev_set_name(&engine->conf_dev, "engine%d.%d", - idxd->id, engine->id); - engine->conf_dev.bus = idxd_get_bus_type(idxd); - engine->conf_dev.groups = idxd_engine_attribute_groups; - engine->conf_dev.type = &idxd_engine_device_type; - dev_dbg(dev, "Engine device register: %s\n", - dev_name(&engine->conf_dev)); - rc = device_register(&engine->conf_dev); - if (rc < 0) { - put_device(&engine->conf_dev); + rc = device_add(&engine->conf_dev); + if (rc < 0) goto cleanup; - } } return 0; cleanup: - while (i--) { - struct idxd_engine *engine = &idxd->engines[i]; + j = i - 1; + for (; i < idxd->max_engines; i++) + put_device(&idxd->engines[i]->conf_dev); - device_unregister(&engine->conf_dev); - } + while (j--) + device_unregister(&idxd->engines[j]->conf_dev); return rc; } @@ -1741,23 +1739,25 @@ int idxd_register_devices(struct idxd_device *idxd) goto err_wq; } + rc = idxd_register_engine_devices(idxd); + if (rc < 0) { + dev_dbg(dev, "Engine devices registering failed: %d\n", rc); + goto err_engine; + } + rc = idxd_setup_group_sysfs(idxd); if (rc < 0) { /* unregister conf dev */ dev_dbg(dev, "Group sysfs registering failed: %d\n", rc); - goto err; - } - - rc = idxd_setup_engine_sysfs(idxd); - if (rc < 0) { - /* unregister conf dev */ - dev_dbg(dev, "Engine sysfs registering failed: %d\n", rc); - goto err; + goto err_group; } return 0; - err: + err_group: + for (i = 0; i < idxd->max_engines; i++) + device_unregister(&idxd->engines[i]->conf_dev); + err_engine: for (i = 0; i < idxd->max_wqs; i++) device_unregister(&idxd->wqs[i]->conf_dev); err_wq: @@ -1776,7 +1776,7 @@ void idxd_unregister_devices(struct idxd_device *idxd) } for (i = 0; i < idxd->max_engines; i++) { - struct idxd_engine *engine = &idxd->engines[i]; + struct idxd_engine *engine = idxd->engines[i]; device_unregister(&engine->conf_dev); } From defe49f96012ca91e8e673cb95b5c30b4a3735e8 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 15 Apr 2021 16:37:51 -0700 Subject: [PATCH 0523/1379] dmaengine: idxd: fix group conf_dev lifetime Remove devm_* allocation and fix group->conf_dev 'struct device' lifetime. Address issues flagged by CONFIG_DEBUG_KOBJECT_RELEASE. Add release functions in order to free the allocated memory at the group->conf_dev destruction time. Reported-by: Jason Gunthorpe Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators") Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161852987144.2203940.8830315575880047.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 8 ++--- drivers/dma/idxd/idxd.h | 3 +- drivers/dma/idxd/init.c | 68 ++++++++++++++++++++++++++++++--------- drivers/dma/idxd/sysfs.c | 66 +++++++++++++++++-------------------- 4 files changed, 87 insertions(+), 58 deletions(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index be1dcddfe3c4..4fef57717049 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -659,7 +659,7 @@ static int idxd_groups_config_write(struct idxd_device *idxd) ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET)); for (i = 0; i < idxd->max_groups; i++) { - struct idxd_group *group = &idxd->groups[i]; + struct idxd_group *group = idxd->groups[i]; idxd_group_config_write(group); } @@ -754,7 +754,7 @@ static void idxd_group_flags_setup(struct idxd_device *idxd) /* TC-A 0 and TC-B 1 should be defaults */ for (i = 0; i < idxd->max_groups; i++) { - struct idxd_group *group = &idxd->groups[i]; + struct idxd_group *group = idxd->groups[i]; if (group->tc_a == -1) group->tc_a = group->grpcfg.flags.tc_a = 0; @@ -781,7 +781,7 @@ static int idxd_engines_setup(struct idxd_device *idxd) struct idxd_group *group; for (i = 0; i < idxd->max_groups; i++) { - group = &idxd->groups[i]; + group = idxd->groups[i]; group->grpcfg.engines = 0; } @@ -810,7 +810,7 @@ static int idxd_wqs_setup(struct idxd_device *idxd) struct device *dev = &idxd->pdev->dev; for (i = 0; i < idxd->max_groups; i++) { - group = &idxd->groups[i]; + group = idxd->groups[i]; for (j = 0; j < 4; j++) group->grpcfg.wqs[j] = 0; } diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index b9b7e8e8c384..3c4ce7997c88 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -193,7 +193,7 @@ struct idxd_device { spinlock_t dev_lock; /* spinlock for device */ struct completion *cmd_done; - struct idxd_group *groups; + struct idxd_group **groups; struct idxd_wq **wqs; struct idxd_engine **engines; @@ -260,6 +260,7 @@ extern struct device_type dsa_device_type; extern struct device_type iax_device_type; extern struct device_type idxd_wq_device_type; extern struct device_type idxd_engine_device_type; +extern struct device_type idxd_group_device_type; static inline bool is_dsa_dev(struct device *dev) { diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index b90ef2f519eb..c20ea6bf09bf 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -236,10 +236,53 @@ static int idxd_setup_engines(struct idxd_device *idxd) return rc; } +static int idxd_setup_groups(struct idxd_device *idxd) +{ + struct device *dev = &idxd->pdev->dev; + struct idxd_group *group; + int i, rc; + + idxd->groups = kcalloc_node(idxd->max_groups, sizeof(struct idxd_group *), + GFP_KERNEL, dev_to_node(dev)); + if (!idxd->groups) + return -ENOMEM; + + for (i = 0; i < idxd->max_groups; i++) { + group = kzalloc_node(sizeof(*group), GFP_KERNEL, dev_to_node(dev)); + if (!group) { + rc = -ENOMEM; + goto err; + } + + group->id = i; + group->idxd = idxd; + device_initialize(&group->conf_dev); + group->conf_dev.parent = &idxd->conf_dev; + group->conf_dev.bus = idxd_get_bus_type(idxd); + group->conf_dev.type = &idxd_group_device_type; + rc = dev_set_name(&group->conf_dev, "group%d.%d", idxd->id, group->id); + if (rc < 0) { + put_device(&group->conf_dev); + goto err; + } + + idxd->groups[i] = group; + group->tc_a = -1; + group->tc_b = -1; + } + + return 0; + + err: + while (--i >= 0) + put_device(&idxd->groups[i]->conf_dev); + return rc; +} + static int idxd_setup_internals(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; - int i, rc; + int rc, i; init_waitqueue_head(&idxd->cmd_waitq); @@ -251,29 +294,22 @@ static int idxd_setup_internals(struct idxd_device *idxd) if (rc < 0) goto err_engine; - idxd->groups = devm_kcalloc(dev, idxd->max_groups, - sizeof(struct idxd_group), GFP_KERNEL); - if (!idxd->groups) { - rc = -ENOMEM; - goto err; - } - - for (i = 0; i < idxd->max_groups; i++) { - idxd->groups[i].idxd = idxd; - idxd->groups[i].id = i; - idxd->groups[i].tc_a = -1; - idxd->groups[i].tc_b = -1; - } + rc = idxd_setup_groups(idxd); + if (rc < 0) + goto err_group; idxd->wq = create_workqueue(dev_name(dev)); if (!idxd->wq) { rc = -ENOMEM; - goto err; + goto err_wkq_create; } return 0; - err: + err_wkq_create: + for (i = 0; i < idxd->max_groups; i++) + put_device(&idxd->groups[i]->conf_dev); + err_group: for (i = 0; i < idxd->max_engines; i++) put_device(&idxd->engines[i]->conf_dev); err_engine: diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index ab02e3b4d75d..f793688039c9 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -16,16 +16,6 @@ static char *idxd_wq_type_names[] = { [IDXD_WQT_USER] = "user", }; -static void idxd_conf_sub_device_release(struct device *dev) -{ - dev_dbg(dev, "%s for %s\n", __func__, dev_name(dev)); -} - -static struct device_type idxd_group_device_type = { - .name = "group", - .release = idxd_conf_sub_device_release, -}; - static int idxd_config_bus_match(struct device *dev, struct device_driver *drv) { @@ -435,7 +425,7 @@ static ssize_t engine_group_id_store(struct device *dev, if (prevg) prevg->num_engines--; - engine->group = &idxd->groups[id]; + engine->group = idxd->groups[id]; engine->group->num_engines++; return count; @@ -479,7 +469,7 @@ static void idxd_set_free_tokens(struct idxd_device *idxd) int i, tokens; for (i = 0, tokens = 0; i < idxd->max_groups; i++) { - struct idxd_group *g = &idxd->groups[i]; + struct idxd_group *g = idxd->groups[i]; tokens += g->tokens_reserved; } @@ -784,6 +774,19 @@ static const struct attribute_group *idxd_group_attribute_groups[] = { NULL, }; +static void idxd_conf_group_release(struct device *dev) +{ + struct idxd_group *group = container_of(dev, struct idxd_group, conf_dev); + + kfree(group); +} + +struct device_type idxd_group_device_type = { + .name = "group", + .release = idxd_conf_group_release, + .groups = idxd_group_attribute_groups, +}; + /* IDXD work queue attribs */ static ssize_t wq_clients_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -856,7 +859,7 @@ static ssize_t wq_group_id_store(struct device *dev, return count; } - group = &idxd->groups[id]; + group = idxd->groups[id]; prevg = wq->group; if (prevg) @@ -1666,37 +1669,27 @@ cleanup: return rc; } -static int idxd_setup_group_sysfs(struct idxd_device *idxd) +static int idxd_register_group_devices(struct idxd_device *idxd) { - struct device *dev = &idxd->pdev->dev; - int i, rc; + int i, j, rc; for (i = 0; i < idxd->max_groups; i++) { - struct idxd_group *group = &idxd->groups[i]; + struct idxd_group *group = idxd->groups[i]; - group->conf_dev.parent = &idxd->conf_dev; - dev_set_name(&group->conf_dev, "group%d.%d", - idxd->id, group->id); - group->conf_dev.bus = idxd_get_bus_type(idxd); - group->conf_dev.groups = idxd_group_attribute_groups; - group->conf_dev.type = &idxd_group_device_type; - dev_dbg(dev, "Group device register: %s\n", - dev_name(&group->conf_dev)); - rc = device_register(&group->conf_dev); - if (rc < 0) { - put_device(&group->conf_dev); + rc = device_add(&group->conf_dev); + if (rc < 0) goto cleanup; - } } return 0; cleanup: - while (i--) { - struct idxd_group *group = &idxd->groups[i]; + j = i - 1; + for (; i < idxd->max_groups; i++) + put_device(&idxd->groups[i]->conf_dev); - device_unregister(&group->conf_dev); - } + while (j--) + device_unregister(&idxd->groups[j]->conf_dev); return rc; } @@ -1745,10 +1738,9 @@ int idxd_register_devices(struct idxd_device *idxd) goto err_engine; } - rc = idxd_setup_group_sysfs(idxd); + rc = idxd_register_group_devices(idxd); if (rc < 0) { - /* unregister conf dev */ - dev_dbg(dev, "Group sysfs registering failed: %d\n", rc); + dev_dbg(dev, "Group device registering failed: %d\n", rc); goto err_group; } @@ -1782,7 +1774,7 @@ void idxd_unregister_devices(struct idxd_device *idxd) } for (i = 0; i < idxd->max_groups; i++) { - struct idxd_group *group = &idxd->groups[i]; + struct idxd_group *group = idxd->groups[i]; device_unregister(&group->conf_dev); } From 04922b7445a1950b86f130a1fe8c52cc27b3e30b Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 15 Apr 2021 16:37:57 -0700 Subject: [PATCH 0524/1379] dmaengine: idxd: fix cdev setup and free device lifetime issues The char device setup and cleanup has device lifetime issues regarding when parts are initialized and cleaned up. The initialization of struct device is done incorrectly. device_initialize() needs to be called on the 'struct device' and then additional changes can be added. The ->release() function needs to be setup via device_type before dev_set_name() to allow proper cleanup. The change re-parents the cdev under the wq->conf_dev to get natural reference inheritance. No known dependency on the old device path exists. Reported-by: Jason Gunthorpe Fixes: 42d279f9137a ("dmaengine: idxd: add char driver to expose submission portal to userland") Signed-off-by: Dave Jiang Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/161852987721.2203940.1478218825576630810.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/cdev.c | 129 ++++++++++++++------------------------- drivers/dma/idxd/idxd.h | 7 ++- drivers/dma/idxd/init.c | 2 +- drivers/dma/idxd/irq.c | 4 +- drivers/dma/idxd/sysfs.c | 10 ++- 5 files changed, 63 insertions(+), 89 deletions(-) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index 0db9b82ed8cf..1d8a3876b745 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -39,15 +39,15 @@ struct idxd_user_context { struct iommu_sva *sva; }; -enum idxd_cdev_cleanup { - CDEV_NORMAL = 0, - CDEV_FAILED, -}; - static void idxd_cdev_dev_release(struct device *dev) { - dev_dbg(dev, "releasing cdev device\n"); - kfree(dev); + struct idxd_cdev *idxd_cdev = container_of(dev, struct idxd_cdev, dev); + struct idxd_cdev_context *cdev_ctx; + struct idxd_wq *wq = idxd_cdev->wq; + + cdev_ctx = &ictx[wq->idxd->type]; + ida_simple_remove(&cdev_ctx->minor_ida, idxd_cdev->minor); + kfree(idxd_cdev); } static struct device_type idxd_cdev_device_type = { @@ -62,14 +62,11 @@ static inline struct idxd_cdev *inode_idxd_cdev(struct inode *inode) return container_of(cdev, struct idxd_cdev, cdev); } -static inline struct idxd_wq *idxd_cdev_wq(struct idxd_cdev *idxd_cdev) -{ - return container_of(idxd_cdev, struct idxd_wq, idxd_cdev); -} - static inline struct idxd_wq *inode_wq(struct inode *inode) { - return idxd_cdev_wq(inode_idxd_cdev(inode)); + struct idxd_cdev *idxd_cdev = inode_idxd_cdev(inode); + + return idxd_cdev->wq; } static int idxd_cdev_open(struct inode *inode, struct file *filp) @@ -220,11 +217,10 @@ static __poll_t idxd_cdev_poll(struct file *filp, struct idxd_user_context *ctx = filp->private_data; struct idxd_wq *wq = ctx->wq; struct idxd_device *idxd = wq->idxd; - struct idxd_cdev *idxd_cdev = &wq->idxd_cdev; unsigned long flags; __poll_t out = 0; - poll_wait(filp, &idxd_cdev->err_queue, wait); + poll_wait(filp, &wq->err_queue, wait); spin_lock_irqsave(&idxd->dev_lock, flags); if (idxd->sw_err.valid) out = EPOLLIN | EPOLLRDNORM; @@ -246,98 +242,67 @@ int idxd_cdev_get_major(struct idxd_device *idxd) return MAJOR(ictx[idxd->type].devt); } -static int idxd_wq_cdev_dev_setup(struct idxd_wq *wq) +int idxd_wq_add_cdev(struct idxd_wq *wq) { struct idxd_device *idxd = wq->idxd; - struct idxd_cdev *idxd_cdev = &wq->idxd_cdev; - struct idxd_cdev_context *cdev_ctx; + struct idxd_cdev *idxd_cdev; + struct cdev *cdev; struct device *dev; - int minor, rc; + struct idxd_cdev_context *cdev_ctx; + int rc, minor; - idxd_cdev->dev = kzalloc(sizeof(*idxd_cdev->dev), GFP_KERNEL); - if (!idxd_cdev->dev) + idxd_cdev = kzalloc(sizeof(*idxd_cdev), GFP_KERNEL); + if (!idxd_cdev) return -ENOMEM; - dev = idxd_cdev->dev; - dev->parent = &idxd->pdev->dev; - dev_set_name(dev, "%s/wq%u.%u", idxd_get_dev_name(idxd), - idxd->id, wq->id); - dev->bus = idxd_get_bus_type(idxd); - + idxd_cdev->wq = wq; + cdev = &idxd_cdev->cdev; + dev = &idxd_cdev->dev; cdev_ctx = &ictx[wq->idxd->type]; minor = ida_simple_get(&cdev_ctx->minor_ida, 0, MINORMASK, GFP_KERNEL); if (minor < 0) { - rc = minor; - kfree(dev); - goto ida_err; - } - - dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor); - dev->type = &idxd_cdev_device_type; - rc = device_register(dev); - if (rc < 0) { - dev_err(&idxd->pdev->dev, "device register failed\n"); - goto dev_reg_err; + kfree(idxd_cdev); + return minor; } idxd_cdev->minor = minor; - return 0; + device_initialize(dev); + dev->parent = &wq->conf_dev; + dev->bus = idxd_get_bus_type(idxd); + dev->type = &idxd_cdev_device_type; + dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor); - dev_reg_err: - ida_simple_remove(&cdev_ctx->minor_ida, MINOR(dev->devt)); - put_device(dev); - ida_err: - idxd_cdev->dev = NULL; - return rc; -} - -static void idxd_wq_cdev_cleanup(struct idxd_wq *wq, - enum idxd_cdev_cleanup cdev_state) -{ - struct idxd_cdev *idxd_cdev = &wq->idxd_cdev; - struct idxd_cdev_context *cdev_ctx; - - cdev_ctx = &ictx[wq->idxd->type]; - if (cdev_state == CDEV_NORMAL) - cdev_del(&idxd_cdev->cdev); - device_unregister(idxd_cdev->dev); - /* - * The device_type->release() will be called on the device and free - * the allocated struct device. We can just forget it. - */ - ida_simple_remove(&cdev_ctx->minor_ida, idxd_cdev->minor); - idxd_cdev->dev = NULL; - idxd_cdev->minor = -1; -} - -int idxd_wq_add_cdev(struct idxd_wq *wq) -{ - struct idxd_cdev *idxd_cdev = &wq->idxd_cdev; - struct cdev *cdev = &idxd_cdev->cdev; - struct device *dev; - int rc; - - rc = idxd_wq_cdev_dev_setup(wq); + rc = dev_set_name(dev, "%s/wq%u.%u", idxd_get_dev_name(idxd), + idxd->id, wq->id); if (rc < 0) - return rc; + goto err; - dev = idxd_cdev->dev; + wq->idxd_cdev = idxd_cdev; cdev_init(cdev, &idxd_cdev_fops); - cdev_set_parent(cdev, &dev->kobj); - rc = cdev_add(cdev, dev->devt, 1); + rc = cdev_device_add(cdev, dev); if (rc) { dev_dbg(&wq->idxd->pdev->dev, "cdev_add failed: %d\n", rc); - idxd_wq_cdev_cleanup(wq, CDEV_FAILED); - return rc; + goto err; } - init_waitqueue_head(&idxd_cdev->err_queue); return 0; + + err: + put_device(dev); + wq->idxd_cdev = NULL; + return rc; } void idxd_wq_del_cdev(struct idxd_wq *wq) { - idxd_wq_cdev_cleanup(wq, CDEV_NORMAL); + struct idxd_cdev *idxd_cdev; + struct idxd_cdev_context *cdev_ctx; + + cdev_ctx = &ictx[wq->idxd->type]; + idxd_cdev = wq->idxd_cdev; + wq->idxd_cdev = NULL; + cdev_device_del(&idxd_cdev->cdev, &idxd_cdev->dev); + put_device(&idxd_cdev->dev); } int idxd_cdev_register(void) diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 3c4ce7997c88..89daf746d121 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -80,10 +80,10 @@ enum idxd_wq_type { }; struct idxd_cdev { + struct idxd_wq *wq; struct cdev cdev; - struct device *dev; + struct device dev; int minor; - struct wait_queue_head err_queue; }; #define IDXD_ALLOCATED_BATCH_SIZE 128U @@ -109,7 +109,8 @@ struct idxd_dma_chan { struct idxd_wq { void __iomem *portal; struct device conf_dev; - struct idxd_cdev idxd_cdev; + struct idxd_cdev *idxd_cdev; + struct wait_queue_head err_queue; struct idxd_device *idxd; int id; enum idxd_wq_type type; diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index c20ea6bf09bf..07cf7977a045 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -176,7 +176,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd) } mutex_init(&wq->wq_lock); - wq->idxd_cdev.minor = -1; + init_waitqueue_head(&wq->err_queue); wq->max_xfer_bytes = idxd->max_xfer_bytes; wq->max_batch_size = idxd->max_batch_size; wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev)); diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index 7b0181532f77..fc0781e3f36d 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -133,7 +133,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause) struct idxd_wq *wq = idxd->wqs[id]; if (wq->type == IDXD_WQT_USER) - wake_up_interruptible(&wq->idxd_cdev.err_queue); + wake_up_interruptible(&wq->err_queue); } else { int i; @@ -141,7 +141,7 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause) struct idxd_wq *wq = idxd->wqs[i]; if (wq->type == IDXD_WQT_USER) - wake_up_interruptible(&wq->idxd_cdev.err_queue); + wake_up_interruptible(&wq->err_queue); } } diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index f793688039c9..9586b55abce5 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -1169,8 +1169,16 @@ static ssize_t wq_cdev_minor_show(struct device *dev, struct device_attribute *attr, char *buf) { struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); + int minor = -1; - return sprintf(buf, "%d\n", wq->idxd_cdev.minor); + mutex_lock(&wq->wq_lock); + if (wq->idxd_cdev) + minor = wq->idxd_cdev->minor; + mutex_unlock(&wq->wq_lock); + + if (minor == -1) + return -ENXIO; + return sysfs_emit(buf, "%d\n", minor); } static struct device_attribute dev_attr_wq_cdev_minor = From 4b73e4ebd43ce48101a4c09bf13d439a954d61c5 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 15 Apr 2021 16:38:03 -0700 Subject: [PATCH 0525/1379] dmaengine: idxd: iax bus removal There is no need to have an additional bus for the IAX device. The removal of IAX will change user ABI as /sys/bus/iax will no longer exist. The iax device will be moved to the dsa bus. The device id for dsa and iax will now be combined rather than unique for each device type in order to accommodate the iax devices. This is in preparation for fixing the sub-driver code for idxd. There's no hardware deployment for Sapphire Rapids platform yet, which means that users have no reason to have developed scripts against this ABI. There is some exposure to released versions of accel-config, but those are being fixed up and an accel-config upgrade is reasonable to get IAX support. As far as accel-config is concerned IAX support starts when these devices appear under /sys/bus/dsa, and old accel-config just assumes that an empty / missing /sys/bus/iax just means a lack of platform support. Fixes: f25b463883a8 ("dmaengine: idxd: add IAX configuration support in the IDXD driver") Suggested-by: Dan Williams Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161852988298.2203940.4529909758034944428.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/cdev.c | 2 +- drivers/dma/idxd/idxd.h | 3 +- drivers/dma/idxd/init.c | 21 ++++-------- drivers/dma/idxd/sysfs.c | 74 +++------------------------------------- 4 files changed, 13 insertions(+), 87 deletions(-) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index 1d8a3876b745..2d976905b2a3 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -268,7 +268,7 @@ int idxd_wq_add_cdev(struct idxd_wq *wq) device_initialize(dev); dev->parent = &wq->conf_dev; - dev->bus = idxd_get_bus_type(idxd); + dev->bus = &dsa_bus_type; dev->type = &idxd_cdev_device_type; dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor); diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 89daf746d121..b17415aa42bd 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -257,6 +257,7 @@ extern struct bus_type dsa_bus_type; extern struct bus_type iax_bus_type; extern bool support_enqcmd; +extern struct ida idxd_ida; extern struct device_type dsa_device_type; extern struct device_type iax_device_type; extern struct device_type idxd_wq_device_type; @@ -346,7 +347,6 @@ static inline int idxd_wq_refcount(struct idxd_wq *wq) return wq->client_count; }; -struct ida *idxd_ida(struct idxd_device *idxd); const char *idxd_get_dev_name(struct idxd_device *idxd); int idxd_register_bus_type(void); void idxd_unregister_bus_type(void); @@ -354,7 +354,6 @@ int idxd_register_devices(struct idxd_device *idxd); void idxd_unregister_devices(struct idxd_device *idxd); int idxd_register_driver(void); void idxd_unregister_driver(void); -struct bus_type *idxd_get_bus_type(struct idxd_device *idxd); struct device_type *idxd_get_device_type(struct idxd_device *idxd); /* device interrupt control */ diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 07cf7977a045..be922a8c2784 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -33,8 +33,7 @@ MODULE_PARM_DESC(sva, "Toggle SVA support on/off"); #define DRV_NAME "idxd" bool support_enqcmd; - -static struct ida idxd_idas[IDXD_TYPE_MAX]; +DEFINE_IDA(idxd_ida); static struct pci_device_id idxd_pci_tbl[] = { /* DSA ver 1.0 platforms */ @@ -51,11 +50,6 @@ static char *idxd_name[] = { "iax" }; -struct ida *idxd_ida(struct idxd_device *idxd) -{ - return &idxd_idas[idxd->type]; -} - const char *idxd_get_dev_name(struct idxd_device *idxd) { return idxd_name[idxd->type]; @@ -167,7 +161,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd) wq->idxd = idxd; device_initialize(&wq->conf_dev); wq->conf_dev.parent = &idxd->conf_dev; - wq->conf_dev.bus = idxd_get_bus_type(idxd); + wq->conf_dev.bus = &dsa_bus_type; wq->conf_dev.type = &idxd_wq_device_type; rc = dev_set_name(&wq->conf_dev, "wq%d.%d", idxd->id, wq->id); if (rc < 0) { @@ -258,7 +252,7 @@ static int idxd_setup_groups(struct idxd_device *idxd) group->idxd = idxd; device_initialize(&group->conf_dev); group->conf_dev.parent = &idxd->conf_dev; - group->conf_dev.bus = idxd_get_bus_type(idxd); + group->conf_dev.bus = &dsa_bus_type; group->conf_dev.type = &idxd_group_device_type; rc = dev_set_name(&group->conf_dev, "group%d.%d", idxd->id, group->id); if (rc < 0) { @@ -409,13 +403,13 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev) idxd->pdev = pdev; idxd_set_type(idxd); - idxd->id = ida_alloc(idxd_ida(idxd), GFP_KERNEL); + idxd->id = ida_alloc(&idxd_ida, GFP_KERNEL); if (idxd->id < 0) return NULL; device_initialize(&idxd->conf_dev); idxd->conf_dev.parent = dev; - idxd->conf_dev.bus = idxd_get_bus_type(idxd); + idxd->conf_dev.bus = &dsa_bus_type; idxd->conf_dev.type = idxd_get_device_type(idxd); rc = dev_set_name(&idxd->conf_dev, "%s%d", idxd_get_dev_name(idxd), idxd->id); if (rc < 0) { @@ -669,7 +663,7 @@ static struct pci_driver idxd_pci_driver = { static int __init idxd_init_module(void) { - int err, i; + int err; /* * If the CPU does not support MOVDIR64B or ENQCMDS, there's no point in @@ -685,9 +679,6 @@ static int __init idxd_init_module(void) else support_enqcmd = true; - for (i = 0; i < IDXD_TYPE_MAX; i++) - ida_init(&idxd_idas[i]); - err = idxd_register_bus_type(); if (err < 0) return err; diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 9586b55abce5..b97a0a817dfb 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -301,19 +301,6 @@ struct bus_type dsa_bus_type = { .shutdown = idxd_config_bus_shutdown, }; -struct bus_type iax_bus_type = { - .name = "iax", - .match = idxd_config_bus_match, - .probe = idxd_config_bus_probe, - .remove = idxd_config_bus_remove, - .shutdown = idxd_config_bus_shutdown, -}; - -static struct bus_type *idxd_bus_types[] = { - &dsa_bus_type, - &iax_bus_type -}; - static struct idxd_device_driver dsa_drv = { .drv = { .name = "dsa", @@ -323,25 +310,6 @@ static struct idxd_device_driver dsa_drv = { }, }; -static struct idxd_device_driver iax_drv = { - .drv = { - .name = "iax", - .bus = &iax_bus_type, - .owner = THIS_MODULE, - .mod_name = KBUILD_MODNAME, - }, -}; - -static struct idxd_device_driver *idxd_drvs[] = { - &dsa_drv, - &iax_drv -}; - -struct bus_type *idxd_get_bus_type(struct idxd_device *idxd) -{ - return idxd_bus_types[idxd->type]; -} - struct device_type *idxd_get_device_type(struct idxd_device *idxd) { if (idxd->type == IDXD_TYPE_DSA) @@ -355,28 +323,12 @@ struct device_type *idxd_get_device_type(struct idxd_device *idxd) /* IDXD generic driver setup */ int idxd_register_driver(void) { - int i, rc; - - for (i = 0; i < IDXD_TYPE_MAX; i++) { - rc = driver_register(&idxd_drvs[i]->drv); - if (rc < 0) - goto drv_fail; - } - - return 0; - -drv_fail: - while (--i >= 0) - driver_unregister(&idxd_drvs[i]->drv); - return rc; + return driver_register(&dsa_drv.drv); } void idxd_unregister_driver(void) { - int i; - - for (i = 0; i < IDXD_TYPE_MAX; i++) - driver_unregister(&idxd_drvs[i]->drv); + driver_unregister(&dsa_drv.drv); } /* IDXD engine attributes */ @@ -1637,7 +1589,7 @@ static void idxd_conf_device_release(struct device *dev) kfree(idxd->wqs); kfree(idxd->engines); kfree(idxd->irq_entries); - ida_free(idxd_ida(idxd), idxd->id); + ida_free(&idxd_ida, idxd->id); kfree(idxd); } @@ -1792,26 +1744,10 @@ void idxd_unregister_devices(struct idxd_device *idxd) int idxd_register_bus_type(void) { - int i, rc; - - for (i = 0; i < IDXD_TYPE_MAX; i++) { - rc = bus_register(idxd_bus_types[i]); - if (rc < 0) - goto bus_err; - } - - return 0; - -bus_err: - while (--i >= 0) - bus_unregister(idxd_bus_types[i]); - return rc; + return bus_register(&dsa_bus_type); } void idxd_unregister_bus_type(void) { - int i; - - for (i = 0; i < IDXD_TYPE_MAX; i++) - bus_unregister(idxd_bus_types[i]); + bus_unregister(&dsa_bus_type); } From 435b512dbc0dac42b34348393049b386bb1a19bd Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 15 Apr 2021 16:38:09 -0700 Subject: [PATCH 0526/1379] dmaengine: idxd: remove detection of device type Move all static data type for per device type to an idxd_driver_data data structure. The data can be attached to the pci_device_id and provided by the pci probe function. This removes a lot of unnecessary type detection and setup code. Suggested-by: Dan Williams Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161852988924.2203940.2787590808682466398.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/cdev.c | 11 +++---- drivers/dma/idxd/device.c | 16 +++------- drivers/dma/idxd/idxd.h | 13 +++++--- drivers/dma/idxd/init.c | 65 +++++++++++++++------------------------ drivers/dma/idxd/submit.c | 2 +- drivers/dma/idxd/sysfs.c | 16 ++-------- 6 files changed, 48 insertions(+), 75 deletions(-) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index 2d976905b2a3..302cba5ff779 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -45,7 +45,7 @@ static void idxd_cdev_dev_release(struct device *dev) struct idxd_cdev_context *cdev_ctx; struct idxd_wq *wq = idxd_cdev->wq; - cdev_ctx = &ictx[wq->idxd->type]; + cdev_ctx = &ictx[wq->idxd->data->type]; ida_simple_remove(&cdev_ctx->minor_ida, idxd_cdev->minor); kfree(idxd_cdev); } @@ -239,7 +239,7 @@ static const struct file_operations idxd_cdev_fops = { int idxd_cdev_get_major(struct idxd_device *idxd) { - return MAJOR(ictx[idxd->type].devt); + return MAJOR(ictx[idxd->data->type].devt); } int idxd_wq_add_cdev(struct idxd_wq *wq) @@ -258,7 +258,7 @@ int idxd_wq_add_cdev(struct idxd_wq *wq) idxd_cdev->wq = wq; cdev = &idxd_cdev->cdev; dev = &idxd_cdev->dev; - cdev_ctx = &ictx[wq->idxd->type]; + cdev_ctx = &ictx[wq->idxd->data->type]; minor = ida_simple_get(&cdev_ctx->minor_ida, 0, MINORMASK, GFP_KERNEL); if (minor < 0) { kfree(idxd_cdev); @@ -272,8 +272,7 @@ int idxd_wq_add_cdev(struct idxd_wq *wq) dev->type = &idxd_cdev_device_type; dev->devt = MKDEV(MAJOR(cdev_ctx->devt), minor); - rc = dev_set_name(dev, "%s/wq%u.%u", idxd_get_dev_name(idxd), - idxd->id, wq->id); + rc = dev_set_name(dev, "%s/wq%u.%u", idxd->data->name_prefix, idxd->id, wq->id); if (rc < 0) goto err; @@ -298,7 +297,7 @@ void idxd_wq_del_cdev(struct idxd_wq *wq) struct idxd_cdev *idxd_cdev; struct idxd_cdev_context *cdev_ctx; - cdev_ctx = &ictx[wq->idxd->type]; + cdev_ctx = &ictx[wq->idxd->data->type]; idxd_cdev = wq->idxd_cdev; wq->idxd_cdev = NULL; cdev_device_del(&idxd_cdev->cdev, &idxd_cdev->dev); diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 4fef57717049..016df87cf5c5 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -144,14 +144,8 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq) if (rc < 0) return rc; - if (idxd->type == IDXD_TYPE_DSA) - align = 32; - else if (idxd->type == IDXD_TYPE_IAX) - align = 64; - else - return -ENODEV; - - wq->compls_size = num_descs * idxd->compl_size + align; + align = idxd->data->align; + wq->compls_size = num_descs * idxd->data->compl_size + align; wq->compls_raw = dma_alloc_coherent(dev, wq->compls_size, &wq->compls_addr_raw, GFP_KERNEL); if (!wq->compls_raw) { @@ -178,11 +172,11 @@ int idxd_wq_alloc_resources(struct idxd_wq *wq) struct idxd_desc *desc = wq->descs[i]; desc->hw = wq->hw_descs[i]; - if (idxd->type == IDXD_TYPE_DSA) + if (idxd->data->type == IDXD_TYPE_DSA) desc->completion = &wq->compls[i]; - else if (idxd->type == IDXD_TYPE_IAX) + else if (idxd->data->type == IDXD_TYPE_IAX) desc->iax_completion = &wq->iax_compls[i]; - desc->compl_dma = wq->compls_addr + idxd->compl_size * i; + desc->compl_dma = wq->compls_addr + idxd->data->compl_size * i; desc->id = i; desc->wq = wq; desc->cpu = -1; diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index b17415aa42bd..8055e872953c 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -178,9 +178,17 @@ struct idxd_dma_dev { struct dma_device dma; }; -struct idxd_device { +struct idxd_driver_data { + const char *name_prefix; enum idxd_type type; + struct device_type *dev_type; + int compl_size; + int align; +}; + +struct idxd_device { struct device conf_dev; + struct idxd_driver_data *data; struct list_head list; struct idxd_hw hw; enum idxd_device_state state; @@ -218,7 +226,6 @@ struct idxd_device { int token_limit; int nr_tokens; /* non-reserved tokens */ unsigned int wqcfg_size; - int compl_size; union sw_err_reg sw_err; wait_queue_head_t cmd_waitq; @@ -347,14 +354,12 @@ static inline int idxd_wq_refcount(struct idxd_wq *wq) return wq->client_count; }; -const char *idxd_get_dev_name(struct idxd_device *idxd); int idxd_register_bus_type(void); void idxd_unregister_bus_type(void); int idxd_register_devices(struct idxd_device *idxd); void idxd_unregister_devices(struct idxd_device *idxd); int idxd_register_driver(void); void idxd_unregister_driver(void); -struct device_type *idxd_get_device_type(struct idxd_device *idxd); /* device interrupt control */ void idxd_msix_perm_setup(struct idxd_device *idxd); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index be922a8c2784..e8f64324bb3a 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -35,26 +35,33 @@ MODULE_PARM_DESC(sva, "Toggle SVA support on/off"); bool support_enqcmd; DEFINE_IDA(idxd_ida); +static struct idxd_driver_data idxd_driver_data[] = { + [IDXD_TYPE_DSA] = { + .name_prefix = "dsa", + .type = IDXD_TYPE_DSA, + .compl_size = sizeof(struct dsa_completion_record), + .align = 32, + .dev_type = &dsa_device_type, + }, + [IDXD_TYPE_IAX] = { + .name_prefix = "iax", + .type = IDXD_TYPE_IAX, + .compl_size = sizeof(struct iax_completion_record), + .align = 64, + .dev_type = &iax_device_type, + }, +}; + static struct pci_device_id idxd_pci_tbl[] = { /* DSA ver 1.0 platforms */ - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_DSA_SPR0) }, + { PCI_DEVICE_DATA(INTEL, DSA_SPR0, &idxd_driver_data[IDXD_TYPE_DSA]) }, /* IAX ver 1.0 platforms */ - { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IAX_SPR0) }, + { PCI_DEVICE_DATA(INTEL, IAX_SPR0, &idxd_driver_data[IDXD_TYPE_IAX]) }, { 0, } }; MODULE_DEVICE_TABLE(pci, idxd_pci_tbl); -static char *idxd_name[] = { - "dsa", - "iax" -}; - -const char *idxd_get_dev_name(struct idxd_device *idxd) -{ - return idxd_name[idxd->type]; -} - static int idxd_setup_interrupts(struct idxd_device *idxd) { struct pci_dev *pdev = idxd->pdev; @@ -379,19 +386,7 @@ static void idxd_read_caps(struct idxd_device *idxd) } } -static inline void idxd_set_type(struct idxd_device *idxd) -{ - struct pci_dev *pdev = idxd->pdev; - - if (pdev->device == PCI_DEVICE_ID_INTEL_DSA_SPR0) - idxd->type = IDXD_TYPE_DSA; - else if (pdev->device == PCI_DEVICE_ID_INTEL_IAX_SPR0) - idxd->type = IDXD_TYPE_IAX; - else - idxd->type = IDXD_TYPE_UNKNOWN; -} - -static struct idxd_device *idxd_alloc(struct pci_dev *pdev) +static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_data *data) { struct device *dev = &pdev->dev; struct idxd_device *idxd; @@ -402,7 +397,7 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev) return NULL; idxd->pdev = pdev; - idxd_set_type(idxd); + idxd->data = data; idxd->id = ida_alloc(&idxd_ida, GFP_KERNEL); if (idxd->id < 0) return NULL; @@ -410,8 +405,8 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev) device_initialize(&idxd->conf_dev); idxd->conf_dev.parent = dev; idxd->conf_dev.bus = &dsa_bus_type; - idxd->conf_dev.type = idxd_get_device_type(idxd); - rc = dev_set_name(&idxd->conf_dev, "%s%d", idxd_get_dev_name(idxd), idxd->id); + idxd->conf_dev.type = idxd->data->dev_type; + rc = dev_set_name(&idxd->conf_dev, "%s%d", idxd->data->name_prefix, idxd->id); if (rc < 0) { put_device(&idxd->conf_dev); return NULL; @@ -503,18 +498,11 @@ static int idxd_probe(struct idxd_device *idxd) return rc; } -static void idxd_type_init(struct idxd_device *idxd) -{ - if (idxd->type == IDXD_TYPE_DSA) - idxd->compl_size = sizeof(struct dsa_completion_record); - else if (idxd->type == IDXD_TYPE_IAX) - idxd->compl_size = sizeof(struct iax_completion_record); -} - static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct device *dev = &pdev->dev; struct idxd_device *idxd; + struct idxd_driver_data *data = (struct idxd_driver_data *)id->driver_data; int rc; rc = pci_enable_device(pdev); @@ -522,7 +510,7 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) return rc; dev_dbg(dev, "Alloc IDXD context\n"); - idxd = idxd_alloc(pdev); + idxd = idxd_alloc(pdev, data); if (!idxd) { rc = -ENOMEM; goto err_idxd_alloc; @@ -548,9 +536,6 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) goto err; - - idxd_type_init(idxd); - dev_dbg(dev, "Set PCI master\n"); pci_set_master(pdev); pci_set_drvdata(pdev, idxd); diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c index a7a61bcc17d5..dfc8900d5de3 100644 --- a/drivers/dma/idxd/submit.c +++ b/drivers/dma/idxd/submit.c @@ -15,7 +15,7 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu) desc = wq->descs[idx]; memset(desc->hw, 0, sizeof(struct dsa_hw_desc)); - memset(desc->completion, 0, idxd->compl_size); + memset(desc->completion, 0, idxd->data->compl_size); desc->cpu = cpu; if (device_pasid_enabled(idxd)) diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index b97a0a817dfb..581ce56ae4f5 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -310,16 +310,6 @@ static struct idxd_device_driver dsa_drv = { }, }; -struct device_type *idxd_get_device_type(struct idxd_device *idxd) -{ - if (idxd->type == IDXD_TYPE_DSA) - return &dsa_device_type; - else if (idxd->type == IDXD_TYPE_IAX) - return &iax_device_type; - else - return NULL; -} - /* IDXD generic driver setup */ int idxd_register_driver(void) { @@ -453,7 +443,7 @@ static ssize_t group_tokens_reserved_store(struct device *dev, if (rc < 0) return -EINVAL; - if (idxd->type == IDXD_TYPE_IAX) + if (idxd->data->type == IDXD_TYPE_IAX) return -EOPNOTSUPP; if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) @@ -501,7 +491,7 @@ static ssize_t group_tokens_allowed_store(struct device *dev, if (rc < 0) return -EINVAL; - if (idxd->type == IDXD_TYPE_IAX) + if (idxd->data->type == IDXD_TYPE_IAX) return -EOPNOTSUPP; if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) @@ -546,7 +536,7 @@ static ssize_t group_use_token_limit_store(struct device *dev, if (rc < 0) return -EINVAL; - if (idxd->type == IDXD_TYPE_IAX) + if (idxd->data->type == IDXD_TYPE_IAX) return -EOPNOTSUPP; if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) From 2c99e55f795593c5f029f65c4b4ab2a72bb076a3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:32 +0100 Subject: [PATCH 0527/1379] PCI: tegra: Convert to MSI domains In anticipation of the removal of the msi_controller structure, convert the Tegra host controller driver to MSI domains. We end-up with the usual two domain structure, the top one being a generic PCI/MSI domain, the bottom one being Tegra-specific and handling the actual HW interrupt allocation. While at it, convert the normal interrupt handler to a chained handler, handle the controller's MSI IRQ edge triggered, support multiple MSIs per device and use the AFI_MSI_EN_VEC* registers to provide MSI masking. [treding@nvidia.com: fix, clean up and address TODOs from Marc's draft] Link: https://lore.kernel.org/r/20210330151145.997953-2-maz@kernel.org Signed-off-by: Thierry Reding Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- drivers/pci/controller/Kconfig | 1 - drivers/pci/controller/pci-tegra.c | 363 ++++++++++++++++------------- 2 files changed, 198 insertions(+), 166 deletions(-) diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 5aa8977d7b0f..be8f9ff512a0 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -41,7 +41,6 @@ config PCI_TEGRA bool "NVIDIA Tegra PCIe controller" depends on ARCH_TEGRA || COMPILE_TEST depends on PCI_MSI_IRQ_DOMAIN - select PCI_MSI_ARCH_FALLBACKS help Say Y here if you want support for the PCIe host controller found on NVIDIA Tegra SoCs. diff --git a/drivers/pci/controller/pci-tegra.c b/drivers/pci/controller/pci-tegra.c index 8fcabed7c6a6..8069bd9232d4 100644 --- a/drivers/pci/controller/pci-tegra.c +++ b/drivers/pci/controller/pci-tegra.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -78,23 +79,8 @@ #define AFI_MSI_FPCI_BAR_ST 0x64 #define AFI_MSI_AXI_BAR_ST 0x68 -#define AFI_MSI_VEC0 0x6c -#define AFI_MSI_VEC1 0x70 -#define AFI_MSI_VEC2 0x74 -#define AFI_MSI_VEC3 0x78 -#define AFI_MSI_VEC4 0x7c -#define AFI_MSI_VEC5 0x80 -#define AFI_MSI_VEC6 0x84 -#define AFI_MSI_VEC7 0x88 - -#define AFI_MSI_EN_VEC0 0x8c -#define AFI_MSI_EN_VEC1 0x90 -#define AFI_MSI_EN_VEC2 0x94 -#define AFI_MSI_EN_VEC3 0x98 -#define AFI_MSI_EN_VEC4 0x9c -#define AFI_MSI_EN_VEC5 0xa0 -#define AFI_MSI_EN_VEC6 0xa4 -#define AFI_MSI_EN_VEC7 0xa8 +#define AFI_MSI_VEC(x) (0x6c + ((x) * 4)) +#define AFI_MSI_EN_VEC(x) (0x8c + ((x) * 4)) #define AFI_CONFIGURATION 0xac #define AFI_CONFIGURATION_EN_FPCI (1 << 0) @@ -280,10 +266,10 @@ #define LINK_RETRAIN_TIMEOUT 100000 /* in usec */ struct tegra_msi { - struct msi_controller chip; DECLARE_BITMAP(used, INT_PCI_MSI_NR); struct irq_domain *domain; - struct mutex lock; + struct mutex map_lock; + spinlock_t mask_lock; void *virt; dma_addr_t phys; int irq; @@ -333,11 +319,6 @@ struct tegra_pcie_soc { } ectl; }; -static inline struct tegra_msi *to_tegra_msi(struct msi_controller *chip) -{ - return container_of(chip, struct tegra_msi, chip); -} - struct tegra_pcie { struct device *dev; @@ -372,6 +353,11 @@ struct tegra_pcie { struct dentry *debugfs; }; +static inline struct tegra_pcie *msi_to_pcie(struct tegra_msi *msi) +{ + return container_of(msi, struct tegra_pcie, msi); +} + struct tegra_pcie_port { struct tegra_pcie *pcie; struct device_node *np; @@ -1432,7 +1418,6 @@ static void tegra_pcie_phys_put(struct tegra_pcie *pcie) } } - static int tegra_pcie_get_resources(struct tegra_pcie *pcie) { struct device *dev = pcie->dev; @@ -1509,6 +1494,7 @@ static int tegra_pcie_get_resources(struct tegra_pcie *pcie) phys_put: if (soc->program_uphy) tegra_pcie_phys_put(pcie); + return err; } @@ -1551,161 +1537,227 @@ static void tegra_pcie_pme_turnoff(struct tegra_pcie_port *port) afi_writel(pcie, val, AFI_PCIE_PME); } -static int tegra_msi_alloc(struct tegra_msi *chip) +static void tegra_pcie_msi_irq(struct irq_desc *desc) { - int msi; - - mutex_lock(&chip->lock); - - msi = find_first_zero_bit(chip->used, INT_PCI_MSI_NR); - if (msi < INT_PCI_MSI_NR) - set_bit(msi, chip->used); - else - msi = -ENOSPC; - - mutex_unlock(&chip->lock); - - return msi; -} - -static void tegra_msi_free(struct tegra_msi *chip, unsigned long irq) -{ - struct device *dev = chip->chip.dev; - - mutex_lock(&chip->lock); - - if (!test_bit(irq, chip->used)) - dev_err(dev, "trying to free unused MSI#%lu\n", irq); - else - clear_bit(irq, chip->used); - - mutex_unlock(&chip->lock); -} - -static irqreturn_t tegra_pcie_msi_irq(int irq, void *data) -{ - struct tegra_pcie *pcie = data; - struct device *dev = pcie->dev; + struct tegra_pcie *pcie = irq_desc_get_handler_data(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); struct tegra_msi *msi = &pcie->msi; - unsigned int i, processed = 0; + struct device *dev = pcie->dev; + unsigned int i; + + chained_irq_enter(chip, desc); for (i = 0; i < 8; i++) { - unsigned long reg = afi_readl(pcie, AFI_MSI_VEC0 + i * 4); + unsigned long reg = afi_readl(pcie, AFI_MSI_VEC(i)); while (reg) { unsigned int offset = find_first_bit(®, 32); unsigned int index = i * 32 + offset; unsigned int irq; - /* clear the interrupt */ - afi_writel(pcie, 1 << offset, AFI_MSI_VEC0 + i * 4); - - irq = irq_find_mapping(msi->domain, index); + irq = irq_find_mapping(msi->domain->parent, index); if (irq) { - if (test_bit(index, msi->used)) - generic_handle_irq(irq); - else - dev_info(dev, "unhandled MSI\n"); + generic_handle_irq(irq); } else { /* * that's weird who triggered this? * just clear it */ dev_info(dev, "unexpected MSI\n"); + afi_writel(pcie, BIT(index % 32), AFI_MSI_VEC(index)); } /* see if there's any more pending in this vector */ - reg = afi_readl(pcie, AFI_MSI_VEC0 + i * 4); - - processed++; + reg = afi_readl(pcie, AFI_MSI_VEC(i)); } } - return processed > 0 ? IRQ_HANDLED : IRQ_NONE; + chained_irq_exit(chip, desc); } -static int tegra_msi_setup_irq(struct msi_controller *chip, - struct pci_dev *pdev, struct msi_desc *desc) +static void tegra_msi_top_irq_ack(struct irq_data *d) { - struct tegra_msi *msi = to_tegra_msi(chip); - struct msi_msg msg; - unsigned int irq; - int hwirq; - - hwirq = tegra_msi_alloc(msi); - if (hwirq < 0) - return hwirq; - - irq = irq_create_mapping(msi->domain, hwirq); - if (!irq) { - tegra_msi_free(msi, hwirq); - return -EINVAL; - } - - irq_set_msi_desc(irq, desc); - - msg.address_lo = lower_32_bits(msi->phys); - msg.address_hi = upper_32_bits(msi->phys); - msg.data = hwirq; - - pci_write_msi_msg(irq, &msg); - - return 0; + irq_chip_ack_parent(d); } -static void tegra_msi_teardown_irq(struct msi_controller *chip, - unsigned int irq) +static void tegra_msi_top_irq_mask(struct irq_data *d) { - struct tegra_msi *msi = to_tegra_msi(chip); - struct irq_data *d = irq_get_irq_data(irq); - irq_hw_number_t hwirq = irqd_to_hwirq(d); - - irq_dispose_mapping(irq); - tegra_msi_free(msi, hwirq); + pci_msi_mask_irq(d); + irq_chip_mask_parent(d); } -static struct irq_chip tegra_msi_irq_chip = { - .name = "Tegra PCIe MSI", - .irq_enable = pci_msi_unmask_irq, - .irq_disable = pci_msi_mask_irq, - .irq_mask = pci_msi_mask_irq, - .irq_unmask = pci_msi_unmask_irq, +static void tegra_msi_top_irq_unmask(struct irq_data *d) +{ + pci_msi_unmask_irq(d); + irq_chip_unmask_parent(d); +} + +static struct irq_chip tegra_msi_top_chip = { + .name = "Tegra PCIe MSI", + .irq_ack = tegra_msi_top_irq_ack, + .irq_mask = tegra_msi_top_irq_mask, + .irq_unmask = tegra_msi_top_irq_unmask, }; -static int tegra_msi_map(struct irq_domain *domain, unsigned int irq, - irq_hw_number_t hwirq) +static void tegra_msi_irq_ack(struct irq_data *d) { - irq_set_chip_and_handler(irq, &tegra_msi_irq_chip, handle_simple_irq); - irq_set_chip_data(irq, domain->host_data); + struct tegra_msi *msi = irq_data_get_irq_chip_data(d); + struct tegra_pcie *pcie = msi_to_pcie(msi); + unsigned int index = d->hwirq / 32; + + /* clear the interrupt */ + afi_writel(pcie, BIT(d->hwirq % 32), AFI_MSI_VEC(index)); +} + +static void tegra_msi_irq_mask(struct irq_data *d) +{ + struct tegra_msi *msi = irq_data_get_irq_chip_data(d); + struct tegra_pcie *pcie = msi_to_pcie(msi); + unsigned int index = d->hwirq / 32; + unsigned long flags; + u32 value; + + spin_lock_irqsave(&msi->mask_lock, flags); + value = afi_readl(pcie, AFI_MSI_EN_VEC(index)); + value &= ~BIT(d->hwirq % 32); + afi_writel(pcie, value, AFI_MSI_EN_VEC(index)); + spin_unlock_irqrestore(&msi->mask_lock, flags); +} + +static void tegra_msi_irq_unmask(struct irq_data *d) +{ + struct tegra_msi *msi = irq_data_get_irq_chip_data(d); + struct tegra_pcie *pcie = msi_to_pcie(msi); + unsigned int index = d->hwirq / 32; + unsigned long flags; + u32 value; + + spin_lock_irqsave(&msi->mask_lock, flags); + value = afi_readl(pcie, AFI_MSI_EN_VEC(index)); + value |= BIT(d->hwirq % 32); + afi_writel(pcie, value, AFI_MSI_EN_VEC(index)); + spin_unlock_irqrestore(&msi->mask_lock, flags); +} + +static int tegra_msi_set_affinity(struct irq_data *d, const struct cpumask *mask, bool force) +{ + return -EINVAL; +} + +static void tegra_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct tegra_msi *msi = irq_data_get_irq_chip_data(data); + + msg->address_lo = lower_32_bits(msi->phys); + msg->address_hi = upper_32_bits(msi->phys); + msg->data = data->hwirq; +} + +static struct irq_chip tegra_msi_bottom_chip = { + .name = "Tegra MSI", + .irq_ack = tegra_msi_irq_ack, + .irq_mask = tegra_msi_irq_mask, + .irq_unmask = tegra_msi_irq_unmask, + .irq_set_affinity = tegra_msi_set_affinity, + .irq_compose_msi_msg = tegra_compose_msi_msg, +}; + +static int tegra_msi_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct tegra_msi *msi = domain->host_data; + unsigned int i; + int hwirq; + + mutex_lock(&msi->map_lock); + + hwirq = bitmap_find_free_region(msi->used, INT_PCI_MSI_NR, order_base_2(nr_irqs)); + + mutex_unlock(&msi->map_lock); + + if (hwirq < 0) + return -ENOSPC; + + for (i = 0; i < nr_irqs; i++) + irq_domain_set_info(domain, virq + i, hwirq + i, + &tegra_msi_bottom_chip, domain->host_data, + handle_edge_irq, NULL, NULL); tegra_cpuidle_pcie_irqs_in_use(); return 0; } -static const struct irq_domain_ops msi_domain_ops = { - .map = tegra_msi_map, +static void tegra_msi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct tegra_msi *msi = domain->host_data; + + mutex_lock(&msi->map_lock); + + bitmap_release_region(msi->used, d->hwirq, order_base_2(nr_irqs)); + + mutex_unlock(&msi->map_lock); +} + +static const struct irq_domain_ops tegra_msi_domain_ops = { + .alloc = tegra_msi_domain_alloc, + .free = tegra_msi_domain_free, }; +static struct msi_domain_info tegra_msi_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX), + .chip = &tegra_msi_top_chip, +}; + +static int tegra_allocate_domains(struct tegra_msi *msi) +{ + struct tegra_pcie *pcie = msi_to_pcie(msi); + struct fwnode_handle *fwnode = dev_fwnode(pcie->dev); + struct irq_domain *parent; + + parent = irq_domain_create_linear(fwnode, INT_PCI_MSI_NR, + &tegra_msi_domain_ops, msi); + if (!parent) { + dev_err(pcie->dev, "failed to create IRQ domain\n"); + return -ENOMEM; + } + irq_domain_update_bus_token(parent, DOMAIN_BUS_NEXUS); + + msi->domain = pci_msi_create_irq_domain(fwnode, &tegra_msi_info, parent); + if (!msi->domain) { + dev_err(pcie->dev, "failed to create MSI domain\n"); + irq_domain_remove(parent); + return -ENOMEM; + } + + return 0; +} + +static void tegra_free_domains(struct tegra_msi *msi) +{ + struct irq_domain *parent = msi->domain->parent; + + irq_domain_remove(msi->domain); + irq_domain_remove(parent); +} + static int tegra_pcie_msi_setup(struct tegra_pcie *pcie) { - struct pci_host_bridge *host = pci_host_bridge_from_priv(pcie); struct platform_device *pdev = to_platform_device(pcie->dev); struct tegra_msi *msi = &pcie->msi; struct device *dev = pcie->dev; int err; - mutex_init(&msi->lock); + mutex_init(&msi->map_lock); + spin_lock_init(&msi->mask_lock); - msi->chip.dev = dev; - msi->chip.setup_irq = tegra_msi_setup_irq; - msi->chip.teardown_irq = tegra_msi_teardown_irq; - - msi->domain = irq_domain_add_linear(dev->of_node, INT_PCI_MSI_NR, - &msi_domain_ops, &msi->chip); - if (!msi->domain) { - dev_err(dev, "failed to create IRQ domain\n"); - return -ENOMEM; + if (IS_ENABLED(CONFIG_PCI_MSI)) { + err = tegra_allocate_domains(msi); + if (err) + return err; } err = platform_get_irq_byname(pdev, "msi"); @@ -1714,12 +1766,7 @@ static int tegra_pcie_msi_setup(struct tegra_pcie *pcie) msi->irq = err; - err = request_irq(msi->irq, tegra_pcie_msi_irq, IRQF_NO_THREAD, - tegra_msi_irq_chip.name, pcie); - if (err < 0) { - dev_err(dev, "failed to request IRQ: %d\n", err); - goto free_irq_domain; - } + irq_set_chained_handler_and_data(msi->irq, tegra_pcie_msi_irq, pcie); /* Though the PCIe controller can address >32-bit address space, to * facilitate endpoints that support only 32-bit MSI target address, @@ -1740,14 +1787,14 @@ static int tegra_pcie_msi_setup(struct tegra_pcie *pcie) goto free_irq; } - host->msi = &msi->chip; - return 0; free_irq: - free_irq(msi->irq, pcie); + irq_set_chained_handler_and_data(msi->irq, NULL, NULL); free_irq_domain: - irq_domain_remove(msi->domain); + if (IS_ENABLED(CONFIG_PCI_MSI)) + tegra_free_domains(msi); + return err; } @@ -1755,22 +1802,18 @@ static void tegra_pcie_enable_msi(struct tegra_pcie *pcie) { const struct tegra_pcie_soc *soc = pcie->soc; struct tegra_msi *msi = &pcie->msi; - u32 reg; + u32 reg, msi_state[INT_PCI_MSI_NR / 32]; + int i; afi_writel(pcie, msi->phys >> soc->msi_base_shift, AFI_MSI_FPCI_BAR_ST); afi_writel(pcie, msi->phys, AFI_MSI_AXI_BAR_ST); /* this register is in 4K increments */ afi_writel(pcie, 1, AFI_MSI_BAR_SZ); - /* enable all MSI vectors */ - afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC0); - afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC1); - afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC2); - afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC3); - afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC4); - afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC5); - afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC6); - afi_writel(pcie, 0xffffffff, AFI_MSI_EN_VEC7); + /* Restore the MSI allocation state */ + bitmap_to_arr32(msi_state, msi->used, INT_PCI_MSI_NR); + for (i = 0; i < ARRAY_SIZE(msi_state); i++) + afi_writel(pcie, msi_state[i], AFI_MSI_EN_VEC(i)); /* and unmask the MSI interrupt */ reg = afi_readl(pcie, AFI_INTR_MASK); @@ -1786,16 +1829,16 @@ static void tegra_pcie_msi_teardown(struct tegra_pcie *pcie) dma_free_attrs(pcie->dev, PAGE_SIZE, msi->virt, msi->phys, DMA_ATTR_NO_KERNEL_MAPPING); - if (msi->irq > 0) - free_irq(msi->irq, pcie); - for (i = 0; i < INT_PCI_MSI_NR; i++) { irq = irq_find_mapping(msi->domain, i); if (irq > 0) - irq_dispose_mapping(irq); + irq_domain_free_irqs(irq, 1); } - irq_domain_remove(msi->domain); + irq_set_chained_handler_and_data(msi->irq, NULL, NULL); + + if (IS_ENABLED(CONFIG_PCI_MSI)) + tegra_free_domains(msi); } static int tegra_pcie_disable_msi(struct tegra_pcie *pcie) @@ -1807,16 +1850,6 @@ static int tegra_pcie_disable_msi(struct tegra_pcie *pcie) value &= ~AFI_INTR_MASK_MSI_MASK; afi_writel(pcie, value, AFI_INTR_MASK); - /* disable all MSI vectors */ - afi_writel(pcie, 0, AFI_MSI_EN_VEC0); - afi_writel(pcie, 0, AFI_MSI_EN_VEC1); - afi_writel(pcie, 0, AFI_MSI_EN_VEC2); - afi_writel(pcie, 0, AFI_MSI_EN_VEC3); - afi_writel(pcie, 0, AFI_MSI_EN_VEC4); - afi_writel(pcie, 0, AFI_MSI_EN_VEC5); - afi_writel(pcie, 0, AFI_MSI_EN_VEC6); - afi_writel(pcie, 0, AFI_MSI_EN_VEC7); - return 0; } From 93cd1bb4862d71298ad5ec86991eac0a119d024d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:33 +0100 Subject: [PATCH 0528/1379] PCI: rcar: Don't allocate extra memory for the MSI capture address A long cargo-culted behaviour of PCI drivers is to allocate memory to obtain an address that is fed to the controller as the MSI capture address (i.e. the MSI doorbell). But there is no actual requirement for this address to be RAM. All it needs to be is a suitable aligned address that will *not* be DMA'd to. Since the rcar platform already has a requirement that this address should be in the first 4GB of the physical address space, use the controller's own base address as the capture address. Link: https://lore.kernel.org/r/20210330151145.997953-3-maz@kernel.org Tested-by: Yoshihiro Shimoda Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Reviewed-by: Yoshihiro Shimoda --- drivers/pci/controller/pcie-rcar-host.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/pci/controller/pcie-rcar-host.c b/drivers/pci/controller/pcie-rcar-host.c index a728e8f9ad3c..ce952403e22c 100644 --- a/drivers/pci/controller/pcie-rcar-host.c +++ b/drivers/pci/controller/pcie-rcar-host.c @@ -36,7 +36,6 @@ struct rcar_msi { DECLARE_BITMAP(used, INT_PCI_MSI_NR); struct irq_domain *domain; struct msi_controller chip; - unsigned long pages; struct mutex lock; int irq1; int irq2; @@ -680,14 +679,15 @@ static void rcar_pcie_unmap_msi(struct rcar_pcie_host *host) static void rcar_pcie_hw_enable_msi(struct rcar_pcie_host *host) { struct rcar_pcie *pcie = &host->pcie; - struct rcar_msi *msi = &host->msi; - unsigned long base; + struct device *dev = pcie->dev; + struct resource res; + + if (WARN_ON(of_address_to_resource(dev->of_node, 0, &res))) + return; /* setup MSI data target */ - base = virt_to_phys((void *)msi->pages); - - rcar_pci_write_reg(pcie, lower_32_bits(base) | MSIFE, PCIEMSIALR); - rcar_pci_write_reg(pcie, upper_32_bits(base), PCIEMSIAUR); + rcar_pci_write_reg(pcie, lower_32_bits(res.start) | MSIFE, PCIEMSIALR); + rcar_pci_write_reg(pcie, upper_32_bits(res.start), PCIEMSIAUR); /* enable all MSI interrupts */ rcar_pci_write_reg(pcie, 0xffffffff, PCIEMSIIER); @@ -735,7 +735,6 @@ static int rcar_pcie_enable_msi(struct rcar_pcie_host *host) } /* setup MSI data target */ - msi->pages = __get_free_pages(GFP_KERNEL | GFP_DMA32, 0); rcar_pcie_hw_enable_msi(host); return 0; @@ -748,7 +747,6 @@ err: static void rcar_pcie_teardown_msi(struct rcar_pcie_host *host) { struct rcar_pcie *pcie = &host->pcie; - struct rcar_msi *msi = &host->msi; /* Disable all MSI interrupts */ rcar_pci_write_reg(pcie, 0, PCIEMSIIER); @@ -756,8 +754,6 @@ static void rcar_pcie_teardown_msi(struct rcar_pcie_host *host) /* Disable address decoding of the MSI interrupt, MSIFE */ rcar_pci_write_reg(pcie, 0, PCIEMSIALR); - free_pages(msi->pages, 0); - rcar_pcie_unmap_msi(host); } From 83ed8d4fa656d37d17bb83203485e3f7a2360e7a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:34 +0100 Subject: [PATCH 0529/1379] PCI: rcar: Convert to MSI domains In anticipation of the removal of the msi_controller structure, convert the Rcar host controller driver to MSI domains. We end-up with the usual two domain structure, the top one being a generic PCI/MSI domain, the bottom one being Rcar-specific and handling the actual HW interrupt allocation. Link: https://lore.kernel.org/r/20210330151145.997953-4-maz@kernel.org Tested-by: Marek Vasut Signed-off-by: Marc Zyngier [lorenzo.pieralisi@arm.com: merged fix https://lore.kernel.org/linux-pci/87y2e2p9wk.wl-maz@kernel.org] Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- drivers/pci/controller/Kconfig | 1 - drivers/pci/controller/pcie-rcar-host.c | 369 ++++++++++++------------ 2 files changed, 178 insertions(+), 192 deletions(-) diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index be8f9ff512a0..5cc07d28a3a0 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -58,7 +58,6 @@ config PCIE_RCAR_HOST bool "Renesas R-Car PCIe host controller" depends on ARCH_RENESAS || COMPILE_TEST depends on PCI_MSI_IRQ_DOMAIN - select PCI_MSI_ARCH_FALLBACKS help Say Y here if you want PCIe controller support on R-Car SoCs in host mode. diff --git a/drivers/pci/controller/pcie-rcar-host.c b/drivers/pci/controller/pcie-rcar-host.c index ce952403e22c..765cf2b45e24 100644 --- a/drivers/pci/controller/pcie-rcar-host.c +++ b/drivers/pci/controller/pcie-rcar-host.c @@ -35,17 +35,12 @@ struct rcar_msi { DECLARE_BITMAP(used, INT_PCI_MSI_NR); struct irq_domain *domain; - struct msi_controller chip; - struct mutex lock; + struct mutex map_lock; + spinlock_t mask_lock; int irq1; int irq2; }; -static inline struct rcar_msi *to_rcar_msi(struct msi_controller *chip) -{ - return container_of(chip, struct rcar_msi, chip); -} - /* Structure representing the PCIe interface */ struct rcar_pcie_host { struct rcar_pcie pcie; @@ -55,6 +50,11 @@ struct rcar_pcie_host { int (*phy_init_fn)(struct rcar_pcie_host *host); }; +static struct rcar_pcie_host *msi_to_host(struct rcar_msi *msi) +{ + return container_of(msi, struct rcar_pcie_host, msi); +} + static u32 rcar_read_conf(struct rcar_pcie *pcie, int where) { unsigned int shift = BITS_PER_BYTE * (where & 3); @@ -291,8 +291,6 @@ static int rcar_pcie_enable(struct rcar_pcie_host *host) bridge->sysdata = host; bridge->ops = &rcar_pcie_ops; - if (IS_ENABLED(CONFIG_PCI_MSI)) - bridge->msi = &host->msi.chip; return pci_host_probe(bridge); } @@ -472,42 +470,6 @@ static int rcar_pcie_phy_init_gen3(struct rcar_pcie_host *host) return err; } -static int rcar_msi_alloc(struct rcar_msi *chip) -{ - int msi; - - mutex_lock(&chip->lock); - - msi = find_first_zero_bit(chip->used, INT_PCI_MSI_NR); - if (msi < INT_PCI_MSI_NR) - set_bit(msi, chip->used); - else - msi = -ENOSPC; - - mutex_unlock(&chip->lock); - - return msi; -} - -static int rcar_msi_alloc_region(struct rcar_msi *chip, int no_irqs) -{ - int msi; - - mutex_lock(&chip->lock); - msi = bitmap_find_free_region(chip->used, INT_PCI_MSI_NR, - order_base_2(no_irqs)); - mutex_unlock(&chip->lock); - - return msi; -} - -static void rcar_msi_free(struct rcar_msi *chip, unsigned long irq) -{ - mutex_lock(&chip->lock); - clear_bit(irq, chip->used); - mutex_unlock(&chip->lock); -} - static irqreturn_t rcar_pcie_msi_irq(int irq, void *data) { struct rcar_pcie_host *host = data; @@ -526,18 +488,13 @@ static irqreturn_t rcar_pcie_msi_irq(int irq, void *data) unsigned int index = find_first_bit(®, 32); unsigned int msi_irq; - /* clear the interrupt */ - rcar_pci_write_reg(pcie, 1 << index, PCIEMSIFR); - - msi_irq = irq_find_mapping(msi->domain, index); + msi_irq = irq_find_mapping(msi->domain->parent, index); if (msi_irq) { - if (test_bit(index, msi->used)) - generic_handle_irq(msi_irq); - else - dev_info(dev, "unhandled MSI\n"); + generic_handle_irq(msi_irq); } else { /* Unknown MSI, just clear it */ dev_dbg(dev, "unexpected MSI\n"); + rcar_pci_write_reg(pcie, BIT(index), PCIEMSIFR); } /* see if there's any more pending in this vector */ @@ -547,150 +504,169 @@ static irqreturn_t rcar_pcie_msi_irq(int irq, void *data) return IRQ_HANDLED; } -static int rcar_msi_setup_irq(struct msi_controller *chip, struct pci_dev *pdev, - struct msi_desc *desc) +static void rcar_msi_top_irq_ack(struct irq_data *d) { - struct rcar_msi *msi = to_rcar_msi(chip); - struct rcar_pcie_host *host = container_of(chip, struct rcar_pcie_host, - msi.chip); - struct rcar_pcie *pcie = &host->pcie; - struct msi_msg msg; - unsigned int irq; - int hwirq; - - hwirq = rcar_msi_alloc(msi); - if (hwirq < 0) - return hwirq; - - irq = irq_find_mapping(msi->domain, hwirq); - if (!irq) { - rcar_msi_free(msi, hwirq); - return -EINVAL; - } - - irq_set_msi_desc(irq, desc); - - msg.address_lo = rcar_pci_read_reg(pcie, PCIEMSIALR) & ~MSIFE; - msg.address_hi = rcar_pci_read_reg(pcie, PCIEMSIAUR); - msg.data = hwirq; - - pci_write_msi_msg(irq, &msg); - - return 0; + irq_chip_ack_parent(d); } -static int rcar_msi_setup_irqs(struct msi_controller *chip, - struct pci_dev *pdev, int nvec, int type) +static void rcar_msi_top_irq_mask(struct irq_data *d) { - struct rcar_msi *msi = to_rcar_msi(chip); - struct rcar_pcie_host *host = container_of(chip, struct rcar_pcie_host, - msi.chip); - struct rcar_pcie *pcie = &host->pcie; - struct msi_desc *desc; - struct msi_msg msg; - unsigned int irq; + pci_msi_mask_irq(d); + irq_chip_mask_parent(d); +} + +static void rcar_msi_top_irq_unmask(struct irq_data *d) +{ + pci_msi_unmask_irq(d); + irq_chip_unmask_parent(d); +} + +static struct irq_chip rcar_msi_top_chip = { + .name = "PCIe MSI", + .irq_ack = rcar_msi_top_irq_ack, + .irq_mask = rcar_msi_top_irq_mask, + .irq_unmask = rcar_msi_top_irq_unmask, +}; + +static void rcar_msi_irq_ack(struct irq_data *d) +{ + struct rcar_msi *msi = irq_data_get_irq_chip_data(d); + struct rcar_pcie *pcie = &msi_to_host(msi)->pcie; + + /* clear the interrupt */ + rcar_pci_write_reg(pcie, BIT(d->hwirq), PCIEMSIFR); +} + +static void rcar_msi_irq_mask(struct irq_data *d) +{ + struct rcar_msi *msi = irq_data_get_irq_chip_data(d); + struct rcar_pcie *pcie = &msi_to_host(msi)->pcie; + unsigned long flags; + u32 value; + + spin_lock_irqsave(&msi->mask_lock, flags); + value = rcar_pci_read_reg(pcie, PCIEMSIIER); + value &= ~BIT(d->hwirq); + rcar_pci_write_reg(pcie, value, PCIEMSIIER); + spin_unlock_irqrestore(&msi->mask_lock, flags); +} + +static void rcar_msi_irq_unmask(struct irq_data *d) +{ + struct rcar_msi *msi = irq_data_get_irq_chip_data(d); + struct rcar_pcie *pcie = &msi_to_host(msi)->pcie; + unsigned long flags; + u32 value; + + spin_lock_irqsave(&msi->mask_lock, flags); + value = rcar_pci_read_reg(pcie, PCIEMSIIER); + value |= BIT(d->hwirq); + rcar_pci_write_reg(pcie, value, PCIEMSIIER); + spin_unlock_irqrestore(&msi->mask_lock, flags); +} + +static int rcar_msi_set_affinity(struct irq_data *d, const struct cpumask *mask, bool force) +{ + return -EINVAL; +} + +static void rcar_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct rcar_msi *msi = irq_data_get_irq_chip_data(data); + struct rcar_pcie *pcie = &msi_to_host(msi)->pcie; + + msg->address_lo = rcar_pci_read_reg(pcie, PCIEMSIALR) & ~MSIFE; + msg->address_hi = rcar_pci_read_reg(pcie, PCIEMSIAUR); + msg->data = data->hwirq; +} + +static struct irq_chip rcar_msi_bottom_chip = { + .name = "Rcar MSI", + .irq_ack = rcar_msi_irq_ack, + .irq_mask = rcar_msi_irq_mask, + .irq_unmask = rcar_msi_irq_unmask, + .irq_set_affinity = rcar_msi_set_affinity, + .irq_compose_msi_msg = rcar_compose_msi_msg, +}; + +static int rcar_msi_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct rcar_msi *msi = domain->host_data; + unsigned int i; int hwirq; - int i; - /* MSI-X interrupts are not supported */ - if (type == PCI_CAP_ID_MSIX) - return -EINVAL; + mutex_lock(&msi->map_lock); - WARN_ON(!list_is_singular(&pdev->dev.msi_list)); - desc = list_entry(pdev->dev.msi_list.next, struct msi_desc, list); + hwirq = bitmap_find_free_region(msi->used, INT_PCI_MSI_NR, order_base_2(nr_irqs)); + + mutex_unlock(&msi->map_lock); - hwirq = rcar_msi_alloc_region(msi, nvec); if (hwirq < 0) return -ENOSPC; - irq = irq_find_mapping(msi->domain, hwirq); - if (!irq) - return -ENOSPC; - - for (i = 0; i < nvec; i++) { - /* - * irq_create_mapping() called from rcar_pcie_probe() pre- - * allocates descs, so there is no need to allocate descs here. - * We can therefore assume that if irq_find_mapping() above - * returns non-zero, then the descs are also successfully - * allocated. - */ - if (irq_set_msi_desc_off(irq, i, desc)) { - /* TODO: clear */ - return -EINVAL; - } - } - - desc->nvec_used = nvec; - desc->msi_attrib.multiple = order_base_2(nvec); - - msg.address_lo = rcar_pci_read_reg(pcie, PCIEMSIALR) & ~MSIFE; - msg.address_hi = rcar_pci_read_reg(pcie, PCIEMSIAUR); - msg.data = hwirq; - - pci_write_msi_msg(irq, &msg); + for (i = 0; i < nr_irqs; i++) + irq_domain_set_info(domain, virq + i, hwirq + i, + &rcar_msi_bottom_chip, domain->host_data, + handle_edge_irq, NULL, NULL); return 0; } -static void rcar_msi_teardown_irq(struct msi_controller *chip, unsigned int irq) +static void rcar_msi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) { - struct rcar_msi *msi = to_rcar_msi(chip); - struct irq_data *d = irq_get_irq_data(irq); + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct rcar_msi *msi = domain->host_data; - rcar_msi_free(msi, d->hwirq); + mutex_lock(&msi->map_lock); + + bitmap_release_region(msi->used, d->hwirq, order_base_2(nr_irqs)); + + mutex_unlock(&msi->map_lock); } -static struct irq_chip rcar_msi_irq_chip = { - .name = "R-Car PCIe MSI", - .irq_enable = pci_msi_unmask_irq, - .irq_disable = pci_msi_mask_irq, - .irq_mask = pci_msi_mask_irq, - .irq_unmask = pci_msi_unmask_irq, +static const struct irq_domain_ops rcar_msi_domain_ops = { + .alloc = rcar_msi_domain_alloc, + .free = rcar_msi_domain_free, }; -static int rcar_msi_map(struct irq_domain *domain, unsigned int irq, - irq_hw_number_t hwirq) +static struct msi_domain_info rcar_msi_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI), + .chip = &rcar_msi_top_chip, +}; + +static int rcar_allocate_domains(struct rcar_msi *msi) { - irq_set_chip_and_handler(irq, &rcar_msi_irq_chip, handle_simple_irq); - irq_set_chip_data(irq, domain->host_data); + struct rcar_pcie *pcie = &msi_to_host(msi)->pcie; + struct fwnode_handle *fwnode = dev_fwnode(pcie->dev); + struct irq_domain *parent; + + parent = irq_domain_create_linear(fwnode, INT_PCI_MSI_NR, + &rcar_msi_domain_ops, msi); + if (!parent) { + dev_err(pcie->dev, "failed to create IRQ domain\n"); + return -ENOMEM; + } + irq_domain_update_bus_token(parent, DOMAIN_BUS_NEXUS); + + msi->domain = pci_msi_create_irq_domain(fwnode, &rcar_msi_info, parent); + if (!msi->domain) { + dev_err(pcie->dev, "failed to create MSI domain\n"); + irq_domain_remove(parent); + return -ENOMEM; + } return 0; } -static const struct irq_domain_ops msi_domain_ops = { - .map = rcar_msi_map, -}; - -static void rcar_pcie_unmap_msi(struct rcar_pcie_host *host) +static void rcar_free_domains(struct rcar_msi *msi) { - struct rcar_msi *msi = &host->msi; - int i, irq; - - for (i = 0; i < INT_PCI_MSI_NR; i++) { - irq = irq_find_mapping(msi->domain, i); - if (irq > 0) - irq_dispose_mapping(irq); - } + struct irq_domain *parent = msi->domain->parent; irq_domain_remove(msi->domain); -} - -static void rcar_pcie_hw_enable_msi(struct rcar_pcie_host *host) -{ - struct rcar_pcie *pcie = &host->pcie; - struct device *dev = pcie->dev; - struct resource res; - - if (WARN_ON(of_address_to_resource(dev->of_node, 0, &res))) - return; - - /* setup MSI data target */ - rcar_pci_write_reg(pcie, lower_32_bits(res.start) | MSIFE, PCIEMSIALR); - rcar_pci_write_reg(pcie, upper_32_bits(res.start), PCIEMSIAUR); - - /* enable all MSI interrupts */ - rcar_pci_write_reg(pcie, 0xffffffff, PCIEMSIIER); + irq_domain_remove(parent); } static int rcar_pcie_enable_msi(struct rcar_pcie_host *host) @@ -698,29 +674,24 @@ static int rcar_pcie_enable_msi(struct rcar_pcie_host *host) struct rcar_pcie *pcie = &host->pcie; struct device *dev = pcie->dev; struct rcar_msi *msi = &host->msi; - int err, i; + struct resource res; + int err; - mutex_init(&msi->lock); + mutex_init(&msi->map_lock); + spin_lock_init(&msi->mask_lock); - msi->chip.dev = dev; - msi->chip.setup_irq = rcar_msi_setup_irq; - msi->chip.setup_irqs = rcar_msi_setup_irqs; - msi->chip.teardown_irq = rcar_msi_teardown_irq; + err = of_address_to_resource(dev->of_node, 0, &res); + if (err) + return err; - msi->domain = irq_domain_add_linear(dev->of_node, INT_PCI_MSI_NR, - &msi_domain_ops, &msi->chip); - if (!msi->domain) { - dev_err(dev, "failed to create IRQ domain\n"); - return -ENOMEM; - } - - for (i = 0; i < INT_PCI_MSI_NR; i++) - irq_create_mapping(msi->domain, i); + err = rcar_allocate_domains(msi); + if (err) + return err; /* Two irqs are for MSI, but they are also used for non-MSI irqs */ err = devm_request_irq(dev, msi->irq1, rcar_pcie_msi_irq, IRQF_SHARED | IRQF_NO_THREAD, - rcar_msi_irq_chip.name, host); + rcar_msi_bottom_chip.name, host); if (err < 0) { dev_err(dev, "failed to request IRQ: %d\n", err); goto err; @@ -728,19 +699,26 @@ static int rcar_pcie_enable_msi(struct rcar_pcie_host *host) err = devm_request_irq(dev, msi->irq2, rcar_pcie_msi_irq, IRQF_SHARED | IRQF_NO_THREAD, - rcar_msi_irq_chip.name, host); + rcar_msi_bottom_chip.name, host); if (err < 0) { dev_err(dev, "failed to request IRQ: %d\n", err); goto err; } - /* setup MSI data target */ - rcar_pcie_hw_enable_msi(host); + /* disable all MSIs */ + rcar_pci_write_reg(pcie, 0, PCIEMSIIER); + + /* + * Setup MSI data target using RC base address address, which + * is guaranteed to be in the low 32bit range on any RCar HW. + */ + rcar_pci_write_reg(pcie, lower_32_bits(res.start) | MSIFE, PCIEMSIALR); + rcar_pci_write_reg(pcie, upper_32_bits(res.start), PCIEMSIAUR); return 0; err: - rcar_pcie_unmap_msi(host); + rcar_free_domains(msi); return err; } @@ -754,7 +732,7 @@ static void rcar_pcie_teardown_msi(struct rcar_pcie_host *host) /* Disable address decoding of the MSI interrupt, MSIFE */ rcar_pci_write_reg(pcie, 0, PCIEMSIALR); - rcar_pcie_unmap_msi(host); + rcar_free_domains(&host->msi); } static int rcar_pcie_get_resources(struct rcar_pcie_host *host) @@ -1007,8 +985,17 @@ static int __maybe_unused rcar_pcie_resume(struct device *dev) dev_info(dev, "PCIe x%d: link up\n", (data >> 20) & 0x3f); /* Enable MSI */ - if (IS_ENABLED(CONFIG_PCI_MSI)) - rcar_pcie_hw_enable_msi(host); + if (IS_ENABLED(CONFIG_PCI_MSI)) { + struct resource res; + u32 val; + + of_address_to_resource(dev->of_node, 0, &res); + rcar_pci_write_reg(pcie, upper_32_bits(res.start), PCIEMSIAUR); + rcar_pci_write_reg(pcie, lower_32_bits(res.start) | MSIFE, PCIEMSIALR); + + bitmap_to_arr32(&val, host->msi.used, INT_PCI_MSI_NR); + rcar_pci_write_reg(pcie, val, PCIEMSIIER); + } rcar_pcie_hw_enable(host); From 161260e7f7bc58d6a0972eb41a6072e82d0b58a5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:35 +0100 Subject: [PATCH 0530/1379] PCI: xilinx: Don't allocate extra memory for the MSI capture address A long cargo-culted behaviour of PCI drivers is to allocate memory to obtain an address that is fed to the controller as the MSI capture address (i.e. the MSI doorbell). But there is no actual requirement for this address to be RAM. All it needs to be is a suitable aligned address that will *not* be DMA'd to. Use the physical address of the 'port' data structure as the MSI capture address, aligned on a 4K boundary. Link: https://lore.kernel.org/r/20210330151145.997953-5-maz@kernel.org Tested-by: Bharat Kumar Gogada Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/pcie-xilinx.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/pci/controller/pcie-xilinx.c b/drivers/pci/controller/pcie-xilinx.c index fa5baeb82653..e127a7b6e535 100644 --- a/drivers/pci/controller/pcie-xilinx.c +++ b/drivers/pci/controller/pcie-xilinx.c @@ -94,7 +94,6 @@ * struct xilinx_pcie_port - PCIe port information * @reg_base: IO Mapped Register Base * @irq: Interrupt number - * @msi_pages: MSI pages * @dev: Device pointer * @msi_domain: MSI IRQ domain pointer * @leg_domain: Legacy IRQ domain pointer @@ -103,7 +102,6 @@ struct xilinx_pcie_port { void __iomem *reg_base; u32 irq; - unsigned long msi_pages; struct device *dev; struct irq_domain *msi_domain; struct irq_domain *leg_domain; @@ -274,10 +272,10 @@ static int xilinx_pcie_msi_setup_irq(struct msi_controller *chip, irq_set_msi_desc(irq, desc); - msg_addr = virt_to_phys((void *)port->msi_pages); + msg_addr = ALIGN_DOWN(virt_to_phys(port), SZ_4K); - msg.address_hi = 0; - msg.address_lo = msg_addr; + msg.address_hi = upper_32_bits(msg_addr); + msg.address_lo = lower_32_bits(msg_addr); msg.data = irq; pci_write_msi_msg(irq, &msg); @@ -330,13 +328,9 @@ static int xilinx_pcie_enable_msi(struct xilinx_pcie_port *port) { phys_addr_t msg_addr; - port->msi_pages = __get_free_pages(GFP_KERNEL, 0); - if (!port->msi_pages) - return -ENOMEM; - - msg_addr = virt_to_phys((void *)port->msi_pages); - pcie_write(port, 0x0, XILINX_PCIE_REG_MSIBASE1); - pcie_write(port, msg_addr, XILINX_PCIE_REG_MSIBASE2); + msg_addr = ALIGN_DOWN(virt_to_phys(port), SZ_4K); + pcie_write(port, upper_32_bits(msg_addr), XILINX_PCIE_REG_MSIBASE1); + pcie_write(port, lower_32_bits(msg_addr), XILINX_PCIE_REG_MSIBASE2); return 0; } From 313b64c3ae52bc8e953319077204cf1d286a8a99 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:36 +0100 Subject: [PATCH 0531/1379] PCI: xilinx: Convert to MSI domains In anticipation of the removal of the msi_controller structure, convert the ancient xilinx host controller driver to MSI domains. We end-up with the usual two domain structure, the top one being a generic PCI/MSI domain, the bottom one being xilinx-specific and handling the actual HW interrupt allocation. This allows us to fix some of the most appaling MSI programming, where the message programmed in the device is the virtual IRQ number instead of the allocated vector number. The allocator is also made safe with a mutex. This should allow support for MultiMSI, but I decided not to even try, since I cannot test it. Link: https://lore.kernel.org/r/20210330151145.997953-6-maz@kernel.org Tested-by: Bharat Kumar Gogada Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- drivers/pci/controller/Kconfig | 2 +- drivers/pci/controller/pcie-xilinx.c | 270 ++++++++++++--------------- 2 files changed, 120 insertions(+), 152 deletions(-) diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 5cc07d28a3a0..60045f7aafc5 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -86,7 +86,7 @@ config PCI_HOST_GENERIC config PCIE_XILINX bool "Xilinx AXI PCIe host bridge support" depends on OF || COMPILE_TEST - select PCI_MSI_ARCH_FALLBACKS + depends on PCI_MSI_IRQ_DOMAIN help Say 'Y' here if you want kernel to support the Xilinx AXI PCIe Host Bridge driver. diff --git a/drivers/pci/controller/pcie-xilinx.c b/drivers/pci/controller/pcie-xilinx.c index e127a7b6e535..14001febf59a 100644 --- a/drivers/pci/controller/pcie-xilinx.c +++ b/drivers/pci/controller/pcie-xilinx.c @@ -93,23 +93,23 @@ /** * struct xilinx_pcie_port - PCIe port information * @reg_base: IO Mapped Register Base - * @irq: Interrupt number * @dev: Device pointer + * @msi_map: Bitmap of allocated MSIs + * @map_lock: Mutex protecting the MSI allocation * @msi_domain: MSI IRQ domain pointer * @leg_domain: Legacy IRQ domain pointer * @resources: Bus Resources */ struct xilinx_pcie_port { void __iomem *reg_base; - u32 irq; struct device *dev; + unsigned long msi_map[BITS_TO_LONGS(XILINX_NUM_MSI_IRQS)]; + struct mutex map_lock; struct irq_domain *msi_domain; struct irq_domain *leg_domain; struct list_head resources; }; -static DECLARE_BITMAP(msi_irq_in_use, XILINX_NUM_MSI_IRQS); - static inline u32 pcie_read(struct xilinx_pcie_port *port, u32 reg) { return readl(port->reg_base + reg); @@ -194,145 +194,116 @@ static struct pci_ops xilinx_pcie_ops = { /* MSI functions */ -/** - * xilinx_pcie_destroy_msi - Free MSI number - * @irq: IRQ to be freed - */ -static void xilinx_pcie_destroy_msi(unsigned int irq) +static void xilinx_msi_top_irq_ack(struct irq_data *d) { - struct msi_desc *msi; - struct xilinx_pcie_port *port; - struct irq_data *d = irq_get_irq_data(irq); - irq_hw_number_t hwirq = irqd_to_hwirq(d); - - if (!test_bit(hwirq, msi_irq_in_use)) { - msi = irq_get_msi_desc(irq); - port = msi_desc_to_pci_sysdata(msi); - dev_err(port->dev, "Trying to free unused MSI#%d\n", irq); - } else { - clear_bit(hwirq, msi_irq_in_use); - } + /* + * xilinx_pcie_intr_handler() will have performed the Ack. + * Eventually, this should be fixed and the Ack be moved in + * the respective callbacks for INTx and MSI. + */ } -/** - * xilinx_pcie_assign_msi - Allocate MSI number - * - * Return: A valid IRQ on success and error value on failure. - */ -static int xilinx_pcie_assign_msi(void) -{ - int pos; +static struct irq_chip xilinx_msi_top_chip = { + .name = "PCIe MSI", + .irq_ack = xilinx_msi_top_irq_ack, +}; - pos = find_first_zero_bit(msi_irq_in_use, XILINX_NUM_MSI_IRQS); - if (pos < XILINX_NUM_MSI_IRQS) - set_bit(pos, msi_irq_in_use); - else +static int xilinx_msi_set_affinity(struct irq_data *d, const struct cpumask *mask, bool force) +{ + return -EINVAL; +} + +static void xilinx_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct xilinx_pcie_port *pcie = irq_data_get_irq_chip_data(data); + phys_addr_t pa = ALIGN_DOWN(virt_to_phys(pcie), SZ_4K); + + msg->address_lo = lower_32_bits(pa); + msg->address_hi = upper_32_bits(pa); + msg->data = data->hwirq; +} + +static struct irq_chip xilinx_msi_bottom_chip = { + .name = "Xilinx MSI", + .irq_set_affinity = xilinx_msi_set_affinity, + .irq_compose_msi_msg = xilinx_compose_msi_msg, +}; + +static int xilinx_msi_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *args) +{ + struct xilinx_pcie_port *port = domain->host_data; + int hwirq, i; + + mutex_lock(&port->map_lock); + + hwirq = bitmap_find_free_region(port->msi_map, XILINX_NUM_MSI_IRQS, order_base_2(nr_irqs)); + + mutex_unlock(&port->map_lock); + + if (hwirq < 0) return -ENOSPC; - return pos; -} - -/** - * xilinx_msi_teardown_irq - Destroy the MSI - * @chip: MSI Chip descriptor - * @irq: MSI IRQ to destroy - */ -static void xilinx_msi_teardown_irq(struct msi_controller *chip, - unsigned int irq) -{ - xilinx_pcie_destroy_msi(irq); - irq_dispose_mapping(irq); -} - -/** - * xilinx_pcie_msi_setup_irq - Setup MSI request - * @chip: MSI chip pointer - * @pdev: PCIe device pointer - * @desc: MSI descriptor pointer - * - * Return: '0' on success and error value on failure - */ -static int xilinx_pcie_msi_setup_irq(struct msi_controller *chip, - struct pci_dev *pdev, - struct msi_desc *desc) -{ - struct xilinx_pcie_port *port = pdev->bus->sysdata; - unsigned int irq; - int hwirq; - struct msi_msg msg; - phys_addr_t msg_addr; - - hwirq = xilinx_pcie_assign_msi(); - if (hwirq < 0) - return hwirq; - - irq = irq_create_mapping(port->msi_domain, hwirq); - if (!irq) - return -EINVAL; - - irq_set_msi_desc(irq, desc); - - msg_addr = ALIGN_DOWN(virt_to_phys(port), SZ_4K); - - msg.address_hi = upper_32_bits(msg_addr); - msg.address_lo = lower_32_bits(msg_addr); - msg.data = irq; - - pci_write_msi_msg(irq, &msg); + for (i = 0; i < nr_irqs; i++) + irq_domain_set_info(domain, virq + i, hwirq + i, + &xilinx_msi_bottom_chip, domain->host_data, + handle_edge_irq, NULL, NULL); return 0; } -/* MSI Chip Descriptor */ -static struct msi_controller xilinx_pcie_msi_chip = { - .setup_irq = xilinx_pcie_msi_setup_irq, - .teardown_irq = xilinx_msi_teardown_irq, -}; - -/* HW Interrupt Chip Descriptor */ -static struct irq_chip xilinx_msi_irq_chip = { - .name = "Xilinx PCIe MSI", - .irq_enable = pci_msi_unmask_irq, - .irq_disable = pci_msi_mask_irq, - .irq_mask = pci_msi_mask_irq, - .irq_unmask = pci_msi_unmask_irq, -}; - -/** - * xilinx_pcie_msi_map - Set the handler for the MSI and mark IRQ as valid - * @domain: IRQ domain - * @irq: Virtual IRQ number - * @hwirq: HW interrupt number - * - * Return: Always returns 0. - */ -static int xilinx_pcie_msi_map(struct irq_domain *domain, unsigned int irq, - irq_hw_number_t hwirq) +static void xilinx_msi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) { - irq_set_chip_and_handler(irq, &xilinx_msi_irq_chip, handle_simple_irq); - irq_set_chip_data(irq, domain->host_data); + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct xilinx_pcie_port *port = domain->host_data; + + mutex_lock(&port->map_lock); + + bitmap_release_region(port->msi_map, d->hwirq, order_base_2(nr_irqs)); + + mutex_unlock(&port->map_lock); +} + +static const struct irq_domain_ops xilinx_msi_domain_ops = { + .alloc = xilinx_msi_domain_alloc, + .free = xilinx_msi_domain_free, +}; + +static struct msi_domain_info xilinx_msi_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS), + .chip = &xilinx_msi_top_chip, +}; + +static int xilinx_allocate_msi_domains(struct xilinx_pcie_port *pcie) +{ + struct fwnode_handle *fwnode = dev_fwnode(pcie->dev); + struct irq_domain *parent; + + parent = irq_domain_create_linear(fwnode, XILINX_NUM_MSI_IRQS, + &xilinx_msi_domain_ops, pcie); + if (!parent) { + dev_err(pcie->dev, "failed to create IRQ domain\n"); + return -ENOMEM; + } + irq_domain_update_bus_token(parent, DOMAIN_BUS_NEXUS); + + pcie->msi_domain = pci_msi_create_irq_domain(fwnode, &xilinx_msi_info, parent); + if (!pcie->msi_domain) { + dev_err(pcie->dev, "failed to create MSI domain\n"); + irq_domain_remove(parent); + return -ENOMEM; + } return 0; } -/* IRQ Domain operations */ -static const struct irq_domain_ops msi_domain_ops = { - .map = xilinx_pcie_msi_map, -}; - -/** - * xilinx_pcie_enable_msi - Enable MSI support - * @port: PCIe port information - */ -static int xilinx_pcie_enable_msi(struct xilinx_pcie_port *port) +static void xilinx_free_msi_domains(struct xilinx_pcie_port *pcie) { - phys_addr_t msg_addr; + struct irq_domain *parent = pcie->msi_domain->parent; - msg_addr = ALIGN_DOWN(virt_to_phys(port), SZ_4K); - pcie_write(port, upper_32_bits(msg_addr), XILINX_PCIE_REG_MSIBASE1); - pcie_write(port, lower_32_bits(msg_addr), XILINX_PCIE_REG_MSIBASE2); - - return 0; + irq_domain_remove(pcie->msi_domain); + irq_domain_remove(parent); } /* INTx Functions */ @@ -414,6 +385,8 @@ static irqreturn_t xilinx_pcie_intr_handler(int irq, void *data) } if (status & (XILINX_PCIE_INTR_INTX | XILINX_PCIE_INTR_MSI)) { + unsigned int irq; + val = pcie_read(port, XILINX_PCIE_REG_RPIFR1); /* Check whether interrupt valid */ @@ -426,20 +399,19 @@ static irqreturn_t xilinx_pcie_intr_handler(int irq, void *data) if (val & XILINX_PCIE_RPIFR1_MSI_INTR) { val = pcie_read(port, XILINX_PCIE_REG_RPIFR2) & XILINX_PCIE_RPIFR2_MSG_DATA; + irq = irq_find_mapping(port->msi_domain->parent, val); } else { val = (val & XILINX_PCIE_RPIFR1_INTR_MASK) >> XILINX_PCIE_RPIFR1_INTR_SHIFT; - val = irq_find_mapping(port->leg_domain, val); + irq = irq_find_mapping(port->leg_domain, val); } /* Clear interrupt FIFO register 1 */ pcie_write(port, XILINX_PCIE_RPIFR1_ALL_MASK, XILINX_PCIE_REG_RPIFR1); - /* Handle the interrupt */ - if (IS_ENABLED(CONFIG_PCI_MSI) || - !(val & XILINX_PCIE_RPIFR1_MSI_INTR)) - generic_handle_irq(val); + if (irq) + generic_handle_irq(irq); } if (status & XILINX_PCIE_INTR_SLV_UNSUPP) @@ -485,12 +457,11 @@ error: static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port) { struct device *dev = port->dev; - struct device_node *node = dev->of_node; struct device_node *pcie_intc_node; int ret; /* Setup INTx */ - pcie_intc_node = of_get_next_child(node, NULL); + pcie_intc_node = of_get_next_child(dev->of_node, NULL); if (!pcie_intc_node) { dev_err(dev, "No PCIe Intc node found\n"); return -ENODEV; @@ -507,18 +478,14 @@ static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port) /* Setup MSI */ if (IS_ENABLED(CONFIG_PCI_MSI)) { - port->msi_domain = irq_domain_add_linear(node, - XILINX_NUM_MSI_IRQS, - &msi_domain_ops, - &xilinx_pcie_msi_chip); - if (!port->msi_domain) { - dev_err(dev, "Failed to get a MSI IRQ domain\n"); - return -ENODEV; - } + phys_addr_t pa = ALIGN_DOWN(virt_to_phys(port), SZ_4K); - ret = xilinx_pcie_enable_msi(port); + ret = xilinx_allocate_msi_domains(port); if (ret) return ret; + + pcie_write(port, upper_32_bits(pa), XILINX_PCIE_REG_MSIBASE1); + pcie_write(port, lower_32_bits(pa), XILINX_PCIE_REG_MSIBASE2); } return 0; @@ -566,6 +533,7 @@ static int xilinx_pcie_parse_dt(struct xilinx_pcie_port *port) struct device *dev = port->dev; struct device_node *node = dev->of_node; struct resource regs; + unsigned int irq; int err; err = of_address_to_resource(node, 0, ®s); @@ -578,12 +546,12 @@ static int xilinx_pcie_parse_dt(struct xilinx_pcie_port *port) if (IS_ERR(port->reg_base)) return PTR_ERR(port->reg_base); - port->irq = irq_of_parse_and_map(node, 0); - err = devm_request_irq(dev, port->irq, xilinx_pcie_intr_handler, + irq = irq_of_parse_and_map(node, 0); + err = devm_request_irq(dev, irq, xilinx_pcie_intr_handler, IRQF_SHARED | IRQF_NO_THREAD, "xilinx-pcie", port); if (err) { - dev_err(dev, "unable to request irq %d\n", port->irq); + dev_err(dev, "unable to request irq %d\n", irq); return err; } @@ -611,7 +579,7 @@ static int xilinx_pcie_probe(struct platform_device *pdev) return -ENODEV; port = pci_host_bridge_priv(bridge); - + mutex_init(&port->map_lock); port->dev = dev; err = xilinx_pcie_parse_dt(port); @@ -631,11 +599,11 @@ static int xilinx_pcie_probe(struct platform_device *pdev) bridge->sysdata = port; bridge->ops = &xilinx_pcie_ops; -#ifdef CONFIG_PCI_MSI - xilinx_pcie_msi_chip.dev = dev; - bridge->msi = &xilinx_pcie_msi_chip; -#endif - return pci_host_probe(bridge); + err = pci_host_probe(bridge); + if (err) + xilinx_free_msi_domains(port); + + return err; } static const struct of_device_id xilinx_pcie_of_match[] = { From e0fad163b6e7482be32ae99662240268aa0e1bf1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:37 +0100 Subject: [PATCH 0532/1379] PCI: hv: Drop msi_controller structure The Hyper-V PCI driver still makes use of a msi_controller structure, but it looks more like a distant leftover than anything actually useful, since it is initialised to 0 and never used for anything. Just remove it. Link: https://lore.kernel.org/r/20210330151145.997953-7-maz@kernel.org Tested-by: Michael Kelley Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Reviewed-by: Michael Kelley Acked-by: Bjorn Helgaas --- drivers/pci/controller/pci-hyperv.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 27a17a1e4a7c..2c014ba7ed4b 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -473,7 +473,6 @@ struct hv_pcibus_device { struct list_head dr_list; struct msi_domain_info msi_info; - struct msi_controller msi_chip; struct irq_domain *irq_domain; spinlock_t retarget_msi_interrupt_lock; @@ -1866,9 +1865,6 @@ static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus) if (!hbus->pci_bus) return -ENODEV; - hbus->pci_bus->msi = &hbus->msi_chip; - hbus->pci_bus->msi->dev = &hbus->hdev->device; - pci_lock_rescan_remove(); pci_scan_child_bus(hbus->pci_bus); hv_pci_assign_numa_node(hbus); From 3a05d08f6cc75b74079290c33d6127b2857226fa Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:38 +0100 Subject: [PATCH 0533/1379] PCI/MSI: Drop use of msi_controller from core code As there is no driver using msi_controller, we can now safely remove its use from the PCI probe code. Link: https://lore.kernel.org/r/20210330151145.997953-8-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- drivers/pci/msi.c | 23 +---------------------- drivers/pci/probe.c | 2 -- include/linux/pci.h | 2 -- 3 files changed, 1 insertion(+), 26 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 3162f88fe940..79b5a995bd02 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -64,39 +64,18 @@ static void pci_msi_teardown_msi_irqs(struct pci_dev *dev) /* Arch hooks */ int __weak arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { - struct msi_controller *chip = dev->bus->msi; - int err; - - if (!chip || !chip->setup_irq) - return -EINVAL; - - err = chip->setup_irq(chip, dev, desc); - if (err < 0) - return err; - - irq_set_chip_data(desc->irq, chip); - - return 0; + return -EINVAL; } void __weak arch_teardown_msi_irq(unsigned int irq) { - struct msi_controller *chip = irq_get_chip_data(irq); - - if (!chip || !chip->teardown_irq) - return; - - chip->teardown_irq(chip, irq); } int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - struct msi_controller *chip = dev->bus->msi; struct msi_desc *entry; int ret; - if (chip && chip->setup_irqs) - return chip->setup_irqs(chip, dev, nvec, type); /* * If an architecture wants to support multiple MSI, it needs to * override arch_setup_msi_irqs() diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 953f15abc850..fb04fc81a8bd 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -895,7 +895,6 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) /* Temporarily move resources off the list */ list_splice_init(&bridge->windows, &resources); bus->sysdata = bridge->sysdata; - bus->msi = bridge->msi; bus->ops = bridge->ops; bus->number = bus->busn_res.start = bridge->busnr; #ifdef CONFIG_PCI_DOMAINS_GENERIC @@ -1053,7 +1052,6 @@ static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent, return NULL; child->parent = parent; - child->msi = parent->msi; child->sysdata = parent->sysdata; child->bus_flags = parent->bus_flags; diff --git a/include/linux/pci.h b/include/linux/pci.h index 86c799c97b77..ebf557e59d87 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -540,7 +540,6 @@ struct pci_host_bridge { int (*map_irq)(const struct pci_dev *, u8, u8); void (*release_fn)(struct pci_host_bridge *); void *release_data; - struct msi_controller *msi; unsigned int ignore_reset_delay:1; /* For entire hierarchy */ unsigned int no_ext_tags:1; /* No Extended Tags */ unsigned int native_aer:1; /* OS may use PCIe AER */ @@ -621,7 +620,6 @@ struct pci_bus { struct resource busn_res; /* Bus numbers routed to this bus */ struct pci_ops *ops; /* Configuration access functions */ - struct msi_controller *msi; /* MSI controller */ void *sysdata; /* Hook for sys-specific extension */ struct proc_dir_entry *procdir; /* Directory entry in /proc/bus/pci */ From b227be0d7314d0869d4e28c199ac1fc7075cf06e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:39 +0100 Subject: [PATCH 0534/1379] PCI/MSI: Kill msi_controller structure msi_controller had a good, long life as the abstraction for a driver providing MSIs to PCI devices. But it has been replaced in all drivers by the more expressive generic MSI framework. Farewell, struct msi_controller. Link: https://lore.kernel.org/r/20210330151145.997953-9-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- include/linux/msi.h | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/include/linux/msi.h b/include/linux/msi.h index aef35fd1cf11..3f21e77b57b7 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -240,8 +240,7 @@ void pci_msi_unmask_irq(struct irq_data *data); /* * The arch hooks to setup up msi irqs. Default functions are implemented * as weak symbols so that they /can/ be overriden by architecture specific - * code if needed. These hooks must be enabled by the architecture or by - * drivers which depend on them via msi_controller based MSI handling. + * code if needed. These hooks can only be enabled by the architecture. * * If CONFIG_PCI_MSI_ARCH_FALLBACKS is not selected they are replaced by * stubs with warnings. @@ -272,19 +271,6 @@ static inline void arch_teardown_msi_irqs(struct pci_dev *dev) void arch_restore_msi_irqs(struct pci_dev *dev); void default_restore_msi_irqs(struct pci_dev *dev); -struct msi_controller { - struct module *owner; - struct device *dev; - struct device_node *of_node; - struct list_head list; - - int (*setup_irq)(struct msi_controller *chip, struct pci_dev *dev, - struct msi_desc *desc); - int (*setup_irqs)(struct msi_controller *chip, struct pci_dev *dev, - int nvec, int type); - void (*teardown_irq)(struct msi_controller *chip, unsigned int irq); -}; - #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN #include From f8bcf249d9cf292c6ceb3d9f5bd90815090f5286 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:40 +0100 Subject: [PATCH 0535/1379] PCI/MSI: Kill default_teardown_msi_irqs() It doesn't have any caller left. Link: https://lore.kernel.org/r/20210330151145.997953-10-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- drivers/pci/msi.c | 11 +---------- include/linux/msi.h | 1 - 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 79b5a995bd02..d9c73c173c14 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -94,11 +94,7 @@ int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 0; } -/* - * We have a default implementation available as a separate non-weak - * function, as it is used by the Xen x86 PCI code - */ -void default_teardown_msi_irqs(struct pci_dev *dev) +void __weak arch_teardown_msi_irqs(struct pci_dev *dev) { int i; struct msi_desc *entry; @@ -108,11 +104,6 @@ void default_teardown_msi_irqs(struct pci_dev *dev) for (i = 0; i < entry->nvec_used; i++) arch_teardown_msi_irq(entry->irq + i); } - -void __weak arch_teardown_msi_irqs(struct pci_dev *dev) -{ - return default_teardown_msi_irqs(dev); -} #endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */ static void default_restore_msi_irq(struct pci_dev *dev, int irq) diff --git a/include/linux/msi.h b/include/linux/msi.h index 3f21e77b57b7..6aff469e511d 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -250,7 +250,6 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc); void arch_teardown_msi_irq(unsigned int irq); int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void arch_teardown_msi_irqs(struct pci_dev *dev); -void default_teardown_msi_irqs(struct pci_dev *dev); #else static inline int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { From 94e89b145371b68fa0ea294855adebcd03e0522e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:41 +0100 Subject: [PATCH 0536/1379] PCI/MSI: Let PCI host bridges declare their reliance on MSI domains There is a whole class of host bridges that cannot know whether MSIs will be provided or not, as they rely on other blocks to provide the MSI functionnality, using MSI domains. This is the case for example on systems that use the ARM GIC architecture. Introduce a new attribute ('msi_domain') indicating that implicit dependency, and use this property to set the NO_MSI flag when no MSI domain is found at probe time. Link: https://lore.kernel.org/r/20210330151145.997953-11-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- drivers/pci/probe.c | 2 ++ include/linux/pci.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index fb04fc81a8bd..aa6fba35f5d1 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -925,6 +925,8 @@ static int pci_register_host_bridge(struct pci_host_bridge *bridge) device_enable_async_suspend(bus->bridge); pci_set_bus_of_node(bus); pci_set_bus_msi_domain(bus); + if (bridge->msi_domain && !dev_get_msi_domain(&bus->dev)) + bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI; if (!parent) set_dev_node(bus->bridge, pcibus_to_node(bus)); diff --git a/include/linux/pci.h b/include/linux/pci.h index ebf557e59d87..ede0aef2cfd4 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -550,6 +550,7 @@ struct pci_host_bridge { unsigned int native_dpc:1; /* OS may use PCIe DPC */ unsigned int preserve_config:1; /* Preserve FW resource setup */ unsigned int size_windows:1; /* Enable root bus sizing */ + unsigned int msi_domain:1; /* Bridge wants MSI domain */ /* Resource alignment requirements */ resource_size_t (*align_resource)(struct pci_dev *dev, From 9ec37efb87832b578d7972fc80b04d94f5d2bbe3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:42 +0100 Subject: [PATCH 0537/1379] PCI/MSI: Make pci_host_common_probe() declare its reliance on MSI domains The generic PCI host driver relies on MSI domains for MSIs to be provided to its end-points. Make this dependency explicit. This cures the warnings occuring on arm/arm64 VMs when booted with PCI virtio devices and no MSI controller (no GICv3 ITS, for example). It is likely that other drivers will need to express the same dependency. Link: https://lore.kernel.org/r/20210330151145.997953-12-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- drivers/pci/controller/pci-host-common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/pci-host-common.c b/drivers/pci/controller/pci-host-common.c index 6ab694f8d283..d3924a44db02 100644 --- a/drivers/pci/controller/pci-host-common.c +++ b/drivers/pci/controller/pci-host-common.c @@ -79,6 +79,7 @@ int pci_host_common_probe(struct platform_device *pdev) bridge->sysdata = cfg; bridge->ops = (struct pci_ops *)&ops->pci_ops; + bridge->msi_domain = true; return pci_host_probe(bridge); } From 645e9c38383d7fcde2784ee537fa18ec9bed54d9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 Mar 2021 16:11:43 +0100 Subject: [PATCH 0538/1379] PCI: mediatek: Advertise lack of built-in MSI handling Some Mediatek host bridges cannot handle MSIs, which is sad. This also results in an ugly warning at device probe time, as the core PCI code wasn't told that MSIs were not available. Advertise this fact to the rest of the core PCI code by using the 'msi_domain' attribute, which still opens the possibility for another block to provide the MSI functionnality. [maz: commit message, switched over to msi_domain attribute] Link: https://lore.kernel.org/r/20210330151145.997953-13-maz@kernel.org Reported-by: Frank Wunderlich Signed-off-by: Thomas Gleixner Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- drivers/pci/controller/pcie-mediatek.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pci/controller/pcie-mediatek.c b/drivers/pci/controller/pcie-mediatek.c index 23548b517e4b..9c10d87b6134 100644 --- a/drivers/pci/controller/pcie-mediatek.c +++ b/drivers/pci/controller/pcie-mediatek.c @@ -143,6 +143,7 @@ struct mtk_pcie_port; * struct mtk_pcie_soc - differentiate between host generations * @need_fix_class_id: whether this host's class ID needed to be fixed or not * @need_fix_device_id: whether this host's device ID needed to be fixed or not + * @no_msi: Bridge has no MSI support, and relies on an external block * @device_id: device ID which this host need to be fixed * @ops: pointer to configuration access functions * @startup: pointer to controller setting functions @@ -151,6 +152,7 @@ struct mtk_pcie_port; struct mtk_pcie_soc { bool need_fix_class_id; bool need_fix_device_id; + bool no_msi; unsigned int device_id; struct pci_ops *ops; int (*startup)(struct mtk_pcie_port *port); @@ -1087,6 +1089,7 @@ static int mtk_pcie_probe(struct platform_device *pdev) host->ops = pcie->soc->ops; host->sysdata = pcie; + host->msi_domain = pcie->soc->no_msi; err = pci_host_probe(host); if (err) @@ -1176,6 +1179,7 @@ static const struct dev_pm_ops mtk_pcie_pm_ops = { }; static const struct mtk_pcie_soc mtk_pcie_soc_v1 = { + .no_msi = true, .ops = &mtk_pcie_ops, .startup = mtk_pcie_startup_port, }; From 61af69296cbadf4c88a1076a3b3e5572827b04c8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:44 +0100 Subject: [PATCH 0539/1379] PCI/MSI: Document the various ways of ending up with NO_MSI We have now three ways of ending up with NO_MSI being set. Document them. Link: https://lore.kernel.org/r/20210330151145.997953-14-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- drivers/pci/msi.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index d9c73c173c14..217dc9f0231f 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -871,8 +871,15 @@ static int pci_msi_supported(struct pci_dev *dev, int nvec) * Any bridge which does NOT route MSI transactions from its * secondary bus to its primary bus must set NO_MSI flag on * the secondary pci_bus. - * We expect only arch-specific PCI host bus controller driver - * or quirks for specific PCI bridges to be setting NO_MSI. + * + * The NO_MSI flag can either be set directly by: + * - arch-specific PCI host bus controller drivers (deprecated) + * - quirks for specific PCI bridges + * + * or indirectly by platform-specific PCI host bridge drivers by + * advertising the 'msi_domain' property, which results in + * the NO_MSI flag when no MSI domain is found for this bridge + * at probe time. */ for (bus = dev->bus; bus; bus = bus->parent) if (bus->bus_flags & PCI_BUS_FLAGS_NO_MSI) From 557853f4e23e60b6c5a6ec4771bbdf39bbae15d0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 30 Mar 2021 16:11:45 +0100 Subject: [PATCH 0540/1379] PCI: Refactor HT advertising of NO_MSI flag The few quirks that deal with NO_MSI tend to be copy-paste heavy. Refactor them so that the hierarchy of conditions is slightly cleaner. Link: https://lore.kernel.org/r/20210330151145.997953-15-maz@kernel.org Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas --- drivers/pci/quirks.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 653660e3ba9e..972bb0f9f994 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2585,10 +2585,8 @@ static int msi_ht_cap_enabled(struct pci_dev *dev) /* Check the HyperTransport MSI mapping to know whether MSI is enabled or not */ static void quirk_msi_ht_cap(struct pci_dev *dev) { - if (dev->subordinate && !msi_ht_cap_enabled(dev)) { - pci_warn(dev, "MSI quirk detected; subordinate MSI disabled\n"); - dev->subordinate->bus_flags |= PCI_BUS_FLAGS_NO_MSI; - } + if (!msi_ht_cap_enabled(dev)) + quirk_disable_msi(dev); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_HT2000_PCIE, quirk_msi_ht_cap); @@ -2601,9 +2599,6 @@ static void quirk_nvidia_ck804_msi_ht_cap(struct pci_dev *dev) { struct pci_dev *pdev; - if (!dev->subordinate) - return; - /* * Check HT MSI cap on this chipset and the root one. A single one * having MSI is enough to be sure that MSI is supported. @@ -2611,10 +2606,8 @@ static void quirk_nvidia_ck804_msi_ht_cap(struct pci_dev *dev) pdev = pci_get_slot(dev->bus, 0); if (!pdev) return; - if (!msi_ht_cap_enabled(dev) && !msi_ht_cap_enabled(pdev)) { - pci_warn(dev, "MSI quirk detected; subordinate MSI disabled\n"); - dev->subordinate->bus_flags |= PCI_BUS_FLAGS_NO_MSI; - } + if (!msi_ht_cap_enabled(pdev)) + quirk_msi_ht_cap(dev); pci_dev_put(pdev); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_CK804_PCIE, From 5a4a8235fee69b5a31cf1c56a9fa14b0d21a930c Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Tue, 6 Apr 2021 17:19:12 +0800 Subject: [PATCH 0541/1379] thermal/drivers/ti-soc-thermal/ti-bandgap: Rearrange all the included header files alphabetically For the sake of lisibility, reorder the header files alphabetically. Signed-off-by: Zhen Lei Reviewed-by: Keerthy Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210406091912.2583-2-thunder.leizhen@huawei.com --- drivers/thermal/ti-soc-thermal/ti-bandgap.c | 38 ++++++++++----------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/thermal/ti-soc-thermal/ti-bandgap.c b/drivers/thermal/ti-soc-thermal/ti-bandgap.c index d81af89166d2..008fbed5a118 100644 --- a/drivers/thermal/ti-soc-thermal/ti-bandgap.c +++ b/drivers/thermal/ti-soc-thermal/ti-bandgap.c @@ -9,29 +9,29 @@ * Eduardo Valentin */ -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "ti-bandgap.h" From 670160fea22c587b384d56698bbb661fa4801534 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Tue, 20 Apr 2021 10:10:09 -0700 Subject: [PATCH 0542/1379] platform/chrome: cros_ec_typec: Track port role Stash the currently reported port role in the port struct and add a check for that too while determining whether to re-configure on-board Type C switches (this deals with cases like role swaps where the mux flags don't change, but the port role does). Signed-off-by: Prashant Malani Suggested-by: Nikunj A. Dadhania Tested-by: Deepti Deshatty Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20210420171008.3829549-1-pmalani@chromium.org --- drivers/platform/chrome/cros_ec_typec.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index d3df1717a5fd..1a06b8c80f80 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -58,6 +58,7 @@ struct cros_typec_port { /* Variables keeping track of switch state. */ struct typec_mux_state state; uint8_t mux_flags; + uint8_t role; /* Port alt modes. */ struct typec_altmode p_altmode[CROS_EC_ALTMODE_MAX]; @@ -995,10 +996,12 @@ static int cros_typec_port_update(struct cros_typec_data *typec, int port_num) } /* No change needs to be made, let's exit early. */ - if (typec->ports[port_num]->mux_flags == mux_resp.flags) + if (typec->ports[port_num]->mux_flags == mux_resp.flags && + typec->ports[port_num]->role == resp.role) return 0; typec->ports[port_num]->mux_flags = mux_resp.flags; + typec->ports[port_num]->role = resp.role; ret = cros_typec_configure_mux(typec, port_num, mux_resp.flags, &resp); if (ret) dev_warn(typec->dev, "Configure muxes failed, err = %d\n", ret); From 67880f1bc342ed4c94e72cad7f8ca76e5121aae3 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Tue, 20 Apr 2021 10:16:11 -0700 Subject: [PATCH 0543/1379] platform/chrome: cros_ec: Add Type C hard reset Update the EC command header to include the new event bit. This bit is included in the latest version of the Chrome EC headers[1]. [1] https://chromium.googlesource.com/chromiumos/platform/ec/+/refs/heads/main/include/ec_commands.h Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20210420171617.3830902-1-pmalani@chromium.org --- include/linux/platform_data/cros_ec_commands.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/platform_data/cros_ec_commands.h b/include/linux/platform_data/cros_ec_commands.h index 5ff8597ceabd..9156078c6fc6 100644 --- a/include/linux/platform_data/cros_ec_commands.h +++ b/include/linux/platform_data/cros_ec_commands.h @@ -5678,6 +5678,7 @@ enum tcpc_cc_polarity { #define PD_STATUS_EVENT_SOP_DISC_DONE BIT(0) #define PD_STATUS_EVENT_SOP_PRIME_DISC_DONE BIT(1) +#define PD_STATUS_EVENT_HARD_RESET BIT(2) struct ec_params_typec_status { uint8_t port; From 944b3a639573796debe3cd47298a5dd79810be73 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Tue, 20 Apr 2021 10:16:13 -0700 Subject: [PATCH 0544/1379] platform/chrome: cros_ec_typec: Handle hard reset The Chrome Embedded Controller (EC) generates a hard reset type C event when a USB Power Delivery (PD) hard reset is encountered. Handle this event by unregistering the partner and cable on the associated port and clearing the event flag. Cc: Benson Leung Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20210420171617.3830902-2-pmalani@chromium.org --- drivers/platform/chrome/cros_ec_typec.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index 1a06b8c80f80..60c21911a6c8 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -906,6 +906,19 @@ static void cros_typec_handle_status(struct cros_typec_data *typec, int port_num return; } + /* If we got a hard reset, unregister everything and return. */ + if (resp.events & PD_STATUS_EVENT_HARD_RESET) { + cros_typec_remove_partner(typec, port_num); + cros_typec_remove_cable(typec, port_num); + + ret = cros_typec_send_clear_event(typec, port_num, + PD_STATUS_EVENT_HARD_RESET); + if (ret < 0) + dev_warn(typec->dev, + "Failed hard reset event clear, port: %d\n", port_num); + return; + } + /* Handle any events appropriately. */ if (resp.events & PD_STATUS_EVENT_SOP_DISC_DONE && !typec->ports[port_num]->sop_disc_done) { u16 sop_revision; From bd5d553653e4151030ad2a94ef39a46b40c75a9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Thu, 15 Apr 2021 13:21:21 +0200 Subject: [PATCH 0545/1379] dt-bindings: thermal: thermal-sensor: require "#thermal-sensor-cells" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This property is required for every thermal sensor as it's used when using phandles. Signed-off-by: Rafał Miłecki Acked-by: Rob Herring Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210415112121.4999-1-zajec5@gmail.com --- Documentation/devicetree/bindings/thermal/thermal-sensor.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/thermal/thermal-sensor.yaml b/Documentation/devicetree/bindings/thermal/thermal-sensor.yaml index 9f747921e851..4bd345c71eb8 100644 --- a/Documentation/devicetree/bindings/thermal/thermal-sensor.yaml +++ b/Documentation/devicetree/bindings/thermal/thermal-sensor.yaml @@ -36,6 +36,9 @@ properties: containing several internal sensors. enum: [0, 1] +required: + - "#thermal-sensor-cells" + additionalProperties: true examples: From 76c50eb70d8e1133eaada0013845619c36345fbc Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 20 Nov 2020 12:26:40 -0600 Subject: [PATCH 0546/1379] nfsd: Fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix multiple warnings by explicitly adding a couple of break statements instead of just letting the code fall through to the next case. Link: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 1 + fs/nfsd/nfsctl.c | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 7364af580d83..b517a8794400 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3139,6 +3139,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out_nolock; } new->cl_mach_cred = true; + break; case SP4_NONE: break; default: /* checked by xdr code */ diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 853bf50a2a9b..c2c3d9077dc5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1166,6 +1166,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) inode->i_fop = &simple_dir_operations; inode->i_op = &simple_dir_inode_operations; inc_nlink(inode); + break; default: break; } From d12b64b9764ea17554fb230784ebf91287ed807e Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 20 Apr 2021 22:40:26 -0700 Subject: [PATCH 0547/1379] MAINTAINERS: repair reference in HYCON HY46XX TOUCHSCREEN SUPPORT Commit aa2f62cf211a ("Input: add driver for the Hycon HY46XX touchpanel series") adds the file ./drivers/input/touchscreen/hycon-hy46xx.c, but the file entry in MAINTAINERS refers to ./drivers/input/touchscreen/hy46xx.c. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains: warning: no file matches F: drivers/input/touchscreen/hy46xx.c Repair the file entry by referring to the right location. Signed-off-by: Lukas Bulwahn Acked-by: Giulio Benetti Link: https://lore.kernel.org/r/20210419060023.3460-1-lukas.bulwahn@gmail.com Signed-off-by: Dmitry Torokhov --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e2a4d7cb38a2..3799f33650f9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8249,7 +8249,7 @@ M: Giulio Benetti L: linux-input@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/input/touchscreen/hycon,hy46xx.yaml -F: drivers/input/touchscreen/hy46xx.c +F: drivers/input/touchscreen/hycon-hy46xx.c HYGON PROCESSOR SUPPORT M: Pu Wen From c5bb32f57bf3a30ed03be51f7be0840325ba8b4a Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Tue, 20 Apr 2021 21:21:09 -0700 Subject: [PATCH 0548/1379] platform/chrome: cros_ec_typec: Add DP mode check There are certain transitional situations where the dp_mode field in the PD_CONTROL response might not be populated with the right DP pin assignment value yet. Add a check for that to avoid sending an invalid value to the Type C mode switch. Signed-off-by: Prashant Malani Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20210421042108.2002-1-pmalani@chromium.org --- drivers/platform/chrome/cros_ec_typec.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/platform/chrome/cros_ec_typec.c b/drivers/platform/chrome/cros_ec_typec.c index 60c21911a6c8..27c068c4c38d 100644 --- a/drivers/platform/chrome/cros_ec_typec.c +++ b/drivers/platform/chrome/cros_ec_typec.c @@ -487,6 +487,11 @@ static int cros_typec_enable_dp(struct cros_typec_data *typec, return -ENOTSUPP; } + if (!pd_ctrl->dp_mode) { + dev_err(typec->dev, "No valid DP mode provided.\n"); + return -EINVAL; + } + /* Status VDO. */ dp_data.status = DP_STATUS_ENABLED; if (port->mux_flags & USB_PD_MUX_HPD_IRQ) From 4423ee65f76818c8a8994e6f5821372661ea7f89 Mon Sep 17 00:00:00 2001 From: Pi-Hsun Shih Date: Wed, 14 Apr 2021 14:45:24 +0800 Subject: [PATCH 0549/1379] platform/chrome: cros_usbpd_notify: Listen to EC_HOST_EVENT_USB_MUX host event On system that use ACPI, cros_usbpd_notify gets notifications of USB MUX host event same as PD host events [1]. But currently on system that use DT, the driver only listen on EC_HOST_EVENT_PD_MCU. Add EC_HOST_EVENT_USB_MUX to the list of host events, so we have same behavior on all platforms. [1]: https://chromium.googlesource.com/chromiumos/third_party/coreboot/+/refs/heads/chromeos-2016.05/src/ec/google/chromeec/acpi/ec.asl#382 Signed-off-by: Pi-Hsun Shih Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20210414064524.2450908-1-pihsun@chromium.org --- drivers/platform/chrome/cros_usbpd_notify.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/chrome/cros_usbpd_notify.c b/drivers/platform/chrome/cros_usbpd_notify.c index 7f36142ab12a..48a6617aa12f 100644 --- a/drivers/platform/chrome/cros_usbpd_notify.c +++ b/drivers/platform/chrome/cros_usbpd_notify.c @@ -220,7 +220,8 @@ static int cros_usbpd_notify_plat(struct notifier_block *nb, if (!host_event) return NOTIFY_DONE; - if (host_event & EC_HOST_EVENT_MASK(EC_HOST_EVENT_PD_MCU)) { + if (host_event & (EC_HOST_EVENT_MASK(EC_HOST_EVENT_PD_MCU) | + EC_HOST_EVENT_MASK(EC_HOST_EVENT_USB_MUX))) { cros_usbpd_get_event_and_notify(pdnotify->dev, ec_dev); return NOTIFY_OK; } From d61b3f9b91be32f714b218377ab5081932e3ebc2 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Fri, 9 Apr 2021 17:51:38 +0800 Subject: [PATCH 0550/1379] platform/chrome: cros_ec_lpc: Use DEFINE_MUTEX() for mutex lock mutex lock can be initialized automatically with DEFINE_MUTEX() rather than explicitly calling mutex_init(). Reported-by: Hulk Robot Signed-off-by: Ye Bin Signed-off-by: Enric Balletbo i Serra Link: https://lore.kernel.org/r/20210409095138.2293869-1-yebin10@huawei.com --- drivers/platform/chrome/cros_ec_lpc_mec.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/platform/chrome/cros_ec_lpc_mec.c b/drivers/platform/chrome/cros_ec_lpc_mec.c index 9035b17e8c86..bbc2884f5e2f 100644 --- a/drivers/platform/chrome/cros_ec_lpc_mec.c +++ b/drivers/platform/chrome/cros_ec_lpc_mec.c @@ -14,7 +14,7 @@ * This mutex must be held while accessing the EMI unit. We can't rely on the * EC mutex because memmap data may be accessed without it being held. */ -static struct mutex io_mutex; +static DEFINE_MUTEX(io_mutex); static u16 mec_emi_base, mec_emi_end; /** @@ -142,7 +142,6 @@ EXPORT_SYMBOL(cros_ec_lpc_io_bytes_mec); void cros_ec_lpc_mec_init(unsigned int base, unsigned int end) { - mutex_init(&io_mutex); mec_emi_base = base; mec_emi_end = end; } From d473327f8f53418691cb2944a45da3e9ea51f9bf Mon Sep 17 00:00:00 2001 From: Lin Ruizhe Date: Wed, 21 Apr 2021 16:42:56 +0800 Subject: [PATCH 0551/1379] thermal/drivers/ti-soc-thermal/bandgap Remove unused variable 'val' The function ti_bandgap_restore_ctxt() restores the context at resume time. It checks if the sensor has a counter, reads the register but does nothing with the value. The block was probably omitted by the commit b87ea759a4cc. Remove the unused variable as well as the block using it as we can consider it as dead code. Reported-by: Hulk Robot Fixes: b87ea759a4cc ("staging: omap-thermal: fix context restore function") Signed-off-by: Lin Ruizhe Reviewed-by: Tony Lindgren Tested-by: Tony Lindgren Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210421084256.57591-1-linruizhe@huawei.com --- drivers/thermal/ti-soc-thermal/ti-bandgap.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/thermal/ti-soc-thermal/ti-bandgap.c b/drivers/thermal/ti-soc-thermal/ti-bandgap.c index 008fbed5a118..ebe7cb70bfb6 100644 --- a/drivers/thermal/ti-soc-thermal/ti-bandgap.c +++ b/drivers/thermal/ti-soc-thermal/ti-bandgap.c @@ -1142,14 +1142,10 @@ static int ti_bandgap_restore_ctxt(struct ti_bandgap *bgp) for (i = 0; i < bgp->conf->sensor_count; i++) { struct temp_sensor_registers *tsr; struct temp_sensor_regval *rval; - u32 val = 0; rval = &bgp->regval[i]; tsr = bgp->conf->sensors[i].registers; - if (TI_BANDGAP_HAS(bgp, COUNTER)) - val = ti_bandgap_readl(bgp, tsr->bgap_counter); - if (TI_BANDGAP_HAS(bgp, TSHUT_CONFIG)) ti_bandgap_writel(bgp, rval->tshut_threshold, tsr->tshut_threshold); From d99f2487e1de23a2e902d1a359a85a48bfd21fe7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 21 Apr 2021 07:48:43 -0400 Subject: [PATCH 0552/1379] NFS: The 'fattr_valid' field in struct nfs_server should be unsigned int Fix up a static compiler warning: "fs/nfs/nfs4proc.c:3882 _nfs4_server_capabilities() warn: was expecting a 64 bit value instead of '(1 << 11)'" The fix is to convert the fattr_valid field to match the type of the 'valid' field in struct nfs_fattr. Reported-by: Dan Carpenter Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index d28d7a62864f..70057b2e606e 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -156,6 +156,7 @@ struct nfs_server { #define NFS_MOUNT_WRITE_EAGER 0x01000000 #define NFS_MOUNT_WRITE_WAIT 0x02000000 + unsigned int fattr_valid; /* Valid attributes */ unsigned int caps; /* server capabilities */ unsigned int rsize; /* read size */ unsigned int rpages; /* read size (in pages) */ @@ -191,8 +192,6 @@ struct nfs_server { dev_t s_dev; /* superblock dev numbers */ struct nfs_auth_info auth_info; /* parsed auth flavors */ - __u64 fattr_valid; /* Valid attributes */ - #ifdef CONFIG_NFS_FSCACHE struct nfs_fscache_key *fscache_key; /* unique key for superblock */ struct fscache_cookie *fscache; /* superblock cookie */ From 08e9fdfbb2248e93bbfaeb9cde284776085466cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Tue, 20 Apr 2021 23:01:04 +0200 Subject: [PATCH 0553/1379] dt-bindings: thermal: brcm,ns-thermal: Convert to the json-schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helps validating DTS files. Signed-off-by: Rafał Miłecki Reviewed-by: Rob Herring Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210420210104.10555-1-zajec5@gmail.com --- .../bindings/thermal/brcm,ns-thermal.txt | 37 ------------ .../bindings/thermal/brcm,ns-thermal.yaml | 60 +++++++++++++++++++ 2 files changed, 60 insertions(+), 37 deletions(-) delete mode 100644 Documentation/devicetree/bindings/thermal/brcm,ns-thermal.txt create mode 100644 Documentation/devicetree/bindings/thermal/brcm,ns-thermal.yaml diff --git a/Documentation/devicetree/bindings/thermal/brcm,ns-thermal.txt b/Documentation/devicetree/bindings/thermal/brcm,ns-thermal.txt deleted file mode 100644 index 68e047170039..000000000000 --- a/Documentation/devicetree/bindings/thermal/brcm,ns-thermal.txt +++ /dev/null @@ -1,37 +0,0 @@ -* Broadcom Northstar Thermal - -This binding describes thermal sensor that is part of Northstar's DMU (Device -Management Unit). - -Required properties: -- compatible : Must be "brcm,ns-thermal" -- reg : iomem address range of PVTMON registers -- #thermal-sensor-cells : Should be <0> - -Example: - -thermal: thermal@1800c2c0 { - compatible = "brcm,ns-thermal"; - reg = <0x1800c2c0 0x10>; - #thermal-sensor-cells = <0>; -}; - -thermal-zones { - cpu_thermal: cpu-thermal { - polling-delay-passive = <0>; - polling-delay = <1000>; - coefficients = <(-556) 418000>; - thermal-sensors = <&thermal>; - - trips { - cpu-crit { - temperature = <125000>; - hysteresis = <0>; - type = "critical"; - }; - }; - - cooling-maps { - }; - }; -}; diff --git a/Documentation/devicetree/bindings/thermal/brcm,ns-thermal.yaml b/Documentation/devicetree/bindings/thermal/brcm,ns-thermal.yaml new file mode 100644 index 000000000000..fdeb333e010d --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/brcm,ns-thermal.yaml @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/thermal/brcm,ns-thermal.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Broadcom Northstar Thermal + +maintainers: + - Rafał Miłecki + +description: + Thermal sensor that is part of Northstar's DMU (Device Management Unit). + +allOf: + - $ref: thermal-sensor.yaml# + +properties: + compatible: + const: brcm,ns-thermal + + reg: + description: PVTMON registers range + maxItems: 1 + + "#thermal-sensor-cells": + const: 0 + +unevaluatedProperties: false + +required: + - reg + +examples: + - | + thermal: thermal@1800c2c0 { + compatible = "brcm,ns-thermal"; + reg = <0x1800c2c0 0x10>; + #thermal-sensor-cells = <0>; + }; + + thermal-zones { + cpu-thermal { + polling-delay-passive = <0>; + polling-delay = <1000>; + coefficients = <(-556) 418000>; + thermal-sensors = <&thermal>; + + trips { + cpu-crit { + temperature = <125000>; + hysteresis = <0>; + type = "critical"; + }; + }; + + cooling-maps { + }; + }; + }; From 85367040511f8402d7e4054d8c17b053c75e33ff Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 21 Apr 2021 23:45:26 +0800 Subject: [PATCH 0554/1379] scsi: blk-mq: Fix build warning when making htmldocs Fixes the following warning when running 'make htmldocs': include/linux/blk-mq.h:395: warning: Function parameter or member 'set_rq_budget_token' not described in 'blk_mq_ops' include/linux/blk-mq.h:395: warning: Function parameter or member 'get_rq_budget_token' not described in 'blk_mq_ops' [mkp: added warning messages] Link: https://lore.kernel.org/r/20210421154526.1954174-1-ming.lei@redhat.com Fixes: d022d18c045f ("scsi: blk-mq: Add callbacks for storing & retrieving budget token") Reported-by: Stephen Rothwell Signed-off-by: Ming Lei Signed-off-by: Martin K. Petersen --- include/linux/blk-mq.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 3bd3ee651143..359486940fa0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -313,12 +313,12 @@ struct blk_mq_ops { */ void (*put_budget)(struct request_queue *, int); - /* - * @set_rq_budget_toekn: store rq's budget token + /** + * @set_rq_budget_token: store rq's budget token */ void (*set_rq_budget_token)(struct request *, int); - /* - * @get_rq_budget_toekn: retrieve rq's budget token + /** + * @get_rq_budget_token: retrieve rq's budget token */ int (*get_rq_budget_token)(struct request *); From 509f1010e4fc55e2dbfc036317afd573ccd0931c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 21 Apr 2021 09:54:55 +0800 Subject: [PATCH 0555/1379] f2fs: avoid using native allocate_segment_by_default() As we did for other cases, in fix_curseg_write_pointer(), let's use wrapped f2fs_allocate_new_section() instead of native allocate_segment_by_default(), by this way, it fixes to cover segment allocation with curseg_lock and sentry_lock. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/segment.c | 18 ++++++++++-------- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 984a2a546745..3ebb951cc426 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3395,7 +3395,7 @@ void f2fs_get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, int dir); void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); -void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type); +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force); void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d697c8900fa7..af7230fb9c1f 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1664,7 +1664,7 @@ next_alloc: down_write(&sbi->pin_sem); f2fs_lock_op(sbi); - f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); f2fs_unlock_op(sbi); map.m_seg_type = CURSEG_COLD_DATA_PINNED; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6e740ecf0814..efd1e57384b9 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2933,7 +2933,7 @@ unlock: } static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, - bool new_sec) + bool new_sec, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; @@ -2941,7 +2941,7 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, if (!curseg->inited) goto alloc; - if (curseg->next_blkoff || + if (force || curseg->next_blkoff || get_valid_blocks(sbi, curseg->segno, new_sec)) goto alloc; @@ -2953,16 +2953,17 @@ alloc: locate_dirty_segment(sbi, old_segno); } -static void __allocate_new_section(struct f2fs_sb_info *sbi, int type) +static void __allocate_new_section(struct f2fs_sb_info *sbi, + int type, bool force) { - __allocate_new_segment(sbi, type, true); + __allocate_new_segment(sbi, type, true, force); } -void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type) +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); - __allocate_new_section(sbi, type); + __allocate_new_section(sbi, type, force); up_write(&SIT_I(sbi)->sentry_lock); up_read(&SM_I(sbi)->curseg_lock); } @@ -2974,7 +2975,7 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segment(sbi, i, false); + __allocate_new_segment(sbi, i, false, false); up_write(&SIT_I(sbi)->sentry_lock); up_read(&SM_I(sbi)->curseg_lock); } @@ -4844,7 +4845,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) f2fs_notice(sbi, "Assign new section to curseg[%d]: " "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); - allocate_segment_by_default(sbi, type, true); + + f2fs_allocate_new_section(sbi, type, true); /* check consistency of the zone curseg pointed to */ if (check_zone_write_pointer(sbi, zbd, &zone)) From 93effd83b6927c0252bb1e35aa3e116d3e2527bb Mon Sep 17 00:00:00 2001 From: Thara Gopinath Date: Thu, 21 Jan 2021 21:34:04 -0500 Subject: [PATCH 0556/1379] iwlwifi: mvm: tt: Replace thermal_notify_framework thermal_notify_framework just updates for a single trip point where as thermal_zone_device_update does other bookkeeping like updating the temperature of the thermal zone and setting the next trip point etc. Replace thermal_notify_framework with thermal_zone_device_update as the later is more thorough. Acked-by: Kalle Valo Signed-off-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210122023406.3500424-2-thara.gopinath@linaro.org --- drivers/net/wireless/intel/iwlwifi/mvm/tt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c index 2a7339b12b13..398390c59344 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/tt.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/tt.c @@ -146,8 +146,8 @@ void iwl_mvm_temp_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb) if (mvm->tz_device.tzone) { struct iwl_mvm_thermal_device *tz_dev = &mvm->tz_device; - thermal_notify_framework(tz_dev->tzone, - tz_dev->fw_trips_index[ths_crossed]); + thermal_zone_device_update(tz_dev->tzone, + THERMAL_TRIP_VIOLATED); } #endif /* CONFIG_THERMAL */ } From d60d6e7adfc3814f6de03c978ff1daab21478f87 Mon Sep 17 00:00:00 2001 From: Thara Gopinath Date: Thu, 21 Jan 2021 21:34:05 -0500 Subject: [PATCH 0557/1379] thermal/core: Remove thermal_notify_framework thermal_notify_framework just updates for a single trip point where as thermal_zone_device_update does other bookkeeping like updating the temperature of the thermal zone and setting the next trip point. The only driver that was using thermal_notify_framework was updated in the previous patch to use thermal_zone_device_update instead. Since there are no users for thermal_notify_framework remove it. Signed-off-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210122023406.3500424-3-thara.gopinath@linaro.org --- drivers/thermal/thermal_core.c | 18 ------------------ include/linux/thermal.h | 5 ----- 2 files changed, 23 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 3566fd291399..d20b25f40d19 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -561,24 +561,6 @@ void thermal_zone_device_update(struct thermal_zone_device *tz, } EXPORT_SYMBOL_GPL(thermal_zone_device_update); -/** - * thermal_notify_framework - Sensor drivers use this API to notify framework - * @tz: thermal zone device - * @trip: indicates which trip point has been crossed - * - * This function handles the trip events from sensor drivers. It starts - * throttling the cooling devices according to the policy configured. - * For CRITICAL and HOT trip points, this notifies the respective drivers, - * and does actual throttling for other trip points i.e ACTIVE and PASSIVE. - * The throttling policy is based on the configured platform data; if no - * platform data is provided, this uses the step_wise throttling policy. - */ -void thermal_notify_framework(struct thermal_zone_device *tz, int trip) -{ - handle_thermal_trip(tz, trip); -} -EXPORT_SYMBOL_GPL(thermal_notify_framework); - static void thermal_zone_device_check(struct work_struct *work) { struct thermal_zone_device *tz = container_of(work, struct diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 169502164364..d296f3b88fb9 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -390,7 +390,6 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp); int thermal_zone_get_slope(struct thermal_zone_device *tz); int thermal_zone_get_offset(struct thermal_zone_device *tz); -void thermal_notify_framework(struct thermal_zone_device *, int); int thermal_zone_device_enable(struct thermal_zone_device *tz); int thermal_zone_device_disable(struct thermal_zone_device *tz); void thermal_zone_device_critical(struct thermal_zone_device *tz); @@ -436,10 +435,6 @@ static inline int thermal_zone_get_offset( struct thermal_zone_device *tz) { return -ENODEV; } -static inline void thermal_notify_framework(struct thermal_zone_device *tz, - int trip) -{ } - static inline int thermal_zone_device_enable(struct thermal_zone_device *tz) { return -ENODEV; } From a5655d90cf508598b5f5bbccd911924cf8dc9060 Mon Sep 17 00:00:00 2001 From: Thara Gopinath Date: Thu, 21 Jan 2021 21:34:06 -0500 Subject: [PATCH 0558/1379] Documentation: driver-api: thermal: Remove thermal_notify_framework from documentation Since thermal_notify_framework is no longer supported/implemented remove the entry from sysfs-api.rst. Signed-off-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210122023406.3500424-4-thara.gopinath@linaro.org --- Documentation/driver-api/thermal/sysfs-api.rst | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/Documentation/driver-api/thermal/sysfs-api.rst b/Documentation/driver-api/thermal/sysfs-api.rst index 29fdd817ddb0..4b638c14bc16 100644 --- a/Documentation/driver-api/thermal/sysfs-api.rst +++ b/Documentation/driver-api/thermal/sysfs-api.rst @@ -730,17 +730,7 @@ This function returns the thermal_instance corresponding to a given {thermal_zone, cooling_device, trip_point} combination. Returns NULL if such an instance does not exist. -4.3. thermal_notify_framework ------------------------------ - -This function handles the trip events from sensor drivers. It starts -throttling the cooling devices according to the policy configured. -For CRITICAL and HOT trip points, this notifies the respective drivers, -and does actual throttling for other trip points i.e ACTIVE and PASSIVE. -The throttling policy is based on the configured platform data; if no -platform data is provided, this uses the step_wise throttling policy. - -4.4. thermal_cdev_update +4.3. thermal_cdev_update ------------------------ This function serves as an arbitrator to set the state of a cooling From 9d51769b2e75bb33c56c8f9ee933eca2d92b375b Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Tue, 20 Apr 2021 20:33:35 +0200 Subject: [PATCH 0559/1379] thermal/drivers/tsens: Don't hardcode sensor slope Function compute_intercept_slope hardcode the sensor slope to SLOPE_DEFAULT. Change this and use the default value only if a slope is not defined. This is needed for tsens VER_0 that has a hardcoded slope table. Signed-off-by: Ansuel Smith Reviewed-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210420183343.2272-2-ansuelsmth@gmail.com --- drivers/thermal/qcom/tsens.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index a5da6a74a465..d6f802100641 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -85,7 +85,8 @@ void compute_intercept_slope(struct tsens_priv *priv, u32 *p1, "%s: sensor%d - data_point1:%#x data_point2:%#x\n", __func__, i, p1[i], p2[i]); - priv->sensor[i].slope = SLOPE_DEFAULT; + if (!priv->sensor[i].slope) + priv->sensor[i].slope = SLOPE_DEFAULT; if (mode == TWO_PT_CALIB) { /* * slope (m) = adc_code2 - adc_code1 (y2 - y1)/ From a0ed1411278db902a043e584c8ed320fe34346b6 Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Tue, 20 Apr 2021 20:33:36 +0200 Subject: [PATCH 0560/1379] thermal/drivers/tsens: Convert msm8960 to reg_field Convert msm9860 driver to reg_field to use the init_common function. Signed-off-by: Ansuel Smith Acked-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210420183343.2272-3-ansuelsmth@gmail.com --- drivers/thermal/qcom/tsens-8960.c | 73 ++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/qcom/tsens-8960.c b/drivers/thermal/qcom/tsens-8960.c index 2a28a5af209e..0d0c2647da86 100644 --- a/drivers/thermal/qcom/tsens-8960.c +++ b/drivers/thermal/qcom/tsens-8960.c @@ -51,11 +51,22 @@ #define MIN_LIMIT_TH 0x0 #define MAX_LIMIT_TH 0xff -#define S0_STATUS_ADDR 0x3628 #define INT_STATUS_ADDR 0x363c #define TRDY_MASK BIT(7) #define TIMEOUT_US 100 +#define S0_STATUS_OFF 0x3628 +#define S1_STATUS_OFF 0x362c +#define S2_STATUS_OFF 0x3630 +#define S3_STATUS_OFF 0x3634 +#define S4_STATUS_OFF 0x3638 +#define S5_STATUS_OFF 0x3664 /* Sensors 5-10 found on apq8064/msm8960 */ +#define S6_STATUS_OFF 0x3668 +#define S7_STATUS_OFF 0x366c +#define S8_STATUS_OFF 0x3670 +#define S9_STATUS_OFF 0x3674 +#define S10_STATUS_OFF 0x3678 + static int suspend_8960(struct tsens_priv *priv) { int ret; @@ -179,7 +190,7 @@ static int init_8960(struct tsens_priv *priv) */ for (i = 0; i < priv->num_sensors; i++) { if (i >= 5) - priv->sensor[i].status = S0_STATUS_ADDR + 40; + priv->sensor[i].status = S0_STATUS_OFF + 40; priv->sensor[i].status += i * 4; } @@ -269,6 +280,63 @@ static int get_temp_8960(const struct tsens_sensor *s, int *temp) return -ETIMEDOUT; } +static const struct reg_field tsens_8960_regfields[MAX_REGFIELDS] = { + /* ----- SROT ------ */ + /* No VERSION information */ + + /* CNTL */ + [TSENS_EN] = REG_FIELD(CNTL_ADDR, 0, 0), + [TSENS_SW_RST] = REG_FIELD(CNTL_ADDR, 1, 1), + /* 8960 has 5 sensors, 8660 has 11, we only handle 5 */ + [SENSOR_EN] = REG_FIELD(CNTL_ADDR, 3, 7), + + /* ----- TM ------ */ + /* INTERRUPT ENABLE */ + /* NO INTERRUPT ENABLE */ + + /* Single UPPER/LOWER TEMPERATURE THRESHOLD for all sensors */ + [LOW_THRESH_0] = REG_FIELD(THRESHOLD_ADDR, 0, 7), + [UP_THRESH_0] = REG_FIELD(THRESHOLD_ADDR, 8, 15), + /* MIN_THRESH_0 and MAX_THRESH_0 are not present in the regfield + * Recycle CRIT_THRESH_0 and 1 to set the required regs to hardcoded temp + * MIN_THRESH_0 -> CRIT_THRESH_1 + * MAX_THRESH_0 -> CRIT_THRESH_0 + */ + [CRIT_THRESH_1] = REG_FIELD(THRESHOLD_ADDR, 16, 23), + [CRIT_THRESH_0] = REG_FIELD(THRESHOLD_ADDR, 24, 31), + + /* UPPER/LOWER INTERRUPT [CLEAR/STATUS] */ + /* 1 == clear, 0 == normal operation */ + [LOW_INT_CLEAR_0] = REG_FIELD(CNTL_ADDR, 9, 9), + [UP_INT_CLEAR_0] = REG_FIELD(CNTL_ADDR, 10, 10), + + /* NO CRITICAL INTERRUPT SUPPORT on 8960 */ + + /* Sn_STATUS */ + [LAST_TEMP_0] = REG_FIELD(S0_STATUS_OFF, 0, 7), + [LAST_TEMP_1] = REG_FIELD(S1_STATUS_OFF, 0, 7), + [LAST_TEMP_2] = REG_FIELD(S2_STATUS_OFF, 0, 7), + [LAST_TEMP_3] = REG_FIELD(S3_STATUS_OFF, 0, 7), + [LAST_TEMP_4] = REG_FIELD(S4_STATUS_OFF, 0, 7), + [LAST_TEMP_5] = REG_FIELD(S5_STATUS_OFF, 0, 7), + [LAST_TEMP_6] = REG_FIELD(S6_STATUS_OFF, 0, 7), + [LAST_TEMP_7] = REG_FIELD(S7_STATUS_OFF, 0, 7), + [LAST_TEMP_8] = REG_FIELD(S8_STATUS_OFF, 0, 7), + [LAST_TEMP_9] = REG_FIELD(S9_STATUS_OFF, 0, 7), + [LAST_TEMP_10] = REG_FIELD(S10_STATUS_OFF, 0, 7), + + /* No VALID field on 8960 */ + /* TSENS_INT_STATUS bits: 1 == threshold violated */ + [MIN_STATUS_0] = REG_FIELD(INT_STATUS_ADDR, 0, 0), + [LOWER_STATUS_0] = REG_FIELD(INT_STATUS_ADDR, 1, 1), + [UPPER_STATUS_0] = REG_FIELD(INT_STATUS_ADDR, 2, 2), + /* No CRITICAL field on 8960 */ + [MAX_STATUS_0] = REG_FIELD(INT_STATUS_ADDR, 3, 3), + + /* TRDY: 1=ready, 0=in progress */ + [TRDY] = REG_FIELD(INT_STATUS_ADDR, 7, 7), +}; + static const struct tsens_ops ops_8960 = { .init = init_8960, .calibrate = calibrate_8960, @@ -282,4 +350,5 @@ static const struct tsens_ops ops_8960 = { struct tsens_plat_data data_8960 = { .num_sensors = 11, .ops = &ops_8960, + .fields = tsens_8960_regfields, }; From 53e2a20e4c41683b695145436b34aa4a14bbcd8c Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Tue, 20 Apr 2021 20:33:37 +0200 Subject: [PATCH 0561/1379] thermal/drivers/tsens: Add VER_0 tsens version VER_0 is used to describe device based on tsens version before v0.1. These device are devices based on msm8960 for example apq8064 or ipq806x. Add support for VER_0 in tsens.c and set the right tsens feat in tsens-8960.c file. Signed-off-by: Ansuel Smith Reviewed-by: Thara Gopinath Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210420183343.2272-4-ansuelsmth@gmail.com --- drivers/thermal/qcom/tsens-8960.c | 9 ++ drivers/thermal/qcom/tsens.c | 152 ++++++++++++++++++++++++------ drivers/thermal/qcom/tsens.h | 4 +- 3 files changed, 134 insertions(+), 31 deletions(-) diff --git a/drivers/thermal/qcom/tsens-8960.c b/drivers/thermal/qcom/tsens-8960.c index 0d0c2647da86..0dd15e812208 100644 --- a/drivers/thermal/qcom/tsens-8960.c +++ b/drivers/thermal/qcom/tsens-8960.c @@ -347,8 +347,17 @@ static const struct tsens_ops ops_8960 = { .resume = resume_8960, }; +static struct tsens_features tsens_8960_feat = { + .ver_major = VER_0, + .crit_int = 0, + .adc = 1, + .srot_split = 0, + .max_sensors = 11, +}; + struct tsens_plat_data data_8960 = { .num_sensors = 11, .ops = &ops_8960, + .feat = &tsens_8960_feat, .fields = tsens_8960_regfields, }; diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index d6f802100641..83e01cb370f9 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -516,6 +517,15 @@ static irqreturn_t tsens_irq_thread(int irq, void *data) dev_dbg(priv->dev, "[%u] %s: no violation: %d\n", hw_id, __func__, temp); } + + if (tsens_version(priv) < VER_0_1) { + /* Constraint: There is only 1 interrupt control register for all + * 11 temperature sensor. So monitoring more than 1 sensor based + * on interrupts will yield inconsistent result. To overcome this + * issue we will monitor only sensor 0 which is the master sensor. + */ + break; + } } return IRQ_HANDLED; @@ -531,6 +541,13 @@ static int tsens_set_trips(void *_sensor, int low, int high) int high_val, low_val, cl_high, cl_low; u32 hw_id = s->hw_id; + if (tsens_version(priv) < VER_0_1) { + /* Pre v0.1 IP had a single register for each type of interrupt + * and thresholds + */ + hw_id = 0; + } + dev_dbg(dev, "[%u] %s: proposed thresholds: (%d:%d)\n", hw_id, __func__, low, high); @@ -585,18 +602,21 @@ int get_temp_tsens_valid(const struct tsens_sensor *s, int *temp) u32 valid; int ret; - ret = regmap_field_read(priv->rf[valid_idx], &valid); - if (ret) - return ret; - while (!valid) { - /* Valid bit is 0 for 6 AHB clock cycles. - * At 19.2MHz, 1 AHB clock is ~60ns. - * We should enter this loop very, very rarely. - */ - ndelay(400); + /* VER_0 doesn't have VALID bit */ + if (tsens_version(priv) >= VER_0_1) { ret = regmap_field_read(priv->rf[valid_idx], &valid); if (ret) return ret; + while (!valid) { + /* Valid bit is 0 for 6 AHB clock cycles. + * At 19.2MHz, 1 AHB clock is ~60ns. + * We should enter this loop very, very rarely. + */ + ndelay(400); + ret = regmap_field_read(priv->rf[valid_idx], &valid); + if (ret) + return ret; + } } /* Valid bit is set, OK to read the temperature */ @@ -609,15 +629,29 @@ int get_temp_common(const struct tsens_sensor *s, int *temp) { struct tsens_priv *priv = s->priv; int hw_id = s->hw_id; - int last_temp = 0, ret; + int last_temp = 0, ret, trdy; + unsigned long timeout; - ret = regmap_field_read(priv->rf[LAST_TEMP_0 + hw_id], &last_temp); - if (ret) - return ret; + timeout = jiffies + usecs_to_jiffies(TIMEOUT_US); + do { + if (tsens_version(priv) == VER_0) { + ret = regmap_field_read(priv->rf[TRDY], &trdy); + if (ret) + return ret; + if (!trdy) + continue; + } - *temp = code_to_degc(last_temp, s) * 1000; + ret = regmap_field_read(priv->rf[LAST_TEMP_0 + hw_id], &last_temp); + if (ret) + return ret; - return 0; + *temp = code_to_degc(last_temp, s) * 1000; + + return 0; + } while (time_before(jiffies, timeout)); + + return -ETIMEDOUT; } #ifdef CONFIG_DEBUG_FS @@ -739,18 +773,33 @@ int __init init_common(struct tsens_priv *priv) priv->tm_offset = 0x1000; } - res = platform_get_resource(op, IORESOURCE_MEM, 0); - tm_base = devm_ioremap_resource(dev, res); - if (IS_ERR(tm_base)) { - ret = PTR_ERR(tm_base); + if (tsens_version(priv) >= VER_0_1) { + res = platform_get_resource(op, IORESOURCE_MEM, 0); + tm_base = devm_ioremap_resource(dev, res); + if (IS_ERR(tm_base)) { + ret = PTR_ERR(tm_base); + goto err_put_device; + } + + priv->tm_map = devm_regmap_init_mmio(dev, tm_base, &tsens_config); + } else { /* VER_0 share the same gcc regs using a syscon */ + struct device *parent = priv->dev->parent; + + if (parent) + priv->tm_map = syscon_node_to_regmap(parent->of_node); + } + + if (IS_ERR_OR_NULL(priv->tm_map)) { + if (!priv->tm_map) + ret = -ENODEV; + else + ret = PTR_ERR(priv->tm_map); goto err_put_device; } - priv->tm_map = devm_regmap_init_mmio(dev, tm_base, &tsens_config); - if (IS_ERR(priv->tm_map)) { - ret = PTR_ERR(priv->tm_map); - goto err_put_device; - } + /* VER_0 have only tm_map */ + if (!priv->srot_map) + priv->srot_map = priv->tm_map; if (tsens_version(priv) > VER_0_1) { for (i = VER_MAJOR; i <= VER_STEP; i++) { @@ -772,6 +821,10 @@ int __init init_common(struct tsens_priv *priv) ret = PTR_ERR(priv->rf[TSENS_EN]); goto err_put_device; } + /* in VER_0 TSENS need to be explicitly enabled */ + if (tsens_version(priv) == VER_0) + regmap_field_write(priv->rf[TSENS_EN], 1); + ret = regmap_field_read(priv->rf[TSENS_EN], &enabled); if (ret) goto err_put_device; @@ -794,6 +847,19 @@ int __init init_common(struct tsens_priv *priv) goto err_put_device; } + priv->rf[TSENS_SW_RST] = + devm_regmap_field_alloc(dev, priv->srot_map, priv->fields[TSENS_SW_RST]); + if (IS_ERR(priv->rf[TSENS_SW_RST])) { + ret = PTR_ERR(priv->rf[TSENS_SW_RST]); + goto err_put_device; + } + + priv->rf[TRDY] = devm_regmap_field_alloc(dev, priv->tm_map, priv->fields[TRDY]); + if (IS_ERR(priv->rf[TRDY])) { + ret = PTR_ERR(priv->rf[TRDY]); + goto err_put_device; + } + /* This loop might need changes if enum regfield_ids is reordered */ for (j = LAST_TEMP_0; j <= UP_THRESH_15; j += 16) { for (i = 0; i < priv->feat->max_sensors; i++) { @@ -809,7 +875,7 @@ int __init init_common(struct tsens_priv *priv) } } - if (priv->feat->crit_int) { + if (priv->feat->crit_int || tsens_version(priv) < VER_0_1) { /* Loop might need changes if enum regfield_ids is reordered */ for (j = CRITICAL_STATUS_0; j <= CRIT_THRESH_15; j += 16) { for (i = 0; i < priv->feat->max_sensors; i++) { @@ -847,7 +913,11 @@ int __init init_common(struct tsens_priv *priv) } spin_lock_init(&priv->ul_lock); - tsens_enable_irq(priv); + + /* VER_0 interrupt doesn't need to be enabled */ + if (tsens_version(priv) >= VER_0_1) + tsens_enable_irq(priv); + tsens_debug_init(op); err_put_device: @@ -949,10 +1019,19 @@ static int tsens_register_irq(struct tsens_priv *priv, char *irqname, if (irq == -ENXIO) ret = 0; } else { - ret = devm_request_threaded_irq(&pdev->dev, irq, - NULL, thread_fn, - IRQF_ONESHOT, - dev_name(&pdev->dev), priv); + /* VER_0 interrupt is TRIGGER_RISING, VER_0_1 and up is ONESHOT */ + if (tsens_version(priv) == VER_0) + ret = devm_request_threaded_irq(&pdev->dev, irq, + thread_fn, NULL, + IRQF_TRIGGER_RISING, + dev_name(&pdev->dev), + priv); + else + ret = devm_request_threaded_irq(&pdev->dev, irq, NULL, + thread_fn, IRQF_ONESHOT, + dev_name(&pdev->dev), + priv); + if (ret) dev_err(&pdev->dev, "%s: failed to get irq\n", __func__); @@ -981,6 +1060,19 @@ static int tsens_register(struct tsens_priv *priv) priv->ops->enable(priv, i); } + /* VER_0 require to set MIN and MAX THRESH + * These 2 regs are set using the: + * - CRIT_THRESH_0 for MAX THRESH hardcoded to 120°C + * - CRIT_THRESH_1 for MIN THRESH hardcoded to 0°C + */ + if (tsens_version(priv) < VER_0_1) { + regmap_field_write(priv->rf[CRIT_THRESH_0], + tsens_mC_to_hw(priv->sensor, 120000)); + + regmap_field_write(priv->rf[CRIT_THRESH_1], + tsens_mC_to_hw(priv->sensor, 0)); + } + ret = tsens_register_irq(priv, "uplow", tsens_irq_thread); if (ret < 0) return ret; diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h index cba64c33b4f9..1471a2c00f15 100644 --- a/drivers/thermal/qcom/tsens.h +++ b/drivers/thermal/qcom/tsens.h @@ -13,6 +13,7 @@ #define CAL_DEGC_PT2 120 #define SLOPE_FACTOR 1000 #define SLOPE_DEFAULT 3200 +#define TIMEOUT_US 100 #define THRESHOLD_MAX_ADC_CODE 0x3ff #define THRESHOLD_MIN_ADC_CODE 0x0 @@ -25,7 +26,8 @@ struct tsens_priv; /* IP version numbers in ascending order */ enum tsens_ver { - VER_0_1 = 0, + VER_0 = 0, + VER_0_1, VER_1_X, VER_2_X, }; From fdda131f8fbadee2dfc21f0787d11547b42a961e Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Tue, 20 Apr 2021 20:33:38 +0200 Subject: [PATCH 0562/1379] thermal/drivers/tsens: Use init_common for msm8960 Use init_common and drop custom init for msm8960. Signed-off-by: Ansuel Smith Reviewed-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210420183343.2272-5-ansuelsmth@gmail.com --- drivers/thermal/qcom/tsens-8960.c | 52 +------------------------------ 1 file changed, 1 insertion(+), 51 deletions(-) diff --git a/drivers/thermal/qcom/tsens-8960.c b/drivers/thermal/qcom/tsens-8960.c index 0dd15e812208..83746ee3fc88 100644 --- a/drivers/thermal/qcom/tsens-8960.c +++ b/drivers/thermal/qcom/tsens-8960.c @@ -173,56 +173,6 @@ static void disable_8960(struct tsens_priv *priv) regmap_write(priv->tm_map, CNTL_ADDR, reg_cntl); } -static int init_8960(struct tsens_priv *priv) -{ - int ret, i; - u32 reg_cntl; - - priv->tm_map = dev_get_regmap(priv->dev, NULL); - if (!priv->tm_map) - return -ENODEV; - - /* - * The status registers for each sensor are discontiguous - * because some SoCs have 5 sensors while others have more - * but the control registers stay in the same place, i.e - * directly after the first 5 status registers. - */ - for (i = 0; i < priv->num_sensors; i++) { - if (i >= 5) - priv->sensor[i].status = S0_STATUS_OFF + 40; - priv->sensor[i].status += i * 4; - } - - reg_cntl = SW_RST; - ret = regmap_update_bits(priv->tm_map, CNTL_ADDR, SW_RST, reg_cntl); - if (ret) - return ret; - - if (priv->num_sensors > 1) { - reg_cntl |= SLP_CLK_ENA | (MEASURE_PERIOD << 18); - reg_cntl &= ~SW_RST; - ret = regmap_update_bits(priv->tm_map, CONFIG_ADDR, - CONFIG_MASK, CONFIG); - } else { - reg_cntl |= SLP_CLK_ENA_8660 | (MEASURE_PERIOD << 16); - reg_cntl &= ~CONFIG_MASK_8660; - reg_cntl |= CONFIG_8660 << CONFIG_SHIFT_8660; - } - - reg_cntl |= GENMASK(priv->num_sensors - 1, 0) << SENSOR0_SHIFT; - ret = regmap_write(priv->tm_map, CNTL_ADDR, reg_cntl); - if (ret) - return ret; - - reg_cntl |= EN; - ret = regmap_write(priv->tm_map, CNTL_ADDR, reg_cntl); - if (ret) - return ret; - - return 0; -} - static int calibrate_8960(struct tsens_priv *priv) { int i; @@ -338,7 +288,7 @@ static const struct reg_field tsens_8960_regfields[MAX_REGFIELDS] = { }; static const struct tsens_ops ops_8960 = { - .init = init_8960, + .init = init_common, .calibrate = calibrate_8960, .get_temp = get_temp_8960, .enable = enable_8960, From 3d08f029fdbbd29c8b363ef4c8c4bfe3b8f79ad0 Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Tue, 20 Apr 2021 20:33:39 +0200 Subject: [PATCH 0563/1379] thermal/drivers/tsens: Fix bug in sensor enable for msm8960 Device based on tsens VER_0 contains a hardware bug that results in some problem with sensor enablement. Sensor id 6-11 can't be enabled selectively and all of them must be enabled in one step. Signed-off-by: Ansuel Smith Acked-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210420183343.2272-6-ansuelsmth@gmail.com --- drivers/thermal/qcom/tsens-8960.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/qcom/tsens-8960.c b/drivers/thermal/qcom/tsens-8960.c index 83746ee3fc88..a96d37c2b674 100644 --- a/drivers/thermal/qcom/tsens-8960.c +++ b/drivers/thermal/qcom/tsens-8960.c @@ -27,9 +27,9 @@ #define EN BIT(0) #define SW_RST BIT(1) #define SENSOR0_EN BIT(3) +#define MEASURE_PERIOD BIT(18) #define SLP_CLK_ENA BIT(26) #define SLP_CLK_ENA_8660 BIT(24) -#define MEASURE_PERIOD 1 #define SENSOR0_SHIFT 3 /* INT_STATUS_ADDR bitmasks */ @@ -126,17 +126,34 @@ static int resume_8960(struct tsens_priv *priv) static int enable_8960(struct tsens_priv *priv, int id) { int ret; - u32 reg, mask; + u32 reg, mask = BIT(id); ret = regmap_read(priv->tm_map, CNTL_ADDR, ®); if (ret) return ret; - mask = BIT(id + SENSOR0_SHIFT); + /* HARDWARE BUG: + * On platforms with more than 6 sensors, all remaining sensors + * must be enabled together, otherwise undefined results are expected. + * (Sensor 6-7 disabled, Sensor 3 disabled...) In the original driver, + * all the sensors are enabled in one step hence this bug is not + * triggered. + */ + if (id > 5) + mask = GENMASK(10, 6); + + mask <<= SENSOR0_SHIFT; + + /* Sensors already enabled. Skip. */ + if ((reg & mask) == mask) + return 0; + ret = regmap_write(priv->tm_map, CNTL_ADDR, reg | SW_RST); if (ret) return ret; + reg |= MEASURE_PERIOD; + if (priv->num_sensors > 1) reg |= mask | SLP_CLK_ENA | EN; else From dfc1193d4dbd6c3cb68c944413146c940bde290a Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Tue, 20 Apr 2021 20:33:40 +0200 Subject: [PATCH 0564/1379] thermal/drivers/tsens: Replace custom 8960 apis with generic apis Rework calibrate function to use common function. Derive the offset from a missing hardcoded slope table and the data from the nvmem calib efuses. Drop custom get_temp function and use generic api. Signed-off-by: Ansuel Smith Acked-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210420183343.2272-7-ansuelsmth@gmail.com --- drivers/thermal/qcom/tsens-8960.c | 56 +++++++++---------------------- 1 file changed, 15 insertions(+), 41 deletions(-) diff --git a/drivers/thermal/qcom/tsens-8960.c b/drivers/thermal/qcom/tsens-8960.c index a96d37c2b674..58112f0dfe77 100644 --- a/drivers/thermal/qcom/tsens-8960.c +++ b/drivers/thermal/qcom/tsens-8960.c @@ -67,6 +67,13 @@ #define S9_STATUS_OFF 0x3674 #define S10_STATUS_OFF 0x3678 +/* Original slope - 350 to compensate mC to C inaccuracy */ +static u32 tsens_msm8960_slope[] = { + 826, 826, 804, 826, + 761, 782, 782, 849, + 782, 849, 782 + }; + static int suspend_8960(struct tsens_priv *priv) { int ret; @@ -194,9 +201,7 @@ static int calibrate_8960(struct tsens_priv *priv) { int i; char *data; - - ssize_t num_read = priv->num_sensors; - struct tsens_sensor *s = priv->sensor; + u32 p1[11]; data = qfprom_read(priv->dev, "calib"); if (IS_ERR(data)) @@ -204,49 +209,18 @@ static int calibrate_8960(struct tsens_priv *priv) if (IS_ERR(data)) return PTR_ERR(data); - for (i = 0; i < num_read; i++, s++) - s->offset = data[i]; + for (i = 0; i < priv->num_sensors; i++) { + p1[i] = data[i]; + priv->sensor[i].slope = tsens_msm8960_slope[i]; + } + + compute_intercept_slope(priv, p1, NULL, ONE_PT_CALIB); kfree(data); return 0; } -/* Temperature on y axis and ADC-code on x-axis */ -static inline int code_to_mdegC(u32 adc_code, const struct tsens_sensor *s) -{ - int slope, offset; - - slope = thermal_zone_get_slope(s->tzd); - offset = CAL_MDEGC - slope * s->offset; - - return adc_code * slope + offset; -} - -static int get_temp_8960(const struct tsens_sensor *s, int *temp) -{ - int ret; - u32 code, trdy; - struct tsens_priv *priv = s->priv; - unsigned long timeout; - - timeout = jiffies + usecs_to_jiffies(TIMEOUT_US); - do { - ret = regmap_read(priv->tm_map, INT_STATUS_ADDR, &trdy); - if (ret) - return ret; - if (!(trdy & TRDY_MASK)) - continue; - ret = regmap_read(priv->tm_map, s->status, &code); - if (ret) - return ret; - *temp = code_to_mdegC(code, s); - return 0; - } while (time_before(jiffies, timeout)); - - return -ETIMEDOUT; -} - static const struct reg_field tsens_8960_regfields[MAX_REGFIELDS] = { /* ----- SROT ------ */ /* No VERSION information */ @@ -307,7 +281,7 @@ static const struct reg_field tsens_8960_regfields[MAX_REGFIELDS] = { static const struct tsens_ops ops_8960 = { .init = init_common, .calibrate = calibrate_8960, - .get_temp = get_temp_8960, + .get_temp = get_temp_common, .enable = enable_8960, .disable = disable_8960, .suspend = suspend_8960, From 2ebd0982e6ba69d9f9c02a4a0aab705a5526283e Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Tue, 20 Apr 2021 20:33:41 +0200 Subject: [PATCH 0565/1379] thermal/drivers/tsens: Drop unused define for msm8960 Drop unused define for msm8960 replaced by generic api and reg_field. Signed-off-by: Ansuel Smith Reviewed-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210420183343.2272-8-ansuelsmth@gmail.com --- drivers/thermal/qcom/tsens-8960.c | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/drivers/thermal/qcom/tsens-8960.c b/drivers/thermal/qcom/tsens-8960.c index 58112f0dfe77..67c1748cdf73 100644 --- a/drivers/thermal/qcom/tsens-8960.c +++ b/drivers/thermal/qcom/tsens-8960.c @@ -10,8 +10,6 @@ #include #include "tsens.h" -#define CAL_MDEGC 30000 - #define CONFIG_ADDR 0x3640 #define CONFIG_ADDR_8660 0x3620 /* CONFIG_ADDR bitmasks */ @@ -21,39 +19,19 @@ #define CONFIG_SHIFT_8660 28 #define CONFIG_MASK_8660 (3 << CONFIG_SHIFT_8660) -#define STATUS_CNTL_ADDR_8064 0x3660 #define CNTL_ADDR 0x3620 /* CNTL_ADDR bitmasks */ #define EN BIT(0) #define SW_RST BIT(1) -#define SENSOR0_EN BIT(3) + #define MEASURE_PERIOD BIT(18) #define SLP_CLK_ENA BIT(26) #define SLP_CLK_ENA_8660 BIT(24) #define SENSOR0_SHIFT 3 -/* INT_STATUS_ADDR bitmasks */ -#define MIN_STATUS_MASK BIT(0) -#define LOWER_STATUS_CLR BIT(1) -#define UPPER_STATUS_CLR BIT(2) -#define MAX_STATUS_MASK BIT(3) - #define THRESHOLD_ADDR 0x3624 -/* THRESHOLD_ADDR bitmasks */ -#define THRESHOLD_MAX_LIMIT_SHIFT 24 -#define THRESHOLD_MIN_LIMIT_SHIFT 16 -#define THRESHOLD_UPPER_LIMIT_SHIFT 8 -#define THRESHOLD_LOWER_LIMIT_SHIFT 0 - -/* Initial temperature threshold values */ -#define LOWER_LIMIT_TH 0x50 -#define UPPER_LIMIT_TH 0xdf -#define MIN_LIMIT_TH 0x0 -#define MAX_LIMIT_TH 0xff #define INT_STATUS_ADDR 0x363c -#define TRDY_MASK BIT(7) -#define TIMEOUT_US 100 #define S0_STATUS_OFF 0x3628 #define S1_STATUS_OFF 0x362c From 6b3aeafbc12c18036809108e301efe8056249233 Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Tue, 20 Apr 2021 20:33:42 +0200 Subject: [PATCH 0566/1379] thermal/drivers/tsens: Add support for ipq8064-tsens Add support for tsens present in ipq806x SoCs based on generic msm8960 tsens driver. Signed-off-by: Ansuel Smith Reviewed-by: Thara Gopinath Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210420183343.2272-9-ansuelsmth@gmail.com --- drivers/thermal/qcom/tsens.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index 83e01cb370f9..4c7ebd1d3f9c 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -968,6 +968,9 @@ static SIMPLE_DEV_PM_OPS(tsens_pm_ops, tsens_suspend, tsens_resume); static const struct of_device_id tsens_table[] = { { + .compatible = "qcom,ipq8064-tsens", + .data = &data_8960, + }, { .compatible = "qcom,mdm9607-tsens", .data = &data_9607, }, { From 26b2f03d2adf43d0dc9aeeb3fff54dcc9fcdb1f4 Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Tue, 20 Apr 2021 20:33:43 +0200 Subject: [PATCH 0567/1379] dt-bindings: thermal: tsens: Document ipq8064 bindings Document the use of bindings used for msm8960 tsens based devices. msm8960 use the same gcc regs and is set as a child of the qcom gcc. Signed-off-by: Ansuel Smith Reviewed-by: Rob Herring Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210420183343.2272-10-ansuelsmth@gmail.com --- .../bindings/thermal/qcom-tsens.yaml | 56 ++++++++++++++++--- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml index fbd03cd3eb9b..0242fd91b622 100644 --- a/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml +++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.yaml @@ -19,6 +19,11 @@ description: | properties: compatible: oneOf: + - description: msm9860 TSENS based + items: + - enum: + - qcom,ipq8064-tsens + - description: v0.1 of TSENS items: - enum: @@ -75,7 +80,9 @@ properties: maxItems: 2 items: - const: calib - - const: calib_sel + - enum: + - calib_backup + - calib_sel "#qcom,sensors": description: @@ -90,12 +97,20 @@ properties: Number of cells required to uniquely identify the thermal sensors. Since we have multiple sensors this is set to 1 +required: + - compatible + - interrupts + - interrupt-names + - "#thermal-sensor-cells" + - "#qcom,sensors" + allOf: - if: properties: compatible: contains: enum: + - qcom,ipq8064-tsens - qcom,mdm9607-tsens - qcom,msm8916-tsens - qcom,msm8974-tsens @@ -117,17 +132,42 @@ allOf: interrupt-names: minItems: 2 -required: - - compatible - - reg - - "#qcom,sensors" - - interrupts - - interrupt-names - - "#thermal-sensor-cells" + - if: + properties: + compatible: + contains: + enum: + - qcom,tsens-v0_1 + - qcom,tsens-v1 + - qcom,tsens-v2 + + then: + required: + - reg additionalProperties: false examples: + - | + #include + // Example msm9860 based SoC (ipq8064): + gcc: clock-controller { + + /* ... */ + + tsens: thermal-sensor { + compatible = "qcom,ipq8064-tsens"; + + nvmem-cells = <&tsens_calib>, <&tsens_calib_backup>; + nvmem-cell-names = "calib", "calib_backup"; + interrupts = ; + interrupt-names = "uplow"; + + #qcom,sensors = <11>; + #thermal-sensor-cells = <1>; + }; + }; + - | #include // Example 1 (legacy: for pre v1 IP): From b70dbf40eb075c596d86c42d93b86ff502290fc5 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 22 Apr 2021 12:43:06 +0100 Subject: [PATCH 0568/1379] thermal/core: Create a helper __thermal_cdev_update() without a lock There is a need to have a helper function which updates cooling device state from the governors code. With this change governor can use lock and unlock while calling helper function. This avoid unnecessary second time lock/unlock which was in previous solution present in governor implementation. This new helper function must be called with mutex 'cdev->lock' hold. The changed been discussed and part of code presented in thread: https://lore.kernel.org/linux-pm/20210419084536.25000-1-lukasz.luba@arm.com/ Co-developed-by: Daniel Lezcano Signed-off-by: Daniel Lezcano Signed-off-by: Lukasz Luba Link: https://lore.kernel.org/r/20210422114308.29684-2-lukasz.luba@arm.com --- drivers/thermal/thermal_core.h | 1 + drivers/thermal/thermal_helpers.c | 27 +++++++++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 86b8cef7310e..726e327b4205 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -66,6 +66,7 @@ static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) } void thermal_cdev_update(struct thermal_cooling_device *); +void __thermal_cdev_update(struct thermal_cooling_device *cdev); /** * struct thermal_trip - representation of a point in temperature domain diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c index 7f50f412e02a..3edd047e144f 100644 --- a/drivers/thermal/thermal_helpers.c +++ b/drivers/thermal/thermal_helpers.c @@ -192,18 +192,11 @@ static void thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev, thermal_cooling_device_stats_update(cdev, target); } -void thermal_cdev_update(struct thermal_cooling_device *cdev) +void __thermal_cdev_update(struct thermal_cooling_device *cdev) { struct thermal_instance *instance; unsigned long target = 0; - mutex_lock(&cdev->lock); - /* cooling device is updated*/ - if (cdev->updated) { - mutex_unlock(&cdev->lock); - return; - } - /* Make sure cdev enters the deepest cooling state */ list_for_each_entry(instance, &cdev->thermal_instances, cdev_node) { dev_dbg(&cdev->device, "zone%d->target=%lu\n", @@ -216,11 +209,25 @@ void thermal_cdev_update(struct thermal_cooling_device *cdev) thermal_cdev_set_cur_state(cdev, target); - cdev->updated = true; - mutex_unlock(&cdev->lock); trace_cdev_update(cdev, target); dev_dbg(&cdev->device, "set to state %lu\n", target); } + +/** + * thermal_cdev_update - update cooling device state if needed + * @cdev: pointer to struct thermal_cooling_device + * + * Update the cooling device state if there is a need. + */ +void thermal_cdev_update(struct thermal_cooling_device *cdev) +{ + mutex_lock(&cdev->lock); + if (!cdev->updated) { + __thermal_cdev_update(cdev); + cdev->updated = true; + } + mutex_unlock(&cdev->lock); +} EXPORT_SYMBOL(thermal_cdev_update); /** From cb579086536f6564f5846f89808ec394ef8b8621 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 22 Apr 2021 12:14:37 +0300 Subject: [PATCH 0569/1379] SUNRPC: fix ternary sign expansion bug in tracing This code is supposed to pass negative "err" values for tracing but it passes positive values instead. The problem is that the trace_svcsock_tcp_send() function takes a long but "err" is an int and "sent" is a u32. The negative is first type promoted to u32 so it becomes a high positive then it is promoted to long and it stays positive. Fix this by casting "err" directly to long. Fixes: 998024dee197 ("SUNRPC: Add more svcsock tracepoints") Signed-off-by: Dan Carpenter Signed-off-by: Chuck Lever --- net/sunrpc/svcsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 9eb5b6b89077..478f857cdaed 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1174,7 +1174,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) tcp_sock_set_cork(svsk->sk_sk, true); err = svc_tcp_sendmsg(svsk->sk_sock, xdr, marker, &sent); xdr_free_bvec(xdr); - trace_svcsock_tcp_send(xprt, err < 0 ? err : sent); + trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent); if (err < 0 || sent != (xdr->len + sizeof(marker))) goto out_close; if (atomic_dec_and_test(&svsk->sk_sendqlen)) From d3b60ed9b135e2c652115db691a87dc28b324bea Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 22 Apr 2021 12:43:07 +0100 Subject: [PATCH 0570/1379] thermal/core/power_allocator: Maintain the device statistics from going stale When the temperature is below the first activation trip point the cooling devices are not checked, so they cannot maintain fresh statistics. It leads into the situation, when temperature crosses first trip point, the statistics are stale and show state for very long period. This has impact on IPA algorithm calculation and wrong decisions. Thus, check the cooling devices even when the temperature is low, to refresh these statistics. Signed-off-by: Lukasz Luba Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210422114308.29684-3-lukasz.luba@arm.com --- drivers/thermal/gov_power_allocator.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 2802a0e13c88..d393409fb786 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -575,15 +575,25 @@ static void allow_maximum_power(struct thermal_zone_device *tz) { struct thermal_instance *instance; struct power_allocator_params *params = tz->governor_data; + u32 req_power; mutex_lock(&tz->lock); list_for_each_entry(instance, &tz->thermal_instances, tz_node) { + struct thermal_cooling_device *cdev = instance->cdev; + if ((instance->trip != params->trip_max_desired_temperature) || (!cdev_is_power_actor(instance->cdev))) continue; instance->target = 0; mutex_lock(&instance->cdev->lock); + /* + * Call for updating the cooling devices local stats and avoid + * periods of dozen of seconds when those have not been + * maintained. + */ + cdev->ops->get_requested_power(cdev, &req_power); + instance->cdev->updated = false; mutex_unlock(&instance->cdev->lock); thermal_cdev_update(instance->cdev); From 0952177f2a1f63ba87a1940fac21768f402c0b94 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 22 Apr 2021 12:43:08 +0100 Subject: [PATCH 0571/1379] thermal/core/power_allocator: Update once cooling devices when temp is low The cooling device state change generates an event, also when there is no need, because temperature is low and device is not throttled. Avoid to unnecessary update the cooling device which means also not sending event. The cooling device state has not changed because the temperature is still below the first activation trip point value, so we can do this. Add a tracking mechanism to make sure it updates cooling devices only once - when the temperature dropps below first trip point. Reported-by: Daniel Lezcano Signed-off-by: Lukasz Luba Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210422114308.29684-4-lukasz.luba@arm.com --- drivers/thermal/gov_power_allocator.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index d393409fb786..a6cdb2e892da 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -571,7 +571,7 @@ static void reset_pid_controller(struct power_allocator_params *params) params->prev_err = 0; } -static void allow_maximum_power(struct thermal_zone_device *tz) +static void allow_maximum_power(struct thermal_zone_device *tz, bool update) { struct thermal_instance *instance; struct power_allocator_params *params = tz->governor_data; @@ -594,9 +594,10 @@ static void allow_maximum_power(struct thermal_zone_device *tz) */ cdev->ops->get_requested_power(cdev, &req_power); - instance->cdev->updated = false; + if (update) + __thermal_cdev_update(instance->cdev); + mutex_unlock(&instance->cdev->lock); - thermal_cdev_update(instance->cdev); } mutex_unlock(&tz->lock); } @@ -710,6 +711,7 @@ static int power_allocator_throttle(struct thermal_zone_device *tz, int trip) int ret; int switch_on_temp, control_temp; struct power_allocator_params *params = tz->governor_data; + bool update; /* * We get called for every trip point but we only need to do @@ -721,9 +723,10 @@ static int power_allocator_throttle(struct thermal_zone_device *tz, int trip) ret = tz->ops->get_trip_temp(tz, params->trip_switch_on, &switch_on_temp); if (!ret && (tz->temperature < switch_on_temp)) { + update = (tz->last_temperature >= switch_on_temp); tz->passive = 0; reset_pid_controller(params); - allow_maximum_power(tz); + allow_maximum_power(tz, update); return 0; } From fef05776eb02238dcad8d5514e666a42572c3f32 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 22 Apr 2021 16:36:22 +0100 Subject: [PATCH 0572/1379] thermal/core/fair share: Lock the thermal zone while looping over instances The tz->lock must be hold during the looping over the instances in that thermal zone. This lock was missing in the governor code since the beginning, so it's hard to point into a particular commit. CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Lukasz Luba Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210422153624.6074-2-lukasz.luba@arm.com --- drivers/thermal/gov_fair_share.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/thermal/gov_fair_share.c b/drivers/thermal/gov_fair_share.c index aaa07180ab48..645432ce6365 100644 --- a/drivers/thermal/gov_fair_share.c +++ b/drivers/thermal/gov_fair_share.c @@ -82,6 +82,8 @@ static int fair_share_throttle(struct thermal_zone_device *tz, int trip) int total_instance = 0; int cur_trip_level = get_trip_level(tz); + mutex_lock(&tz->lock); + list_for_each_entry(instance, &tz->thermal_instances, tz_node) { if (instance->trip != trip) continue; @@ -110,6 +112,8 @@ static int fair_share_throttle(struct thermal_zone_device *tz, int trip) mutex_unlock(&instance->cdev->lock); thermal_cdev_update(cdev); } + + mutex_unlock(&tz->lock); return 0; } From 1a93369810660905f5e89b527cd709fa6832f7c8 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 22 Apr 2021 16:36:23 +0100 Subject: [PATCH 0573/1379] thermal/core/fair share: Use the lockless __thermal_cdev_update() function Use the new helper function and avoid unnecessery second lock/unlock, which was present in old approach with thermal_cdev_update(). Signed-off-by: Lukasz Luba Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210422153624.6074-3-lukasz.luba@arm.com --- drivers/thermal/gov_fair_share.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/gov_fair_share.c b/drivers/thermal/gov_fair_share.c index 645432ce6365..1e5abf4822be 100644 --- a/drivers/thermal/gov_fair_share.c +++ b/drivers/thermal/gov_fair_share.c @@ -107,10 +107,9 @@ static int fair_share_throttle(struct thermal_zone_device *tz, int trip) instance->target = get_target_state(tz, cdev, percentage, cur_trip_level); - mutex_lock(&instance->cdev->lock); - instance->cdev->updated = false; - mutex_unlock(&instance->cdev->lock); - thermal_cdev_update(cdev); + mutex_lock(&cdev->lock); + __thermal_cdev_update(cdev); + mutex_unlock(&cdev->lock); } mutex_unlock(&tz->lock); From ab39c8853737158604e154ad3b03639e74082bd6 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 22 Apr 2021 16:36:24 +0100 Subject: [PATCH 0574/1379] thermal/core/power allocator: Use the lockless __thermal_cdev_update() function Use the new helper function and avoid unnecessery second lock/unlock, which was present in old approach with thermal_cdev_update(). Signed-off-by: Lukasz Luba Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210422153624.6074-4-lukasz.luba@arm.com --- drivers/thermal/gov_power_allocator.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index a6cdb2e892da..13e375751d22 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -301,9 +301,8 @@ power_actor_set_power(struct thermal_cooling_device *cdev, instance->target = clamp_val(state, instance->lower, instance->upper); mutex_lock(&cdev->lock); - cdev->updated = false; + __thermal_cdev_update(cdev); mutex_unlock(&cdev->lock); - thermal_cdev_update(cdev); return 0; } From c310e546164d5cca4c12faf9582b75989b030b68 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 22 Apr 2021 13:04:12 +0100 Subject: [PATCH 0575/1379] thermal/drivers/mtk_thermal: Remove redundant initializations of several variables Several variables are being initialized with values that is never read and being updated later with a new value. The initializations are redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20210422120412.246291-1-colin.king@canonical.com --- drivers/thermal/mtk_thermal.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/thermal/mtk_thermal.c b/drivers/thermal/mtk_thermal.c index 149c6d7fd5a0..97e8678ccf0e 100644 --- a/drivers/thermal/mtk_thermal.c +++ b/drivers/thermal/mtk_thermal.c @@ -573,12 +573,12 @@ static int raw_to_mcelsius_v1(struct mtk_thermal *mt, int sensno, s32 raw) static int raw_to_mcelsius_v2(struct mtk_thermal *mt, int sensno, s32 raw) { - s32 format_1 = 0; - s32 format_2 = 0; - s32 g_oe = 1; - s32 g_gain = 1; - s32 g_x_roomt = 0; - s32 tmp = 0; + s32 format_1; + s32 format_2; + s32 g_oe; + s32 g_gain; + s32 g_x_roomt; + s32 tmp; if (raw == 0) return 0; From 6fc277c7c935c7e1fdee23e82da988d9d3cb6bef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Apr 2021 13:48:27 -0700 Subject: [PATCH 0576/1379] xfs: rename xfs_ictimestamp_t Rename xfs_ictimestamp_t to xfs_log_timestamp_t as it is a type used for logging timestamps with no relationship to the in-core inode. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_log_format.h | 10 +++++----- fs/xfs/xfs_inode_item.c | 4 ++-- fs/xfs/xfs_inode_item_recover.c | 2 +- fs/xfs/xfs_ondisk.h | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 8bd00da6d2a4..5900772d678a 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -368,7 +368,7 @@ static inline int xfs_ilog_fdata(int w) * directly mirrors the xfs_dinode structure as it must contain all the same * information. */ -typedef uint64_t xfs_ictimestamp_t; +typedef uint64_t xfs_log_timestamp_t; /* Legacy timestamp encoding format. */ struct xfs_legacy_ictimestamp { @@ -393,9 +393,9 @@ struct xfs_log_dinode { uint16_t di_projid_hi; /* higher part of owner's project id */ uint8_t di_pad[6]; /* unused, zeroed space */ uint16_t di_flushiter; /* incremented on flush */ - xfs_ictimestamp_t di_atime; /* time last accessed */ - xfs_ictimestamp_t di_mtime; /* time last modified */ - xfs_ictimestamp_t di_ctime; /* time created/inode modified */ + xfs_log_timestamp_t di_atime; /* time last accessed */ + xfs_log_timestamp_t di_mtime; /* time last modified */ + xfs_log_timestamp_t di_ctime; /* time created/inode modified */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ @@ -420,7 +420,7 @@ struct xfs_log_dinode { uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ - xfs_ictimestamp_t di_crtime; /* time created */ + xfs_log_timestamp_t di_crtime; /* time created */ xfs_ino_t di_ino; /* inode number */ uuid_t di_uuid; /* UUID of the filesystem */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c1b32680f71c..6cc4ca15209c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -299,13 +299,13 @@ xfs_inode_item_format_attr_fork( * Convert an incore timestamp to a log timestamp. Note that the log format * specifies host endian format! */ -static inline xfs_ictimestamp_t +static inline xfs_log_timestamp_t xfs_inode_to_log_dinode_ts( struct xfs_inode *ip, const struct timespec64 tv) { struct xfs_legacy_ictimestamp *lits; - xfs_ictimestamp_t its; + xfs_log_timestamp_t its; if (xfs_inode_has_bigtime(ip)) return xfs_inode_encode_bigtime(tv); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index cb44f7653f03..9b877de2ce5e 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -125,7 +125,7 @@ static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld) static inline xfs_timestamp_t xfs_log_dinode_to_disk_ts( struct xfs_log_dinode *from, - const xfs_ictimestamp_t its) + const xfs_log_timestamp_t its) { struct xfs_legacy_timestamp *lts; struct xfs_legacy_ictimestamp *lits; diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 0aa87c210104..66b541b7bb64 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -126,7 +126,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_extent_64, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); - XFS_CHECK_STRUCT_SIZE(xfs_ictimestamp_t, 8); + XFS_CHECK_STRUCT_SIZE(xfs_log_timestamp_t, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_ictimestamp, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); From 732de7dbdbd30df40a6d260a8da6fc5262039439 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Apr 2021 13:48:27 -0700 Subject: [PATCH 0577/1379] xfs: rename struct xfs_legacy_ictimestamp Rename struct xfs_legacy_ictimestamp to struct xfs_log_legacy_timestamp as it is a type used for logging timestamps with no relationship to the in-core inode. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_log_format.h | 2 +- fs/xfs/xfs_inode_item.c | 4 ++-- fs/xfs/xfs_inode_item_recover.c | 4 ++-- fs/xfs/xfs_ondisk.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 5900772d678a..3e15ea29fb8d 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -371,7 +371,7 @@ static inline int xfs_ilog_fdata(int w) typedef uint64_t xfs_log_timestamp_t; /* Legacy timestamp encoding format. */ -struct xfs_legacy_ictimestamp { +struct xfs_log_legacy_timestamp { int32_t t_sec; /* timestamp seconds */ int32_t t_nsec; /* timestamp nanoseconds */ }; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 6cc4ca15209c..6764d12342da 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -304,13 +304,13 @@ xfs_inode_to_log_dinode_ts( struct xfs_inode *ip, const struct timespec64 tv) { - struct xfs_legacy_ictimestamp *lits; + struct xfs_log_legacy_timestamp *lits; xfs_log_timestamp_t its; if (xfs_inode_has_bigtime(ip)) return xfs_inode_encode_bigtime(tv); - lits = (struct xfs_legacy_ictimestamp *)&its; + lits = (struct xfs_log_legacy_timestamp *)&its; lits->t_sec = tv.tv_sec; lits->t_nsec = tv.tv_nsec; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 9b877de2ce5e..7b79518b6c20 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -128,14 +128,14 @@ xfs_log_dinode_to_disk_ts( const xfs_log_timestamp_t its) { struct xfs_legacy_timestamp *lts; - struct xfs_legacy_ictimestamp *lits; + struct xfs_log_legacy_timestamp *lits; xfs_timestamp_t ts; if (xfs_log_dinode_has_bigtime(from)) return cpu_to_be64(its); lts = (struct xfs_legacy_timestamp *)&ts; - lits = (struct xfs_legacy_ictimestamp *)&its; + lits = (struct xfs_log_legacy_timestamp *)&its; lts->t_sec = cpu_to_be32(lits->t_sec); lts->t_nsec = cpu_to_be32(lits->t_nsec); diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 66b541b7bb64..25991923c1a8 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -127,7 +127,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_log_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_icreate_log, 28); XFS_CHECK_STRUCT_SIZE(xfs_log_timestamp_t, 8); - XFS_CHECK_STRUCT_SIZE(struct xfs_legacy_ictimestamp, 8); + XFS_CHECK_STRUCT_SIZE(struct xfs_log_legacy_timestamp, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format_32, 52); XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); From 09accc3a05f7f1a6486f4a278d209ac122289c0b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 17 Apr 2021 10:10:09 +0900 Subject: [PATCH 0578/1379] riscv: Disable data start offset in flat binaries uclibc/gcc combined with elf2flt riscv linker file fully resolve the PC relative __global_pointer$ value at compile time and do not generate a relocation entry to set a correct value of the gp register at runtime. As a result, if the flatbin loader offsets the start of the data section, the relative position change between the text and data sections compared to the compile time positions results in an incorrect gp value being used. This causes flatbin executables to crash. Avoid this problem by enabling CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET automatically when CONFIG_RISCV is enabled and CONFIG_MMU is disabled. Signed-off-by: Damien Le Moal Acked-by: Palmer Dabbelt Signed-off-by: Greg Ungerer --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4515a10c5d22..add528eb9235 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -33,6 +33,7 @@ config RISCV select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if 64BIT + select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU select CLONE_BACKWARDS select CLINT_TIMER if !MMU select COMMON_CLK From 6b3788e5fb8041211ac2fa7c818ca9010e976a74 Mon Sep 17 00:00:00 2001 From: Angelo Dureghello Date: Thu, 8 Apr 2021 23:37:40 +0200 Subject: [PATCH 0579/1379] m68k: coldfire: fix irq ranges Working on flexcan0, there was no way to have irq 128 working. Fix irq 128 and 196 setup. Signed-off-by: Angelo Dureghello Reviewed-by: Geert Uytterhoeven Signed-off-by: Greg Ungerer --- arch/m68k/coldfire/intc-simr.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/m68k/coldfire/intc-simr.c b/arch/m68k/coldfire/intc-simr.c index 15c4b7a6e38f..f7c2c41b3156 100644 --- a/arch/m68k/coldfire/intc-simr.c +++ b/arch/m68k/coldfire/intc-simr.c @@ -68,9 +68,9 @@ static void intc_irq_mask(struct irq_data *d) { unsigned int irq = d->irq - MCFINT_VECBASE; - if (MCFINTC2_SIMR && (irq > 128)) + if (MCFINTC2_SIMR && (irq > 127)) __raw_writeb(irq - 128, MCFINTC2_SIMR); - else if (MCFINTC1_SIMR && (irq > 64)) + else if (MCFINTC1_SIMR && (irq > 63)) __raw_writeb(irq - 64, MCFINTC1_SIMR); else __raw_writeb(irq, MCFINTC0_SIMR); @@ -80,9 +80,9 @@ static void intc_irq_unmask(struct irq_data *d) { unsigned int irq = d->irq - MCFINT_VECBASE; - if (MCFINTC2_CIMR && (irq > 128)) + if (MCFINTC2_CIMR && (irq > 127)) __raw_writeb(irq - 128, MCFINTC2_CIMR); - else if (MCFINTC1_CIMR && (irq > 64)) + else if (MCFINTC1_CIMR && (irq > 63)) __raw_writeb(irq - 64, MCFINTC1_CIMR); else __raw_writeb(irq, MCFINTC0_CIMR); @@ -115,9 +115,9 @@ static unsigned int intc_irq_startup(struct irq_data *d) } irq -= MCFINT_VECBASE; - if (MCFINTC2_ICR0 && (irq > 128)) + if (MCFINTC2_ICR0 && (irq > 127)) __raw_writeb(5, MCFINTC2_ICR0 + irq - 128); - else if (MCFINTC1_ICR0 && (irq > 64)) + else if (MCFINTC1_ICR0 && (irq > 63)) __raw_writeb(5, MCFINTC1_ICR0 + irq - 64); else __raw_writeb(5, MCFINTC0_ICR0 + irq); From ef94340583eec5cb1544dc41a87baa4f684b3fe1 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 20 Apr 2021 10:44:25 -0700 Subject: [PATCH 0580/1379] arm64: vdso32: drop -no-integrated-as flag Clang can assemble these files just fine; this is a relic from the top level Makefile conditionally adding this. We no longer need --prefix, --gcc-toolchain, or -Qunused-arguments flags either with this change, so remove those too. To test building: $ ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- \ CROSS_COMPILE_COMPAT=arm-linux-gnueabi- make LLVM=1 LLVM_IAS=1 \ defconfig arch/arm64/kernel/vdso32/ Suggested-by: Nathan Chancellor Signed-off-by: Nick Desaulniers Reviewed-by: Nathan Chancellor Reviewed-by: Vincenzo Frascino Tested-by: Stephen Boyd Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210420174427.230228-1-ndesaulniers@google.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vdso32/Makefile | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 789ad420f16b..3dba0c4f8f42 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -10,15 +10,7 @@ include $(srctree)/lib/vdso/Makefile # Same as cc-*option, but using CC_COMPAT instead of CC ifeq ($(CONFIG_CC_IS_CLANG), y) -COMPAT_GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE_COMPAT)elfedit)) -COMPAT_GCC_TOOLCHAIN := $(realpath $(COMPAT_GCC_TOOLCHAIN_DIR)/..) - CC_COMPAT_CLANG_FLAGS := --target=$(notdir $(CROSS_COMPILE_COMPAT:%-=%)) -CC_COMPAT_CLANG_FLAGS += --prefix=$(COMPAT_GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE_COMPAT)) -CC_COMPAT_CLANG_FLAGS += -no-integrated-as -Qunused-arguments -ifneq ($(COMPAT_GCC_TOOLCHAIN),) -CC_COMPAT_CLANG_FLAGS += --gcc-toolchain=$(COMPAT_GCC_TOOLCHAIN) -endif CC_COMPAT ?= $(CC) CC_COMPAT += $(CC_COMPAT_CLANG_FLAGS) From 782276b4d0ad2fdd7096f8177bb7a9827f5258e4 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 20 Apr 2021 10:35:59 +0100 Subject: [PATCH 0581/1379] arm64: Force SPARSEMEM_VMEMMAP as the only memory management model Currently arm64 allows a choice of FLATMEM, SPARSEMEM and SPARSEMEM_VMEMMAP. However, only the latter is tested regularly. FLATMEM does not seem to boot in certain configurations (guest under KVM with Qemu as a VMM). Since the reduction of the SECTION_SIZE_BITS to 27 (4K pages) or 29 (64K page), there's little argument against the memory wasted by the mem_map array with SPARSEMEM. Make SPARSEMEM_VMEMMAP the only available option, non-selectable, and remove the corresponding #ifdefs under arch/arm64/. Signed-off-by: Catalin Marinas Cc: Will Deacon Acked-by: Will Deacon Acked-by: Ard Biesheuvel Acked-by: Marc Zyngier Reviewed-by: Anshuman Khandual Acked-by: Mike Rapoport Link: https://lore.kernel.org/r/20210420093559.23168-1-catalin.marinas@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 10 +--------- arch/arm64/include/asm/kernel-pgtable.h | 2 +- arch/arm64/include/asm/memory.h | 4 ++-- arch/arm64/include/asm/sparsemem.h | 3 --- arch/arm64/mm/init.c | 8 ++------ arch/arm64/mm/mmu.c | 2 -- arch/arm64/mm/ptdump.c | 2 -- 7 files changed, 6 insertions(+), 25 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9b4d629f7628..01c294035928 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1040,15 +1040,7 @@ source "kernel/Kconfig.hz" config ARCH_SPARSEMEM_ENABLE def_bool y select SPARSEMEM_VMEMMAP_ENABLE - -config ARCH_SPARSEMEM_DEFAULT - def_bool ARCH_SPARSEMEM_ENABLE - -config ARCH_SELECT_MEMORY_MODEL - def_bool ARCH_SPARSEMEM_ENABLE - -config ARCH_FLATMEM_ENABLE - def_bool !NUMA + select SPARSEMEM_VMEMMAP config HW_PERF_EVENTS def_bool y diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 587c504a4c8b..d44df9d62fc9 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -136,7 +136,7 @@ * has a direct correspondence, and needs to appear sufficiently aligned * in the virtual address space. */ -#if defined(CONFIG_SPARSEMEM_VMEMMAP) && ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS +#if ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS #define ARM64_MEMSTART_ALIGN (1UL << SECTION_SIZE_BITS) #else #define ARM64_MEMSTART_ALIGN (1UL << ARM64_MEMSTART_SHIFT) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index b943879c1c24..15018dc59554 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -329,7 +329,7 @@ static inline void *phys_to_virt(phys_addr_t x) */ #define ARCH_PFN_OFFSET ((unsigned long)PHYS_PFN_OFFSET) -#if !defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_DEBUG_VIRTUAL) +#if defined(CONFIG_DEBUG_VIRTUAL) #define page_to_virt(x) ({ \ __typeof__(x) __page = x; \ void *__addr = __va(page_to_phys(__page)); \ @@ -349,7 +349,7 @@ static inline void *phys_to_virt(phys_addr_t x) u64 __addr = VMEMMAP_START + (__idx * sizeof(struct page)); \ (struct page *)__addr; \ }) -#endif /* !CONFIG_SPARSEMEM_VMEMMAP || CONFIG_DEBUG_VIRTUAL */ +#endif /* CONFIG_DEBUG_VIRTUAL */ #define virt_addr_valid(addr) ({ \ __typeof__(addr) __addr = __tag_reset(addr); \ diff --git a/arch/arm64/include/asm/sparsemem.h b/arch/arm64/include/asm/sparsemem.h index eb4a75d720ed..4b73463423c3 100644 --- a/arch/arm64/include/asm/sparsemem.h +++ b/arch/arm64/include/asm/sparsemem.h @@ -5,7 +5,6 @@ #ifndef __ASM_SPARSEMEM_H #define __ASM_SPARSEMEM_H -#ifdef CONFIG_SPARSEMEM #define MAX_PHYSMEM_BITS CONFIG_ARM64_PA_BITS /* @@ -27,6 +26,4 @@ #define SECTION_SIZE_BITS 27 #endif /* CONFIG_ARM64_64K_PAGES */ -#endif /* CONFIG_SPARSEMEM*/ - #endif diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 3685e12aba9b..a205538aa1d5 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -220,6 +220,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) int pfn_valid(unsigned long pfn) { phys_addr_t addr = PFN_PHYS(pfn); + struct mem_section *ms; /* * Ensure the upper PAGE_SHIFT bits are clear in the @@ -230,10 +231,6 @@ int pfn_valid(unsigned long pfn) if (PHYS_PFN(addr) != pfn) return 0; -#ifdef CONFIG_SPARSEMEM -{ - struct mem_section *ms; - if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; @@ -252,8 +249,7 @@ int pfn_valid(unsigned long pfn) */ if (!early_section(ms)) return pfn_section_valid(ms, pfn); -} -#endif + return memblock_is_map_memory(addr); } EXPORT_SYMBOL(pfn_valid); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index fac957ff5187..af0ebcad3e1f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1113,7 +1113,6 @@ static void free_empty_tables(unsigned long addr, unsigned long end, } #endif -#ifdef CONFIG_SPARSEMEM_VMEMMAP #if !ARM64_SWAPPER_USES_SECTION_MAPS int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) @@ -1177,7 +1176,6 @@ void vmemmap_free(unsigned long start, unsigned long end, free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END); #endif } -#endif /* CONFIG_SPARSEMEM_VMEMMAP */ static inline pud_t *fixmap_pud(unsigned long addr) { diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index a50e92ea1878..a1937dfff31c 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -51,10 +51,8 @@ static struct addr_marker address_markers[] = { { FIXADDR_TOP, "Fixmap end" }, { PCI_IO_START, "PCI I/O start" }, { PCI_IO_END, "PCI I/O end" }, -#ifdef CONFIG_SPARSEMEM_VMEMMAP { VMEMMAP_START, "vmemmap start" }, { VMEMMAP_START + VMEMMAP_SIZE, "vmemmap end" }, -#endif { -1, NULL }, }; From 8d144746ecc5fe5d64f3f1599db2192bd5c795ff Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 16 Apr 2021 17:30:32 +0100 Subject: [PATCH 0582/1379] arm64: alternative: simplify passing alt_region In __apply_alternatives() we take a pointer to void which we later assign to a pointer to struct alt_region. As all callers are passing a pointer to struct alt_region to begin with, it's simpler and more robust to take a pointer to struct alt region, so let's do so and avoid the need for a temporary variable. Signed-off-by: Mark Rutland Cc: Will Deacon Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210416163032.10857-1-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/alternative.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 1184c44ea2c7..f6fd16185040 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -133,11 +133,10 @@ static void clean_dcache_range_nopatch(u64 start, u64 end) } while (cur += d_size, cur < end); } -static void __apply_alternatives(void *alt_region, bool is_module, +static void __apply_alternatives(struct alt_region *region, bool is_module, unsigned long *feature_mask) { struct alt_instr *alt; - struct alt_region *region = alt_region; __le32 *origptr, *updptr; alternative_cb_t alt_cb; From 4139cf940d523ed30d4a362306b93115a2c9354c Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Sun, 18 Apr 2021 21:52:31 +0800 Subject: [PATCH 0583/1379] arm64: remove HAVE_DEBUG_BUGVERBOSE After commit 9fb7410f955f ("arm64/BUG: Use BRK instruction for generic BUG traps"), arm64 has switched to generic BUG implementation, so there's no need to select HAVE_DEBUG_BUGVERBOSE. Signed-off-by: Jisheng Zhang Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210418215231.563d4b72@xhacker Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 01c294035928..f3d45c4867b6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -160,7 +160,6 @@ config ARM64 select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING - select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE From b9f83ffaa0c096b4c832a43964fe6bff3acffe10 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Fri, 23 Apr 2021 17:42:58 +0800 Subject: [PATCH 0584/1379] SUNRPC: Fix null pointer dereference in svc_rqst_free() When alloc_pages_node() returns null in svc_rqst_alloc(), the null rq_scratch_page pointer will be dereferenced when calling put_page() in svc_rqst_free(). Fix it by adding a null check. Addresses-Coverity: ("Dereference after null check") Fixes: 5191955d6fc6 ("SUNRPC: Prepare for xdr_stream-style decoding on the server-side") Signed-off-by: Yunjian Wang Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index d76dc9d95d16..0de918cb3d90 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -846,7 +846,8 @@ void svc_rqst_free(struct svc_rqst *rqstp) { svc_release_buffer(rqstp); - put_page(rqstp->rq_scratch_page); + if (rqstp->rq_scratch_page) + put_page(rqstp->rq_scratch_page); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); From 9af1fba33b5751d71c0e6727a875b9fd7d8a99de Mon Sep 17 00:00:00 2001 From: Clemens Gruber Date: Thu, 15 Apr 2021 14:14:48 +0200 Subject: [PATCH 0585/1379] pwm: pca9685: Switch to atomic API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The switch to the atomic API goes hand in hand with a few fixes to previously experienced issues: - The duty cycle is no longer lost after disable/enable (previously the OFF registers were cleared in disable and the user was required to call config to restore the duty cycle settings) - If one sets a period resulting in the same prescale register value, the sleep and write to the register is now skipped - Previously, only the full ON bit was toggled in GPIO mode (and full OFF cleared if set to high), which could result in both full OFF and full ON not being set and on=0, off=0, which is not allowed according to the datasheet - The OFF registers were reset to 0 in probe, which could lead to the forbidden on=0, off=0. Fixed by resetting to POR default (full OFF) Signed-off-by: Clemens Gruber Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-pca9685.c | 259 +++++++++++++------------------------- 1 file changed, 90 insertions(+), 169 deletions(-) diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c index 00976b2c57b2..b740b824bda9 100644 --- a/drivers/pwm/pwm-pca9685.c +++ b/drivers/pwm/pwm-pca9685.c @@ -51,7 +51,6 @@ #define PCA9685_PRESCALE_MAX 0xFF /* => min. frequency of 24 Hz */ #define PCA9685_COUNTER_RANGE 4096 -#define PCA9685_DEFAULT_PERIOD 5000000 /* Default period_ns = 1/200 Hz */ #define PCA9685_OSC_CLOCK_MHZ 25 /* Internal oscillator with 25 MHz */ #define PCA9685_NUMREGS 0xFF @@ -71,10 +70,14 @@ #define LED_N_OFF_H(N) (PCA9685_LEDX_OFF_H + (4 * (N))) #define LED_N_OFF_L(N) (PCA9685_LEDX_OFF_L + (4 * (N))) +#define REG_ON_H(C) ((C) >= PCA9685_MAXCHAN ? PCA9685_ALL_LED_ON_H : LED_N_ON_H((C))) +#define REG_ON_L(C) ((C) >= PCA9685_MAXCHAN ? PCA9685_ALL_LED_ON_L : LED_N_ON_L((C))) +#define REG_OFF_H(C) ((C) >= PCA9685_MAXCHAN ? PCA9685_ALL_LED_OFF_H : LED_N_OFF_H((C))) +#define REG_OFF_L(C) ((C) >= PCA9685_MAXCHAN ? PCA9685_ALL_LED_OFF_L : LED_N_OFF_L((C))) + struct pca9685 { struct pwm_chip chip; struct regmap *regmap; - int period_ns; #if IS_ENABLED(CONFIG_GPIOLIB) struct mutex lock; struct gpio_chip gpio; @@ -87,6 +90,53 @@ static inline struct pca9685 *to_pca(struct pwm_chip *chip) return container_of(chip, struct pca9685, chip); } +/* Helper function to set the duty cycle ratio to duty/4096 (e.g. duty=2048 -> 50%) */ +static void pca9685_pwm_set_duty(struct pca9685 *pca, int channel, unsigned int duty) +{ + if (duty == 0) { + /* Set the full OFF bit, which has the highest precedence */ + regmap_write(pca->regmap, REG_OFF_H(channel), LED_FULL); + } else if (duty >= PCA9685_COUNTER_RANGE) { + /* Set the full ON bit and clear the full OFF bit */ + regmap_write(pca->regmap, REG_ON_H(channel), LED_FULL); + regmap_write(pca->regmap, REG_OFF_H(channel), 0); + } else { + /* Set OFF time (clears the full OFF bit) */ + regmap_write(pca->regmap, REG_OFF_L(channel), duty & 0xff); + regmap_write(pca->regmap, REG_OFF_H(channel), (duty >> 8) & 0xf); + /* Clear the full ON bit */ + regmap_write(pca->regmap, REG_ON_H(channel), 0); + } +} + +static unsigned int pca9685_pwm_get_duty(struct pca9685 *pca, int channel) +{ + unsigned int off_h = 0, val = 0; + + if (WARN_ON(channel >= PCA9685_MAXCHAN)) { + /* HW does not support reading state of "all LEDs" channel */ + return 0; + } + + regmap_read(pca->regmap, LED_N_OFF_H(channel), &off_h); + if (off_h & LED_FULL) { + /* Full OFF bit is set */ + return 0; + } + + regmap_read(pca->regmap, LED_N_ON_H(channel), &val); + if (val & LED_FULL) { + /* Full ON bit is set */ + return PCA9685_COUNTER_RANGE; + } + + if (regmap_read(pca->regmap, LED_N_OFF_L(channel), &val)) { + /* Reset val to 0 in case reading LED_N_OFF_L failed */ + val = 0; + } + return ((off_h & 0xf) << 8) | (val & 0xff); +} + #if IS_ENABLED(CONFIG_GPIOLIB) static bool pca9685_pwm_test_and_set_inuse(struct pca9685 *pca, int pwm_idx) { @@ -138,34 +188,23 @@ static int pca9685_pwm_gpio_request(struct gpio_chip *gpio, unsigned int offset) static int pca9685_pwm_gpio_get(struct gpio_chip *gpio, unsigned int offset) { struct pca9685 *pca = gpiochip_get_data(gpio); - struct pwm_device *pwm = &pca->chip.pwms[offset]; - unsigned int value; - regmap_read(pca->regmap, LED_N_ON_H(pwm->hwpwm), &value); - - return value & LED_FULL; + return pca9685_pwm_get_duty(pca, offset) != 0; } static void pca9685_pwm_gpio_set(struct gpio_chip *gpio, unsigned int offset, int value) { struct pca9685 *pca = gpiochip_get_data(gpio); - struct pwm_device *pwm = &pca->chip.pwms[offset]; - unsigned int on = value ? LED_FULL : 0; - /* Clear both OFF registers */ - regmap_write(pca->regmap, LED_N_OFF_L(pwm->hwpwm), 0); - regmap_write(pca->regmap, LED_N_OFF_H(pwm->hwpwm), 0); - - /* Set the full ON bit */ - regmap_write(pca->regmap, LED_N_ON_H(pwm->hwpwm), on); + pca9685_pwm_set_duty(pca, offset, value ? PCA9685_COUNTER_RANGE : 0); } static void pca9685_pwm_gpio_free(struct gpio_chip *gpio, unsigned int offset) { struct pca9685 *pca = gpiochip_get_data(gpio); - pca9685_pwm_gpio_set(gpio, offset, 0); + pca9685_pwm_set_duty(pca, offset, 0); pm_runtime_put(pca->chip.dev); pca9685_pwm_clear_inuse(pca, offset); } @@ -246,167 +285,52 @@ static void pca9685_set_sleep_mode(struct pca9685 *pca, bool enable) } } -static int pca9685_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) +static int pca9685_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) { struct pca9685 *pca = to_pca(chip); - unsigned long long duty; - unsigned int reg; - int prescale; + unsigned long long duty, prescale; + unsigned int val = 0; - if (period_ns != pca->period_ns) { - prescale = DIV_ROUND_CLOSEST(PCA9685_OSC_CLOCK_MHZ * period_ns, - PCA9685_COUNTER_RANGE * 1000) - 1; + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; - if (prescale >= PCA9685_PRESCALE_MIN && - prescale <= PCA9685_PRESCALE_MAX) { - /* - * Putting the chip briefly into SLEEP mode - * at this point won't interfere with the - * pm_runtime framework, because the pm_runtime - * state is guaranteed active here. - */ - /* Put chip into sleep mode */ - pca9685_set_sleep_mode(pca, true); - - /* Change the chip-wide output frequency */ - regmap_write(pca->regmap, PCA9685_PRESCALE, prescale); - - /* Wake the chip up */ - pca9685_set_sleep_mode(pca, false); - - pca->period_ns = period_ns; - } else { - dev_err(chip->dev, - "prescaler not set: period out of bounds!\n"); - return -EINVAL; - } + prescale = DIV_ROUND_CLOSEST_ULL(PCA9685_OSC_CLOCK_MHZ * state->period, + PCA9685_COUNTER_RANGE * 1000) - 1; + if (prescale < PCA9685_PRESCALE_MIN || prescale > PCA9685_PRESCALE_MAX) { + dev_err(chip->dev, "pwm not changed: period out of bounds!\n"); + return -EINVAL; } - if (duty_ns < 1) { - if (pwm->hwpwm >= PCA9685_MAXCHAN) - reg = PCA9685_ALL_LED_OFF_H; - else - reg = LED_N_OFF_H(pwm->hwpwm); - - regmap_write(pca->regmap, reg, LED_FULL); - + if (!state->enabled) { + pca9685_pwm_set_duty(pca, pwm->hwpwm, 0); return 0; } - if (duty_ns == period_ns) { - /* Clear both OFF registers */ - if (pwm->hwpwm >= PCA9685_MAXCHAN) - reg = PCA9685_ALL_LED_OFF_L; - else - reg = LED_N_OFF_L(pwm->hwpwm); + regmap_read(pca->regmap, PCA9685_PRESCALE, &val); + if (prescale != val) { + /* + * Putting the chip briefly into SLEEP mode + * at this point won't interfere with the + * pm_runtime framework, because the pm_runtime + * state is guaranteed active here. + */ + /* Put chip into sleep mode */ + pca9685_set_sleep_mode(pca, true); - regmap_write(pca->regmap, reg, 0x0); + /* Change the chip-wide output frequency */ + regmap_write(pca->regmap, PCA9685_PRESCALE, prescale); - if (pwm->hwpwm >= PCA9685_MAXCHAN) - reg = PCA9685_ALL_LED_OFF_H; - else - reg = LED_N_OFF_H(pwm->hwpwm); - - regmap_write(pca->regmap, reg, 0x0); - - /* Set the full ON bit */ - if (pwm->hwpwm >= PCA9685_MAXCHAN) - reg = PCA9685_ALL_LED_ON_H; - else - reg = LED_N_ON_H(pwm->hwpwm); - - regmap_write(pca->regmap, reg, LED_FULL); - - return 0; + /* Wake the chip up */ + pca9685_set_sleep_mode(pca, false); } - duty = PCA9685_COUNTER_RANGE * (unsigned long long)duty_ns; - duty = DIV_ROUND_UP_ULL(duty, period_ns); - - if (pwm->hwpwm >= PCA9685_MAXCHAN) - reg = PCA9685_ALL_LED_OFF_L; - else - reg = LED_N_OFF_L(pwm->hwpwm); - - regmap_write(pca->regmap, reg, (int)duty & 0xff); - - if (pwm->hwpwm >= PCA9685_MAXCHAN) - reg = PCA9685_ALL_LED_OFF_H; - else - reg = LED_N_OFF_H(pwm->hwpwm); - - regmap_write(pca->regmap, reg, ((int)duty >> 8) & 0xf); - - /* Clear the full ON bit, otherwise the set OFF time has no effect */ - if (pwm->hwpwm >= PCA9685_MAXCHAN) - reg = PCA9685_ALL_LED_ON_H; - else - reg = LED_N_ON_H(pwm->hwpwm); - - regmap_write(pca->regmap, reg, 0); - + duty = PCA9685_COUNTER_RANGE * state->duty_cycle; + duty = DIV_ROUND_UP_ULL(duty, state->period); + pca9685_pwm_set_duty(pca, pwm->hwpwm, duty); return 0; } -static int pca9685_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) -{ - struct pca9685 *pca = to_pca(chip); - unsigned int reg; - - /* - * The PWM subsystem does not support a pre-delay. - * So, set the ON-timeout to 0 - */ - if (pwm->hwpwm >= PCA9685_MAXCHAN) - reg = PCA9685_ALL_LED_ON_L; - else - reg = LED_N_ON_L(pwm->hwpwm); - - regmap_write(pca->regmap, reg, 0); - - if (pwm->hwpwm >= PCA9685_MAXCHAN) - reg = PCA9685_ALL_LED_ON_H; - else - reg = LED_N_ON_H(pwm->hwpwm); - - regmap_write(pca->regmap, reg, 0); - - /* - * Clear the full-off bit. - * It has precedence over the others and must be off. - */ - if (pwm->hwpwm >= PCA9685_MAXCHAN) - reg = PCA9685_ALL_LED_OFF_H; - else - reg = LED_N_OFF_H(pwm->hwpwm); - - regmap_update_bits(pca->regmap, reg, LED_FULL, 0x0); - - return 0; -} - -static void pca9685_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) -{ - struct pca9685 *pca = to_pca(chip); - unsigned int reg; - - if (pwm->hwpwm >= PCA9685_MAXCHAN) - reg = PCA9685_ALL_LED_OFF_H; - else - reg = LED_N_OFF_H(pwm->hwpwm); - - regmap_write(pca->regmap, reg, LED_FULL); - - /* Clear the LED_OFF counter. */ - if (pwm->hwpwm >= PCA9685_MAXCHAN) - reg = PCA9685_ALL_LED_OFF_L; - else - reg = LED_N_OFF_L(pwm->hwpwm); - - regmap_write(pca->regmap, reg, 0x0); -} - static int pca9685_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) { struct pca9685 *pca = to_pca(chip); @@ -422,15 +346,13 @@ static void pca9685_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) { struct pca9685 *pca = to_pca(chip); - pca9685_pwm_disable(chip, pwm); + pca9685_pwm_set_duty(pca, pwm->hwpwm, 0); pm_runtime_put(chip->dev); pca9685_pwm_clear_inuse(pca, pwm->hwpwm); } static const struct pwm_ops pca9685_pwm_ops = { - .enable = pca9685_pwm_enable, - .disable = pca9685_pwm_disable, - .config = pca9685_pwm_config, + .apply = pca9685_pwm_apply, .request = pca9685_pwm_request, .free = pca9685_pwm_free, .owner = THIS_MODULE, @@ -461,7 +383,6 @@ static int pca9685_pwm_probe(struct i2c_client *client, ret); return ret; } - pca->period_ns = PCA9685_DEFAULT_PERIOD; i2c_set_clientdata(client, pca); @@ -484,9 +405,9 @@ static int pca9685_pwm_probe(struct i2c_client *client, reg &= ~(MODE1_ALLCALL | MODE1_SUB1 | MODE1_SUB2 | MODE1_SUB3); regmap_write(pca->regmap, PCA9685_MODE1, reg); - /* Clear all "full off" bits */ - regmap_write(pca->regmap, PCA9685_ALL_LED_OFF_L, 0); - regmap_write(pca->regmap, PCA9685_ALL_LED_OFF_H, 0); + /* Reset OFF registers to POR default */ + regmap_write(pca->regmap, PCA9685_ALL_LED_OFF_L, LED_FULL); + regmap_write(pca->regmap, PCA9685_ALL_LED_OFF_H, LED_FULL); pca->chip.ops = &pca9685_pwm_ops; /* Add an extra channel for ALL_LED */ From 8f4768a56b673cbff3f24cf7b1784852c0f572d1 Mon Sep 17 00:00:00 2001 From: Clemens Gruber Date: Thu, 15 Apr 2021 14:14:49 +0200 Subject: [PATCH 0586/1379] pwm: pca9685: Support hardware readout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement .get_state to read-out the current hardware state. The hardware readout may return slightly different values than those that were set in apply due to the limited range of possible prescale and counter register values. Also note that although the datasheet mentions 200 Hz as default frequency when using the internal 25 MHz oscillator, the calculated period from the default prescaler register setting of 30 is 5079040ns. Signed-off-by: Clemens Gruber Reviewed-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-pca9685.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c index b740b824bda9..5db62e5af518 100644 --- a/drivers/pwm/pwm-pca9685.c +++ b/drivers/pwm/pwm-pca9685.c @@ -331,6 +331,41 @@ static int pca9685_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, return 0; } +static void pca9685_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, + struct pwm_state *state) +{ + struct pca9685 *pca = to_pca(chip); + unsigned long long duty; + unsigned int val = 0; + + /* Calculate (chip-wide) period from prescale value */ + regmap_read(pca->regmap, PCA9685_PRESCALE, &val); + /* + * PCA9685_OSC_CLOCK_MHZ is 25, i.e. an integer divider of 1000. + * The following calculation is therefore only a multiplication + * and we are not losing precision. + */ + state->period = (PCA9685_COUNTER_RANGE * 1000 / PCA9685_OSC_CLOCK_MHZ) * + (val + 1); + + /* The (per-channel) polarity is fixed */ + state->polarity = PWM_POLARITY_NORMAL; + + if (pwm->hwpwm >= PCA9685_MAXCHAN) { + /* + * The "all LEDs" channel does not support HW readout + * Return 0 and disabled for backwards compatibility + */ + state->duty_cycle = 0; + state->enabled = false; + return; + } + + state->enabled = true; + duty = pca9685_pwm_get_duty(pca, pwm->hwpwm); + state->duty_cycle = DIV_ROUND_DOWN_ULL(duty * state->period, PCA9685_COUNTER_RANGE); +} + static int pca9685_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm) { struct pca9685 *pca = to_pca(chip); @@ -353,6 +388,7 @@ static void pca9685_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) static const struct pwm_ops pca9685_pwm_ops = { .apply = pca9685_pwm_apply, + .get_state = pca9685_pwm_get_state, .request = pca9685_pwm_request, .free = pca9685_pwm_free, .owner = THIS_MODULE, From 9e6fd830abcae958f3a3465e511a6e5600a005f5 Mon Sep 17 00:00:00 2001 From: Clemens Gruber Date: Thu, 15 Apr 2021 14:14:50 +0200 Subject: [PATCH 0587/1379] pwm: pca9685: Improve runtime PM behavior The chip does not come out of POR in active state but in sleep state. To be sure (in case the bootloader woke it up) we force it to sleep in probe. If runtime PM is disabled, we instead wake the chip in .probe and put it to sleep in .remove. Signed-off-by: Clemens Gruber Signed-off-by: Thierry Reding --- drivers/pwm/pwm-pca9685.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c index 5db62e5af518..7c9f174de64e 100644 --- a/drivers/pwm/pwm-pca9685.c +++ b/drivers/pwm/pwm-pca9685.c @@ -461,14 +461,20 @@ static int pca9685_pwm_probe(struct i2c_client *client, return ret; } - /* The chip comes out of power-up in the active state */ - pm_runtime_set_active(&client->dev); - /* - * Enable will put the chip into suspend, which is what we - * want as all outputs are disabled at this point - */ pm_runtime_enable(&client->dev); + if (pm_runtime_enabled(&client->dev)) { + /* + * Although the chip comes out of power-up in the sleep state, + * we force it to sleep in case it was woken up before + */ + pca9685_set_sleep_mode(pca, true); + pm_runtime_set_suspended(&client->dev); + } else { + /* Wake the chip up if runtime PM is disabled */ + pca9685_set_sleep_mode(pca, false); + } + return 0; } @@ -480,7 +486,14 @@ static int pca9685_pwm_remove(struct i2c_client *client) ret = pwmchip_remove(&pca->chip); if (ret) return ret; + + if (!pm_runtime_enabled(&client->dev)) { + /* Put chip in sleep state if runtime PM is disabled */ + pca9685_set_sleep_mode(pca, true); + } + pm_runtime_disable(&client->dev); + return 0; } From 0b638f5032849d701167764de38df80cbf825cc6 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 15 Apr 2021 16:35:53 +0800 Subject: [PATCH 0588/1379] pwm: mediatek: Remove unused function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following clang warning: drivers/pwm/pwm-mediatek.c:110:19: warning: unused function 'pwm_mediatek_readl' [-Wunused-function]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Acked-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-mediatek.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 87fb37d43814..b4a31060bcd7 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -107,12 +107,6 @@ static void pwm_mediatek_clk_disable(struct pwm_chip *chip, clk_disable_unprepare(pc->clk_top); } -static inline u32 pwm_mediatek_readl(struct pwm_mediatek_chip *chip, - unsigned int num, unsigned int offset) -{ - return readl(chip->regs + pwm_mediatek_reg_offset[num] + offset); -} - static inline void pwm_mediatek_writel(struct pwm_mediatek_chip *chip, unsigned int num, unsigned int offset, u32 value) From a331099332957d30bce249182c8b66a57e439bae Mon Sep 17 00:00:00 2001 From: Johan Jonker Date: Mon, 12 Apr 2021 22:01:52 +0200 Subject: [PATCH 0589/1379] dt-bindings: pwm: Convert pwm-rockchip.txt to YAML Current dts files with 'pwm' nodes are manually verified. In order to automate this process pwm-rockchip.txt has to be converted to YAML. Signed-off-by: Johan Jonker Reviewed-by: Rob Herring Acked-by: Heiko Stuebner Signed-off-by: Thierry Reding --- .../devicetree/bindings/pwm/pwm-rockchip.txt | 27 ------ .../devicetree/bindings/pwm/pwm-rockchip.yaml | 88 +++++++++++++++++++ 2 files changed, 88 insertions(+), 27 deletions(-) delete mode 100644 Documentation/devicetree/bindings/pwm/pwm-rockchip.txt create mode 100644 Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml diff --git a/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt b/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt deleted file mode 100644 index f70956dea77b..000000000000 --- a/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt +++ /dev/null @@ -1,27 +0,0 @@ -Rockchip PWM controller - -Required properties: - - compatible: should be "rockchip,-pwm" - "rockchip,rk2928-pwm": found on RK29XX,RK3066 and RK3188 SoCs - "rockchip,rk3288-pwm": found on RK3288 SOC - "rockchip,rv1108-pwm", "rockchip,rk3288-pwm": found on RV1108 SoC - "rockchip,vop-pwm": found integrated in VOP on RK3288 SoC - - reg: physical base address and length of the controller's registers - - clocks: See ../clock/clock-bindings.txt - - For older hardware (rk2928, rk3066, rk3188, rk3228, rk3288, rk3399): - - There is one clock that's used both to derive the functional clock - for the device and as the bus clock. - - For newer hardware (rk3328 and future socs): specified by name - - "pwm": This is used to derive the functional clock. - - "pclk": This is the APB bus clock. - - #pwm-cells: must be 2 (rk2928) or 3 (rk3288). See pwm.yaml in this directory - for a description of the cell format. - -Example: - - pwm0: pwm@20030000 { - compatible = "rockchip,rk2928-pwm"; - reg = <0x20030000 0x10>; - clocks = <&cru PCLK_PWM01>; - #pwm-cells = <2>; - }; diff --git a/Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml b/Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml new file mode 100644 index 000000000000..19b42d31d0db --- /dev/null +++ b/Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pwm/pwm-rockchip.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Rockchip PWM controller + +maintainers: + - Heiko Stuebner + +properties: + compatible: + oneOf: + - const: rockchip,rk2928-pwm + - const: rockchip,rk3288-pwm + - const: rockchip,vop-pwm + - items: + - enum: + - rockchip,rv1108-pwm + - const: rockchip,rk3288-pwm + + reg: + maxItems: 1 + + clocks: + minItems: 1 + maxItems: 2 + + clock-names: + maxItems: 2 + + "#pwm-cells": + enum: [2, 3] + description: + Must be 2 (rk2928) or 3 (rk3288 and later). + See pwm.yaml for a description of the cell format. + +required: + - compatible + - reg + - "#pwm-cells" + +if: + properties: + compatible: + contains: + enum: + - rockchip,rv1108-pwm + +then: + properties: + clocks: + items: + - description: Used to derive the functional clock for the device. + - description: Used as the APB bus clock. + + clock-names: + items: + - const: pwm + - const: pclk + + required: + - clocks + - clock-names + +else: + properties: + clocks: + maxItems: 1 + description: + Used both to derive the functional clock + for the device and as the bus clock. + + required: + - clocks + +additionalProperties: false + +examples: + - | + #include + pwm0: pwm@20030000 { + compatible = "rockchip,rk2928-pwm"; + reg = <0x20030000 0x10>; + clocks = <&cru PCLK_PWM01>; + #pwm-cells = <2>; + }; From 78e7da2c1058c9b31ad1c704814b86120d96bdc4 Mon Sep 17 00:00:00 2001 From: Johan Jonker Date: Mon, 12 Apr 2021 22:01:53 +0200 Subject: [PATCH 0590/1379] dt-bindings: pwm: rockchip: Add more compatible strings The compatible strings below are already in use in the Rockchip DTSI files, but were somehow never added to a document, so add "rockchip,rk3328-pwm" "rockchip,rk3036-pwm", "rockchip,rk2928-pwm" "rockchip,rk3368-pwm", "rockchip,rk3288-pwm" "rockchip,rk3399-pwm", "rockchip,rk3288-pwm" "rockchip,px30-pwm", "rockchip,rk3328-pwm" "rockchip,rk3308-pwm", "rockchip,rk3328-pwm" for PWM nodes to pwm-rockchip.yaml. Signed-off-by: Johan Jonker Reviewed-by: Rob Herring Acked-by: Heiko Stuebner Signed-off-by: Thierry Reding --- .../devicetree/bindings/pwm/pwm-rockchip.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml b/Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml index 19b42d31d0db..5596bee70509 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml +++ b/Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml @@ -14,11 +14,22 @@ properties: oneOf: - const: rockchip,rk2928-pwm - const: rockchip,rk3288-pwm + - const: rockchip,rk3328-pwm - const: rockchip,vop-pwm + - items: + - const: rockchip,rk3036-pwm + - const: rockchip,rk2928-pwm - items: - enum: + - rockchip,rk3368-pwm + - rockchip,rk3399-pwm - rockchip,rv1108-pwm - const: rockchip,rk3288-pwm + - items: + - enum: + - rockchip,px30-pwm + - rockchip,rk3308-pwm + - const: rockchip,rk3328-pwm reg: maxItems: 1 @@ -46,6 +57,7 @@ if: compatible: contains: enum: + - rockchip,rk3328-pwm - rockchip,rv1108-pwm then: From 201fe12e7bb324da9fb5cfc2a1b89e7b45caf54d Mon Sep 17 00:00:00 2001 From: Johan Jonker Date: Mon, 12 Apr 2021 22:01:54 +0200 Subject: [PATCH 0591/1379] ARM: dts: rockchip: Remove clock-names from PWM nodes A test with the command below gives this error: /arch/arm/boot/dts/rk3036-evb.dt.yaml: pwm@20050030: clock-names: ['pwm'] is too short Devices with only one PWM clock use it to both to derive the functional clock for the device and as the bus clock. The driver does not need "clock-names" to get a handle, so remove them all. make ARCH=arm dtbs_check DT_SCHEMA_FILES=Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml Signed-off-by: Johan Jonker Acked-by: Heiko Stuebner Signed-off-by: Thierry Reding --- arch/arm/boot/dts/rk3036.dtsi | 4 ---- arch/arm/boot/dts/rk3288.dtsi | 4 ---- 2 files changed, 8 deletions(-) diff --git a/arch/arm/boot/dts/rk3036.dtsi b/arch/arm/boot/dts/rk3036.dtsi index 47a787a12e55..e24230d50a78 100644 --- a/arch/arm/boot/dts/rk3036.dtsi +++ b/arch/arm/boot/dts/rk3036.dtsi @@ -355,7 +355,6 @@ reg = <0x20050000 0x10>; #pwm-cells = <3>; clocks = <&cru PCLK_PWM>; - clock-names = "pwm"; pinctrl-names = "default"; pinctrl-0 = <&pwm0_pin>; status = "disabled"; @@ -366,7 +365,6 @@ reg = <0x20050010 0x10>; #pwm-cells = <3>; clocks = <&cru PCLK_PWM>; - clock-names = "pwm"; pinctrl-names = "default"; pinctrl-0 = <&pwm1_pin>; status = "disabled"; @@ -377,7 +375,6 @@ reg = <0x20050020 0x10>; #pwm-cells = <3>; clocks = <&cru PCLK_PWM>; - clock-names = "pwm"; pinctrl-names = "default"; pinctrl-0 = <&pwm2_pin>; status = "disabled"; @@ -388,7 +385,6 @@ reg = <0x20050030 0x10>; #pwm-cells = <2>; clocks = <&cru PCLK_PWM>; - clock-names = "pwm"; pinctrl-names = "default"; pinctrl-0 = <&pwm3_pin>; status = "disabled"; diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi index ea7416c31f9b..05557ad02b33 100644 --- a/arch/arm/boot/dts/rk3288.dtsi +++ b/arch/arm/boot/dts/rk3288.dtsi @@ -679,7 +679,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm0_pin>; clocks = <&cru PCLK_RKPWM>; - clock-names = "pwm"; status = "disabled"; }; @@ -690,7 +689,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm1_pin>; clocks = <&cru PCLK_RKPWM>; - clock-names = "pwm"; status = "disabled"; }; @@ -701,7 +699,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm2_pin>; clocks = <&cru PCLK_RKPWM>; - clock-names = "pwm"; status = "disabled"; }; @@ -712,7 +709,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm3_pin>; clocks = <&cru PCLK_RKPWM>; - clock-names = "pwm"; status = "disabled"; }; From ba0d527be46f692463a4d94f840cc2b022169de2 Mon Sep 17 00:00:00 2001 From: Johan Jonker Date: Mon, 12 Apr 2021 22:01:55 +0200 Subject: [PATCH 0592/1379] arm64: dts: rockchip: Remove clock-names from PWM nodes A test with the command below gives this error: /arch/arm64/boot/dts/rockchip/rk3368-evb-act8846.dt.yaml: pwm@ff680030: clock-names: ['pwm'] is too short Devices with only one PWM clock use it to both to derive the functional clock for the device and as the bus clock. The driver does not need "clock-names" to get a handle, so remove them all. make ARCH=arm64 dtbs_check DT_SCHEMA_FILES=Documentation/devicetree/bindings/pwm/pwm-rockchip.yaml Signed-off-by: Johan Jonker Acked-by: Heiko Stuebner Signed-off-by: Thierry Reding --- arch/arm64/boot/dts/rockchip/rk3368.dtsi | 4 ---- arch/arm64/boot/dts/rockchip/rk3399.dtsi | 4 ---- 2 files changed, 8 deletions(-) diff --git a/arch/arm64/boot/dts/rockchip/rk3368.dtsi b/arch/arm64/boot/dts/rockchip/rk3368.dtsi index 7af68ec3feae..4b2c67c28036 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3368.dtsi @@ -561,7 +561,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm0_pin>; clocks = <&cru PCLK_PWM1>; - clock-names = "pwm"; status = "disabled"; }; @@ -572,7 +571,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm1_pin>; clocks = <&cru PCLK_PWM1>; - clock-names = "pwm"; status = "disabled"; }; @@ -581,7 +579,6 @@ reg = <0x0 0xff680020 0x0 0x10>; #pwm-cells = <3>; clocks = <&cru PCLK_PWM1>; - clock-names = "pwm"; status = "disabled"; }; @@ -592,7 +589,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm3_pin>; clocks = <&cru PCLK_PWM1>; - clock-names = "pwm"; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index edbbf35fe19e..a79faac34e02 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -1185,7 +1185,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm0_pin>; clocks = <&pmucru PCLK_RKPWM_PMU>; - clock-names = "pwm"; status = "disabled"; }; @@ -1196,7 +1195,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm1_pin>; clocks = <&pmucru PCLK_RKPWM_PMU>; - clock-names = "pwm"; status = "disabled"; }; @@ -1207,7 +1205,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm2_pin>; clocks = <&pmucru PCLK_RKPWM_PMU>; - clock-names = "pwm"; status = "disabled"; }; @@ -1218,7 +1215,6 @@ pinctrl-names = "default"; pinctrl-0 = <&pwm3a_pin>; clocks = <&pmucru PCLK_RKPWM_PMU>; - clock-names = "pwm"; status = "disabled"; }; From b0221e706cd7da74ee0aa557690f9ccfaf45bd53 Mon Sep 17 00:00:00 2001 From: Nobuhiro Iwamatsu Date: Mon, 19 Apr 2021 09:00:06 +0900 Subject: [PATCH 0593/1379] dt-bindings: pwm: Add bindings for Toshiba Visconti PWM Controller Add bindings for the Toshiba Visconti PWM Controller. Signed-off-by: Nobuhiro Iwamatsu Reviewed-by: Rob Herring Signed-off-by: Thierry Reding --- .../bindings/pwm/toshiba,pwm-visconti.yaml | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 Documentation/devicetree/bindings/pwm/toshiba,pwm-visconti.yaml diff --git a/Documentation/devicetree/bindings/pwm/toshiba,pwm-visconti.yaml b/Documentation/devicetree/bindings/pwm/toshiba,pwm-visconti.yaml new file mode 100644 index 000000000000..d350f5edfb67 --- /dev/null +++ b/Documentation/devicetree/bindings/pwm/toshiba,pwm-visconti.yaml @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pwm/toshiba,pwm-visconti.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Toshiba Visconti PWM Controller + +maintainers: + - Nobuhiro Iwamatsu + +properties: + compatible: + items: + - const: toshiba,visconti-pwm + + reg: + maxItems: 1 + + '#pwm-cells': + const: 2 + +required: + - compatible + - reg + - '#pwm-cells' + +additionalProperties: false + +examples: + - | + soc { + #address-cells = <2>; + #size-cells = <2>; + + pwm: pwm@241c0000 { + compatible = "toshiba,visconti-pwm"; + reg = <0 0x241c0000 0 0x1000>; + pinctrl-names = "default"; + pinctrl-0 = <&pwm_mux>; + #pwm-cells = <2>; + }; + }; From 0bca3ec846d7a9ea5bddc3b5ab55f6968e690a84 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 20 Apr 2021 18:22:45 +0100 Subject: [PATCH 0594/1379] arm64: Show three registers per line Displaying two registers per line takes 15 lines. That improves to just 10 lines if we display three registers per line, which reduces the amount of information lost when oopses are cut off. It stays within 80 columns and matches x86-64. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kees Cook Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210420172245.3679077-1-willy@infradead.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/process.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 6aa053462400..0da0fd4ed1d0 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -292,13 +292,10 @@ void __show_regs(struct pt_regs *regs) i = top_reg; while (i >= 0) { - printk("x%-2d: %016llx ", i, regs->regs[i]); - i--; + printk("x%-2d: %016llx", i, regs->regs[i]); - if (i % 2 == 0) { - pr_cont("x%-2d: %016llx ", i, regs->regs[i]); - i--; - } + while (i-- % 3) + pr_cont(" x%-2d: %016llx", i, regs->regs[i]); pr_cont("\n"); } From 1ecd5b129252249b9bc03d7645a7bda512747277 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 21 Apr 2021 17:43:16 +0100 Subject: [PATCH 0595/1379] ACPI: GTDT: Don't corrupt interrupt mappings on watchdow probe failure When failing the driver probe because of invalid firmware properties, the GTDT driver unmaps the interrupt that it mapped earlier. However, it never checks whether the mapping of the interrupt actially succeeded. Even more, should the firmware report an illegal interrupt number that overlaps with the GIC SGI range, this can result in an IPI being unmapped, and subsequent fireworks (as reported by Dann Frazier). Rework the driver to have a slightly saner behaviour and actually check whether the interrupt has been mapped before unmapping things. Reported-by: dann frazier Fixes: ca9ae5ec4ef0 ("acpi/arm64: Add SBSA Generic Watchdog support in GTDT driver") Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/YH87dtTfwYgavusz@xps13.dannf Cc: Cc: Fu Wei Reviewed-by: Sudeep Holla Tested-by: dann frazier Tested-by: Hanjun Guo Reviewed-by: Hanjun Guo Reviewed-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20210421164317.1718831-2-maz@kernel.org Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/gtdt.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c index f2d0e5915dab..0a0a982f9c28 100644 --- a/drivers/acpi/arm64/gtdt.c +++ b/drivers/acpi/arm64/gtdt.c @@ -329,7 +329,7 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd, int index) { struct platform_device *pdev; - int irq = map_gt_gsi(wd->timer_interrupt, wd->timer_flags); + int irq; /* * According to SBSA specification the size of refresh and control @@ -338,7 +338,7 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd, struct resource res[] = { DEFINE_RES_MEM(wd->control_frame_address, SZ_4K), DEFINE_RES_MEM(wd->refresh_frame_address, SZ_4K), - DEFINE_RES_IRQ(irq), + {}, }; int nr_res = ARRAY_SIZE(res); @@ -348,10 +348,11 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd, if (!(wd->refresh_frame_address && wd->control_frame_address)) { pr_err(FW_BUG "failed to get the Watchdog base address.\n"); - acpi_unregister_gsi(wd->timer_interrupt); return -EINVAL; } + irq = map_gt_gsi(wd->timer_interrupt, wd->timer_flags); + res[2] = (struct resource)DEFINE_RES_IRQ(irq); if (irq <= 0) { pr_warn("failed to map the Watchdog interrupt.\n"); nr_res--; @@ -364,7 +365,8 @@ static int __init gtdt_import_sbsa_gwdt(struct acpi_gtdt_watchdog *wd, */ pdev = platform_device_register_simple("sbsa-gwdt", index, res, nr_res); if (IS_ERR(pdev)) { - acpi_unregister_gsi(wd->timer_interrupt); + if (irq > 0) + acpi_unregister_gsi(wd->timer_interrupt); return PTR_ERR(pdev); } From 2a20b08f06e70860272bc7f52b5423c1b2f06696 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 21 Apr 2021 17:43:17 +0100 Subject: [PATCH 0596/1379] ACPI: irq: Prevent unregistering of GIC SGIs When using ACPI on arm64, which implies the GIC IRQ model, no table should ever provide a GSI number in the range [0:15], as these are reserved for IPIs. However, drivers tend to call acpi_unregister_gsi() with any random GSI number provided by half baked tables, which results in an exploding kernel when its IPIs have been unconfigured. In order to catch this, check for the silly case early, warn that something is going wrong and avoid the above disaster. Signed-off-by: Marc Zyngier Reviewed-by: Sudeep Holla Tested-by: dann frazier Tested-by: Hanjun Guo Reviewed-by: Hanjun Guo Reviewed-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20210421164317.1718831-3-maz@kernel.org Signed-off-by: Catalin Marinas --- drivers/acpi/irq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/irq.c b/drivers/acpi/irq.c index e209081d644b..c68e694fca26 100644 --- a/drivers/acpi/irq.c +++ b/drivers/acpi/irq.c @@ -75,8 +75,12 @@ void acpi_unregister_gsi(u32 gsi) { struct irq_domain *d = irq_find_matching_fwnode(acpi_gsi_domain_id, DOMAIN_BUS_ANY); - int irq = irq_find_mapping(d, gsi); + int irq; + if (WARN_ON(acpi_irq_model == ACPI_IRQ_MODEL_GIC && gsi < 16)) + return; + + irq = irq_find_mapping(d, gsi); irq_dispose_mapping(irq); } EXPORT_SYMBOL_GPL(acpi_unregister_gsi); From 721b595744f199c185fbcefaa6e7e5cea9da1941 Mon Sep 17 00:00:00 2001 From: Nobuhiro Iwamatsu Date: Mon, 19 Apr 2021 09:00:07 +0900 Subject: [PATCH 0597/1379] pwm: visconti: Add Toshiba Visconti SoC PWM support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add driver for the PWM controller on Toshiba Visconti ARM SoC. Signed-off-by: Nobuhiro Iwamatsu Reviewed-by: Uwe Kleine-König [thierry.reding@gmail.com: fix up a couple of checkpatch warnings] Signed-off-by: Thierry Reding --- drivers/pwm/Kconfig | 9 ++ drivers/pwm/Makefile | 1 + drivers/pwm/pwm-visconti.c | 190 +++++++++++++++++++++++++++++++++++++ 3 files changed, 200 insertions(+) create mode 100644 drivers/pwm/pwm-visconti.c diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index 9a4f66ae8070..8ae68d6203fb 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -601,6 +601,15 @@ config PWM_TWL_LED To compile this driver as a module, choose M here: the module will be called pwm-twl-led. +config PWM_VISCONTI + tristate "Toshiba Visconti PWM support" + depends on ARCH_VISCONTI || COMPILE_TEST + help + PWM Subsystem driver support for Toshiba Visconti SoCs. + + To compile this driver as a module, choose M here: the module + will be called pwm-visconti. + config PWM_VT8500 tristate "vt8500 PWM support" depends on ARCH_VT8500 || COMPILE_TEST diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile index 6374d3b1d6f3..d43b1e17e8e1 100644 --- a/drivers/pwm/Makefile +++ b/drivers/pwm/Makefile @@ -56,4 +56,5 @@ obj-$(CONFIG_PWM_TIECAP) += pwm-tiecap.o obj-$(CONFIG_PWM_TIEHRPWM) += pwm-tiehrpwm.o obj-$(CONFIG_PWM_TWL) += pwm-twl.o obj-$(CONFIG_PWM_TWL_LED) += pwm-twl-led.o +obj-$(CONFIG_PWM_VISCONTI) += pwm-visconti.o obj-$(CONFIG_PWM_VT8500) += pwm-vt8500.o diff --git a/drivers/pwm/pwm-visconti.c b/drivers/pwm/pwm-visconti.c new file mode 100644 index 000000000000..46d903786366 --- /dev/null +++ b/drivers/pwm/pwm-visconti.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Toshiba Visconti pulse-width-modulation controller driver + * + * Copyright (c) 2020 - 2021 TOSHIBA CORPORATION + * Copyright (c) 2020 - 2021 Toshiba Electronic Devices & Storage Corporation + * + * Authors: Nobuhiro Iwamatsu + * + * Limitations: + * - The fixed input clock is running at 1 MHz and is divided by either 1, + * 2, 4 or 8. + * - When the settings of the PWM are modified, the new values are shadowed + * in hardware until the PIPGM_PCSR register is written and the currently + * running period is completed. This way the hardware switches atomically + * from the old setting to the new. + * - Disabling the hardware completes the currently running period and keeps + * the output at low level at all times. + */ + +#include +#include +#include +#include +#include +#include + +#define PIPGM_PCSR(ch) (0x400 + 4 * (ch)) +#define PIPGM_PDUT(ch) (0x420 + 4 * (ch)) +#define PIPGM_PWMC(ch) (0x440 + 4 * (ch)) + +#define PIPGM_PWMC_PWMACT BIT(5) +#define PIPGM_PWMC_CLK_MASK GENMASK(1, 0) +#define PIPGM_PWMC_POLARITY_MASK GENMASK(5, 5) + +struct visconti_pwm_chip { + struct pwm_chip chip; + void __iomem *base; +}; + +static inline struct visconti_pwm_chip *visconti_pwm_from_chip(struct pwm_chip *chip) +{ + return container_of(chip, struct visconti_pwm_chip, chip); +} + +static int visconti_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + struct visconti_pwm_chip *priv = visconti_pwm_from_chip(chip); + u32 period, duty_cycle, pwmc0; + + if (!state->enabled) { + writel(0, priv->base + PIPGM_PCSR(pwm->hwpwm)); + return 0; + } + + /* + * The biggest period the hardware can provide is + * (0xffff << 3) * 1000 ns + * This value fits easily in an u32, so simplify the maths by + * capping the values to 32 bit integers. + */ + if (state->period > (0xffff << 3) * 1000) + period = (0xffff << 3) * 1000; + else + period = state->period; + + if (state->duty_cycle > period) + duty_cycle = period; + else + duty_cycle = state->duty_cycle; + + /* + * The input clock runs fixed at 1 MHz, so we have only + * microsecond resolution and so can divide by + * NSEC_PER_SEC / CLKFREQ = 1000 without losing precision. + */ + period /= 1000; + duty_cycle /= 1000; + + if (!period) + return -ERANGE; + + /* + * PWMC controls a divider that divides the input clk by a + * power of two between 1 and 8. As a smaller divider yields + * higher precision, pick the smallest possible one. + */ + if (period > 0xffff) { + pwmc0 = ilog2(period >> 16); + if (WARN_ON(pwmc0 > 3)) + return -EINVAL; + } else { + pwmc0 = 0; + } + + period >>= pwmc0; + duty_cycle >>= pwmc0; + + if (state->polarity == PWM_POLARITY_INVERSED) + pwmc0 |= PIPGM_PWMC_PWMACT; + writel(pwmc0, priv->base + PIPGM_PWMC(pwm->hwpwm)); + writel(duty_cycle, priv->base + PIPGM_PDUT(pwm->hwpwm)); + writel(period, priv->base + PIPGM_PCSR(pwm->hwpwm)); + + return 0; +} + +static void visconti_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, + struct pwm_state *state) +{ + struct visconti_pwm_chip *priv = visconti_pwm_from_chip(chip); + u32 period, duty, pwmc0, pwmc0_clk; + + period = readl(priv->base + PIPGM_PCSR(pwm->hwpwm)); + duty = readl(priv->base + PIPGM_PDUT(pwm->hwpwm)); + pwmc0 = readl(priv->base + PIPGM_PWMC(pwm->hwpwm)); + pwmc0_clk = pwmc0 & PIPGM_PWMC_CLK_MASK; + + state->period = (period << pwmc0_clk) * NSEC_PER_USEC; + state->duty_cycle = (duty << pwmc0_clk) * NSEC_PER_USEC; + if (pwmc0 & PIPGM_PWMC_POLARITY_MASK) + state->polarity = PWM_POLARITY_INVERSED; + else + state->polarity = PWM_POLARITY_NORMAL; + + state->enabled = true; +} + +static const struct pwm_ops visconti_pwm_ops = { + .apply = visconti_pwm_apply, + .get_state = visconti_pwm_get_state, + .owner = THIS_MODULE, +}; + +static int visconti_pwm_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct visconti_pwm_chip *priv; + int ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + + platform_set_drvdata(pdev, priv); + + priv->chip.dev = dev; + priv->chip.ops = &visconti_pwm_ops; + priv->chip.npwm = 4; + + ret = pwmchip_add(&priv->chip); + if (ret < 0) + return dev_err_probe(&pdev->dev, ret, "Cannot register visconti PWM\n"); + + return 0; +} + +static int visconti_pwm_remove(struct platform_device *pdev) +{ + struct visconti_pwm_chip *priv = platform_get_drvdata(pdev); + + pwmchip_remove(&priv->chip); + + return 0; +} + +static const struct of_device_id visconti_pwm_of_match[] = { + { .compatible = "toshiba,visconti-pwm", }, + { } +}; +MODULE_DEVICE_TABLE(of, visconti_pwm_of_match); + +static struct platform_driver visconti_pwm_driver = { + .driver = { + .name = "pwm-visconti", + .of_match_table = visconti_pwm_of_match, + }, + .probe = visconti_pwm_probe, + .remove = visconti_pwm_remove, +}; +module_platform_driver(visconti_pwm_driver); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Nobuhiro Iwamatsu "); +MODULE_ALIAS("platform:pwm-visconti"); From 453e8b3d8e36ddcb283b3d1698864a03ea45599a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 20 Apr 2021 11:51:17 +0200 Subject: [PATCH 0598/1379] pwm: atmel: Fix duty cycle calculation in .get_state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CDTY register contains the number of inactive cycles. .apply() does this correctly, however .get_state() got this wrong. Fixes: 651b510a74d4 ("pwm: atmel: Implement .get_state()") Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-atmel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c index d49da708337f..ebaeb50dcfde 100644 --- a/drivers/pwm/pwm-atmel.c +++ b/drivers/pwm/pwm-atmel.c @@ -319,7 +319,7 @@ static void atmel_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, cdty = atmel_pwm_ch_readl(atmel_pwm, pwm->hwpwm, atmel_pwm->data->regs.duty); - tmp = (u64)cdty * NSEC_PER_SEC; + tmp = (u64)(cprd - cdty) * NSEC_PER_SEC; tmp <<= pres; state->duty_cycle = DIV64_U64_ROUND_UP(tmp, rate); From 8035e6c66a5e98f098edf7441667de74affb4e78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 20 Apr 2021 11:51:18 +0200 Subject: [PATCH 0599/1379] pwm: atmel: Improve duty cycle calculation in .apply() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the calculation of the register value determining the duty cycle the requested period is used instead of the actually implemented period which results in suboptimal settings. The following example assumes an input clock of 133333333 Hz on one of the SoCs with 16 bit period. When the following state is to be applied: .period = 414727681 .duty_cycle = 652806 the following register values used to be calculated: PRES = 10 CPRD = 54000 CDTY = 53916 which yields an actual duty cycle of a bit more than 645120 ns. The setting PRES = 10 CPRD = 54000 CDTY = 53915 however yields a duty of 652800 ns which is between the current result and the requested value and so is a better approximation. The reason for this error is that for the calculation of CDTY the requested period was used instead of the actually implemented one. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-atmel.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c index ebaeb50dcfde..29b5ad03f715 100644 --- a/drivers/pwm/pwm-atmel.c +++ b/drivers/pwm/pwm-atmel.c @@ -124,6 +124,7 @@ static inline void atmel_pwm_ch_writel(struct atmel_pwm_chip *chip, } static int atmel_pwm_calculate_cprd_and_pres(struct pwm_chip *chip, + unsigned long clkrate, const struct pwm_state *state, unsigned long *cprd, u32 *pres) { @@ -132,7 +133,7 @@ static int atmel_pwm_calculate_cprd_and_pres(struct pwm_chip *chip, int shift; /* Calculate the period cycles and prescale value */ - cycles *= clk_get_rate(atmel_pwm->clk); + cycles *= clkrate; do_div(cycles, NSEC_PER_SEC); /* @@ -158,12 +159,14 @@ static int atmel_pwm_calculate_cprd_and_pres(struct pwm_chip *chip, } static void atmel_pwm_calculate_cdty(const struct pwm_state *state, - unsigned long cprd, unsigned long *cdty) + unsigned long clkrate, unsigned long cprd, + u32 pres, unsigned long *cdty) { unsigned long long cycles = state->duty_cycle; - cycles *= cprd; - do_div(cycles, state->period); + cycles *= clkrate; + do_div(cycles, NSEC_PER_SEC); + cycles >>= pres; *cdty = cprd - cycles; } @@ -244,17 +247,23 @@ static int atmel_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, pwm_get_state(pwm, &cstate); if (state->enabled) { + unsigned long clkrate = clk_get_rate(atmel_pwm->clk); + if (cstate.enabled && cstate.polarity == state->polarity && cstate.period == state->period) { + u32 cmr = atmel_pwm_ch_readl(atmel_pwm, pwm->hwpwm, PWM_CMR); + cprd = atmel_pwm_ch_readl(atmel_pwm, pwm->hwpwm, atmel_pwm->data->regs.period); - atmel_pwm_calculate_cdty(state, cprd, &cdty); + pres = cmr & PWM_CMR_CPRE_MSK; + + atmel_pwm_calculate_cdty(state, clkrate, cprd, pres, &cdty); atmel_pwm_update_cdty(chip, pwm, cdty); return 0; } - ret = atmel_pwm_calculate_cprd_and_pres(chip, state, &cprd, + ret = atmel_pwm_calculate_cprd_and_pres(chip, clkrate, state, &cprd, &pres); if (ret) { dev_err(chip->dev, @@ -262,7 +271,7 @@ static int atmel_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, return ret; } - atmel_pwm_calculate_cdty(state, cprd, &cdty); + atmel_pwm_calculate_cdty(state, clkrate, cprd, pres, &cdty); if (cstate.enabled) { atmel_pwm_disable(chip, pwm, false); From 93a40a6d7428921897bb7fed5ffb4ce83df05432 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 20 Apr 2021 11:46:22 -0700 Subject: [PATCH 0600/1379] dmaengine: idxd: add percpu_ref to descriptor submission path Current submission path has no way to restrict the submitter from stop submiting on shutdown path or wq disable path. This provides a way to quiesce the submission path. Modeling after 'struct reqeust_queue' usage of percpu_ref. One of the abilities of per_cpu reference counting is the ability to stop new references from being taken while awaiting outstanding references to be dropped. On wq shutdown, we want to block any new submissions to the kernel workqueue and quiesce before disabling. The percpu_ref allows us to block any new submissions and wait for any current submission calls to finish submitting to the workqueue. A percpu_ref is embedded in each idxd_wq context to allow control for individual wq. The wq->wq_active counter is elevated before calling movdir64b() or enqcmds() to submit a descriptor to the wq and dropped once the submission call completes. The function is gated by percpu_ref_tryget_live(). On shutdown with percpu_ref_kill() called, any new submission would be blocked from acquiring a ref and failed. Once all references are dropped for the wq, shutdown can continue. Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161894438293.3202472.14894701611500822232.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 26 +++++ drivers/dma/idxd/idxd.h | 4 + drivers/dma/idxd/init.c | 1 + drivers/dma/idxd/submit.c | 5 + drivers/dma/idxd/sysfs.c | 233 ++++++++++++++++++++------------------ 5 files changed, 161 insertions(+), 108 deletions(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 016df87cf5c5..6d674fdedb4d 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -384,6 +384,32 @@ void idxd_wq_disable_cleanup(struct idxd_wq *wq) memset(wq->name, 0, WQ_NAME_SIZE); } +static void idxd_wq_ref_release(struct percpu_ref *ref) +{ + struct idxd_wq *wq = container_of(ref, struct idxd_wq, wq_active); + + complete(&wq->wq_dead); +} + +int idxd_wq_init_percpu_ref(struct idxd_wq *wq) +{ + int rc; + + memset(&wq->wq_active, 0, sizeof(wq->wq_active)); + rc = percpu_ref_init(&wq->wq_active, idxd_wq_ref_release, 0, GFP_KERNEL); + if (rc < 0) + return rc; + reinit_completion(&wq->wq_dead); + return 0; +} + +void idxd_wq_quiesce(struct idxd_wq *wq) +{ + percpu_ref_kill(&wq->wq_active); + wait_for_completion(&wq->wq_dead); + percpu_ref_exit(&wq->wq_active); +} + /* Device control bits */ static inline bool idxd_is_enabled(struct idxd_device *idxd) { diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 8055e872953c..1b539cbf3c14 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -108,6 +108,8 @@ struct idxd_dma_chan { struct idxd_wq { void __iomem *portal; + struct percpu_ref wq_active; + struct completion wq_dead; struct device conf_dev; struct idxd_cdev *idxd_cdev; struct wait_queue_head err_queue; @@ -395,6 +397,8 @@ void idxd_wq_unmap_portal(struct idxd_wq *wq); void idxd_wq_disable_cleanup(struct idxd_wq *wq); int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid); int idxd_wq_disable_pasid(struct idxd_wq *wq); +void idxd_wq_quiesce(struct idxd_wq *wq); +int idxd_wq_init_percpu_ref(struct idxd_wq *wq); /* submission */ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index e8f64324bb3a..eda5ffd307c6 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -178,6 +178,7 @@ static int idxd_setup_wqs(struct idxd_device *idxd) mutex_init(&wq->wq_lock); init_waitqueue_head(&wq->err_queue); + init_completion(&wq->wq_dead); wq->max_xfer_bytes = idxd->max_xfer_bytes; wq->max_batch_size = idxd->max_batch_size; wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev)); diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c index dfc8900d5de3..02f9f51e29a6 100644 --- a/drivers/dma/idxd/submit.c +++ b/drivers/dma/idxd/submit.c @@ -86,6 +86,9 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc) if (idxd->state != IDXD_DEV_ENABLED) return -EIO; + if (!percpu_ref_tryget_live(&wq->wq_active)) + return -ENXIO; + portal = wq->portal; /* @@ -108,6 +111,8 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc) return rc; } + percpu_ref_put(&wq->wq_active); + /* * Pending the descriptor to the lockless list for the irq_entry * that we designated the descriptor to. diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 581ce56ae4f5..dad5c0be9ae8 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -47,6 +47,127 @@ static int idxd_config_bus_match(struct device *dev, return matched; } +static int enable_wq(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + unsigned long flags; + int rc; + + mutex_lock(&wq->wq_lock); + + if (idxd->state != IDXD_DEV_ENABLED) { + mutex_unlock(&wq->wq_lock); + dev_warn(dev, "Enabling while device not enabled.\n"); + return -EPERM; + } + + if (wq->state != IDXD_WQ_DISABLED) { + mutex_unlock(&wq->wq_lock); + dev_warn(dev, "WQ %d already enabled.\n", wq->id); + return -EBUSY; + } + + if (!wq->group) { + mutex_unlock(&wq->wq_lock); + dev_warn(dev, "WQ not attached to group.\n"); + return -EINVAL; + } + + if (strlen(wq->name) == 0) { + mutex_unlock(&wq->wq_lock); + dev_warn(dev, "WQ name not set.\n"); + return -EINVAL; + } + + /* Shared WQ checks */ + if (wq_shared(wq)) { + if (!device_swq_supported(idxd)) { + dev_warn(dev, "PASID not enabled and shared WQ.\n"); + mutex_unlock(&wq->wq_lock); + return -ENXIO; + } + /* + * Shared wq with the threshold set to 0 means the user + * did not set the threshold or transitioned from a + * dedicated wq but did not set threshold. A value + * of 0 would effectively disable the shared wq. The + * driver does not allow a value of 0 to be set for + * threshold via sysfs. + */ + if (wq->threshold == 0) { + dev_warn(dev, "Shared WQ and threshold 0.\n"); + mutex_unlock(&wq->wq_lock); + return -EINVAL; + } + } + + rc = idxd_wq_alloc_resources(wq); + if (rc < 0) { + mutex_unlock(&wq->wq_lock); + dev_warn(dev, "WQ resource alloc failed\n"); + return rc; + } + + spin_lock_irqsave(&idxd->dev_lock, flags); + rc = idxd_device_config(idxd); + spin_unlock_irqrestore(&idxd->dev_lock, flags); + if (rc < 0) { + mutex_unlock(&wq->wq_lock); + dev_warn(dev, "Writing WQ %d config failed: %d\n", wq->id, rc); + return rc; + } + + rc = idxd_wq_enable(wq); + if (rc < 0) { + mutex_unlock(&wq->wq_lock); + dev_warn(dev, "WQ %d enabling failed: %d\n", wq->id, rc); + return rc; + } + + rc = idxd_wq_map_portal(wq); + if (rc < 0) { + dev_warn(dev, "wq portal mapping failed: %d\n", rc); + rc = idxd_wq_disable(wq); + if (rc < 0) + dev_warn(dev, "IDXD wq disable failed\n"); + mutex_unlock(&wq->wq_lock); + return rc; + } + + wq->client_count = 0; + + if (wq->type == IDXD_WQT_KERNEL) { + rc = idxd_wq_init_percpu_ref(wq); + if (rc < 0) { + dev_dbg(dev, "percpu_ref setup failed\n"); + mutex_unlock(&wq->wq_lock); + return rc; + } + } + + if (is_idxd_wq_dmaengine(wq)) { + rc = idxd_register_dma_channel(wq); + if (rc < 0) { + dev_dbg(dev, "DMA channel register failed\n"); + mutex_unlock(&wq->wq_lock); + return rc; + } + } else if (is_idxd_wq_cdev(wq)) { + rc = idxd_wq_add_cdev(wq); + if (rc < 0) { + dev_dbg(dev, "Cdev creation failed\n"); + mutex_unlock(&wq->wq_lock); + return rc; + } + } + + mutex_unlock(&wq->wq_lock); + dev_info(dev, "wq %s enabled\n", dev_name(&wq->conf_dev)); + + return 0; +} + static int idxd_config_bus_probe(struct device *dev) { int rc; @@ -94,115 +215,8 @@ static int idxd_config_bus_probe(struct device *dev) return 0; } else if (is_idxd_wq_dev(dev)) { struct idxd_wq *wq = confdev_to_wq(dev); - struct idxd_device *idxd = wq->idxd; - mutex_lock(&wq->wq_lock); - - if (idxd->state != IDXD_DEV_ENABLED) { - mutex_unlock(&wq->wq_lock); - dev_warn(dev, "Enabling while device not enabled.\n"); - return -EPERM; - } - - if (wq->state != IDXD_WQ_DISABLED) { - mutex_unlock(&wq->wq_lock); - dev_warn(dev, "WQ %d already enabled.\n", wq->id); - return -EBUSY; - } - - if (!wq->group) { - mutex_unlock(&wq->wq_lock); - dev_warn(dev, "WQ not attached to group.\n"); - return -EINVAL; - } - - if (strlen(wq->name) == 0) { - mutex_unlock(&wq->wq_lock); - dev_warn(dev, "WQ name not set.\n"); - return -EINVAL; - } - - /* Shared WQ checks */ - if (wq_shared(wq)) { - if (!device_swq_supported(idxd)) { - dev_warn(dev, - "PASID not enabled and shared WQ.\n"); - mutex_unlock(&wq->wq_lock); - return -ENXIO; - } - /* - * Shared wq with the threshold set to 0 means the user - * did not set the threshold or transitioned from a - * dedicated wq but did not set threshold. A value - * of 0 would effectively disable the shared wq. The - * driver does not allow a value of 0 to be set for - * threshold via sysfs. - */ - if (wq->threshold == 0) { - dev_warn(dev, - "Shared WQ and threshold 0.\n"); - mutex_unlock(&wq->wq_lock); - return -EINVAL; - } - } - - rc = idxd_wq_alloc_resources(wq); - if (rc < 0) { - mutex_unlock(&wq->wq_lock); - dev_warn(dev, "WQ resource alloc failed\n"); - return rc; - } - - spin_lock_irqsave(&idxd->dev_lock, flags); - rc = idxd_device_config(idxd); - spin_unlock_irqrestore(&idxd->dev_lock, flags); - if (rc < 0) { - mutex_unlock(&wq->wq_lock); - dev_warn(dev, "Writing WQ %d config failed: %d\n", - wq->id, rc); - return rc; - } - - rc = idxd_wq_enable(wq); - if (rc < 0) { - mutex_unlock(&wq->wq_lock); - dev_warn(dev, "WQ %d enabling failed: %d\n", - wq->id, rc); - return rc; - } - - rc = idxd_wq_map_portal(wq); - if (rc < 0) { - dev_warn(dev, "wq portal mapping failed: %d\n", rc); - rc = idxd_wq_disable(wq); - if (rc < 0) - dev_warn(dev, "IDXD wq disable failed\n"); - mutex_unlock(&wq->wq_lock); - return rc; - } - - wq->client_count = 0; - - dev_info(dev, "wq %s enabled\n", dev_name(&wq->conf_dev)); - - if (is_idxd_wq_dmaengine(wq)) { - rc = idxd_register_dma_channel(wq); - if (rc < 0) { - dev_dbg(dev, "DMA channel register failed\n"); - mutex_unlock(&wq->wq_lock); - return rc; - } - } else if (is_idxd_wq_cdev(wq)) { - rc = idxd_wq_add_cdev(wq); - if (rc < 0) { - dev_dbg(dev, "Cdev creation failed\n"); - mutex_unlock(&wq->wq_lock); - return rc; - } - } - - mutex_unlock(&wq->wq_lock); - return 0; + return enable_wq(wq); } return -ENODEV; @@ -220,6 +234,9 @@ static void disable_wq(struct idxd_wq *wq) return; } + if (wq->type == IDXD_WQT_KERNEL) + idxd_wq_quiesce(wq); + if (is_idxd_wq_dmaengine(wq)) idxd_unregister_dma_channel(wq); else if (is_idxd_wq_cdev(wq)) From 8c66bbdc4fbf3c297ebc8edf71f359e4a132c9db Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 20 Apr 2021 11:46:28 -0700 Subject: [PATCH 0601/1379] dmaengine: idxd: add support for readonly config mode The read-only configuration mode is defined by the DSA spec as a mode of the device WQ configuration. When GENCAP register bit 31 is set to 0, the device is in RO mode and group configuration and some fields of the workqueue configuration registers are read-only and reflect the fixed configuration of the device. Add support for RO mode. The driver will load the values from the registers directly setup all the internally cached data structures based on the device configuration. Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161894438847.3202472.6317563824045432727.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 116 ++++++++++++++++++++++++++++++++++++++ drivers/dma/idxd/idxd.h | 1 + drivers/dma/idxd/init.c | 8 +++ drivers/dma/idxd/sysfs.c | 20 ++++--- 4 files changed, 137 insertions(+), 8 deletions(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 6d674fdedb4d..3ddb1c731080 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -884,3 +884,119 @@ int idxd_device_config(struct idxd_device *idxd) return 0; } + +static int idxd_wq_load_config(struct idxd_wq *wq) +{ + struct idxd_device *idxd = wq->idxd; + struct device *dev = &idxd->pdev->dev; + int wqcfg_offset; + int i; + + wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, 0); + memcpy_fromio(wq->wqcfg, idxd->reg_base + wqcfg_offset, idxd->wqcfg_size); + + wq->size = wq->wqcfg->wq_size; + wq->threshold = wq->wqcfg->wq_thresh; + if (wq->wqcfg->priv) + wq->type = IDXD_WQT_KERNEL; + + /* The driver does not support shared WQ mode in read-only config yet */ + if (wq->wqcfg->mode == 0 || wq->wqcfg->pasid_en) + return -EOPNOTSUPP; + + set_bit(WQ_FLAG_DEDICATED, &wq->flags); + + wq->priority = wq->wqcfg->priority; + + for (i = 0; i < WQCFG_STRIDES(idxd); i++) { + wqcfg_offset = WQCFG_OFFSET(idxd, wq->id, i); + dev_dbg(dev, "WQ[%d][%d][%#x]: %#x\n", wq->id, i, wqcfg_offset, wq->wqcfg->bits[i]); + } + + return 0; +} + +static void idxd_group_load_config(struct idxd_group *group) +{ + struct idxd_device *idxd = group->idxd; + struct device *dev = &idxd->pdev->dev; + int i, j, grpcfg_offset; + + /* + * Load WQS bit fields + * Iterate through all 256 bits 64 bits at a time + */ + for (i = 0; i < GRPWQCFG_STRIDES; i++) { + struct idxd_wq *wq; + + grpcfg_offset = GRPWQCFG_OFFSET(idxd, group->id, i); + group->grpcfg.wqs[i] = ioread64(idxd->reg_base + grpcfg_offset); + dev_dbg(dev, "GRPCFG wq[%d:%d: %#x]: %#llx\n", + group->id, i, grpcfg_offset, group->grpcfg.wqs[i]); + + if (i * 64 >= idxd->max_wqs) + break; + + /* Iterate through all 64 bits and check for wq set */ + for (j = 0; j < 64; j++) { + int id = i * 64 + j; + + /* No need to check beyond max wqs */ + if (id >= idxd->max_wqs) + break; + + /* Set group assignment for wq if wq bit is set */ + if (group->grpcfg.wqs[i] & BIT(j)) { + wq = idxd->wqs[id]; + wq->group = group; + } + } + } + + grpcfg_offset = GRPENGCFG_OFFSET(idxd, group->id); + group->grpcfg.engines = ioread64(idxd->reg_base + grpcfg_offset); + dev_dbg(dev, "GRPCFG engs[%d: %#x]: %#llx\n", group->id, + grpcfg_offset, group->grpcfg.engines); + + /* Iterate through all 64 bits to check engines set */ + for (i = 0; i < 64; i++) { + if (i >= idxd->max_engines) + break; + + if (group->grpcfg.engines & BIT(i)) { + struct idxd_engine *engine = idxd->engines[i]; + + engine->group = group; + } + } + + grpcfg_offset = GRPFLGCFG_OFFSET(idxd, group->id); + group->grpcfg.flags.bits = ioread32(idxd->reg_base + grpcfg_offset); + dev_dbg(dev, "GRPFLAGS flags[%d: %#x]: %#x\n", + group->id, grpcfg_offset, group->grpcfg.flags.bits); +} + +int idxd_device_load_config(struct idxd_device *idxd) +{ + union gencfg_reg reg; + int i, rc; + + reg.bits = ioread32(idxd->reg_base + IDXD_GENCFG_OFFSET); + idxd->token_limit = reg.token_limit; + + for (i = 0; i < idxd->max_groups; i++) { + struct idxd_group *group = idxd->groups[i]; + + idxd_group_load_config(group); + } + + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; + + rc = idxd_wq_load_config(wq); + if (rc < 0) + return rc; + } + + return 0; +} diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 1b539cbf3c14..940a2e1ddf12 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -384,6 +384,7 @@ void idxd_device_cleanup(struct idxd_device *idxd); int idxd_device_config(struct idxd_device *idxd); void idxd_device_wqs_clear_state(struct idxd_device *idxd); void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid); +int idxd_device_load_config(struct idxd_device *idxd); /* work queue control */ int idxd_wq_alloc_resources(struct idxd_wq *wq); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index eda5ffd307c6..a07e6d8eec00 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -482,6 +482,14 @@ static int idxd_probe(struct idxd_device *idxd) if (rc) goto err; + /* If the configs are readonly, then load them from device */ + if (!test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) { + dev_dbg(dev, "Loading RO device config\n"); + rc = idxd_device_load_config(idxd); + if (rc < 0) + goto err; + } + rc = idxd_setup_interrupts(idxd); if (rc) goto err; diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index dad5c0be9ae8..d45cb61f300b 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -110,7 +110,8 @@ static int enable_wq(struct idxd_wq *wq) } spin_lock_irqsave(&idxd->dev_lock, flags); - rc = idxd_device_config(idxd); + if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + rc = idxd_device_config(idxd); spin_unlock_irqrestore(&idxd->dev_lock, flags); if (rc < 0) { mutex_unlock(&wq->wq_lock); @@ -170,7 +171,7 @@ static int enable_wq(struct idxd_wq *wq) static int idxd_config_bus_probe(struct device *dev) { - int rc; + int rc = 0; unsigned long flags; dev_dbg(dev, "%s called\n", __func__); @@ -188,7 +189,8 @@ static int idxd_config_bus_probe(struct device *dev) /* Perform IDXD configuration and enabling */ spin_lock_irqsave(&idxd->dev_lock, flags); - rc = idxd_device_config(idxd); + if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) + rc = idxd_device_config(idxd); spin_unlock_irqrestore(&idxd->dev_lock, flags); if (rc < 0) { module_put(THIS_MODULE); @@ -287,12 +289,14 @@ static int idxd_config_bus_remove(struct device *dev) idxd_unregister_dma_device(idxd); rc = idxd_device_disable(idxd); - for (i = 0; i < idxd->max_wqs; i++) { - struct idxd_wq *wq = idxd->wqs[i]; + if (test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)) { + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; - mutex_lock(&wq->wq_lock); - idxd_wq_disable_cleanup(wq); - mutex_unlock(&wq->wq_lock); + mutex_lock(&wq->wq_lock); + idxd_wq_disable_cleanup(wq); + mutex_unlock(&wq->wq_lock); + } } module_put(THIS_MODULE); if (rc < 0) From eb15e7154fbfa3e61c777704b2ff28eb3a0d4796 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 20 Apr 2021 11:46:34 -0700 Subject: [PATCH 0602/1379] dmaengine: idxd: add interrupt handle request and release support DSA spec states that when Request Interrupt Handle and Release Interrupt Handle command bits are set in the CMDCAP register, these device commands must be supported by the driver. The interrupt handle is programmed in a descriptor. When Request Interrupt Handle is not supported, the interrupt handle is the index of the desired entry in the MSI-X table. When the command is supported, driver must use the command to obtain a handle to be programmed in the submitted descriptor. A requested handle may be revoked. After the handle is revoked, any use of the handle will result in Invalid Interrupt Handle error. Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161894439422.3202472.17579543737810265471.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 71 ++++++++++++++++++++++++++++++++++++ drivers/dma/idxd/idxd.h | 13 +++++++ drivers/dma/idxd/init.c | 56 +++++++++++++++++++++++++++- drivers/dma/idxd/registers.h | 9 ++++- drivers/dma/idxd/submit.c | 35 ++++++++++++++---- drivers/dma/idxd/sysfs.c | 1 + 6 files changed, 176 insertions(+), 9 deletions(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 3ddb1c731080..54d5afec81cf 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -598,6 +598,77 @@ void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid) dev_dbg(dev, "pasid %d drained\n", pasid); } +int idxd_device_request_int_handle(struct idxd_device *idxd, int idx, int *handle, + enum idxd_interrupt_type irq_type) +{ + struct device *dev = &idxd->pdev->dev; + u32 operand, status; + + if (!(idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE))) + return -EOPNOTSUPP; + + dev_dbg(dev, "get int handle, idx %d\n", idx); + + operand = idx & GENMASK(15, 0); + if (irq_type == IDXD_IRQ_IMS) + operand |= CMD_INT_HANDLE_IMS; + + dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_REQUEST_INT_HANDLE, operand); + + idxd_cmd_exec(idxd, IDXD_CMD_REQUEST_INT_HANDLE, operand, &status); + + if ((status & IDXD_CMDSTS_ERR_MASK) != IDXD_CMDSTS_SUCCESS) { + dev_dbg(dev, "request int handle failed: %#x\n", status); + return -ENXIO; + } + + *handle = (status >> IDXD_CMDSTS_RES_SHIFT) & GENMASK(15, 0); + + dev_dbg(dev, "int handle acquired: %u\n", *handle); + return 0; +} + +int idxd_device_release_int_handle(struct idxd_device *idxd, int handle, + enum idxd_interrupt_type irq_type) +{ + struct device *dev = &idxd->pdev->dev; + u32 operand, status; + union idxd_command_reg cmd; + unsigned long flags; + + if (!(idxd->hw.cmd_cap & BIT(IDXD_CMD_RELEASE_INT_HANDLE))) + return -EOPNOTSUPP; + + dev_dbg(dev, "release int handle, handle %d\n", handle); + + memset(&cmd, 0, sizeof(cmd)); + operand = handle & GENMASK(15, 0); + + if (irq_type == IDXD_IRQ_IMS) + operand |= CMD_INT_HANDLE_IMS; + + cmd.cmd = IDXD_CMD_RELEASE_INT_HANDLE; + cmd.operand = operand; + + dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_RELEASE_INT_HANDLE, operand); + + spin_lock_irqsave(&idxd->dev_lock, flags); + iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET); + + while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) & IDXD_CMDSTS_ACTIVE) + cpu_relax(); + status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET); + spin_unlock_irqrestore(&idxd->dev_lock, flags); + + if ((status & IDXD_CMDSTS_ERR_MASK) != IDXD_CMDSTS_SUCCESS) { + dev_dbg(dev, "release int handle failed: %#x\n", status); + return -ENXIO; + } + + dev_dbg(dev, "int handle released.\n"); + return 0; +} + /* Device configuration bits */ void idxd_msix_perm_setup(struct idxd_device *idxd) { diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 940a2e1ddf12..c1d4a1976206 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -160,6 +160,7 @@ struct idxd_hw { union group_cap_reg group_cap; union engine_cap_reg engine_cap; struct opcap opcap; + u32 cmd_cap; }; enum idxd_device_state { @@ -237,6 +238,8 @@ struct idxd_device { struct idxd_dma_dev *idxd_dma; struct workqueue_struct *wq; struct work_struct work; + + int *int_handles; }; /* IDXD software descriptor */ @@ -256,6 +259,7 @@ struct idxd_desc { struct list_head list; int id; int cpu; + unsigned int vector; struct idxd_wq *wq; }; @@ -330,6 +334,11 @@ enum idxd_portal_prot { IDXD_PORTAL_LIMITED, }; +enum idxd_interrupt_type { + IDXD_IRQ_MSIX = 0, + IDXD_IRQ_IMS, +}; + static inline int idxd_get_wq_portal_offset(enum idxd_portal_prot prot) { return prot * 0x1000; @@ -385,6 +394,10 @@ int idxd_device_config(struct idxd_device *idxd); void idxd_device_wqs_clear_state(struct idxd_device *idxd); void idxd_device_drain_pasid(struct idxd_device *idxd, int pasid); int idxd_device_load_config(struct idxd_device *idxd); +int idxd_device_request_int_handle(struct idxd_device *idxd, int idx, int *handle, + enum idxd_interrupt_type irq_type); +int idxd_device_release_int_handle(struct idxd_device *idxd, int handle, + enum idxd_interrupt_type irq_type); /* work queue control */ int idxd_wq_alloc_resources(struct idxd_wq *wq); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index a07e6d8eec00..ef58750c24cc 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -125,7 +125,25 @@ static int idxd_setup_interrupts(struct idxd_device *idxd) dev_err(dev, "Failed to allocate irq %d.\n", irq_entry->vector); goto err_wq_irqs; } + dev_dbg(dev, "Allocated idxd-msix %d for vector %d\n", i, irq_entry->vector); + if (idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE)) { + /* + * The MSIX vector enumeration starts at 1 with vector 0 being the + * misc interrupt that handles non I/O completion events. The + * interrupt handles are for IMS enumeration on guest. The misc + * interrupt vector does not require a handle and therefore we start + * the int_handles at index 0. Since 'i' starts at 1, the first + * int_handles index will be 0. + */ + rc = idxd_device_request_int_handle(idxd, i, &idxd->int_handles[i - 1], + IDXD_IRQ_MSIX); + if (rc < 0) { + free_irq(irq_entry->vector, irq_entry); + goto err_wq_irqs; + } + dev_dbg(dev, "int handle requested: %u\n", idxd->int_handles[i - 1]); + } } idxd_unmask_error_interrupts(idxd); @@ -136,6 +154,9 @@ static int idxd_setup_interrupts(struct idxd_device *idxd) while (--i >= 0) { irq_entry = &idxd->irq_entries[i]; free_irq(irq_entry->vector, irq_entry); + if (i != 0) + idxd_device_release_int_handle(idxd, + idxd->int_handles[i], IDXD_IRQ_MSIX); } err_misc_irq: /* Disable error interrupt generation */ @@ -288,9 +309,15 @@ static int idxd_setup_internals(struct idxd_device *idxd) init_waitqueue_head(&idxd->cmd_waitq); + if (idxd->hw.cmd_cap & BIT(IDXD_CMD_REQUEST_INT_HANDLE)) { + idxd->int_handles = devm_kcalloc(dev, idxd->max_wqs, sizeof(int), GFP_KERNEL); + if (!idxd->int_handles) + return -ENOMEM; + } + rc = idxd_setup_wqs(idxd); if (rc < 0) - return rc; + goto err_wqs; rc = idxd_setup_engines(idxd); if (rc < 0) @@ -317,6 +344,8 @@ static int idxd_setup_internals(struct idxd_device *idxd) err_engine: for (i = 0; i < idxd->max_wqs; i++) put_device(&idxd->wqs[i]->conf_dev); + err_wqs: + kfree(idxd->int_handles); return rc; } @@ -345,6 +374,12 @@ static void idxd_read_caps(struct idxd_device *idxd) /* reading generic capabilities */ idxd->hw.gen_cap.bits = ioread64(idxd->reg_base + IDXD_GENCAP_OFFSET); dev_dbg(dev, "gen_cap: %#llx\n", idxd->hw.gen_cap.bits); + + if (idxd->hw.gen_cap.cmd_cap) { + idxd->hw.cmd_cap = ioread32(idxd->reg_base + IDXD_CMDCAP_OFFSET); + dev_dbg(dev, "cmd_cap: %#x\n", idxd->hw.cmd_cap); + } + idxd->max_xfer_bytes = 1ULL << idxd->hw.gen_cap.max_xfer_shift; dev_dbg(dev, "max xfer size: %llu bytes\n", idxd->max_xfer_bytes); idxd->max_batch_size = 1U << idxd->hw.gen_cap.max_batch_shift; @@ -604,6 +639,24 @@ static void idxd_flush_work_list(struct idxd_irq_entry *ie) } } +static void idxd_release_int_handles(struct idxd_device *idxd) +{ + struct device *dev = &idxd->pdev->dev; + int i, rc; + + for (i = 0; i < idxd->num_wq_irqs; i++) { + if (idxd->hw.cmd_cap & BIT(IDXD_CMD_RELEASE_INT_HANDLE)) { + rc = idxd_device_release_int_handle(idxd, idxd->int_handles[i], + IDXD_IRQ_MSIX); + if (rc < 0) + dev_warn(dev, "irq handle %d release failed\n", + idxd->int_handles[i]); + else + dev_dbg(dev, "int handle requested: %u\n", idxd->int_handles[i]); + } + } +} + static void idxd_shutdown(struct pci_dev *pdev) { struct idxd_device *idxd = pci_get_drvdata(pdev); @@ -630,6 +683,7 @@ static void idxd_shutdown(struct pci_dev *pdev) } idxd_msix_perm_clear(idxd); + idxd_release_int_handles(idxd); pci_free_irq_vectors(pdev); pci_iounmap(pdev, idxd->reg_base); pci_disable_device(pdev); diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h index 751ecb4f9f81..5cbf368c7367 100644 --- a/drivers/dma/idxd/registers.h +++ b/drivers/dma/idxd/registers.h @@ -24,8 +24,8 @@ union gen_cap_reg { u64 overlap_copy:1; u64 cache_control_mem:1; u64 cache_control_cache:1; + u64 cmd_cap:1; u64 rsvd:3; - u64 int_handle_req:1; u64 dest_readback:1; u64 drain_readback:1; u64 rsvd2:6; @@ -180,8 +180,11 @@ enum idxd_cmd { IDXD_CMD_DRAIN_PASID, IDXD_CMD_ABORT_PASID, IDXD_CMD_REQUEST_INT_HANDLE, + IDXD_CMD_RELEASE_INT_HANDLE, }; +#define CMD_INT_HANDLE_IMS 0x10000 + #define IDXD_CMDSTS_OFFSET 0xa8 union cmdsts_reg { struct { @@ -193,6 +196,8 @@ union cmdsts_reg { u32 bits; } __packed; #define IDXD_CMDSTS_ACTIVE 0x80000000 +#define IDXD_CMDSTS_ERR_MASK 0xff +#define IDXD_CMDSTS_RES_SHIFT 8 enum idxd_cmdsts_err { IDXD_CMDSTS_SUCCESS = 0, @@ -228,6 +233,8 @@ enum idxd_cmdsts_err { IDXD_CMDSTS_ERR_NO_HANDLE, }; +#define IDXD_CMDCAP_OFFSET 0xb0 + #define IDXD_SWERR_OFFSET 0xc0 #define IDXD_SWERR_VALID 0x00000001 #define IDXD_SWERR_OVERFLOW 0x00000002 diff --git a/drivers/dma/idxd/submit.c b/drivers/dma/idxd/submit.c index 02f9f51e29a6..19afb62abaff 100644 --- a/drivers/dma/idxd/submit.c +++ b/drivers/dma/idxd/submit.c @@ -22,11 +22,23 @@ static struct idxd_desc *__get_desc(struct idxd_wq *wq, int idx, int cpu) desc->hw->pasid = idxd->pasid; /* - * Descriptor completion vectors are 1-8 for MSIX. We will round - * robin through the 8 vectors. + * Descriptor completion vectors are 1...N for MSIX. We will round + * robin through the N vectors. */ wq->vec_ptr = (wq->vec_ptr % idxd->num_wq_irqs) + 1; - desc->hw->int_handle = wq->vec_ptr; + if (!idxd->int_handles) { + desc->hw->int_handle = wq->vec_ptr; + } else { + desc->vector = wq->vec_ptr; + /* + * int_handles are only for descriptor completion. However for device + * MSIX enumeration, vec 0 is used for misc interrupts. Therefore even + * though we are rotating through 1...N for descriptor interrupts, we + * need to acqurie the int_handles from 0..N-1. + */ + desc->hw->int_handle = idxd->int_handles[desc->vector - 1]; + } + return desc; } @@ -79,7 +91,6 @@ void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc) int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc) { struct idxd_device *idxd = wq->idxd; - int vec = desc->hw->int_handle; void __iomem *portal; int rc; @@ -117,9 +128,19 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc) * Pending the descriptor to the lockless list for the irq_entry * that we designated the descriptor to. */ - if (desc->hw->flags & IDXD_OP_FLAG_RCI) - llist_add(&desc->llnode, - &idxd->irq_entries[vec].pending_llist); + if (desc->hw->flags & IDXD_OP_FLAG_RCI) { + int vec; + + /* + * If the driver is on host kernel, it would be the value + * assigned to interrupt handle, which is index for MSIX + * vector. If it's guest then can't use the int_handle since + * that is the index to IMS for the entire device. The guest + * device local index will be used. + */ + vec = !idxd->int_handles ? desc->hw->int_handle : desc->vector; + llist_add(&desc->llnode, &idxd->irq_entries[vec].pending_llist); + } return 0; } diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index d45cb61f300b..3f4ea4d0fae7 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -1600,6 +1600,7 @@ static void idxd_conf_device_release(struct device *dev) kfree(idxd->wqs); kfree(idxd->engines); kfree(idxd->irq_entries); + kfree(idxd->int_handles); ida_free(&idxd_ida, idxd->id); kfree(idxd); } From 8241571fac9eeb7f3424ad343369eaa411919da3 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 20 Apr 2021 11:46:40 -0700 Subject: [PATCH 0603/1379] dmaengine: idxd: convert sprintf() to sysfs_emit() for all usages Convert sprintf() to sysfs_emit() in order to check buffer overrun on sysfs outputs. Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161894440044.3202472.13926639619695319753.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/sysfs.c | 116 +++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 61 deletions(-) diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index 3f4ea4d0fae7..0460d58e3941 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -350,9 +350,9 @@ static ssize_t engine_group_id_show(struct device *dev, container_of(dev, struct idxd_engine, conf_dev); if (engine->group) - return sprintf(buf, "%d\n", engine->group->id); + return sysfs_emit(buf, "%d\n", engine->group->id); else - return sprintf(buf, "%d\n", -1); + return sysfs_emit(buf, "%d\n", -1); } static ssize_t engine_group_id_store(struct device *dev, @@ -447,7 +447,7 @@ static ssize_t group_tokens_reserved_show(struct device *dev, struct idxd_group *group = container_of(dev, struct idxd_group, conf_dev); - return sprintf(buf, "%u\n", group->tokens_reserved); + return sysfs_emit(buf, "%u\n", group->tokens_reserved); } static ssize_t group_tokens_reserved_store(struct device *dev, @@ -495,7 +495,7 @@ static ssize_t group_tokens_allowed_show(struct device *dev, struct idxd_group *group = container_of(dev, struct idxd_group, conf_dev); - return sprintf(buf, "%u\n", group->tokens_allowed); + return sysfs_emit(buf, "%u\n", group->tokens_allowed); } static ssize_t group_tokens_allowed_store(struct device *dev, @@ -540,7 +540,7 @@ static ssize_t group_use_token_limit_show(struct device *dev, struct idxd_group *group = container_of(dev, struct idxd_group, conf_dev); - return sprintf(buf, "%u\n", group->use_token_limit); + return sysfs_emit(buf, "%u\n", group->use_token_limit); } static ssize_t group_use_token_limit_store(struct device *dev, @@ -583,7 +583,6 @@ static ssize_t group_engines_show(struct device *dev, struct idxd_group *group = container_of(dev, struct idxd_group, conf_dev); int i, rc = 0; - char *tmp = buf; struct idxd_device *idxd = group->idxd; for (i = 0; i < idxd->max_engines; i++) { @@ -593,12 +592,13 @@ static ssize_t group_engines_show(struct device *dev, continue; if (engine->group->id == group->id) - rc += sprintf(tmp + rc, "engine%d.%d ", - idxd->id, engine->id); + rc += sysfs_emit_at(buf, rc, "engine%d.%d ", idxd->id, engine->id); } + if (!rc) + return 0; rc--; - rc += sprintf(tmp + rc, "\n"); + rc += sysfs_emit_at(buf, rc, "\n"); return rc; } @@ -612,7 +612,6 @@ static ssize_t group_work_queues_show(struct device *dev, struct idxd_group *group = container_of(dev, struct idxd_group, conf_dev); int i, rc = 0; - char *tmp = buf; struct idxd_device *idxd = group->idxd; for (i = 0; i < idxd->max_wqs; i++) { @@ -622,12 +621,13 @@ static ssize_t group_work_queues_show(struct device *dev, continue; if (wq->group->id == group->id) - rc += sprintf(tmp + rc, "wq%d.%d ", - idxd->id, wq->id); + rc += sysfs_emit_at(buf, rc, "wq%d.%d ", idxd->id, wq->id); } + if (!rc) + return 0; rc--; - rc += sprintf(tmp + rc, "\n"); + rc += sysfs_emit_at(buf, rc, "\n"); return rc; } @@ -642,7 +642,7 @@ static ssize_t group_traffic_class_a_show(struct device *dev, struct idxd_group *group = container_of(dev, struct idxd_group, conf_dev); - return sprintf(buf, "%d\n", group->tc_a); + return sysfs_emit(buf, "%d\n", group->tc_a); } static ssize_t group_traffic_class_a_store(struct device *dev, @@ -683,7 +683,7 @@ static ssize_t group_traffic_class_b_show(struct device *dev, struct idxd_group *group = container_of(dev, struct idxd_group, conf_dev); - return sprintf(buf, "%d\n", group->tc_b); + return sysfs_emit(buf, "%d\n", group->tc_b); } static ssize_t group_traffic_class_b_store(struct device *dev, @@ -756,7 +756,7 @@ static ssize_t wq_clients_show(struct device *dev, { struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); - return sprintf(buf, "%d\n", wq->client_count); + return sysfs_emit(buf, "%d\n", wq->client_count); } static struct device_attribute dev_attr_wq_clients = @@ -769,12 +769,12 @@ static ssize_t wq_state_show(struct device *dev, switch (wq->state) { case IDXD_WQ_DISABLED: - return sprintf(buf, "disabled\n"); + return sysfs_emit(buf, "disabled\n"); case IDXD_WQ_ENABLED: - return sprintf(buf, "enabled\n"); + return sysfs_emit(buf, "enabled\n"); } - return sprintf(buf, "unknown\n"); + return sysfs_emit(buf, "unknown\n"); } static struct device_attribute dev_attr_wq_state = @@ -786,9 +786,9 @@ static ssize_t wq_group_id_show(struct device *dev, struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); if (wq->group) - return sprintf(buf, "%u\n", wq->group->id); + return sysfs_emit(buf, "%u\n", wq->group->id); else - return sprintf(buf, "-1\n"); + return sysfs_emit(buf, "-1\n"); } static ssize_t wq_group_id_store(struct device *dev, @@ -840,8 +840,7 @@ static ssize_t wq_mode_show(struct device *dev, struct device_attribute *attr, { struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); - return sprintf(buf, "%s\n", - wq_dedicated(wq) ? "dedicated" : "shared"); + return sysfs_emit(buf, "%s\n", wq_dedicated(wq) ? "dedicated" : "shared"); } static ssize_t wq_mode_store(struct device *dev, @@ -877,7 +876,7 @@ static ssize_t wq_size_show(struct device *dev, struct device_attribute *attr, { struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); - return sprintf(buf, "%u\n", wq->size); + return sysfs_emit(buf, "%u\n", wq->size); } static int total_claimed_wq_size(struct idxd_device *idxd) @@ -928,7 +927,7 @@ static ssize_t wq_priority_show(struct device *dev, { struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); - return sprintf(buf, "%u\n", wq->priority); + return sysfs_emit(buf, "%u\n", wq->priority); } static ssize_t wq_priority_store(struct device *dev, @@ -965,8 +964,7 @@ static ssize_t wq_block_on_fault_show(struct device *dev, { struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); - return sprintf(buf, "%u\n", - test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags)); + return sysfs_emit(buf, "%u\n", test_bit(WQ_FLAG_BLOCK_ON_FAULT, &wq->flags)); } static ssize_t wq_block_on_fault_store(struct device *dev, @@ -1005,7 +1003,7 @@ static ssize_t wq_threshold_show(struct device *dev, { struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); - return sprintf(buf, "%u\n", wq->threshold); + return sysfs_emit(buf, "%u\n", wq->threshold); } static ssize_t wq_threshold_store(struct device *dev, @@ -1048,15 +1046,12 @@ static ssize_t wq_type_show(struct device *dev, switch (wq->type) { case IDXD_WQT_KERNEL: - return sprintf(buf, "%s\n", - idxd_wq_type_names[IDXD_WQT_KERNEL]); + return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_KERNEL]); case IDXD_WQT_USER: - return sprintf(buf, "%s\n", - idxd_wq_type_names[IDXD_WQT_USER]); + return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_USER]); case IDXD_WQT_NONE: default: - return sprintf(buf, "%s\n", - idxd_wq_type_names[IDXD_WQT_NONE]); + return sysfs_emit(buf, "%s\n", idxd_wq_type_names[IDXD_WQT_NONE]); } return -EINVAL; @@ -1097,7 +1092,7 @@ static ssize_t wq_name_show(struct device *dev, { struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); - return sprintf(buf, "%s\n", wq->name); + return sysfs_emit(buf, "%s\n", wq->name); } static ssize_t wq_name_store(struct device *dev, @@ -1167,7 +1162,7 @@ static ssize_t wq_max_transfer_size_show(struct device *dev, struct device_attri { struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); - return sprintf(buf, "%llu\n", wq->max_xfer_bytes); + return sysfs_emit(buf, "%llu\n", wq->max_xfer_bytes); } static ssize_t wq_max_transfer_size_store(struct device *dev, struct device_attribute *attr, @@ -1201,7 +1196,7 @@ static ssize_t wq_max_batch_size_show(struct device *dev, struct device_attribut { struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); - return sprintf(buf, "%u\n", wq->max_batch_size); + return sysfs_emit(buf, "%u\n", wq->max_batch_size); } static ssize_t wq_max_batch_size_store(struct device *dev, struct device_attribute *attr, @@ -1234,7 +1229,7 @@ static ssize_t wq_ats_disable_show(struct device *dev, struct device_attribute * { struct idxd_wq *wq = container_of(dev, struct idxd_wq, conf_dev); - return sprintf(buf, "%u\n", wq->ats_dis); + return sysfs_emit(buf, "%u\n", wq->ats_dis); } static ssize_t wq_ats_disable_store(struct device *dev, struct device_attribute *attr, @@ -1311,7 +1306,7 @@ static ssize_t version_show(struct device *dev, struct device_attribute *attr, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%#x\n", idxd->hw.version); + return sysfs_emit(buf, "%#x\n", idxd->hw.version); } static DEVICE_ATTR_RO(version); @@ -1322,7 +1317,7 @@ static ssize_t max_work_queues_size_show(struct device *dev, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%u\n", idxd->max_wq_size); + return sysfs_emit(buf, "%u\n", idxd->max_wq_size); } static DEVICE_ATTR_RO(max_work_queues_size); @@ -1332,7 +1327,7 @@ static ssize_t max_groups_show(struct device *dev, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%u\n", idxd->max_groups); + return sysfs_emit(buf, "%u\n", idxd->max_groups); } static DEVICE_ATTR_RO(max_groups); @@ -1342,7 +1337,7 @@ static ssize_t max_work_queues_show(struct device *dev, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%u\n", idxd->max_wqs); + return sysfs_emit(buf, "%u\n", idxd->max_wqs); } static DEVICE_ATTR_RO(max_work_queues); @@ -1352,7 +1347,7 @@ static ssize_t max_engines_show(struct device *dev, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%u\n", idxd->max_engines); + return sysfs_emit(buf, "%u\n", idxd->max_engines); } static DEVICE_ATTR_RO(max_engines); @@ -1362,7 +1357,7 @@ static ssize_t numa_node_show(struct device *dev, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%d\n", dev_to_node(&idxd->pdev->dev)); + return sysfs_emit(buf, "%d\n", dev_to_node(&idxd->pdev->dev)); } static DEVICE_ATTR_RO(numa_node); @@ -1372,7 +1367,7 @@ static ssize_t max_batch_size_show(struct device *dev, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%u\n", idxd->max_batch_size); + return sysfs_emit(buf, "%u\n", idxd->max_batch_size); } static DEVICE_ATTR_RO(max_batch_size); @@ -1383,7 +1378,7 @@ static ssize_t max_transfer_size_show(struct device *dev, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%llu\n", idxd->max_xfer_bytes); + return sysfs_emit(buf, "%llu\n", idxd->max_xfer_bytes); } static DEVICE_ATTR_RO(max_transfer_size); @@ -1409,7 +1404,7 @@ static ssize_t gen_cap_show(struct device *dev, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%#llx\n", idxd->hw.gen_cap.bits); + return sysfs_emit(buf, "%#llx\n", idxd->hw.gen_cap.bits); } static DEVICE_ATTR_RO(gen_cap); @@ -1419,8 +1414,7 @@ static ssize_t configurable_show(struct device *dev, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%u\n", - test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)); + return sysfs_emit(buf, "%u\n", test_bit(IDXD_FLAG_CONFIGURABLE, &idxd->flags)); } static DEVICE_ATTR_RO(configurable); @@ -1440,7 +1434,7 @@ static ssize_t clients_show(struct device *dev, } spin_unlock_irqrestore(&idxd->dev_lock, flags); - return sprintf(buf, "%d\n", count); + return sysfs_emit(buf, "%d\n", count); } static DEVICE_ATTR_RO(clients); @@ -1450,7 +1444,7 @@ static ssize_t pasid_enabled_show(struct device *dev, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%u\n", device_pasid_enabled(idxd)); + return sysfs_emit(buf, "%u\n", device_pasid_enabled(idxd)); } static DEVICE_ATTR_RO(pasid_enabled); @@ -1463,14 +1457,14 @@ static ssize_t state_show(struct device *dev, switch (idxd->state) { case IDXD_DEV_DISABLED: case IDXD_DEV_CONF_READY: - return sprintf(buf, "disabled\n"); + return sysfs_emit(buf, "disabled\n"); case IDXD_DEV_ENABLED: - return sprintf(buf, "enabled\n"); + return sysfs_emit(buf, "enabled\n"); case IDXD_DEV_HALTED: - return sprintf(buf, "halted\n"); + return sysfs_emit(buf, "halted\n"); } - return sprintf(buf, "unknown\n"); + return sysfs_emit(buf, "unknown\n"); } static DEVICE_ATTR_RO(state); @@ -1484,10 +1478,10 @@ static ssize_t errors_show(struct device *dev, spin_lock_irqsave(&idxd->dev_lock, flags); for (i = 0; i < 4; i++) - out += sprintf(buf + out, "%#018llx ", idxd->sw_err.bits[i]); + out += sysfs_emit_at(buf, out, "%#018llx ", idxd->sw_err.bits[i]); spin_unlock_irqrestore(&idxd->dev_lock, flags); out--; - out += sprintf(buf + out, "\n"); + out += sysfs_emit_at(buf, out, "\n"); return out; } static DEVICE_ATTR_RO(errors); @@ -1498,7 +1492,7 @@ static ssize_t max_tokens_show(struct device *dev, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%u\n", idxd->max_tokens); + return sysfs_emit(buf, "%u\n", idxd->max_tokens); } static DEVICE_ATTR_RO(max_tokens); @@ -1508,7 +1502,7 @@ static ssize_t token_limit_show(struct device *dev, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%u\n", idxd->token_limit); + return sysfs_emit(buf, "%u\n", idxd->token_limit); } static ssize_t token_limit_store(struct device *dev, @@ -1547,7 +1541,7 @@ static ssize_t cdev_major_show(struct device *dev, struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%u\n", idxd->major); + return sysfs_emit(buf, "%u\n", idxd->major); } static DEVICE_ATTR_RO(cdev_major); @@ -1556,7 +1550,7 @@ static ssize_t cmd_status_show(struct device *dev, { struct idxd_device *idxd = container_of(dev, struct idxd_device, conf_dev); - return sprintf(buf, "%#x\n", idxd->cmd_status); + return sysfs_emit(buf, "%#x\n", idxd->cmd_status); } static DEVICE_ATTR_RO(cmd_status); From cf5f86a7d47df149857ba2fb72f9c6c9da46af2e Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 20 Apr 2021 11:46:46 -0700 Subject: [PATCH 0604/1379] dmaengine: idxd: enable SVA feature for IOMMU Enable IOMMU_DEV_FEAT_SVA before attempt to bind pasid. This is needed according to iommu_sva_bind_device() comment. Currently Intel IOMMU code does this before bind call. It really needs to be controlled by the driver. Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161894440621.3202472.17644507396206848134.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index ef58750c24cc..eb0b3a00a2d7 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -501,11 +501,18 @@ static int idxd_probe(struct idxd_device *idxd) dev_dbg(dev, "IDXD reset complete\n"); if (IS_ENABLED(CONFIG_INTEL_IDXD_SVM) && sva) { - rc = idxd_enable_system_pasid(idxd); - if (rc < 0) - dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc); - else - set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags); + rc = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA); + if (rc == 0) { + rc = idxd_enable_system_pasid(idxd); + if (rc < 0) { + iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA); + dev_warn(dev, "Failed to enable PASID. No SVA support: %d\n", rc); + } else { + set_bit(IDXD_FLAG_PASID_ENABLED, &idxd->flags); + } + } else { + dev_warn(dev, "Unable to turn on SVA feature.\n"); + } } else if (!sva) { dev_warn(dev, "User forced SVA off via module param.\n"); } @@ -539,6 +546,7 @@ static int idxd_probe(struct idxd_device *idxd) err: if (device_pasid_enabled(idxd)) idxd_disable_system_pasid(idxd); + iommu_dev_disable_feature(dev, IOMMU_DEV_FEAT_SVA); return rc; } @@ -699,6 +707,7 @@ static void idxd_remove(struct pci_dev *pdev) if (device_pasid_enabled(idxd)) idxd_disable_system_pasid(idxd); idxd_unregister_devices(idxd); + iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA); } static struct pci_driver idxd_pci_driver = { From 5b0c68c473a131c2acb21abad44b0047b200e185 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 20 Apr 2021 11:46:51 -0700 Subject: [PATCH 0605/1379] dmaengine: idxd: support reporting of halt interrupt Unmask the halt error interrupt so it gets reported to the interrupt handler. When halt state interrupt is received, quiesce the kernel WQs and unmap the portals to stop submission. Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161894441167.3202472.9485946398140619501.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 15 +++++++++++++++ drivers/dma/idxd/idxd.h | 2 ++ drivers/dma/idxd/init.c | 12 ++++++++++++ drivers/dma/idxd/irq.c | 2 ++ drivers/dma/idxd/registers.h | 3 ++- 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 54d5afec81cf..3934e660d951 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -47,6 +47,7 @@ void idxd_unmask_error_interrupts(struct idxd_device *idxd) genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET); genctrl.softerr_int_en = 1; + genctrl.halt_int_en = 1; iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET); } @@ -56,6 +57,7 @@ void idxd_mask_error_interrupts(struct idxd_device *idxd) genctrl.bits = ioread32(idxd->reg_base + IDXD_GENCTRL_OFFSET); genctrl.softerr_int_en = 0; + genctrl.halt_int_en = 0; iowrite32(genctrl.bits, idxd->reg_base + IDXD_GENCTRL_OFFSET); } @@ -312,6 +314,19 @@ void idxd_wq_unmap_portal(struct idxd_wq *wq) struct device *dev = &wq->idxd->pdev->dev; devm_iounmap(dev, wq->portal); + wq->portal = NULL; +} + +void idxd_wqs_unmap_portal(struct idxd_device *idxd) +{ + int i; + + for (i = 0; i < idxd->max_wqs; i++) { + struct idxd_wq *wq = idxd->wqs[i]; + + if (wq->portal) + idxd_wq_unmap_portal(wq); + } } int idxd_wq_set_pasid(struct idxd_wq *wq, int pasid) diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index c1d4a1976206..d7185c6bfade 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -371,6 +371,7 @@ int idxd_register_devices(struct idxd_device *idxd); void idxd_unregister_devices(struct idxd_device *idxd); int idxd_register_driver(void); void idxd_unregister_driver(void); +void idxd_wqs_quiesce(struct idxd_device *idxd); /* device interrupt control */ void idxd_msix_perm_setup(struct idxd_device *idxd); @@ -400,6 +401,7 @@ int idxd_device_release_int_handle(struct idxd_device *idxd, int handle, enum idxd_interrupt_type irq_type); /* work queue control */ +void idxd_wqs_unmap_portal(struct idxd_device *idxd); int idxd_wq_alloc_resources(struct idxd_wq *wq); void idxd_wq_free_resources(struct idxd_wq *wq); int idxd_wq_enable(struct idxd_wq *wq); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index eb0b3a00a2d7..e6bfd55e421b 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -647,6 +647,18 @@ static void idxd_flush_work_list(struct idxd_irq_entry *ie) } } +void idxd_wqs_quiesce(struct idxd_device *idxd) +{ + struct idxd_wq *wq; + int i; + + for (i = 0; i < idxd->max_wqs; i++) { + wq = idxd->wqs[i]; + if (wq->state == IDXD_WQ_ENABLED && wq->type == IDXD_WQT_KERNEL) + idxd_wq_quiesce(wq); + } +} + static void idxd_release_int_handles(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index fc0781e3f36d..43eea5c9cbd4 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -202,6 +202,8 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause) queue_work(idxd->wq, &idxd->work); } else { spin_lock_bh(&idxd->dev_lock); + idxd_wqs_quiesce(idxd); + idxd_wqs_unmap_portal(idxd); idxd_device_wqs_clear_state(idxd); dev_err(&idxd->pdev->dev, "idxd halted, need %s.\n", diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h index 5cbf368c7367..6c11375cc56a 100644 --- a/drivers/dma/idxd/registers.h +++ b/drivers/dma/idxd/registers.h @@ -120,7 +120,8 @@ union gencfg_reg { union genctrl_reg { struct { u32 softerr_int_en:1; - u32 rsvd:31; + u32 halt_int_en:1; + u32 rsvd:30; }; u32 bits; } __packed; From 53b2ee7f637c4f1fa2f50dbdb210088e30c11d2b Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 20 Apr 2021 12:00:56 -0700 Subject: [PATCH 0606/1379] dmaengine: idxd: device cmd should use dedicated lock Create a dedicated lock for device command operations. Put the device command operation under finer grained locking instead of using the idxd->dev_lock. Suggested-by: Sanjay Kumar Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/161894525685.3210132.16160045731436382560.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/device.c | 18 +++++++++--------- drivers/dma/idxd/idxd.h | 1 + drivers/dma/idxd/init.c | 1 + 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/dma/idxd/device.c b/drivers/dma/idxd/device.c index 3934e660d951..420b93fe5feb 100644 --- a/drivers/dma/idxd/device.c +++ b/drivers/dma/idxd/device.c @@ -465,13 +465,13 @@ int idxd_device_init_reset(struct idxd_device *idxd) memset(&cmd, 0, sizeof(cmd)); cmd.cmd = IDXD_CMD_RESET_DEVICE; dev_dbg(dev, "%s: sending reset for init.\n", __func__); - spin_lock_irqsave(&idxd->dev_lock, flags); + spin_lock_irqsave(&idxd->cmd_lock, flags); iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET); while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) & IDXD_CMDSTS_ACTIVE) cpu_relax(); - spin_unlock_irqrestore(&idxd->dev_lock, flags); + spin_unlock_irqrestore(&idxd->cmd_lock, flags); return 0; } @@ -494,10 +494,10 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand, cmd.operand = operand; cmd.int_req = 1; - spin_lock_irqsave(&idxd->dev_lock, flags); + spin_lock_irqsave(&idxd->cmd_lock, flags); wait_event_lock_irq(idxd->cmd_waitq, !test_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags), - idxd->dev_lock); + idxd->cmd_lock); dev_dbg(&idxd->pdev->dev, "%s: sending cmd: %#x op: %#x\n", __func__, cmd_code, operand); @@ -511,9 +511,9 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand, * After command submitted, release lock and go to sleep until * the command completes via interrupt. */ - spin_unlock_irqrestore(&idxd->dev_lock, flags); + spin_unlock_irqrestore(&idxd->cmd_lock, flags); wait_for_completion(&done); - spin_lock_irqsave(&idxd->dev_lock, flags); + spin_lock_irqsave(&idxd->cmd_lock, flags); if (status) { *status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET); idxd->cmd_status = *status & GENMASK(7, 0); @@ -522,7 +522,7 @@ static void idxd_cmd_exec(struct idxd_device *idxd, int cmd_code, u32 operand, __clear_bit(IDXD_FLAG_CMD_RUNNING, &idxd->flags); /* Wake up other pending commands */ wake_up(&idxd->cmd_waitq); - spin_unlock_irqrestore(&idxd->dev_lock, flags); + spin_unlock_irqrestore(&idxd->cmd_lock, flags); } int idxd_device_enable(struct idxd_device *idxd) @@ -667,13 +667,13 @@ int idxd_device_release_int_handle(struct idxd_device *idxd, int handle, dev_dbg(dev, "cmd: %u operand: %#x\n", IDXD_CMD_RELEASE_INT_HANDLE, operand); - spin_lock_irqsave(&idxd->dev_lock, flags); + spin_lock_irqsave(&idxd->cmd_lock, flags); iowrite32(cmd.bits, idxd->reg_base + IDXD_CMD_OFFSET); while (ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET) & IDXD_CMDSTS_ACTIVE) cpu_relax(); status = ioread32(idxd->reg_base + IDXD_CMDSTS_OFFSET); - spin_unlock_irqrestore(&idxd->dev_lock, flags); + spin_unlock_irqrestore(&idxd->cmd_lock, flags); if ((status & IDXD_CMDSTS_ERR_MASK) != IDXD_CMDSTS_SUCCESS) { dev_dbg(dev, "release int handle failed: %#x\n", status); diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index d7185c6bfade..87330b6940fa 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -204,6 +204,7 @@ struct idxd_device { void __iomem *reg_base; spinlock_t dev_lock; /* spinlock for device */ + spinlock_t cmd_lock; /* spinlock for device commands */ struct completion *cmd_done; struct idxd_group **groups; struct idxd_wq **wqs; diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index e6bfd55e421b..64dab4de217e 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -449,6 +449,7 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_d } spin_lock_init(&idxd->dev_lock); + spin_lock_init(&idxd->cmd_lock); return idxd; } From a16104617d212d4b482568847b25172972b87e60 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 20 Apr 2021 12:00:34 -0700 Subject: [PATCH 0607/1379] dmaengine: idxd: remove MSIX masking for interrupt handlers Remove interrupt masking and just let the hard irq handler keep firing for new events. This is less of a performance impact vs the MMIO readback inside the pci_msi_{mask,unmas}_irq(). Especially with a loaded system those flushes can be stuck behind large amounts of MMIO writes to flush. When guest kernel is running on top of VFIO mdev, mask/unmask causes a vmexit each time and is not desirable. Suggested-by: Dan Williams Signed-off-by: Dave Jiang Reviewed-by: Dan Williams Link: https://lore.kernel.org/r/161894523436.3210025.1834640110556139277.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Vinod Koul --- drivers/dma/idxd/idxd.h | 1 - drivers/dma/idxd/init.c | 4 ++-- drivers/dma/idxd/irq.c | 12 ------------ 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 87330b6940fa..97c96ca6ab70 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -377,7 +377,6 @@ void idxd_wqs_quiesce(struct idxd_device *idxd); /* device interrupt control */ void idxd_msix_perm_setup(struct idxd_device *idxd); void idxd_msix_perm_clear(struct idxd_device *idxd); -irqreturn_t idxd_irq_handler(int vec, void *data); irqreturn_t idxd_misc_thread(int vec, void *data); irqreturn_t idxd_wq_thread(int irq, void *data); void idxd_mask_error_interrupts(struct idxd_device *idxd); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 64dab4de217e..8003f8a25fff 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -102,7 +102,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd) } irq_entry = &idxd->irq_entries[0]; - rc = request_threaded_irq(irq_entry->vector, idxd_irq_handler, idxd_misc_thread, + rc = request_threaded_irq(irq_entry->vector, NULL, idxd_misc_thread, 0, "idxd-misc", irq_entry); if (rc < 0) { dev_err(dev, "Failed to allocate misc interrupt.\n"); @@ -119,7 +119,7 @@ static int idxd_setup_interrupts(struct idxd_device *idxd) init_llist_head(&idxd->irq_entries[i].pending_llist); INIT_LIST_HEAD(&idxd->irq_entries[i].work_list); - rc = request_threaded_irq(irq_entry->vector, idxd_irq_handler, + rc = request_threaded_irq(irq_entry->vector, NULL, idxd_wq_thread, 0, "idxd-portal", irq_entry); if (rc < 0) { dev_err(dev, "Failed to allocate irq %d.\n", irq_entry->vector); diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index 43eea5c9cbd4..afee571e0194 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -102,15 +102,6 @@ static int idxd_device_schedule_fault_process(struct idxd_device *idxd, return 0; } -irqreturn_t idxd_irq_handler(int vec, void *data) -{ - struct idxd_irq_entry *irq_entry = data; - struct idxd_device *idxd = irq_entry->idxd; - - idxd_mask_msix_vector(idxd, irq_entry->id); - return IRQ_WAKE_THREAD; -} - static int process_misc_interrupts(struct idxd_device *idxd, u32 cause) { struct device *dev = &idxd->pdev->dev; @@ -237,7 +228,6 @@ irqreturn_t idxd_misc_thread(int vec, void *data) iowrite32(cause, idxd->reg_base + IDXD_INTCAUSE_OFFSET); } - idxd_unmask_msix_vector(idxd, irq_entry->id); return IRQ_HANDLED; } @@ -394,8 +384,6 @@ irqreturn_t idxd_wq_thread(int irq, void *data) int processed; processed = idxd_desc_process(irq_entry); - idxd_unmask_msix_vector(irq_entry->idxd, irq_entry->id); - if (processed == 0) return IRQ_NONE; From 0d95f41ebde40d552bb4fea64b1d618607915fd6 Mon Sep 17 00:00:00 2001 From: Jae Hyun Yoo Date: Thu, 8 Apr 2021 10:28:03 -0700 Subject: [PATCH 0608/1379] Revert "i3c master: fix missing destroy_workqueue() on error in i3c_master_register" Adding the destroy_workqueue call in i3c_master_register introduced below kernel warning because it makes duplicate destroy_workqueue calls when i3c_master_register fails after allocating the workqueue. The workqueue will be destroyed by i3c_masterdev_release which is called by put_device at the end of the i3c_master_register function eventually in failure cases so the workqueue doesn't need to be destroyed in i3c_master_register. [ 6.972952] WARNING: CPU: 1 PID: 1 at lib/list_debug.c:48 __list_del_entry_valid+0x9c/0xf4 [ 6.982205] list_del corruption, 8fe03c08->prev is LIST_POISON2 (00000122) [ 6.989910] CPU: 1 PID: 1 Comm: swapper/0 Tainted: G W 5.10.23-c12838a-dirty-31dc772 #1 [ 7.000295] Hardware name: Generic DT based system [ 7.005638] Backtrace: [ 7.008369] [<809133f0>] (dump_backtrace) from [<80913644>] (show_stack+0x20/0x24) [ 7.016819] r7:00000030 r6:60000013 r5:00000000 r4:813b5d40 [ 7.023137] [<80913624>] (show_stack) from [<8091e1a0>] (dump_stack+0x9c/0xb0) [ 7.031201] [<8091e104>] (dump_stack) from [<8011fa30>] (__warn+0xf8/0x154) [ 7.038972] r7:00000030 r6:00000009 r5:804fa1c8 r4:80b6eca4 [ 7.045289] [<8011f938>] (__warn) from [<80913d14>] (warn_slowpath_fmt+0x8c/0xc0) [ 7.053641] r7:00000030 r6:80b6eca4 r5:80b6ed74 r4:818cc000 [ 7.059960] [<80913c8c>] (warn_slowpath_fmt) from [<804fa1c8>] (__list_del_entry_valid+0x9c/0xf4) [ 7.069866] r9:96becf8c r8:818cc000 r7:8fe03c10 r6:8fe03c00 r5:8fe03ba0 r4:ff7ead4c [ 7.078513] [<804fa12c>] (__list_del_entry_valid) from [<8013f0b4>] (destroy_workqueue+0x1c4/0x23c) [ 7.088615] [<8013eef0>] (destroy_workqueue) from [<806aa124>] (i3c_masterdev_release+0x40/0xb0) [ 7.098421] r7:00000000 r6:81a43b80 r5:8fe65360 r4:8fe65048 [ 7.104740] [<806aa0e4>] (i3c_masterdev_release) from [<805f3f04>] (device_release+0x40/0xb0) [ 7.114254] r5:00000000 r4:8fe65048 [ 7.118245] [<805f3ec4>] (device_release) from [<808fe754>] (kobject_put+0xc8/0x204) [ 7.126885] r5:813978dc r4:8fe65048 [ 7.130877] [<808fe68c>] (kobject_put) from [<805f5fbc>] (put_device+0x20/0x24) [ 7.139037] r7:8fe65358 r6:8fe65368 r5:8fe65358 r4:8fe65048 [ 7.145355] [<805f5f9c>] (put_device) from [<806abac4>] (i3c_master_register+0x338/0xb00) [ 7.154487] [<806ab78c>] (i3c_master_register) from [<806ae084>] (dw_i3c_probe+0x224/0x24c) [ 7.163811] r10:00000000 r9:8fe7a100 r8:00000032 r7:819fa810 r6:819fa800 r5:8fe65040 [ 7.172547] r4:00000000 [ 7.175376] [<806ade60>] (dw_i3c_probe) from [<805fdc14>] (platform_drv_probe+0x44/0x80) [ 7.184409] r9:813a25c0 r8:00000000 r7:815ec114 r6:00000000 r5:813a25c0 r4:819fa810 [ 7.193053] [<805fdbd0>] (platform_drv_probe) from [<805fb83c>] (really_probe+0x108/0x50c) [ 7.202275] r5:815ec004 r4:819fa810 [ 7.206265] [<805fb734>] (really_probe) from [<805fc180>] (driver_probe_device+0xb4/0x190) [ 7.215492] r10:813dc000 r9:80c4385c r8:000000d9 r7:813a25c0 r6:819fa810 r5:00000000 [ 7.224228] r4:813a25c0 [ 7.227055] [<805fc0cc>] (driver_probe_device) from [<805fc5cc>] (device_driver_attach+0xb8/0xc0) [ 7.236959] r9:80c4385c r8:000000d9 r7:813a25c0 r6:819fa854 r4:819fa810 [ 7.244439] [<805fc514>] (device_driver_attach) from [<805fc65c>] (__driver_attach+0x88/0x16c) [ 7.254051] r7:00000000 r6:819fa810 r5:00000000 r4:813a25c0 [ 7.260369] [<805fc5d4>] (__driver_attach) from [<805f954c>] (bus_for_each_dev+0x88/0xc8) [ 7.269489] r7:00000000 r6:818cc000 r5:805fc5d4 r4:813a25c0 [ 7.275806] [<805f94c4>] (bus_for_each_dev) from [<805fc76c>] (driver_attach+0x2c/0x30) [ 7.284739] r7:81397c98 r6:00000000 r5:8fe7db80 r4:813a25c0 [ 7.291057] [<805fc740>] (driver_attach) from [<805f9eec>] (bus_add_driver+0x120/0x200) [ 7.299984] [<805f9dcc>] (bus_add_driver) from [<805fce44>] (driver_register+0x98/0x128) [ 7.309005] r7:80c4383c r6:00000000 r5:00000000 r4:813a25c0 [ 7.315323] [<805fcdac>] (driver_register) from [<805fedb4>] (__platform_driver_register+0x50/0x58) [ 7.325410] r5:818cc000 r4:81397c98 [ 7.329404] [<805fed64>] (__platform_driver_register) from [<80c23398>] (dw_i3c_driver_init+0x24/0x28) [ 7.339790] r5:818cc000 r4:80c23374 [ 7.343784] [<80c23374>] (dw_i3c_driver_init) from [<80c01300>] (do_one_initcall+0xac/0x1d0) [ 7.353206] [<80c01254>] (do_one_initcall) from [<80c01630>] (kernel_init_freeable+0x1a8/0x204) [ 7.362916] r8:000000d9 r7:80c4383c r6:00000007 r5:819ca2c0 r4:80c67680 [ 7.370398] [<80c01488>] (kernel_init_freeable) from [<8091eb18>] (kernel_init+0x18/0x12c) [ 7.379616] r10:00000000 r9:00000000 r8:00000000 r7:00000000 r6:00000000 r5:8091eb00 [ 7.388343] r4:00000000 [ 7.391170] [<8091eb00>] (kernel_init) from [<80100148>] (ret_from_fork+0x14/0x2c) [ 7.399607] Exception stack(0x818cdfb0 to 0x818cdff8) [ 7.405243] dfa0: 00000000 00000000 00000000 00000000 [ 7.414371] dfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 [ 7.423499] dfe0: 00000000 00000000 00000000 00000000 00000013 00000000 [ 7.430879] r5:8091eb00 r4:00000000 This reverts commit 59165d16c699182b86b5c65181013f1fd88feb62. Fixes: 59165d16c699 ("i3c master: fix missing destroy_workqueue() on error in i3c_master_register") Signed-off-by: Jae Hyun Yoo Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210408172803.24599-1-jae.hyun.yoo@linux.intel.com --- drivers/i3c/master.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/i3c/master.c b/drivers/i3c/master.c index f8e9b7305c13..e2e12a5585e5 100644 --- a/drivers/i3c/master.c +++ b/drivers/i3c/master.c @@ -2535,7 +2535,7 @@ int i3c_master_register(struct i3c_master_controller *master, ret = i3c_master_bus_init(master); if (ret) - goto err_destroy_wq; + goto err_put_dev; ret = device_add(&master->dev); if (ret) @@ -2566,9 +2566,6 @@ err_del_dev: err_cleanup_bus: i3c_master_bus_cleanup(master); -err_destroy_wq: - destroy_workqueue(master->wq); - err_put_dev: put_device(&master->dev); From 2e22d48dca0bc5b7fccca8d7b6caed80a9d07465 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 23 Apr 2021 14:09:38 +0800 Subject: [PATCH 0609/1379] f2fs: clean up left deprecated IO trace codes Commit d5f7bc0064e0 ("f2fs: deprecate f2fs_trace_io") left some dead codes, delete them. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 6 ------ fs/f2fs/f2fs.h | 8 -------- 2 files changed, 14 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 3c9d797dbdd6..6e46a00c1930 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -76,12 +76,6 @@ bool f2fs_is_compressed_page(struct page *page) return false; if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page)) return false; - /* - * page->private may be set with pid. - * pid_max is enough to check if it is traced. - */ - if (IS_IO_TRACED_PAGE(page)) - return false; f2fs_bug_on(F2FS_M_SB(page->mapping), *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3ebb951cc426..b9d5317db0a7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1303,14 +1303,6 @@ enum { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == DUMMY_WRITTEN_PAGE) -#ifdef CONFIG_F2FS_IO_TRACE -#define IS_IO_TRACED_PAGE(page) \ - (page_private(page) > 0 && \ - page_private(page) < (unsigned long)PID_MAX_LIMIT) -#else -#define IS_IO_TRACED_PAGE(page) (0) -#endif - /* For compression */ enum compress_algorithm_type { COMPRESS_LZO, From 81dd4d4d6178306ab31db91bdc7353d485bdafce Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 24 Apr 2021 10:04:15 -0500 Subject: [PATCH 0610/1379] dmaengine: idxd: Add IDXD performance monitor support Implement the IDXD performance monitor capability (named 'perfmon' in the DSA (Data Streaming Accelerator) spec [1]), which supports the collection of information about key events occurring during DSA and IAX (Intel Analytics Accelerator) device execution, to assist in performance tuning and debugging. The idxd perfmon support is implemented as part of the IDXD driver and interfaces with the Linux perf framework. It has several features in common with the existing uncore pmu support: - it does not support sampling - does not support per-thread counting However it also has some unique features not present in the core and uncore support: - all general-purpose counters are identical, thus no event constraints - operation is always system-wide While the core perf subsystem assumes that all counters are by default per-cpu, the uncore pmus are socket-scoped and use a cpu mask to restrict counting to one cpu from each socket. IDXD counters use a similar strategy but expand the scope even further; since IDXD counters are system-wide and can be read from any cpu, the IDXD perf driver picks a single cpu to do the work (with cpu hotplug notifiers to choose a different cpu if the chosen one is taken off-line). More specifically, the perf userspace tool by default opens a counter for each cpu for an event. However, if it finds a cpumask file associated with the pmu under sysfs, as is the case with the uncore pmus, it will open counters only on the cpus specified by the cpumask. Since perfmon only needs to open a single counter per event for a given IDXD device, the perfmon driver will create a sysfs cpumask file for the device and insert the first cpu of the system into it. When a user uses perf to open an event, perf will open a single counter on the cpu specified by the cpu mask. This amounts to the default system-wide rather than per-cpu counting mentioned previously for perfmon pmu events. In order to keep the cpu mask up-to-date, the driver implements cpu hotplug support for multiple devices, as IDXD usually enumerates and registers more than one idxd device. The perfmon driver implements basic perfmon hardware capability discovery and configuration, and is initialized by the IDXD driver's probe function. During initialization, the driver retrieves the total number of supported performance counters, the pmu ID, and the device type from idxd device, and registers itself under the Linux perf framework. The perf userspace tool can be used to monitor single or multiple events depending on the given configuration, as well as event groups, which are also supported by the perfmon driver. The user configures events using the perf tool command-line interface by specifying the event and corresponding event category, along with an optional set of filters that can be used to restrict counting to specific work queues, traffic classes, page and transfer sizes, and engines (See [1] for specifics). With the configuration specified by the user, the perf tool issues a system call passing that information to the kernel, which uses it to initialize the specified event(s). The event(s) are opened and started, and following termination of the perf command, they're stopped. At that point, the perfmon driver will read the latest count for the event(s), calculate the difference between the latest counter values and previously tracked counter values, and display the final incremental count as the event count for the cycle. An overflow handler registered on the IDXD irq path is used to account for counter overflows, which are signaled by an overflow interrupt. Below are a couple of examples of perf usage for monitoring DSA events. The following monitors all events in the 'engine' category. Becuuse no filters are specified, this captures all engine events for the workload, which in this case is 19 iterations of the work generated by the kernel dmatest module. Details describing the events can be found in Appendix D of [1], Performance Monitoring Events, but briefly they are: event 0x1: total input data processed, in 32-byte units event 0x2: total data written, in 32-byte units event 0x4: number of work descriptors that read the source event 0x8: number of work descriptors that write the destination event 0x10: number of work descriptors dispatched from batch descriptors event 0x20: number of work descriptors dispatched from work queues # perf stat -e dsa0/event=0x1,event_category=0x1/, dsa0/event=0x2,event_category=0x1/, dsa0/event=0x4,event_category=0x1/, dsa0/event=0x8,event_category=0x1/, dsa0/event=0x10,event_category=0x1/, dsa0/event=0x20,event_category=0x1/ modprobe dmatest channel=dma0chan0 timeout=2000 iterations=19 run=1 wait=1 Performance counter stats for 'system wide': 5,332 dsa0/event=0x1,event_category=0x1/ 5,327 dsa0/event=0x2,event_category=0x1/ 19 dsa0/event=0x4,event_category=0x1/ 19 dsa0/event=0x8,event_category=0x1/ 0 dsa0/event=0x10,event_category=0x1/ 19 dsa0/event=0x20,event_category=0x1/ 21.977436186 seconds time elapsed The command below illustrates filter usage with a simple example. It specifies that MEM_MOVE operations should be counted for the DSA device dsa0 (event 0x8 corresponds to the EV_MEM_MOVE event - Number of Memory Move Descriptors, which is part of event category 0x3 - Operations. The detailed category and event IDs are available in Appendix D, Performance Monitoring Events, of [1]). In addition to the event and event category, a number of filters are also specified (the detailed filter values are available in Chapter 6.4 (Filter Support) of [1]), which will restrict counting to only those events that meet all of the filter criteria. In this case, the filters specify that only MEM_MOVE operations that are serviced by work queue wq0 and specifically engine number engine0 and traffic class tc0 having sizes between 0 and 4k and page size of between 0 and 1G result in a counter hit; anything else will be filtered out and not appear in the final count. Note that filters are optional - any filter not specified is assumed to be all ones and will pass anything. # perf stat -e dsa0/filter_wq=0x1,filter_tc=0x1,filter_sz=0x7, filter_eng=0x1,event=0x8,event_category=0x3/ modprobe dmatest channel=dma0chan0 timeout=2000 iterations=19 run=1 wait=1 Performance counter stats for 'system wide': 19 dsa0/filter_wq=0x1,filter_tc=0x1,filter_sz=0x7, filter_eng=0x1,event=0x8,event_category=0x3/ 21.865914091 seconds time elapsed The output above reflects that the unspecified workload resulted in the counting of 19 MEM_MOVE operation events that met the filter criteria. [1]: https://software.intel.com/content/www/us/en/develop/download/intel-data-streaming-accelerator-preliminary-architecture-specification.html [ Based on work originally by Jing Lin. ] Reviewed-by: Dave Jiang Reviewed-by: Kan Liang Signed-off-by: Tom Zanussi Link: https://lore.kernel.org/r/0c5080a7d541904c4ad42b848c76a1ce056ddac7.1619276133.git.zanussi@kernel.org Signed-off-by: Vinod Koul --- .../sysfs-bus-event_source-devices-dsa | 30 + drivers/dma/Kconfig | 12 + drivers/dma/idxd/Makefile | 2 + drivers/dma/idxd/idxd.h | 45 ++ drivers/dma/idxd/perfmon.c | 662 ++++++++++++++++++ drivers/dma/idxd/perfmon.h | 119 ++++ drivers/dma/idxd/registers.h | 108 +++ include/linux/cpuhotplug.h | 1 + 8 files changed, 979 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa create mode 100644 drivers/dma/idxd/perfmon.c create mode 100644 drivers/dma/idxd/perfmon.h diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa b/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa new file mode 100644 index 000000000000..3c7d132281b0 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-dsa @@ -0,0 +1,30 @@ +What: /sys/bus/event_source/devices/dsa*/format +Date: April 2021 +KernelVersion: 5.13 +Contact: Tom Zanussi +Description: Read-only. Attribute group to describe the magic bits + that go into perf_event_attr.config or + perf_event_attr.config1 for the IDXD DSA pmu. (See also + ABI/testing/sysfs-bus-event_source-devices-format). + + Each attribute in this group defines a bit range in + perf_event_attr.config or perf_event_attr.config1. + All supported attributes are listed below (See the + IDXD DSA Spec for possible attribute values):: + + event_category = "config:0-3" - event category + event = "config:4-31" - event ID + + filter_wq = "config1:0-31" - workqueue filter + filter_tc = "config1:32-39" - traffic class filter + filter_pgsz = "config1:40-43" - page size filter + filter_sz = "config1:44-51" - transfer size filter + filter_eng = "config1:52-59" - engine filter + +What: /sys/bus/event_source/devices/dsa*/cpumask +Date: April 2021 +KernelVersion: 5.13 +Contact: Tom Zanussi +Description: Read-only. This file always returns the cpu to which the + IDXD DSA pmu is bound for access to all dsa pmu + performance monitoring events. diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 0c2827fd8c19..b417217c148c 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -300,6 +300,18 @@ config INTEL_IDXD_SVM depends on PCI_PASID depends on PCI_IOV +config INTEL_IDXD_PERFMON + bool "Intel Data Accelerators performance monitor support" + depends on INTEL_IDXD + help + Enable performance monitor (pmu) support for the Intel(R) + data accelerators present in Intel Xeon CPU. With this + enabled, perf can be used to monitor the DSA (Intel Data + Streaming Accelerator) events described in the Intel DSA + spec. + + If unsure, say N. + config INTEL_IOATDMA tristate "Intel I/OAT DMA support" depends on PCI && X86_64 diff --git a/drivers/dma/idxd/Makefile b/drivers/dma/idxd/Makefile index 8978b898d777..6d11558756f8 100644 --- a/drivers/dma/idxd/Makefile +++ b/drivers/dma/idxd/Makefile @@ -1,2 +1,4 @@ obj-$(CONFIG_INTEL_IDXD) += idxd.o idxd-y := init.o irq.o device.o sysfs.o submit.o dma.o cdev.o + +idxd-$(CONFIG_INTEL_IDXD_PERFMON) += perfmon.o diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 97c96ca6ab70..26482c7d4c3a 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include "registers.h" #define IDXD_DRIVER_VERSION "1.00" @@ -29,6 +31,7 @@ enum idxd_type { }; #define IDXD_NAME_SIZE 128 +#define IDXD_PMU_EVENT_MAX 64 struct idxd_device_driver { struct device_driver drv; @@ -61,6 +64,31 @@ struct idxd_group { int tc_b; }; +struct idxd_pmu { + struct idxd_device *idxd; + + struct perf_event *event_list[IDXD_PMU_EVENT_MAX]; + int n_events; + + DECLARE_BITMAP(used_mask, IDXD_PMU_EVENT_MAX); + + struct pmu pmu; + char name[IDXD_NAME_SIZE]; + int cpu; + + int n_counters; + int counter_width; + int n_event_categories; + + bool per_counter_caps_supported; + unsigned long supported_event_categories; + + unsigned long supported_filters; + int n_filters; + + struct hlist_node cpuhp_node; +}; + #define IDXD_MAX_PRIORITY 0xf enum idxd_wq_state { @@ -241,6 +269,8 @@ struct idxd_device { struct work_struct work; int *int_handles; + + struct idxd_pmu *idxd_pmu; }; /* IDXD software descriptor */ @@ -437,4 +467,19 @@ int idxd_cdev_get_major(struct idxd_device *idxd); int idxd_wq_add_cdev(struct idxd_wq *wq); void idxd_wq_del_cdev(struct idxd_wq *wq); +/* perfmon */ +#if IS_ENABLED(CONFIG_INTEL_IDXD_PERFMON) +int perfmon_pmu_init(struct idxd_device *idxd); +void perfmon_pmu_remove(struct idxd_device *idxd); +void perfmon_counter_overflow(struct idxd_device *idxd); +void perfmon_init(void); +void perfmon_exit(void); +#else +static inline int perfmon_pmu_init(struct idxd_device *idxd) { return 0; } +static inline void perfmon_pmu_remove(struct idxd_device *idxd) {} +static inline void perfmon_counter_overflow(struct idxd_device *idxd) {} +static inline void perfmon_init(void) {} +static inline void perfmon_exit(void) {} +#endif + #endif diff --git a/drivers/dma/idxd/perfmon.c b/drivers/dma/idxd/perfmon.c new file mode 100644 index 000000000000..d73004f47cf4 --- /dev/null +++ b/drivers/dma/idxd/perfmon.c @@ -0,0 +1,662 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */ + +#include +#include +#include "idxd.h" +#include "perfmon.h" + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, + char *buf); + +static cpumask_t perfmon_dsa_cpu_mask; +static bool cpuhp_set_up; +static enum cpuhp_state cpuhp_slot; + +/* + * perf userspace reads this attribute to determine which cpus to open + * counters on. It's connected to perfmon_dsa_cpu_mask, which is + * maintained by the cpu hotplug handlers. + */ +static DEVICE_ATTR_RO(cpumask); + +static struct attribute *perfmon_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group cpumask_attr_group = { + .attrs = perfmon_cpumask_attrs, +}; + +/* + * These attributes specify the bits in the config word that the perf + * syscall uses to pass the event ids and categories to perfmon. + */ +DEFINE_PERFMON_FORMAT_ATTR(event_category, "config:0-3"); +DEFINE_PERFMON_FORMAT_ATTR(event, "config:4-31"); + +/* + * These attributes specify the bits in the config1 word that the perf + * syscall uses to pass filter data to perfmon. + */ +DEFINE_PERFMON_FORMAT_ATTR(filter_wq, "config1:0-31"); +DEFINE_PERFMON_FORMAT_ATTR(filter_tc, "config1:32-39"); +DEFINE_PERFMON_FORMAT_ATTR(filter_pgsz, "config1:40-43"); +DEFINE_PERFMON_FORMAT_ATTR(filter_sz, "config1:44-51"); +DEFINE_PERFMON_FORMAT_ATTR(filter_eng, "config1:52-59"); + +#define PERFMON_FILTERS_START 2 +#define PERFMON_FILTERS_MAX 5 + +static struct attribute *perfmon_format_attrs[] = { + &format_attr_idxd_event_category.attr, + &format_attr_idxd_event.attr, + &format_attr_idxd_filter_wq.attr, + &format_attr_idxd_filter_tc.attr, + &format_attr_idxd_filter_pgsz.attr, + &format_attr_idxd_filter_sz.attr, + &format_attr_idxd_filter_eng.attr, + NULL, +}; + +static struct attribute_group perfmon_format_attr_group = { + .name = "format", + .attrs = perfmon_format_attrs, +}; + +static const struct attribute_group *perfmon_attr_groups[] = { + &perfmon_format_attr_group, + &cpumask_attr_group, + NULL, +}; + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return cpumap_print_to_pagebuf(true, buf, &perfmon_dsa_cpu_mask); +} + +static bool is_idxd_event(struct idxd_pmu *idxd_pmu, struct perf_event *event) +{ + return &idxd_pmu->pmu == event->pmu; +} + +static int perfmon_collect_events(struct idxd_pmu *idxd_pmu, + struct perf_event *leader, + bool do_grp) +{ + struct perf_event *event; + int n, max_count; + + max_count = idxd_pmu->n_counters; + n = idxd_pmu->n_events; + + if (n >= max_count) + return -EINVAL; + + if (is_idxd_event(idxd_pmu, leader)) { + idxd_pmu->event_list[n] = leader; + idxd_pmu->event_list[n]->hw.idx = n; + n++; + } + + if (!do_grp) + return n; + + for_each_sibling_event(event, leader) { + if (!is_idxd_event(idxd_pmu, event) || + event->state <= PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -EINVAL; + + idxd_pmu->event_list[n] = event; + idxd_pmu->event_list[n]->hw.idx = n; + n++; + } + + return n; +} + +static void perfmon_assign_hw_event(struct idxd_pmu *idxd_pmu, + struct perf_event *event, int idx) +{ + struct idxd_device *idxd = idxd_pmu->idxd; + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = idx; + hwc->config_base = ioread64(CNTRCFG_REG(idxd, idx)); + hwc->event_base = ioread64(CNTRCFG_REG(idxd, idx)); +} + +static int perfmon_assign_event(struct idxd_pmu *idxd_pmu, + struct perf_event *event) +{ + int i; + + for (i = 0; i < IDXD_PMU_EVENT_MAX; i++) + if (!test_and_set_bit(i, idxd_pmu->used_mask)) + return i; + + return -EINVAL; +} + +/* + * Check whether there are enough counters to satisfy that all the + * events in the group can actually be scheduled at the same time. + * + * To do this, create a fake idxd_pmu object so the event collection + * and assignment functions can be used without affecting the internal + * state of the real idxd_pmu object. + */ +static int perfmon_validate_group(struct idxd_pmu *pmu, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + struct idxd_pmu *fake_pmu; + int i, ret = 0, n, idx; + + fake_pmu = kzalloc(sizeof(*fake_pmu), GFP_KERNEL); + if (!fake_pmu) + return -ENOMEM; + + fake_pmu->pmu.name = pmu->pmu.name; + fake_pmu->n_counters = pmu->n_counters; + + n = perfmon_collect_events(fake_pmu, leader, true); + if (n < 0) { + ret = n; + goto out; + } + + fake_pmu->n_events = n; + n = perfmon_collect_events(fake_pmu, event, false); + if (n < 0) { + ret = n; + goto out; + } + + fake_pmu->n_events = n; + + for (i = 0; i < n; i++) { + event = fake_pmu->event_list[i]; + + idx = perfmon_assign_event(fake_pmu, event); + if (idx < 0) { + ret = idx; + goto out; + } + } +out: + kfree(fake_pmu); + + return ret; +} + +static int perfmon_pmu_event_init(struct perf_event *event) +{ + struct idxd_device *idxd; + int ret = 0; + + idxd = event_to_idxd(event); + event->hw.idx = -1; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* sampling not supported */ + if (event->attr.sample_period) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + if (event->pmu != &idxd->idxd_pmu->pmu) + return -EINVAL; + + event->hw.event_base = ioread64(PERFMON_TABLE_OFFSET(idxd)); + event->cpu = idxd->idxd_pmu->cpu; + event->hw.config = event->attr.config; + + if (event->group_leader != event) + /* non-group events have themselves as leader */ + ret = perfmon_validate_group(idxd->idxd_pmu, event); + + return ret; +} + +static inline u64 perfmon_pmu_read_counter(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct idxd_device *idxd; + int cntr = hwc->idx; + + idxd = event_to_idxd(event); + + return ioread64(CNTRDATA_REG(idxd, cntr)); +} + +static void perfmon_pmu_event_update(struct perf_event *event) +{ + struct idxd_device *idxd = event_to_idxd(event); + u64 prev_raw_count, new_raw_count, delta, p, n; + int shift = 64 - idxd->idxd_pmu->counter_width; + struct hw_perf_event *hwc = &event->hw; + + do { + prev_raw_count = local64_read(&hwc->prev_count); + new_raw_count = perfmon_pmu_read_counter(event); + } while (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count); + + n = (new_raw_count << shift); + p = (prev_raw_count << shift); + + delta = ((n - p) >> shift); + + local64_add(delta, &event->count); +} + +void perfmon_counter_overflow(struct idxd_device *idxd) +{ + int i, n_counters, max_loop = OVERFLOW_SIZE; + struct perf_event *event; + unsigned long ovfstatus; + + n_counters = min(idxd->idxd_pmu->n_counters, OVERFLOW_SIZE); + + ovfstatus = ioread32(OVFSTATUS_REG(idxd)); + + /* + * While updating overflowed counters, other counters behind + * them could overflow and be missed in a given pass. + * Normally this could happen at most n_counters times, but in + * theory a tiny counter width could result in continual + * overflows and endless looping. max_loop provides a + * failsafe in that highly unlikely case. + */ + while (ovfstatus && max_loop--) { + /* Figure out which counter(s) overflowed */ + for_each_set_bit(i, &ovfstatus, n_counters) { + unsigned long ovfstatus_clear = 0; + + /* Update event->count for overflowed counter */ + event = idxd->idxd_pmu->event_list[i]; + perfmon_pmu_event_update(event); + /* Writing 1 to OVFSTATUS bit clears it */ + set_bit(i, &ovfstatus_clear); + iowrite32(ovfstatus_clear, OVFSTATUS_REG(idxd)); + } + + ovfstatus = ioread32(OVFSTATUS_REG(idxd)); + } + + /* + * Should never happen. If so, it means a counter(s) looped + * around twice while this handler was running. + */ + WARN_ON_ONCE(ovfstatus); +} + +static inline void perfmon_reset_config(struct idxd_device *idxd) +{ + iowrite32(CONFIG_RESET, PERFRST_REG(idxd)); + iowrite32(0, OVFSTATUS_REG(idxd)); + iowrite32(0, PERFFRZ_REG(idxd)); +} + +static inline void perfmon_reset_counters(struct idxd_device *idxd) +{ + iowrite32(CNTR_RESET, PERFRST_REG(idxd)); +} + +static inline void perfmon_reset(struct idxd_device *idxd) +{ + perfmon_reset_config(idxd); + perfmon_reset_counters(idxd); +} + +static void perfmon_pmu_event_start(struct perf_event *event, int mode) +{ + u32 flt_wq, flt_tc, flt_pg_sz, flt_xfer_sz, flt_eng = 0; + u64 cntr_cfg, cntrdata, event_enc, event_cat = 0; + struct hw_perf_event *hwc = &event->hw; + union filter_cfg flt_cfg; + union event_cfg event_cfg; + struct idxd_device *idxd; + int cntr; + + idxd = event_to_idxd(event); + + event->hw.idx = hwc->idx; + cntr = hwc->idx; + + /* Obtain event category and event value from user space */ + event_cfg.val = event->attr.config; + flt_cfg.val = event->attr.config1; + event_cat = event_cfg.event_cat; + event_enc = event_cfg.event_enc; + + /* Obtain filter configuration from user space */ + flt_wq = flt_cfg.wq; + flt_tc = flt_cfg.tc; + flt_pg_sz = flt_cfg.pg_sz; + flt_xfer_sz = flt_cfg.xfer_sz; + flt_eng = flt_cfg.eng; + + if (flt_wq && test_bit(FLT_WQ, &idxd->idxd_pmu->supported_filters)) + iowrite32(flt_wq, FLTCFG_REG(idxd, cntr, FLT_WQ)); + if (flt_tc && test_bit(FLT_TC, &idxd->idxd_pmu->supported_filters)) + iowrite32(flt_tc, FLTCFG_REG(idxd, cntr, FLT_TC)); + if (flt_pg_sz && test_bit(FLT_PG_SZ, &idxd->idxd_pmu->supported_filters)) + iowrite32(flt_pg_sz, FLTCFG_REG(idxd, cntr, FLT_PG_SZ)); + if (flt_xfer_sz && test_bit(FLT_XFER_SZ, &idxd->idxd_pmu->supported_filters)) + iowrite32(flt_xfer_sz, FLTCFG_REG(idxd, cntr, FLT_XFER_SZ)); + if (flt_eng && test_bit(FLT_ENG, &idxd->idxd_pmu->supported_filters)) + iowrite32(flt_eng, FLTCFG_REG(idxd, cntr, FLT_ENG)); + + /* Read the start value */ + cntrdata = ioread64(CNTRDATA_REG(idxd, cntr)); + local64_set(&event->hw.prev_count, cntrdata); + + /* Set counter to event/category */ + cntr_cfg = event_cat << CNTRCFG_CATEGORY_SHIFT; + cntr_cfg |= event_enc << CNTRCFG_EVENT_SHIFT; + /* Set interrupt on overflow and counter enable bits */ + cntr_cfg |= (CNTRCFG_IRQ_OVERFLOW | CNTRCFG_ENABLE); + + iowrite64(cntr_cfg, CNTRCFG_REG(idxd, cntr)); +} + +static void perfmon_pmu_event_stop(struct perf_event *event, int mode) +{ + struct hw_perf_event *hwc = &event->hw; + struct idxd_device *idxd; + int i, cntr = hwc->idx; + u64 cntr_cfg; + + idxd = event_to_idxd(event); + + /* remove this event from event list */ + for (i = 0; i < idxd->idxd_pmu->n_events; i++) { + if (event != idxd->idxd_pmu->event_list[i]) + continue; + + for (++i; i < idxd->idxd_pmu->n_events; i++) + idxd->idxd_pmu->event_list[i - 1] = idxd->idxd_pmu->event_list[i]; + --idxd->idxd_pmu->n_events; + break; + } + + cntr_cfg = ioread64(CNTRCFG_REG(idxd, cntr)); + cntr_cfg &= ~CNTRCFG_ENABLE; + iowrite64(cntr_cfg, CNTRCFG_REG(idxd, cntr)); + + if (mode == PERF_EF_UPDATE) + perfmon_pmu_event_update(event); + + event->hw.idx = -1; + clear_bit(cntr, idxd->idxd_pmu->used_mask); +} + +static void perfmon_pmu_event_del(struct perf_event *event, int mode) +{ + perfmon_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int perfmon_pmu_event_add(struct perf_event *event, int flags) +{ + struct idxd_device *idxd = event_to_idxd(event); + struct idxd_pmu *idxd_pmu = idxd->idxd_pmu; + struct hw_perf_event *hwc = &event->hw; + int idx, n; + + n = perfmon_collect_events(idxd_pmu, event, false); + if (n < 0) + return n; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + idx = perfmon_assign_event(idxd_pmu, event); + if (idx < 0) + return idx; + + perfmon_assign_hw_event(idxd_pmu, event, idx); + + if (flags & PERF_EF_START) + perfmon_pmu_event_start(event, 0); + + idxd_pmu->n_events = n; + + return 0; +} + +static void enable_perfmon_pmu(struct idxd_device *idxd) +{ + iowrite32(COUNTER_UNFREEZE, PERFFRZ_REG(idxd)); +} + +static void disable_perfmon_pmu(struct idxd_device *idxd) +{ + iowrite32(COUNTER_FREEZE, PERFFRZ_REG(idxd)); +} + +static void perfmon_pmu_enable(struct pmu *pmu) +{ + struct idxd_device *idxd = pmu_to_idxd(pmu); + + enable_perfmon_pmu(idxd); +} + +static void perfmon_pmu_disable(struct pmu *pmu) +{ + struct idxd_device *idxd = pmu_to_idxd(pmu); + + disable_perfmon_pmu(idxd); +} + +static void skip_filter(int i) +{ + int j; + + for (j = i; j < PERFMON_FILTERS_MAX; j++) + perfmon_format_attrs[PERFMON_FILTERS_START + j] = + perfmon_format_attrs[PERFMON_FILTERS_START + j + 1]; +} + +static void idxd_pmu_init(struct idxd_pmu *idxd_pmu) +{ + int i; + + for (i = 0 ; i < PERFMON_FILTERS_MAX; i++) { + if (!test_bit(i, &idxd_pmu->supported_filters)) + skip_filter(i); + } + + idxd_pmu->pmu.name = idxd_pmu->name; + idxd_pmu->pmu.attr_groups = perfmon_attr_groups; + idxd_pmu->pmu.task_ctx_nr = perf_invalid_context; + idxd_pmu->pmu.event_init = perfmon_pmu_event_init; + idxd_pmu->pmu.pmu_enable = perfmon_pmu_enable, + idxd_pmu->pmu.pmu_disable = perfmon_pmu_disable, + idxd_pmu->pmu.add = perfmon_pmu_event_add; + idxd_pmu->pmu.del = perfmon_pmu_event_del; + idxd_pmu->pmu.start = perfmon_pmu_event_start; + idxd_pmu->pmu.stop = perfmon_pmu_event_stop; + idxd_pmu->pmu.read = perfmon_pmu_event_update; + idxd_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; + idxd_pmu->pmu.module = THIS_MODULE; +} + +void perfmon_pmu_remove(struct idxd_device *idxd) +{ + if (!idxd->idxd_pmu) + return; + + cpuhp_state_remove_instance(cpuhp_slot, &idxd->idxd_pmu->cpuhp_node); + perf_pmu_unregister(&idxd->idxd_pmu->pmu); + kfree(idxd->idxd_pmu); + idxd->idxd_pmu = NULL; +} + +static int perf_event_cpu_online(unsigned int cpu, struct hlist_node *node) +{ + struct idxd_pmu *idxd_pmu; + + idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node); + + /* select the first online CPU as the designated reader */ + if (cpumask_empty(&perfmon_dsa_cpu_mask)) { + cpumask_set_cpu(cpu, &perfmon_dsa_cpu_mask); + idxd_pmu->cpu = cpu; + } + + return 0; +} + +static int perf_event_cpu_offline(unsigned int cpu, struct hlist_node *node) +{ + struct idxd_pmu *idxd_pmu; + unsigned int target; + + idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node); + + if (!cpumask_test_and_clear_cpu(cpu, &perfmon_dsa_cpu_mask)) + return 0; + + target = cpumask_any_but(cpu_online_mask, cpu); + + /* migrate events if there is a valid target */ + if (target < nr_cpu_ids) + cpumask_set_cpu(target, &perfmon_dsa_cpu_mask); + else + target = -1; + + perf_pmu_migrate_context(&idxd_pmu->pmu, cpu, target); + + return 0; +} + +int perfmon_pmu_init(struct idxd_device *idxd) +{ + union idxd_perfcap perfcap; + struct idxd_pmu *idxd_pmu; + int rc = -ENODEV; + + /* + * perfmon module initialization failed, nothing to do + */ + if (!cpuhp_set_up) + return -ENODEV; + + /* + * If perfmon_offset or num_counters is 0, it means perfmon is + * not supported on this hardware. + */ + if (idxd->perfmon_offset == 0) + return -ENODEV; + + idxd_pmu = kzalloc(sizeof(*idxd_pmu), GFP_KERNEL); + if (!idxd_pmu) + return -ENOMEM; + + idxd_pmu->idxd = idxd; + idxd->idxd_pmu = idxd_pmu; + + if (idxd->data->type == IDXD_TYPE_DSA) { + rc = sprintf(idxd_pmu->name, "dsa%d", idxd->id); + if (rc < 0) + goto free; + } else if (idxd->data->type == IDXD_TYPE_IAX) { + rc = sprintf(idxd_pmu->name, "iax%d", idxd->id); + if (rc < 0) + goto free; + } else { + goto free; + } + + perfmon_reset(idxd); + + perfcap.bits = ioread64(PERFCAP_REG(idxd)); + + /* + * If total perf counter is 0, stop further registration. + * This is necessary in order to support driver running on + * guest which does not have pmon support. + */ + if (perfcap.num_perf_counter == 0) + goto free; + + /* A counter width of 0 means it can't count */ + if (perfcap.counter_width == 0) + goto free; + + /* Overflow interrupt and counter freeze support must be available */ + if (!perfcap.overflow_interrupt || !perfcap.counter_freeze) + goto free; + + /* Number of event categories cannot be 0 */ + if (perfcap.num_event_category == 0) + goto free; + + /* + * We don't support per-counter capabilities for now. + */ + if (perfcap.cap_per_counter) + goto free; + + idxd_pmu->n_event_categories = perfcap.num_event_category; + idxd_pmu->supported_event_categories = perfcap.global_event_category; + idxd_pmu->per_counter_caps_supported = perfcap.cap_per_counter; + + /* check filter capability. If 0, then filters are not supported */ + idxd_pmu->supported_filters = perfcap.filter; + if (perfcap.filter) + idxd_pmu->n_filters = hweight8(perfcap.filter); + + /* Store the total number of counters categories, and counter width */ + idxd_pmu->n_counters = perfcap.num_perf_counter; + idxd_pmu->counter_width = perfcap.counter_width; + + idxd_pmu_init(idxd_pmu); + + rc = perf_pmu_register(&idxd_pmu->pmu, idxd_pmu->name, -1); + if (rc) + goto free; + + rc = cpuhp_state_add_instance(cpuhp_slot, &idxd_pmu->cpuhp_node); + if (rc) { + perf_pmu_unregister(&idxd->idxd_pmu->pmu); + goto free; + } +out: + return rc; +free: + kfree(idxd_pmu); + idxd->idxd_pmu = NULL; + + goto out; +} + +void __init perfmon_init(void) +{ + int rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "driver/dma/idxd/perf:online", + perf_event_cpu_online, + perf_event_cpu_offline); + if (WARN_ON(rc < 0)) + return; + + cpuhp_slot = rc; + cpuhp_set_up = true; +} + +void __exit perfmon_exit(void) +{ + if (cpuhp_set_up) + cpuhp_remove_multi_state(cpuhp_slot); +} diff --git a/drivers/dma/idxd/perfmon.h b/drivers/dma/idxd/perfmon.h new file mode 100644 index 000000000000..9a081a1bc605 --- /dev/null +++ b/drivers/dma/idxd/perfmon.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2020 Intel Corporation. All rights rsvd. */ + +#ifndef _PERFMON_H_ +#define _PERFMON_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "registers.h" + +static inline struct idxd_pmu *event_to_pmu(struct perf_event *event) +{ + struct idxd_pmu *idxd_pmu; + struct pmu *pmu; + + pmu = event->pmu; + idxd_pmu = container_of(pmu, struct idxd_pmu, pmu); + + return idxd_pmu; +} + +static inline struct idxd_device *event_to_idxd(struct perf_event *event) +{ + struct idxd_pmu *idxd_pmu; + struct pmu *pmu; + + pmu = event->pmu; + idxd_pmu = container_of(pmu, struct idxd_pmu, pmu); + + return idxd_pmu->idxd; +} + +static inline struct idxd_device *pmu_to_idxd(struct pmu *pmu) +{ + struct idxd_pmu *idxd_pmu; + + idxd_pmu = container_of(pmu, struct idxd_pmu, pmu); + + return idxd_pmu->idxd; +} + +enum dsa_perf_events { + DSA_PERF_EVENT_WQ = 0, + DSA_PERF_EVENT_ENGINE, + DSA_PERF_EVENT_ADDR_TRANS, + DSA_PERF_EVENT_OP, + DSA_PERF_EVENT_COMPL, + DSA_PERF_EVENT_MAX, +}; + +enum filter_enc { + FLT_WQ = 0, + FLT_TC, + FLT_PG_SZ, + FLT_XFER_SZ, + FLT_ENG, + FLT_MAX, +}; + +#define CONFIG_RESET 0x0000000000000001 +#define CNTR_RESET 0x0000000000000002 +#define CNTR_ENABLE 0x0000000000000001 +#define INTR_OVFL 0x0000000000000002 + +#define COUNTER_FREEZE 0x00000000FFFFFFFF +#define COUNTER_UNFREEZE 0x0000000000000000 +#define OVERFLOW_SIZE 32 + +#define CNTRCFG_ENABLE BIT(0) +#define CNTRCFG_IRQ_OVERFLOW BIT(1) +#define CNTRCFG_CATEGORY_SHIFT 8 +#define CNTRCFG_EVENT_SHIFT 32 + +#define PERFMON_TABLE_OFFSET(_idxd) \ +({ \ + typeof(_idxd) __idxd = (_idxd); \ + ((__idxd)->reg_base + (__idxd)->perfmon_offset); \ +}) +#define PERFMON_REG_OFFSET(idxd, offset) \ + (PERFMON_TABLE_OFFSET(idxd) + (offset)) + +#define PERFCAP_REG(idxd) (PERFMON_REG_OFFSET(idxd, IDXD_PERFCAP_OFFSET)) +#define PERFRST_REG(idxd) (PERFMON_REG_OFFSET(idxd, IDXD_PERFRST_OFFSET)) +#define OVFSTATUS_REG(idxd) (PERFMON_REG_OFFSET(idxd, IDXD_OVFSTATUS_OFFSET)) +#define PERFFRZ_REG(idxd) (PERFMON_REG_OFFSET(idxd, IDXD_PERFFRZ_OFFSET)) + +#define FLTCFG_REG(idxd, cntr, flt) \ + (PERFMON_REG_OFFSET(idxd, IDXD_FLTCFG_OFFSET) + ((cntr) * 32) + ((flt) * 4)) + +#define CNTRCFG_REG(idxd, cntr) \ + (PERFMON_REG_OFFSET(idxd, IDXD_CNTRCFG_OFFSET) + ((cntr) * 8)) +#define CNTRDATA_REG(idxd, cntr) \ + (PERFMON_REG_OFFSET(idxd, IDXD_CNTRDATA_OFFSET) + ((cntr) * 8)) +#define CNTRCAP_REG(idxd, cntr) \ + (PERFMON_REG_OFFSET(idxd, IDXD_CNTRCAP_OFFSET) + ((cntr) * 8)) + +#define EVNTCAP_REG(idxd, category) \ + (PERFMON_REG_OFFSET(idxd, IDXD_EVNTCAP_OFFSET) + ((category) * 8)) + +#define DEFINE_PERFMON_FORMAT_ATTR(_name, _format) \ +static ssize_t __perfmon_idxd_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_idxd_##_name = \ + __ATTR(_name, 0444, __perfmon_idxd_##_name##_show, NULL) + +#endif diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h index 6c11375cc56a..c970c3f025f0 100644 --- a/drivers/dma/idxd/registers.h +++ b/drivers/dma/idxd/registers.h @@ -386,4 +386,112 @@ union wqcfg { #define GRPENGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 32) #define GRPFLGCFG_OFFSET(idxd_dev, n) ((idxd_dev)->grpcfg_offset + (n) * GRPCFG_SIZE + 40) +/* Following is performance monitor registers */ +#define IDXD_PERFCAP_OFFSET 0x0 +union idxd_perfcap { + struct { + u64 num_perf_counter:6; + u64 rsvd1:2; + u64 counter_width:8; + u64 num_event_category:4; + u64 global_event_category:16; + u64 filter:8; + u64 rsvd2:8; + u64 cap_per_counter:1; + u64 writeable_counter:1; + u64 counter_freeze:1; + u64 overflow_interrupt:1; + u64 rsvd3:8; + }; + u64 bits; +} __packed; + +#define IDXD_EVNTCAP_OFFSET 0x80 +union idxd_evntcap { + struct { + u64 events:28; + u64 rsvd:36; + }; + u64 bits; +} __packed; + +struct idxd_event { + union { + struct { + u32 event_category:4; + u32 events:28; + }; + u32 val; + }; +} __packed; + +#define IDXD_CNTRCAP_OFFSET 0x800 +struct idxd_cntrcap { + union { + struct { + u32 counter_width:8; + u32 rsvd:20; + u32 num_events:4; + }; + u32 val; + }; + struct idxd_event events[]; +} __packed; + +#define IDXD_PERFRST_OFFSET 0x10 +union idxd_perfrst { + struct { + u32 perfrst_config:1; + u32 perfrst_counter:1; + u32 rsvd:30; + }; + u32 val; +} __packed; + +#define IDXD_OVFSTATUS_OFFSET 0x30 +#define IDXD_PERFFRZ_OFFSET 0x20 +#define IDXD_CNTRCFG_OFFSET 0x100 +union idxd_cntrcfg { + struct { + u64 enable:1; + u64 interrupt_ovf:1; + u64 global_freeze_ovf:1; + u64 rsvd1:5; + u64 event_category:4; + u64 rsvd2:20; + u64 events:28; + u64 rsvd3:4; + }; + u64 val; +} __packed; + +#define IDXD_FLTCFG_OFFSET 0x300 + +#define IDXD_CNTRDATA_OFFSET 0x200 +union idxd_cntrdata { + struct { + u64 event_count_value; + }; + u64 val; +} __packed; + +union event_cfg { + struct { + u64 event_cat:4; + u64 event_enc:28; + }; + u64 val; +} __packed; + +union filter_cfg { + struct { + u64 wq:32; + u64 tc:8; + u64 pg_sz:4; + u64 xfer_sz:8; + u64 eng:8; + }; + u64 val; +} __packed; + #endif diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f14adb882338..264d911424c0 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -167,6 +167,7 @@ enum cpuhp_state { CPUHP_AP_PERF_X86_RAPL_ONLINE, CPUHP_AP_PERF_X86_CQM_ONLINE, CPUHP_AP_PERF_X86_CSTATE_ONLINE, + CPUHP_AP_PERF_X86_IDXD_ONLINE, CPUHP_AP_PERF_S390_CF_ONLINE, CPUHP_AP_PERF_S390_CFD_ONLINE, CPUHP_AP_PERF_S390_SF_ONLINE, From 0bde4444ec44b8e64bbd4af72fcaef58bcdbd4ce Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sat, 24 Apr 2021 10:04:16 -0500 Subject: [PATCH 0611/1379] dmaengine: idxd: Enable IDXD performance monitor support Add the code needed in the main IDXD driver to interface with the IDXD perfmon implementation. [ Based on work originally by Jing Lin. ] Reviewed-by: Dave Jiang Reviewed-by: Kan Liang Signed-off-by: Tom Zanussi Link: https://lore.kernel.org/r/a5564a5583911565d31c2af9234218c5166c4b2c.1619276133.git.zanussi@kernel.org Signed-off-by: Vinod Koul --- drivers/dma/idxd/init.c | 9 +++++++++ drivers/dma/idxd/irq.c | 5 +---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 8003f8a25fff..2a926bef87f2 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -21,6 +21,7 @@ #include "../dmaengine.h" #include "registers.h" #include "idxd.h" +#include "perfmon.h" MODULE_VERSION(IDXD_DRIVER_VERSION); MODULE_LICENSE("GPL v2"); @@ -541,6 +542,10 @@ static int idxd_probe(struct idxd_device *idxd) idxd->major = idxd_cdev_get_major(idxd); + rc = perfmon_pmu_init(idxd); + if (rc < 0) + dev_warn(dev, "Failed to initialize perfmon. No PMU support: %d\n", rc); + dev_dbg(dev, "IDXD device %d probed successfully\n", idxd->id); return 0; @@ -720,6 +725,7 @@ static void idxd_remove(struct pci_dev *pdev) if (device_pasid_enabled(idxd)) idxd_disable_system_pasid(idxd); idxd_unregister_devices(idxd); + perfmon_pmu_remove(idxd); iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA); } @@ -749,6 +755,8 @@ static int __init idxd_init_module(void) else support_enqcmd = true; + perfmon_init(); + err = idxd_register_bus_type(); if (err < 0) return err; @@ -782,5 +790,6 @@ static void __exit idxd_exit_module(void) pci_unregister_driver(&idxd_pci_driver); idxd_cdev_remove(); idxd_unregister_bus_type(); + perfmon_exit(); } module_exit(idxd_exit_module); diff --git a/drivers/dma/idxd/irq.c b/drivers/dma/idxd/irq.c index afee571e0194..ae68e1e5487a 100644 --- a/drivers/dma/idxd/irq.c +++ b/drivers/dma/idxd/irq.c @@ -156,11 +156,8 @@ static int process_misc_interrupts(struct idxd_device *idxd, u32 cause) } if (cause & IDXD_INTC_PERFMON_OVFL) { - /* - * Driver does not utilize perfmon counter overflow interrupt - * yet. - */ val |= IDXD_INTC_PERFMON_OVFL; + perfmon_counter_overflow(idxd); } val ^= cause; From fcc96cef8a185e55c25d25f4f698f51e1a030911 Mon Sep 17 00:00:00 2001 From: Zheng Yongjun Date: Fri, 8 Jan 2021 17:24:46 +0800 Subject: [PATCH 0612/1379] leds-lm3642: convert comma to semicolon Replace a comma between expression statements by a semicolon. Signed-off-by: Zheng Yongjun Signed-off-by: Pavel Machek --- drivers/leds/leds-lm3642.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/leds/leds-lm3642.c b/drivers/leds/leds-lm3642.c index 8007b82985a8..435309154e6b 100644 --- a/drivers/leds/leds-lm3642.c +++ b/drivers/leds/leds-lm3642.c @@ -339,7 +339,7 @@ static int lm3642_probe(struct i2c_client *client, chip->cdev_flash.max_brightness = 16; chip->cdev_flash.brightness_set_blocking = lm3642_strobe_brightness_set; chip->cdev_flash.default_trigger = "flash"; - chip->cdev_flash.groups = lm3642_flash_groups, + chip->cdev_flash.groups = lm3642_flash_groups; err = led_classdev_register(&client->dev, &chip->cdev_flash); if (err < 0) { dev_err(chip->dev, "failed to register flash\n"); @@ -351,7 +351,7 @@ static int lm3642_probe(struct i2c_client *client, chip->cdev_torch.max_brightness = 8; chip->cdev_torch.brightness_set_blocking = lm3642_torch_brightness_set; chip->cdev_torch.default_trigger = "torch"; - chip->cdev_torch.groups = lm3642_torch_groups, + chip->cdev_torch.groups = lm3642_torch_groups; err = led_classdev_register(&client->dev, &chip->cdev_torch); if (err < 0) { dev_err(chip->dev, "failed to register torch\n"); From 5222fa9121142ddd86dcb1a9205fd02e9d5d1e04 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Sun, 25 Apr 2021 22:23:47 +0200 Subject: [PATCH 0613/1379] MAINTAINERS: Remove Dan Murphy's bouncing email Signed-off-by: Pavel Machek --- MAINTAINERS | 4 ---- 1 file changed, 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index d92f85ca831d..11d83c30b737 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10030,7 +10030,6 @@ F: scripts/leaking_addresses.pl LED SUBSYSTEM M: Pavel Machek -R: Dan Murphy L: linux-leds@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/pavel/linux-leds.git @@ -10905,7 +10904,6 @@ T: git git://linuxtv.org/media_tree.git F: drivers/media/radio/radio-maxiradio* MCAN MMIO DEVICE DRIVER -M: Dan Murphy M: Pankaj Sharma L: linux-can@vger.kernel.org S: Maintained @@ -17846,7 +17844,6 @@ S: Maintained F: drivers/thermal/ti-soc-thermal/ TI BQ27XXX POWER SUPPLY DRIVER -R: Dan Murphy F: drivers/power/supply/bq27xxx_battery.c F: drivers/power/supply/bq27xxx_battery_i2c.c F: include/linux/power/bq27xxx_battery.h @@ -17981,7 +17978,6 @@ S: Odd Fixes F: sound/soc/codecs/tas571x* TI TCAN4X5X DEVICE DRIVER -M: Dan Murphy L: linux-can@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/net/can/tcan4x5x.txt From ec50536b7840dde085185d9570fa19d0baf5042c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 22 Feb 2021 13:49:39 +0000 Subject: [PATCH 0614/1379] leds: lgm: Fix spelling mistake "prepate" -> "prepare" There is a spelling mistake in a dev_err error message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Pavel Machek --- drivers/leds/blink/leds-lgm-sso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/leds/blink/leds-lgm-sso.c b/drivers/leds/blink/leds-lgm-sso.c index 7d5c9ca007d6..6a63846d10b5 100644 --- a/drivers/leds/blink/leds-lgm-sso.c +++ b/drivers/leds/blink/leds-lgm-sso.c @@ -793,7 +793,7 @@ static int intel_sso_led_probe(struct platform_device *pdev) ret = clk_prepare_enable(priv->gclk); if (ret) { - dev_err(dev, "Failed to prepate/enable sso gate clock!\n"); + dev_err(dev, "Failed to prepare/enable sso gate clock!\n"); return ret; } From 1cfa807b06afd54488512bacef7cb5023437f178 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 16 Mar 2021 14:39:46 +0100 Subject: [PATCH 0615/1379] leds: LEDS_BLINK_LGM should depend on X86 The Intel Lightning Mountain (LGM) Serial Shift Output controller (SSO) is only present on Intel Lightning Mountain SoCs. Hence add a dependency on X86, to prevent asking the user about this driver when configuring a kernel without Intel Lightning Mountain platform support. While at it, merge the other dependencies into a single statement. Signed-off-by: Geert Uytterhoeven Signed-off-by: Pavel Machek --- drivers/leds/blink/Kconfig | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/leds/blink/Kconfig b/drivers/leds/blink/Kconfig index 964c5037d9a5..59ba81e40e85 100644 --- a/drivers/leds/blink/Kconfig +++ b/drivers/leds/blink/Kconfig @@ -1,9 +1,7 @@ config LEDS_LGM tristate "LED support for LGM SoC series" - depends on GPIOLIB - depends on LEDS_CLASS - depends on MFD_SYSCON - depends on OF + depends on X86 || COMPILE_TEST + depends on GPIOLIB && LEDS_CLASS && MFD_SYSCON && OF help This option enables support for LEDs connected to GPIO lines on Lightning Mountain (LGM) SoC. Lightning Mountain is a AnyWAN From 5fe09e16c689eae88a151c2f8199c73cf6f18d7d Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Mon, 22 Mar 2021 10:37:16 +0800 Subject: [PATCH 0616/1379] leds: trigger: pattern: Switch to using the new API kobj_to_dev() Switch to using the new API kobj_to_dev() to fix the below warnning: ./drivers/leds/trigger/ledtrig-pattern.c:336:60-61: WARNING opportunity for kobj_to_dev() Signed-off-by: Tian Tao Signed-off-by: Pavel Machek --- drivers/leds/trigger/ledtrig-pattern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/leds/trigger/ledtrig-pattern.c b/drivers/leds/trigger/ledtrig-pattern.c index 4d138d5317e9..43a265dc4696 100644 --- a/drivers/leds/trigger/ledtrig-pattern.c +++ b/drivers/leds/trigger/ledtrig-pattern.c @@ -333,7 +333,7 @@ static DEVICE_ATTR_RW(hw_pattern); static umode_t pattern_trig_attrs_mode(struct kobject *kobj, struct attribute *attr, int index) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct led_classdev *led_cdev = dev_get_drvdata(dev); if (attr == &dev_attr_repeat.attr || attr == &dev_attr_pattern.attr) From 23a700455a1bc55f3ea20675e574181b8c129306 Mon Sep 17 00:00:00 2001 From: Stefan Riedmueller Date: Wed, 14 Apr 2021 13:51:24 +0200 Subject: [PATCH 0617/1379] leds: pca9532: Assign gpio base dynamically When using devicetree, gpio_base holds its initial zero value which can lead to a rejection if another gpio controller already occupies this base. To prevent that collision let the gpio base be assigned dynamically. Signed-off-by: Stefan Riedmueller Signed-off-by: Pavel Machek --- drivers/leds/leds-pca9532.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/leds/leds-pca9532.c b/drivers/leds/leds-pca9532.c index 27d027165472..017794bb87ae 100644 --- a/drivers/leds/leds-pca9532.c +++ b/drivers/leds/leds-pca9532.c @@ -480,6 +480,8 @@ pca9532_of_populate_pdata(struct device *dev, struct device_node *np) if (!pdata) return ERR_PTR(-ENOMEM); + pdata->gpio_base = -1; + of_property_read_u8_array(np, "nxp,pwm", &pdata->pwm[0], ARRAY_SIZE(pdata->pwm)); of_property_read_u8_array(np, "nxp,psc", &pdata->psc[0], From a6efb35019d00f483a0e5f188747723371d659fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 23 Apr 2021 18:32:26 +0200 Subject: [PATCH 0618/1379] pwm: Reword docs about pwm_apply_state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The main issue is that the current documentation talks about the non-existent function pwm_get_last_applied_state. (This was right in the context of https://lore.kernel.org/linux-pwm/20210406073036.26857-1-u.kleine-koenig@pengutronix.de/ but was then missed to adapt when this patch was reduced to a documentation update.) While at is also clarify "last applied PWM state" to "PWM state that was passed to the last invocation of pwm_apply_state()" to better distinguish to the last actually implemented state and reword to drop a word repetition. Fixes: 1a7a6e8072ea ("pwm: Clarify which state pwm_get_state() returns") Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- Documentation/driver-api/pwm.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/driver-api/pwm.rst b/Documentation/driver-api/pwm.rst index 381f3c46cdac..a7ca4f58305a 100644 --- a/Documentation/driver-api/pwm.rst +++ b/Documentation/driver-api/pwm.rst @@ -55,11 +55,11 @@ several parameter at once. For example, if you see pwm_config() and pwm_{enable,disable}() calls in the same function, this probably means you should switch to pwm_apply_state(). -The PWM user API also allows one to query the last applied PWM state with -pwm_get_last_applied_state(). Note this is different to what the driver has -actually implemented if the request cannot be implemented exactly with the -hardware in use. There is currently no way for consumers to get the actually -implemented settings. +The PWM user API also allows one to query the PWM state that was passed to the +last invocation of pwm_apply_state() using pwm_get_state(). Note this is +different to what the driver has actually implemented if the request cannot be +satisfied exactly with the hardware in use. There is currently no way for +consumers to get the actually implemented settings. In addition to the PWM state, the PWM API also exposes PWM arguments, which are the reference PWM config one should use on this PWM. From 32e6b68167f1d446111c973d57e6f52aee11897a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:03 -0400 Subject: [PATCH 0619/1379] xprtrdma: Avoid Receive Queue wrapping Commit e340c2d6ef2a ("xprtrdma: Reduce the doorbell rate (Receive)") increased the number of Receive WRs that are posted by the client, but did not increase the size of the Receive Queue allocated during transport set-up. This is usually not an issue because RPCRDMA_BACKWARD_WRS is defined as (32) when SUNRPC_BACKCHANNEL is defined. In cases where it isn't, there is a real risk of Receive Queue wrapping. Fixes: e340c2d6ef2a ("xprtrdma: Reduce the doorbell rate (Receive)") Signed-off-by: Chuck Lever Reviewed-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 766a1048a48a..132df9b59ab4 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -257,6 +257,7 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ ep->re_attr.cap.max_recv_wr = ep->re_max_requests; ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->re_attr.cap.max_recv_wr += RPCRDMA_MAX_RECV_BATCH; ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ ep->re_max_rdma_segs = From 15788d1d1077ebe029c48842c738876516d85076 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:09 -0400 Subject: [PATCH 0620/1379] xprtrdma: Do not refresh Receive Queue while it is draining Currently the Receive completion handler refreshes the Receive Queue whenever a successful Receive completion occurs. On disconnect, xprtrdma drains the Receive Queue. The first few Receive completions after a disconnect are typically successful, until the first flushed Receive. This means the Receive completion handler continues to post more Receive WRs after the drain sentinel has been posted. The late- posted Receives flush after the drain sentinel has completed, leading to a crash later in rpcrdma_xprt_disconnect(). To prevent this crash, xprtrdma has to ensure that the Receive handler stops posting Receives before ib_drain_rq() posts its drain sentinel. Suggested-by: Tom Talpey Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 13 +++++++++++++ net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 14 insertions(+) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index ec912cf9c618..d8ed69442219 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -101,6 +101,12 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ep *ep = r_xprt->rx_ep; struct rdma_cm_id *id = ep->re_id; + /* Wait for rpcrdma_post_recvs() to leave its critical + * section. + */ + if (atomic_inc_return(&ep->re_receiving) > 1) + wait_for_completion(&ep->re_done); + /* Flush Receives, then wait for deferred Reply work * to complete. */ @@ -414,6 +420,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) __module_get(THIS_MODULE); device = id->device; ep->re_id = id; + reinit_completion(&ep->re_done); ep->re_max_requests = r_xprt->rx_xprt.max_reqs; ep->re_inline_send = xprt_rdma_max_inline_write; @@ -1385,6 +1392,9 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!temp) needed += RPCRDMA_MAX_RECV_BATCH; + if (atomic_inc_return(&ep->re_receiving) > 1) + goto out; + /* fast path: all needed reps can be found on the free list */ wr = NULL; while (needed) { @@ -1410,6 +1420,9 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); + if (atomic_dec_return(&ep->re_receiving) > 0) + complete(&ep->re_done); + out: trace_xprtrdma_post_recvs(r_xprt, count, rc); if (rc) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index fe3be985e239..31404326f29f 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -83,6 +83,7 @@ struct rpcrdma_ep { unsigned int re_max_inline_recv; int re_async_rc; int re_connect_status; + atomic_t re_receiving; atomic_t re_force_disconnect; struct ib_qp_init_attr re_attr; wait_queue_head_t re_connect_wait; From 5030c9a938f875f31932928632e1597f03e79ace Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:16 -0400 Subject: [PATCH 0621/1379] xprtrdma: Put flushed Receives on free list instead of destroying them Defer destruction of an rpcrdma_rep until transport tear-down to preserve the rb_all_reps list while Receives flush. Signed-off-by: Chuck Lever Reviewed-by: Tom Talpey Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index d8ed69442219..1b599a623eea 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -80,6 +80,8 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, struct rpcrdma_sendctx *sc); static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt); static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, + struct rpcrdma_rep *rep); static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); @@ -211,7 +213,7 @@ static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) out_flushed: rpcrdma_flush_disconnect(r_xprt, wc); - rpcrdma_rep_destroy(rep); + rpcrdma_rep_put(&r_xprt->rx_buf, rep); } static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, From eaf86e8cc85c4abf3e4a2a0d3f59af613d2bacab Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 24 Apr 2021 15:02:28 -0400 Subject: [PATCH 0622/1379] xprtrdma: Improve locking around rpcrdma_rep destruction Currently rpcrdma_reps_destroy() assumes that, at transport tear-down, the content of the rb_free_reps list is the same as the content of the rb_all_reps list. Although that is usually true, using the rb_all_reps list should be more reliable because of the way it's managed. And, rpcrdma_reps_unmap() uses rb_all_reps; these two functions should both traverse the "all" list. Ensure that all rpcrdma_reps are always destroyed whether they are on the rep free list or not. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1b599a623eea..2bc4589c37ce 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1007,16 +1007,23 @@ out: return NULL; } -/* No locking needed here. This function is invoked only by the - * Receive completion handler, or during transport shutdown. - */ -static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) +static void rpcrdma_rep_free(struct rpcrdma_rep *rep) { - list_del(&rep->rr_all); rpcrdma_regbuf_free(rep->rr_rdmabuf); kfree(rep); } +static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep) +{ + struct rpcrdma_buffer *buf = &rep->rr_rxprt->rx_buf; + + spin_lock(&buf->rb_lock); + list_del(&rep->rr_all); + spin_unlock(&buf->rb_lock); + + rpcrdma_rep_free(rep); +} + static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) { struct llist_node *node; @@ -1049,8 +1056,18 @@ static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_rep *rep; - while ((rep = rpcrdma_rep_get_locked(buf)) != NULL) - rpcrdma_rep_destroy(rep); + spin_lock(&buf->rb_lock); + while ((rep = list_first_entry_or_null(&buf->rb_all_reps, + struct rpcrdma_rep, + rr_all)) != NULL) { + list_del(&rep->rr_all); + spin_unlock(&buf->rb_lock); + + rpcrdma_rep_free(rep); + + spin_lock(&buf->rb_lock); + } + spin_unlock(&buf->rb_lock); } /** From 8b5292be6880025cb3789cc811d19b4b8f0bf786 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:28 -0400 Subject: [PATCH 0623/1379] xprtrdma: Improve commentary around rpcrdma_reps_unmap() Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 2bc4589c37ce..baf4b8c99d4c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1041,6 +1041,10 @@ static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, llist_add(&rep->rr_node, &buf->rb_free_reps); } +/* Caller must ensure the QP is quiescent (RQ is drained) before + * invoking this function, to guarantee rb_all_reps is not + * changing. + */ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; @@ -1048,7 +1052,7 @@ static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) list_for_each_entry(rep, &buf->rb_all_reps, rr_all) { rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); - rep->rr_temp = true; + rep->rr_temp = true; /* Mark this rep for destruction */ } } From 9e3ca33b62d4878f6ae39776abb6deebb37db597 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:35 -0400 Subject: [PATCH 0624/1379] xprtrdma: Improve locking around rpcrdma_rep creation Defensive clean up: Protect the rb_all_reps list during rep creation. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index baf4b8c99d4c..95ce9328e5d2 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -963,13 +963,11 @@ static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt) rpcrdma_req_reset(req); } -/* No locking needed here. This function is called only by the - * Receive completion handler. - */ static noinline struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, bool temp) { + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; rep = kzalloc(sizeof(*rep), GFP_KERNEL); @@ -996,7 +994,10 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; rep->rr_recv_wr.num_sge = 1; rep->rr_temp = temp; - list_add(&rep->rr_all, &r_xprt->rx_buf.rb_all_reps); + + spin_lock(&buf->rb_lock); + list_add(&rep->rr_all, &buf->rb_all_reps); + spin_unlock(&buf->rb_lock); return rep; out_free_regbuf: From 35d8b10a25884050bb3b0149b62c3818ec59f77c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:41 -0400 Subject: [PATCH 0625/1379] xprtrdma: Fix cwnd update ordering After a reconnect, the reply handler is opening the cwnd (and thus enabling more RPC Calls to be sent) /before/ rpcrdma_post_recvs() can post enough Receive WRs to receive their replies. This causes an RNR and the new connection is lost immediately. The race is most clearly exposed when KASAN and disconnect injection are enabled. This slows down rpcrdma_rep_create() enough to allow the send side to post a bunch of RPC Calls before the Receive completion handler can invoke ib_post_recv(). Fixes: 2ae50ad68cd7 ("xprtrdma: Close window between waking RPC senders and posting Receives") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 3 ++- net/sunrpc/xprtrdma/verbs.c | 10 +++++----- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 292f066d006e..21ddd78a8c35 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1430,9 +1430,10 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_ep->re_max_requests) credits = r_xprt->rx_ep->re_max_requests; + rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1), + false); if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); - rpcrdma_post_recvs(r_xprt, false); req = rpcr_to_rdmar(rqst); if (unlikely(req->rl_reply)) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 95ce9328e5d2..de96fcede7d4 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -544,7 +544,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) * outstanding Receives. */ rpcrdma_ep_get(ep); - rpcrdma_post_recvs(r_xprt, true); + rpcrdma_post_recvs(r_xprt, 1, true); rc = rdma_connect(ep->re_id, &ep->re_remote_cma); if (rc) @@ -1395,21 +1395,21 @@ int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /** * rpcrdma_post_recvs - Refill the Receive Queue * @r_xprt: controlling transport instance - * @temp: mark Receive buffers to be deleted after use + * @needed: current credit grant + * @temp: mark Receive buffers to be deleted after one use * */ -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_recv_wr *wr, *bad_wr; struct rpcrdma_rep *rep; - int needed, count, rc; + int count, rc; rc = 0; count = 0; - needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); if (likely(ep->re_receive_count > needed)) goto out; needed -= ep->re_receive_count; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 31404326f29f..2504f67af63e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -462,7 +462,7 @@ int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); +void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp); /* * Buffer calls - xprtrdma/verbs.c From c35ca60d490e32b7e7d21f344693ea29d4f4a9d3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:47 -0400 Subject: [PATCH 0626/1379] xprtrdma: Delete rpcrdma_recv_buffer_put() Clean up: The name recv_buffer_put() is a vestige of older code, and the function is just a wrapper for the newer rpcrdma_rep_put(). In most of the existing call sites, a pointer to the owning rpcrdma_buffer is already available. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/backchannel.c | 4 +++- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ++-- net/sunrpc/xprtrdma/verbs.c | 24 ++++++++---------------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index a249837d6a55..1151efd09b27 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -155,9 +155,11 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) { struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_rep *rep = req->rl_reply; struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - rpcrdma_recv_buffer_put(req->rl_reply); + rpcrdma_rep_put(&r_xprt->rx_buf, rep); req->rl_reply = NULL; spin_lock(&xprt->bc_pa_lock); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 21ddd78a8c35..be4e888e78a3 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1437,7 +1437,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) req = rpcr_to_rdmar(rqst); if (unlikely(req->rl_reply)) - rpcrdma_recv_buffer_put(req->rl_reply); + rpcrdma_rep_put(buf, req->rl_reply); req->rl_reply = rep; rep->rr_rqst = rqst; @@ -1465,5 +1465,5 @@ out_shortreply: trace_xprtrdma_reply_short_err(rep); out: - rpcrdma_recv_buffer_put(rep); + rpcrdma_rep_put(buf, rep); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index de96fcede7d4..5002d86a55ee 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -80,8 +80,6 @@ static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, struct rpcrdma_sendctx *sc); static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt); static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); -static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, - struct rpcrdma_rep *rep); static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); @@ -1036,8 +1034,13 @@ static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) return llist_entry(node, struct rpcrdma_rep, rr_node); } -static void rpcrdma_rep_put(struct rpcrdma_buffer *buf, - struct rpcrdma_rep *rep) +/** + * rpcrdma_rep_put - Release rpcrdma_rep back to free list + * @buf: buffer pool + * @rep: rep to release + * + */ +void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep) { llist_add(&rep->rr_node, &buf->rb_free_reps); } @@ -1252,17 +1255,6 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) spin_unlock(&buffers->rb_lock); } -/** - * rpcrdma_recv_buffer_put - Release rpcrdma_rep back to free list - * @rep: rep to release - * - * Used after error conditions. - */ -void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) -{ - rpcrdma_rep_put(&rep->rr_rxprt->rx_buf, rep); -} - /* Returns a pointer to a rpcrdma_regbuf object, or NULL. * * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for @@ -1455,7 +1447,7 @@ out: rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr); wr = wr->next; - rpcrdma_recv_buffer_put(rep); + rpcrdma_rep_put(buf, rep); --count; } } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2504f67af63e..9c5039d4634a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -481,7 +481,7 @@ void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req); -void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); +void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep); bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags); From 1363e6388c363d0433f9aa4e2f33efe047572687 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:02:54 -0400 Subject: [PATCH 0627/1379] xprtrdma: rpcrdma_mr_pop() already does list_del_init() The rpcrdma_mr_pop() earlier in the function has already cleared out mr_list, so it must not be done again in the error path. Fixes: 847568942f93 ("xprtrdma: Remove fr_state") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 132df9b59ab4..aca2228095db 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -576,7 +576,6 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) mr = container_of(frwr, struct rpcrdma_mr, frwr); bad_wr = bad_wr->next; - list_del_init(&mr->mr_list); frwr_mr_recycle(mr); } } From f912af77e2c1ba25bd40534668b10da5b20f686a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:00 -0400 Subject: [PATCH 0628/1379] xprtrdma: Rename frwr_release_mr() Clean up: To be consistent with other functions in this source file, follow the naming convention of putting the object being acted upon before the action itself. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 6 +++--- net/sunrpc/xprtrdma/verbs.c | 4 ++-- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index aca2228095db..bfc057fdb75f 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -50,11 +50,11 @@ #endif /** - * frwr_release_mr - Destroy one MR + * frwr_mr_release - Destroy one MR * @mr: MR allocated by frwr_mr_init * */ -void frwr_release_mr(struct rpcrdma_mr *mr) +void frwr_mr_release(struct rpcrdma_mr *mr) { int rc; @@ -88,7 +88,7 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr) r_xprt->rx_stats.mrs_recycled++; spin_unlock(&r_xprt->rx_buf.rb_lock); - frwr_release_mr(mr); + frwr_mr_release(mr); } static void frwr_mr_put(struct rpcrdma_mr *mr) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 5002d86a55ee..f97b03129d99 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1138,7 +1138,7 @@ void rpcrdma_req_destroy(struct rpcrdma_req *req) list_del(&mr->mr_all); spin_unlock(&buf->rb_lock); - frwr_release_mr(mr); + frwr_mr_release(mr); } rpcrdma_regbuf_free(req->rl_recvbuf); @@ -1169,7 +1169,7 @@ static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt) list_del(&mr->mr_all); spin_unlock(&buf->rb_lock); - frwr_release_mr(mr); + frwr_mr_release(mr); spin_lock(&buf->rb_lock); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 9c5039d4634a..1b187d1dee8a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -528,7 +528,7 @@ rpcrdma_data_dir(bool writing) void frwr_reset(struct rpcrdma_req *req); int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device); int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr); -void frwr_release_mr(struct rpcrdma_mr *mr); +void frwr_mr_release(struct rpcrdma_mr *mr); struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, From 44438ad9ae22277a261f9fa4fdc6387a8ff50f2e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:06 -0400 Subject: [PATCH 0629/1379] xprtrdma: Clarify use of barrier in frwr_wc_localinv_done() Clean up: The comment and the placement of the memory barrier is confusing. Humans want to read the function statements from head to tail. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index bfc057fdb75f..af85cec0ce31 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -592,14 +592,16 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); - struct rpcrdma_rep *rep = mr->mr_req->rl_reply; + struct rpcrdma_rep *rep; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid); - frwr_mr_done(wc, mr); - /* Ensure @rep is generated before frwr_mr_done */ + /* Ensure that @rep is generated before the MR is released */ + rep = mr->mr_req->rl_reply; smp_rmb(); + + frwr_mr_done(wc, mr); rpcrdma_complete_rqst(rep); rpcrdma_flush_disconnect(cq->cq_context, wc); From e4b52ca01315ad53df41877708428c1c41c1444d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:12 -0400 Subject: [PATCH 0630/1379] xprtrdma: Do not recycle MR after FastReg/LocalInv flushes Better not to touch MRs involved in a flush or post error until the Send and Receive Queues are drained and the transport is fully quiescent. Simply don't insert such MRs back onto the free list. They remain on mr_all and will be released when the connection is torn down. I had thought that recycling would prevent hardware resources from being tied up for a long time. However, since v5.7, a transport disconnect destroys the QP and other hardware-owned resources. The MRs get cleaned up nicely at that point. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 1 - net/sunrpc/xprtrdma/frwr_ops.c | 71 +++++++++------------------------- 2 files changed, 18 insertions(+), 54 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index c838e7ac1c2d..e38e745d13b0 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1014,7 +1014,6 @@ DEFINE_MR_EVENT(localinv); DEFINE_MR_EVENT(map); DEFINE_ANON_MR_EVENT(unmap); -DEFINE_ANON_MR_EVENT(recycle); TRACE_EVENT(xprtrdma_dma_maperr, TP_PROTO( diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index af85cec0ce31..27087dc8ba3c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -49,22 +49,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -/** - * frwr_mr_release - Destroy one MR - * @mr: MR allocated by frwr_mr_init - * - */ -void frwr_mr_release(struct rpcrdma_mr *mr) -{ - int rc; - - rc = ib_dereg_mr(mr->frwr.fr_mr); - if (rc) - trace_xprtrdma_frwr_dereg(mr, rc); - kfree(mr->mr_sg); - kfree(mr); -} - static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { if (mr->mr_device) { @@ -75,20 +59,22 @@ static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) } } -static void frwr_mr_recycle(struct rpcrdma_mr *mr) +/** + * frwr_mr_release - Destroy one MR + * @mr: MR allocated by frwr_mr_init + * + */ +void frwr_mr_release(struct rpcrdma_mr *mr) { - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + int rc; - trace_xprtrdma_mr_recycle(mr); + frwr_mr_unmap(mr->mr_xprt, mr); - frwr_mr_unmap(r_xprt, mr); - - spin_lock(&r_xprt->rx_buf.rb_lock); - list_del(&mr->mr_all); - r_xprt->rx_stats.mrs_recycled++; - spin_unlock(&r_xprt->rx_buf.rb_lock); - - frwr_mr_release(mr); + rc = ib_dereg_mr(mr->frwr.fr_mr); + if (rc) + trace_xprtrdma_frwr_dereg(mr, rc); + kfree(mr->mr_sg); + kfree(mr); } static void frwr_mr_put(struct rpcrdma_mr *mr) @@ -365,6 +351,7 @@ out_mapmr_err: * @cq: completion queue * @wc: WCE for a completed FastReg WR * + * Each flushed MR gets destroyed after the QP has drained. */ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) { @@ -374,7 +361,6 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid); - /* The MR will get recycled when the associated req is retransmitted */ rpcrdma_flush_disconnect(cq->cq_context, wc); } @@ -448,9 +434,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr) { - if (wc->status != IB_WC_SUCCESS) - frwr_mr_recycle(mr); - else + if (likely(wc->status == IB_WC_SUCCESS)) frwr_mr_put(mr); } @@ -567,17 +551,8 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) if (!rc) return; - /* Recycle MRs in the LOCAL_INV chain that did not get posted. - */ + /* On error, the MRs get destroyed once the QP has drained. */ trace_xprtrdma_post_linv_err(req, rc); - while (bad_wr) { - frwr = container_of(bad_wr, struct rpcrdma_frwr, - fr_invwr); - mr = container_of(frwr, struct rpcrdma_mr, frwr); - bad_wr = bad_wr->next; - - frwr_mr_recycle(mr); - } } /** @@ -621,7 +596,6 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, *last, **prev; struct rpcrdma_ep *ep = r_xprt->rx_ep; - const struct ib_send_wr *bad_wr; struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; int rc; @@ -663,21 +637,12 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * replaces the QP. The RPC reply handler won't call us * unless re_id->qp is a valid pointer. */ - bad_wr = NULL; - rc = ib_post_send(ep->re_id->qp, first, &bad_wr); + rc = ib_post_send(ep->re_id->qp, first, NULL); if (!rc) return; - /* Recycle MRs in the LOCAL_INV chain that did not get posted. - */ + /* On error, the MRs get destroyed once the QP has drained. */ trace_xprtrdma_post_linv_err(req, rc); - while (bad_wr) { - frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); - mr = container_of(frwr, struct rpcrdma_mr, frwr); - bad_wr = bad_wr->next; - - frwr_mr_recycle(mr); - } /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will From 8a053433de00380a9c5758d94c7c2ec2e25321fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:19 -0400 Subject: [PATCH 0631/1379] xprtrdma: Do not wake RPC consumer on a failed LocalInv Throw away any reply where the LocalInv flushes or could not be posted. The registered memory region is in an unknown state until the disconnect completes. rpcrdma_xprt_disconnect() will find and release the MR. No need to put it back on the MR free list in this case. The client retransmits pending RPC requests once it reestablishes a fresh connection, so a replacement reply should be forthcoming on the next connection instance. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 17 +++++++++++------ net/sunrpc/xprtrdma/rpc_rdma.c | 32 +++++++++++++++++++++++++++++--- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 27087dc8ba3c..951ae20485f3 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -576,10 +576,14 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) rep = mr->mr_req->rl_reply; smp_rmb(); - frwr_mr_done(wc, mr); + if (wc->status != IB_WC_SUCCESS) { + if (rep) + rpcrdma_unpin_rqst(rep); + rpcrdma_flush_disconnect(cq->cq_context, wc); + return; + } + frwr_mr_put(mr); rpcrdma_complete_rqst(rep); - - rpcrdma_flush_disconnect(cq->cq_context, wc); } /** @@ -645,8 +649,9 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) trace_xprtrdma_post_linv_err(req, rc); /* The final LOCAL_INV WR in the chain is supposed to - * do the wake. If it was never posted, the wake will - * not happen, so wake here in that case. + * do the wake. If it was never posted, the wake does + * not happen. Unpin the rqst in preparation for its + * retransmission. */ - rpcrdma_complete_rqst(req->rl_reply); + rpcrdma_unpin_rqst(req->rl_reply); } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index be4e888e78a3..649f7d8b9733 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1326,9 +1326,35 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, return -EIO; } -/* Perform XID lookup, reconstruction of the RPC reply, and - * RPC completion while holding the transport lock to ensure - * the rep, rqst, and rq_task pointers remain stable. +/** + * rpcrdma_unpin_rqst - Release rqst without completing it + * @rep: RPC/RDMA Receive context + * + * This is done when a connection is lost so that a Reply + * can be dropped and its matching Call can be subsequently + * retransmitted on a new connection. + */ +void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep) +{ + struct rpc_xprt *xprt = &rep->rr_rxprt->rx_xprt; + struct rpc_rqst *rqst = rep->rr_rqst; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + + req->rl_reply = NULL; + rep->rr_rqst = NULL; + + spin_lock(&xprt->queue_lock); + xprt_unpin_rqst(rqst); + spin_unlock(&xprt->queue_lock); +} + +/** + * rpcrdma_complete_rqst - Pass completed rqst back to RPC + * @rep: RPC/RDMA Receive context + * + * Reconstruct the RPC reply and complete the transaction + * while @rqst is still pinned to ensure the rep, rqst, and + * rq_task pointers remain stable. */ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) { diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 1b187d1dee8a..bb8aba390b88 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -561,6 +561,7 @@ int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep); void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); +void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep); void rpcrdma_reply_handler(struct rpcrdma_rep *rep); static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) From b3ce7a25f44f03d481d12a17768cfce18b942ec2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:25 -0400 Subject: [PATCH 0632/1379] xprtrdma: Avoid Send Queue wrapping Send WRs can be signalled or unsignalled. A signalled Send WR always has a matching Send completion, while a unsignalled Send has a completion only if the Send WR fails. xprtrdma has a Send account mechanism that is designed to reduce the number of signalled Send WRs. This in turn mitigates the interrupt rate of the underlying device. RDMA consumers can't leave all Sends unsignaled, however, because providers rely on Send completions to maintain their Send Queue head and tail pointers. xprtrdma counts the number of unsignaled Send WRs that have been posted to ensure that Sends are signalled often enough to prevent the Send Queue from wrapping. This mechanism neglected to account for FastReg WRs, which are posted on the Send Queue but never signalled. As a result, the Send Queue wrapped on occasion, resulting in duplication completions of FastReg and LocalInv WRs. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 17 +++++++++++++++-- net/sunrpc/xprtrdma/verbs.c | 16 +--------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 951ae20485f3..43a412ea337a 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -390,11 +390,13 @@ static void frwr_cid_init(struct rpcrdma_ep *ep, */ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { + struct ib_send_wr *post_wr, *send_wr = &req->rl_wr; struct rpcrdma_ep *ep = r_xprt->rx_ep; - struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; + unsigned int num_wrs; - post_wr = &req->rl_wr; + num_wrs = 1; + post_wr = send_wr; list_for_each_entry(mr, &req->rl_registered, mr_list) { struct rpcrdma_frwr *frwr; @@ -409,8 +411,19 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr->fr_regwr.wr.send_flags = 0; post_wr = &frwr->fr_regwr.wr; + ++num_wrs; } + if ((kref_read(&req->rl_kref) > 1) || num_wrs > ep->re_send_count) { + send_wr->send_flags |= IB_SEND_SIGNALED; + ep->re_send_count = min_t(unsigned int, ep->re_send_batch, + num_wrs - ep->re_send_count); + } else { + send_wr->send_flags &= ~IB_SEND_SIGNALED; + ep->re_send_count -= num_wrs; + } + + trace_xprtrdma_post_send(req); return ib_post_send(ep->re_id->qp, post_wr, NULL); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f97b03129d99..72b24dca96c1 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1365,21 +1365,7 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) */ int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { - struct ib_send_wr *send_wr = &req->rl_wr; - struct rpcrdma_ep *ep = r_xprt->rx_ep; - int rc; - - if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) { - send_wr->send_flags |= IB_SEND_SIGNALED; - ep->re_send_count = ep->re_send_batch; - } else { - send_wr->send_flags &= ~IB_SEND_SIGNALED; - --ep->re_send_count; - } - - trace_xprtrdma_post_send(req); - rc = frwr_send(r_xprt, req); - if (rc) + if (frwr_send(r_xprt, req)) return -ENOTCONN; return 0; } From 4ddd0fc32c94fbb77a8c0728dc507b2bdcc67edc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:31 -0400 Subject: [PATCH 0633/1379] xprtrdma: Add tracepoints showing FastReg WRs and remote invalidation The Send signaling logic is a little subtle, so add some observability around it. For every xprtrdma_mr_fastreg event, there should be an xprtrdma_mr_localinv or xprtrdma_mr_reminv event. When these tracepoints are enabled, we can see exactly when an MR is DMA-mapped, registered, invalidated (either locally or remotely) and then DMA-unmapped. kworker/u25:2-190 [000] 787.979512: xprtrdma_mr_map: task:351@5 mr.id=4 nents=2 5608@0x8679e0c8f6f56000:0x00000503 (TO_DEVICE) kworker/u25:2-190 [000] 787.979515: xprtrdma_chunk_read: task:351@5 pos=148 5608@0x8679e0c8f6f56000:0x00000503 (last) kworker/u25:2-190 [000] 787.979519: xprtrdma_marshal: task:351@5 xid=0x8679e0c8: hdr=52 xdr=148/5608/0 read list/inline kworker/u25:2-190 [000] 787.979525: xprtrdma_mr_fastreg: task:351@5 mr.id=4 nents=2 5608@0x8679e0c8f6f56000:0x00000503 (TO_DEVICE) kworker/u25:2-190 [000] 787.979526: xprtrdma_post_send: task:351@5 cq.id=0 cid=73 (2 SGEs) ... kworker/5:1H-219 [005] 787.980567: xprtrdma_wc_receive: cq.id=1 cid=161 status=SUCCESS (0/0x0) received=164 kworker/5:1H-219 [005] 787.980571: xprtrdma_post_recvs: peer=[192.168.100.55]:20049 r_xprt=0xffff8884974d4000: 0 new recvs, 70 active (rc 0) kworker/5:1H-219 [005] 787.980573: xprtrdma_reply: task:351@5 xid=0x8679e0c8 credits=64 kworker/5:1H-219 [005] 787.980576: xprtrdma_mr_reminv: task:351@5 mr.id=4 nents=2 5608@0x8679e0c8f6f56000:0x00000503 (TO_DEVICE) kworker/5:1H-219 [005] 787.980577: xprtrdma_mr_unmap: mr.id=4 nents=2 5608@0x8679e0c8f6f56000:0x00000503 (TO_DEVICE) Note that I've moved the xprtrdma_post_send tracepoint so that event always appears after the xprtrdma_mr_fastreg tracepoint. Otherwise the event log looks counterintuitive (FastReg is always supposed to happen before Send). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 2 ++ net/sunrpc/xprtrdma/frwr_ops.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index e38e745d13b0..9462326b3535 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1010,7 +1010,9 @@ TRACE_EVENT(xprtrdma_frwr_maperr, ) ); +DEFINE_MR_EVENT(fastreg); DEFINE_MR_EVENT(localinv); +DEFINE_MR_EVENT(reminv); DEFINE_MR_EVENT(map); DEFINE_ANON_MR_EVENT(unmap); diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 43a412ea337a..0f47c1e24037 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -400,6 +400,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) list_for_each_entry(mr, &req->rl_registered, mr_list) { struct rpcrdma_frwr *frwr; + trace_xprtrdma_mr_fastreg(mr); frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_fastreg; @@ -440,6 +441,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); + trace_xprtrdma_mr_reminv(mr); frwr_mr_put(mr); break; /* only one invalidated MR per RPC */ } From 6b147ea7f442e1fb31dfa25e25b7a8ca3fb817f0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:38 -0400 Subject: [PATCH 0634/1379] xprtrdma: Add an rpcrdma_mr_completion_class I found it confusing that the MR_EVENT class displays the mr.id but the associated COMPLETION_EVENT class displays a cid (that happens to contain the mr.id!). To make it a little easier on humans who have to read and interpret these events, create an MR_COMPLETION class that displays the mr.id in the same way as the MR_EVENT class. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 48 +++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 9462326b3535..3e6e4c69b533 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -60,6 +60,46 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class, ), \ TP_ARGS(wc, cid)) +DECLARE_EVENT_CLASS(rpcrdma_mr_completion_class, + TP_PROTO( + const struct ib_wc *wc, + const struct rpc_rdma_cid *cid + ), + + TP_ARGS(wc, cid), + + TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) + __field(unsigned long, status) + __field(unsigned int, vendor_err) + ), + + TP_fast_assign( + __entry->cq_id = cid->ci_queue_id; + __entry->completion_id = cid->ci_completion_id; + __entry->status = wc->status; + if (wc->status) + __entry->vendor_err = wc->vendor_err; + else + __entry->vendor_err = 0; + ), + + TP_printk("cq.id=%u mr.id=%d status=%s (%lu/0x%x)", + __entry->cq_id, __entry->completion_id, + rdma_show_wc_status(__entry->status), + __entry->status, __entry->vendor_err + ) +); + +#define DEFINE_MR_COMPLETION_EVENT(name) \ + DEFINE_EVENT(rpcrdma_mr_completion_class, name, \ + TP_PROTO( \ + const struct ib_wc *wc, \ + const struct rpc_rdma_cid *cid \ + ), \ + TP_ARGS(wc, cid)) + DECLARE_EVENT_CLASS(rpcrdma_receive_completion_class, TP_PROTO( const struct ib_wc *wc, @@ -886,10 +926,10 @@ TRACE_EVENT(xprtrdma_post_linv_err, DEFINE_RECEIVE_COMPLETION_EVENT(xprtrdma_wc_receive); DEFINE_COMPLETION_EVENT(xprtrdma_wc_send); -DEFINE_COMPLETION_EVENT(xprtrdma_wc_fastreg); -DEFINE_COMPLETION_EVENT(xprtrdma_wc_li); -DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_wake); -DEFINE_COMPLETION_EVENT(xprtrdma_wc_li_done); +DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_fastreg); +DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_li); +DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_li_wake); +DEFINE_MR_COMPLETION_EVENT(xprtrdma_wc_li_done); TRACE_EVENT(xprtrdma_frwr_alloc, TP_PROTO( From 83189d15115467061295c0b75334b39fc64c6142 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:44 -0400 Subject: [PATCH 0635/1379] xprtrdma: Don't display r_xprt memory addresses in tracepoints The remote peer's IP address is sufficient, and does not expose details of the kernel's memory layout. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 51 +++++++++++++--------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 3e6e4c69b533..e38b8e33be2d 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -190,19 +190,17 @@ DECLARE_EVENT_CLASS(xprtrdma_rxprt, TP_ARGS(r_xprt), TP_STRUCT__entry( - __field(const void *, r_xprt) __string(addr, rpcrdma_addrstr(r_xprt)) __string(port, rpcrdma_portstr(r_xprt)) ), TP_fast_assign( - __entry->r_xprt = r_xprt; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p", - __get_str(addr), __get_str(port), __entry->r_xprt + TP_printk("peer=[%s]:%s", + __get_str(addr), __get_str(port) ) ); @@ -222,7 +220,6 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class, TP_ARGS(r_xprt, rc), TP_STRUCT__entry( - __field(const void *, r_xprt) __field(int, rc) __field(int, connect_status) __string(addr, rpcrdma_addrstr(r_xprt)) @@ -230,15 +227,14 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class, ), TP_fast_assign( - __entry->r_xprt = r_xprt; __entry->rc = rc; __entry->connect_status = r_xprt->rx_ep->re_connect_status; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connection status=%d", - __get_str(addr), __get_str(port), __entry->r_xprt, + TP_printk("peer=[%s]:%s rc=%d connection status=%d", + __get_str(addr), __get_str(port), __entry->rc, __entry->connect_status ) ); @@ -535,22 +531,19 @@ TRACE_EVENT(xprtrdma_op_connect, TP_ARGS(r_xprt, delay), TP_STRUCT__entry( - __field(const void *, r_xprt) __field(unsigned long, delay) __string(addr, rpcrdma_addrstr(r_xprt)) __string(port, rpcrdma_portstr(r_xprt)) ), TP_fast_assign( - __entry->r_xprt = r_xprt; __entry->delay = delay; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p delay=%lu", - __get_str(addr), __get_str(port), __entry->r_xprt, - __entry->delay + TP_printk("peer=[%s]:%s delay=%lu", + __get_str(addr), __get_str(port), __entry->delay ) ); @@ -565,7 +558,6 @@ TRACE_EVENT(xprtrdma_op_set_cto, TP_ARGS(r_xprt, connect, reconnect), TP_STRUCT__entry( - __field(const void *, r_xprt) __field(unsigned long, connect) __field(unsigned long, reconnect) __string(addr, rpcrdma_addrstr(r_xprt)) @@ -573,15 +565,14 @@ TRACE_EVENT(xprtrdma_op_set_cto, ), TP_fast_assign( - __entry->r_xprt = r_xprt; __entry->connect = connect; __entry->reconnect = reconnect; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: connect=%lu reconnect=%lu", - __get_str(addr), __get_str(port), __entry->r_xprt, + TP_printk("peer=[%s]:%s connect=%lu reconnect=%lu", + __get_str(addr), __get_str(port), __entry->connect / HZ, __entry->reconnect / HZ ) ); @@ -631,22 +622,19 @@ TRACE_EVENT(xprtrdma_createmrs, TP_ARGS(r_xprt, count), TP_STRUCT__entry( - __field(const void *, r_xprt) __string(addr, rpcrdma_addrstr(r_xprt)) __string(port, rpcrdma_portstr(r_xprt)) __field(unsigned int, count) ), TP_fast_assign( - __entry->r_xprt = r_xprt; __entry->count = count; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: created %u MRs", - __get_str(addr), __get_str(port), __entry->r_xprt, - __entry->count + TP_printk("peer=[%s]:%s created %u MRs", + __get_str(addr), __get_str(port), __entry->count ) ); @@ -869,7 +857,7 @@ TRACE_EVENT(xprtrdma_post_recvs, TP_ARGS(r_xprt, count, status), TP_STRUCT__entry( - __field(const void *, r_xprt) + __field(u32, cq_id) __field(unsigned int, count) __field(int, status) __field(int, posted) @@ -878,16 +866,18 @@ TRACE_EVENT(xprtrdma_post_recvs, ), TP_fast_assign( - __entry->r_xprt = r_xprt; + const struct rpcrdma_ep *ep = r_xprt->rx_ep; + + __entry->cq_id = ep->re_attr.recv_cq->res.id; __entry->count = count; __entry->status = status; - __entry->posted = r_xprt->rx_ep->re_receive_count; + __entry->posted = ep->re_receive_count; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: %u new recvs, %d active (rc %d)", - __get_str(addr), __get_str(port), __entry->r_xprt, + TP_printk("peer=[%s]:%s cq.id=%d %u new recvs, %d active (rc %d)", + __get_str(addr), __get_str(port), __entry->cq_id, __entry->count, __entry->posted, __entry->status ) ); @@ -1289,22 +1279,19 @@ TRACE_EVENT(xprtrdma_cb_setup, TP_ARGS(r_xprt, reqs), TP_STRUCT__entry( - __field(const void *, r_xprt) __field(unsigned int, reqs) __string(addr, rpcrdma_addrstr(r_xprt)) __string(port, rpcrdma_portstr(r_xprt)) ), TP_fast_assign( - __entry->r_xprt = r_xprt; __entry->reqs = reqs; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s r_xprt=%p: %u reqs", - __get_str(addr), __get_str(port), - __entry->r_xprt, __entry->reqs + TP_printk("peer=[%s]:%s %u reqs", + __get_str(addr), __get_str(port), __entry->reqs ) ); From e1648eb23d839bd4b9f2999296d5e81dcd93311f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:50 -0400 Subject: [PATCH 0636/1379] xprtrdma: Remove the RPC/RDMA QP event handler Clean up: The handler only recorded a trace event. If indeed no action is needed by the RPC/RDMA consumer, then the event can be ignored. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 32 -------------------------------- net/sunrpc/xprtrdma/verbs.c | 18 ------------------ 2 files changed, 50 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index e38b8e33be2d..ef6166b840e7 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -577,38 +577,6 @@ TRACE_EVENT(xprtrdma_op_set_cto, ) ); -TRACE_EVENT(xprtrdma_qp_event, - TP_PROTO( - const struct rpcrdma_ep *ep, - const struct ib_event *event - ), - - TP_ARGS(ep, event), - - TP_STRUCT__entry( - __field(unsigned long, event) - __string(name, event->device->name) - __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) - __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) - ), - - TP_fast_assign( - const struct rdma_cm_id *id = ep->re_id; - - __entry->event = event->event; - __assign_str(name, event->device->name); - memcpy(__entry->srcaddr, &id->route.addr.src_addr, - sizeof(struct sockaddr_in6)); - memcpy(__entry->dstaddr, &id->route.addr.dst_addr, - sizeof(struct sockaddr_in6)); - ), - - TP_printk("%pISpc -> %pISpc device=%s %s (%lu)", - __entry->srcaddr, __entry->dstaddr, __get_str(name), - rdma_show_ib_event(__entry->event), __entry->event - ) -); - /** ** Call events **/ diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 72b24dca96c1..1e965a380896 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -120,22 +120,6 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) rpcrdma_ep_put(ep); } -/** - * rpcrdma_qp_event_handler - Handle one QP event (error notification) - * @event: details of the event - * @context: ep that owns QP where event occurred - * - * Called from the RDMA provider (device driver) possibly in an interrupt - * context. The QP is always destroyed before the ID, so the ID will be - * reliably available when this handler is invoked. - */ -static void rpcrdma_qp_event_handler(struct ib_event *event, void *context) -{ - struct rpcrdma_ep *ep = context; - - trace_xprtrdma_qp_event(ep, event); -} - /* Ensure xprt_force_disconnect() is invoked exactly once when a * connection is closed or lost. (The important thing is it needs * to be invoked "at least" once). @@ -431,8 +415,6 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests); - ep->re_attr.event_handler = rpcrdma_qp_event_handler; - ep->re_attr.qp_context = ep; ep->re_attr.srq = NULL; ep->re_attr.cap.max_inline_data = 0; ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR; From 0a26d10e300204f2a064e44fb181323bc6d986eb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:03:56 -0400 Subject: [PATCH 0637/1379] xprtrdma: Move fr_cid to struct rpcrdma_mr Clean up (for several purposes): - The MR's cid is initialized sooner so that tracepoints can show something reasonable even if the MR is never posted. - The MR's res.id doesn't change so the cid won't change either. Initializing the cid once is sufficient. - struct rpcrdma_frwr is going away soon. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 31 +++++++++++++++---------------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 0f47c1e24037..d3c18c776bf9 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -49,6 +49,15 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +static void frwr_cid_init(struct rpcrdma_ep *ep, + struct rpcrdma_mr *mr) +{ + struct rpc_rdma_cid *cid = &mr->mr_cid; + + cid->ci_queue_id = ep->re_attr.send_cq->res.id; + cid->ci_completion_id = mr->frwr.fr_mr->res.id; +} + static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { if (mr->mr_device) { @@ -134,6 +143,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) mr->mr_device = NULL; INIT_LIST_HEAD(&mr->mr_list); init_completion(&mr->frwr.fr_linv_done); + frwr_cid_init(ep, mr); sg_init_table(sg, depth); mr->mr_sg = sg; @@ -358,22 +368,14 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_frwr *frwr = container_of(cqe, struct rpcrdma_frwr, fr_cqe); + struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_fastreg(wc, &frwr->fr_cid); + trace_xprtrdma_wc_fastreg(wc, &mr->mr_cid); rpcrdma_flush_disconnect(cq->cq_context, wc); } -static void frwr_cid_init(struct rpcrdma_ep *ep, - struct rpcrdma_frwr *frwr) -{ - struct rpc_rdma_cid *cid = &frwr->fr_cid; - - cid->ci_queue_id = ep->re_attr.send_cq->res.id; - cid->ci_completion_id = frwr->fr_mr->res.id; -} - /** * frwr_send - post Send WRs containing the RPC Call message * @r_xprt: controlling transport instance @@ -404,7 +406,6 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_fastreg; - frwr_cid_init(ep, frwr); frwr->fr_regwr.wr.next = post_wr; frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; frwr->fr_regwr.wr.num_sge = 0; @@ -467,7 +468,7 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li(wc, &frwr->fr_cid); + trace_xprtrdma_wc_li(wc, &mr->mr_cid); frwr_mr_done(wc, mr); rpcrdma_flush_disconnect(cq->cq_context, wc); @@ -488,7 +489,7 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_wake(wc, &frwr->fr_cid); + trace_xprtrdma_wc_li_wake(wc, &mr->mr_cid); frwr_mr_done(wc, mr); complete(&frwr->fr_linv_done); @@ -529,7 +530,6 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; - frwr_cid_init(ep, frwr); last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; @@ -585,7 +585,7 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) struct rpcrdma_rep *rep; /* WARNING: Only wr_cqe and status are reliable at this point */ - trace_xprtrdma_wc_li_done(wc, &frwr->fr_cid); + trace_xprtrdma_wc_li_done(wc, &mr->mr_cid); /* Ensure that @rep is generated before the MR is released */ rep = mr->mr_req->rl_reply; @@ -631,7 +631,6 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) frwr = &mr->frwr; frwr->fr_cqe.done = frwr_wc_localinv; - frwr_cid_init(ep, frwr); last = &frwr->fr_invwr; last->next = NULL; last->wr_cqe = &frwr->fr_cqe; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index bb8aba390b88..0cf073f0ee64 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -232,7 +232,6 @@ struct rpcrdma_sendctx { struct rpcrdma_frwr { struct ib_mr *fr_mr; struct ib_cqe fr_cqe; - struct rpc_rdma_cid fr_cid; struct completion fr_linv_done; union { struct ib_reg_wr fr_regwr; @@ -254,6 +253,7 @@ struct rpcrdma_mr { u32 mr_length; u64 mr_offset; struct list_head mr_all; + struct rpc_rdma_cid mr_cid; }; /* From e10fa96d347488d1fd278e84f52ba7b25067cc71 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:04:03 -0400 Subject: [PATCH 0638/1379] xprtrdma: Move cqe to struct rpcrdma_mr Clean up. - Simplify variable initialization in the completion handlers. - Move another field out of struct rpcrdma_frwr. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 35 ++++++++++++++------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index d3c18c776bf9..2a886a28d82b 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -366,9 +366,7 @@ out_mapmr_err: static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_fastreg(wc, &mr->mr_cid); @@ -405,9 +403,9 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) trace_xprtrdma_mr_fastreg(mr); frwr = &mr->frwr; - frwr->fr_cqe.done = frwr_wc_fastreg; + mr->mr_cqe.done = frwr_wc_fastreg; frwr->fr_regwr.wr.next = post_wr; - frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; + frwr->fr_regwr.wr.wr_cqe = &mr->mr_cqe; frwr->fr_regwr.wr.num_sge = 0; frwr->fr_regwr.wr.opcode = IB_WR_REG_MR; frwr->fr_regwr.wr.send_flags = 0; @@ -463,9 +461,7 @@ static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr) static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li(wc, &mr->mr_cid); @@ -484,9 +480,8 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); + struct rpcrdma_frwr *frwr = &mr->frwr; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_wake(wc, &mr->mr_cid); @@ -529,16 +524,17 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) r_xprt->rx_stats.local_inv_needed++; frwr = &mr->frwr; - frwr->fr_cqe.done = frwr_wc_localinv; last = &frwr->fr_invwr; last->next = NULL; - last->wr_cqe = &frwr->fr_cqe; + last->wr_cqe = &mr->mr_cqe; last->sg_list = NULL; last->num_sge = 0; last->opcode = IB_WR_LOCAL_INV; last->send_flags = IB_SEND_SIGNALED; last->ex.invalidate_rkey = mr->mr_handle; + last->wr_cqe->done = frwr_wc_localinv; + *prev = last; prev = &last->next; } @@ -547,7 +543,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * last WR in the chain completes, all WRs in the chain * are complete. */ - frwr->fr_cqe.done = frwr_wc_localinv_wake; + last->wr_cqe->done = frwr_wc_localinv_wake; reinit_completion(&frwr->fr_linv_done); /* Transport disconnect drains the receive CQ before it @@ -579,9 +575,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; - struct rpcrdma_frwr *frwr = - container_of(cqe, struct rpcrdma_frwr, fr_cqe); - struct rpcrdma_mr *mr = container_of(frwr, struct rpcrdma_mr, frwr); + struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); struct rpcrdma_rep *rep; /* WARNING: Only wr_cqe and status are reliable at this point */ @@ -630,16 +624,17 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) r_xprt->rx_stats.local_inv_needed++; frwr = &mr->frwr; - frwr->fr_cqe.done = frwr_wc_localinv; last = &frwr->fr_invwr; last->next = NULL; - last->wr_cqe = &frwr->fr_cqe; + last->wr_cqe = &mr->mr_cqe; last->sg_list = NULL; last->num_sge = 0; last->opcode = IB_WR_LOCAL_INV; last->send_flags = IB_SEND_SIGNALED; last->ex.invalidate_rkey = mr->mr_handle; + last->wr_cqe->done = frwr_wc_localinv; + *prev = last; prev = &last->next; } @@ -649,7 +644,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * are complete. The last completion will wake up the * RPC waiter. */ - frwr->fr_cqe.done = frwr_wc_localinv_done; + last->wr_cqe->done = frwr_wc_localinv_done; /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0cf073f0ee64..f72b69c3f0ea 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -231,7 +231,6 @@ struct rpcrdma_sendctx { */ struct rpcrdma_frwr { struct ib_mr *fr_mr; - struct ib_cqe fr_cqe; struct completion fr_linv_done; union { struct ib_reg_wr fr_regwr; @@ -247,6 +246,7 @@ struct rpcrdma_mr { struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; + struct ib_cqe mr_cqe; struct rpcrdma_frwr frwr; struct rpcrdma_xprt *mr_xprt; u32 mr_handle; From 9a301cafc8619c7f30032d314da6e65d9d913d57 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:04:09 -0400 Subject: [PATCH 0639/1379] xprtrdma: Move fr_linv_done field to struct rpcrdma_mr Clean up: Move more of struct rpcrdma_frwr into its parent. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 9 ++++----- net/sunrpc/xprtrdma/xprt_rdma.h | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 2a886a28d82b..7d7a64d2ca4a 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -142,7 +142,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) mr->frwr.fr_mr = frmr; mr->mr_device = NULL; INIT_LIST_HEAD(&mr->mr_list); - init_completion(&mr->frwr.fr_linv_done); + init_completion(&mr->mr_linv_done); frwr_cid_init(ep, mr); sg_init_table(sg, depth); @@ -481,12 +481,11 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_mr *mr = container_of(cqe, struct rpcrdma_mr, mr_cqe); - struct rpcrdma_frwr *frwr = &mr->frwr; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li_wake(wc, &mr->mr_cid); frwr_mr_done(wc, mr); - complete(&frwr->fr_linv_done); + complete(&mr->mr_linv_done); rpcrdma_flush_disconnect(cq->cq_context, wc); } @@ -544,7 +543,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * are complete. */ last->wr_cqe->done = frwr_wc_localinv_wake; - reinit_completion(&frwr->fr_linv_done); + reinit_completion(&mr->mr_linv_done); /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us @@ -558,7 +557,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * not happen, so don't wait in that case. */ if (bad_wr != first) - wait_for_completion(&frwr->fr_linv_done); + wait_for_completion(&mr->mr_linv_done); if (!rc) return; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index f72b69c3f0ea..1374054b73fd 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -231,7 +231,6 @@ struct rpcrdma_sendctx { */ struct rpcrdma_frwr { struct ib_mr *fr_mr; - struct completion fr_linv_done; union { struct ib_reg_wr fr_regwr; struct ib_send_wr fr_invwr; @@ -247,6 +246,7 @@ struct rpcrdma_mr { int mr_nents; enum dma_data_direction mr_dir; struct ib_cqe mr_cqe; + struct completion mr_linv_done; struct rpcrdma_frwr frwr; struct rpcrdma_xprt *mr_xprt; u32 mr_handle; From dcff9ed209aa6ad8fc575c7fccf6496fef44e869 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:04:15 -0400 Subject: [PATCH 0640/1379] xprtrdma: Move the Work Request union to struct rpcrdma_mr Clean up. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 28 +++++++++------------------- net/sunrpc/xprtrdma/xprt_rdma.h | 8 ++++---- 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 7d7a64d2ca4a..bb2b9c607c71 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -333,7 +333,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, key = (u8)(ibmr->rkey & 0x000000FF); ib_update_fast_reg_key(ibmr, ++key); - reg_wr = &mr->frwr.fr_regwr; + reg_wr = &mr->mr_regwr; reg_wr->mr = ibmr; reg_wr->key = ibmr->rkey; reg_wr->access = writing ? @@ -398,19 +398,15 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) num_wrs = 1; post_wr = send_wr; list_for_each_entry(mr, &req->rl_registered, mr_list) { - struct rpcrdma_frwr *frwr; - trace_xprtrdma_mr_fastreg(mr); - frwr = &mr->frwr; mr->mr_cqe.done = frwr_wc_fastreg; - frwr->fr_regwr.wr.next = post_wr; - frwr->fr_regwr.wr.wr_cqe = &mr->mr_cqe; - frwr->fr_regwr.wr.num_sge = 0; - frwr->fr_regwr.wr.opcode = IB_WR_REG_MR; - frwr->fr_regwr.wr.send_flags = 0; - - post_wr = &frwr->fr_regwr.wr; + mr->mr_regwr.wr.next = post_wr; + mr->mr_regwr.wr.wr_cqe = &mr->mr_cqe; + mr->mr_regwr.wr.num_sge = 0; + mr->mr_regwr.wr.opcode = IB_WR_REG_MR; + mr->mr_regwr.wr.send_flags = 0; + post_wr = &mr->mr_regwr.wr; ++num_wrs; } @@ -506,7 +502,6 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) struct ib_send_wr *first, **prev, *last; struct rpcrdma_ep *ep = r_xprt->rx_ep; const struct ib_send_wr *bad_wr; - struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; int rc; @@ -515,15 +510,13 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ - frwr = NULL; prev = &first; while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; - frwr = &mr->frwr; - last = &frwr->fr_invwr; + last = &mr->mr_invwr; last->next = NULL; last->wr_cqe = &mr->mr_cqe; last->sg_list = NULL; @@ -608,22 +601,19 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *first, *last, **prev; struct rpcrdma_ep *ep = r_xprt->rx_ep; - struct rpcrdma_frwr *frwr; struct rpcrdma_mr *mr; int rc; /* Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. */ - frwr = NULL; prev = &first; while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { trace_xprtrdma_mr_localinv(mr); r_xprt->rx_stats.local_inv_needed++; - frwr = &mr->frwr; - last = &frwr->fr_invwr; + last = &mr->mr_invwr; last->next = NULL; last->wr_cqe = &mr->mr_cqe; last->sg_list = NULL; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 1374054b73fd..99ef83d673d6 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -231,10 +231,6 @@ struct rpcrdma_sendctx { */ struct rpcrdma_frwr { struct ib_mr *fr_mr; - union { - struct ib_reg_wr fr_regwr; - struct ib_send_wr fr_invwr; - }; }; struct rpcrdma_req; @@ -247,6 +243,10 @@ struct rpcrdma_mr { enum dma_data_direction mr_dir; struct ib_cqe mr_cqe; struct completion mr_linv_done; + union { + struct ib_reg_wr mr_regwr; + struct ib_send_wr mr_invwr; + }; struct rpcrdma_frwr frwr; struct rpcrdma_xprt *mr_xprt; u32 mr_handle; From 13bcf7e32a0181095cd62010579869e87aacb332 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Apr 2021 14:04:21 -0400 Subject: [PATCH 0641/1379] xprtrdma: Move fr_mr field to struct rpcrdma_mr Clean up: The last remaining field in struct rpcrdma_frwr has been removed, so the struct can be eliminated. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/rpcrdma.h | 12 ++++++------ net/sunrpc/xprtrdma/frwr_ops.c | 8 ++++---- net/sunrpc/xprtrdma/xprt_rdma.h | 7 ++----- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index ef6166b840e7..bd55908c1bef 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -379,7 +379,7 @@ DECLARE_EVENT_CLASS(xprtrdma_mr_class, __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; @@ -420,7 +420,7 @@ DECLARE_EVENT_CLASS(xprtrdma_anonymous_mr_class, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; @@ -903,7 +903,7 @@ TRACE_EVENT(xprtrdma_frwr_alloc, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->rc = rc; ), @@ -931,7 +931,7 @@ TRACE_EVENT(xprtrdma_frwr_dereg, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->nents = mr->mr_nents; __entry->handle = mr->mr_handle; __entry->length = mr->mr_length; @@ -964,7 +964,7 @@ TRACE_EVENT(xprtrdma_frwr_sgerr, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->addr = mr->mr_sg->dma_address; __entry->dir = mr->mr_dir; __entry->nents = sg_nents; @@ -994,7 +994,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr, ), TP_fast_assign( - __entry->mr_id = mr->frwr.fr_mr->res.id; + __entry->mr_id = mr->mr_ibmr->res.id; __entry->addr = mr->mr_sg->dma_address; __entry->dir = mr->mr_dir; __entry->num_mapped = num_mapped; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index bb2b9c607c71..285d73246fc2 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -55,7 +55,7 @@ static void frwr_cid_init(struct rpcrdma_ep *ep, struct rpc_rdma_cid *cid = &mr->mr_cid; cid->ci_queue_id = ep->re_attr.send_cq->res.id; - cid->ci_completion_id = mr->frwr.fr_mr->res.id; + cid->ci_completion_id = mr->mr_ibmr->res.id; } static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) @@ -79,7 +79,7 @@ void frwr_mr_release(struct rpcrdma_mr *mr) frwr_mr_unmap(mr->mr_xprt, mr); - rc = ib_dereg_mr(mr->frwr.fr_mr); + rc = ib_dereg_mr(mr->mr_ibmr); if (rc) trace_xprtrdma_frwr_dereg(mr, rc); kfree(mr->mr_sg); @@ -139,7 +139,7 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) goto out_list_err; mr->mr_xprt = r_xprt; - mr->frwr.fr_mr = frmr; + mr->mr_ibmr = frmr; mr->mr_device = NULL; INIT_LIST_HEAD(&mr->mr_list); init_completion(&mr->mr_linv_done); @@ -323,7 +323,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, goto out_dmamap_err; mr->mr_device = ep->re_id->device; - ibmr = mr->frwr.fr_mr; + ibmr = mr->mr_ibmr; n = ib_map_mr_sg(ibmr, mr->mr_sg, dma_nents, NULL, PAGE_SIZE); if (n != dma_nents) goto out_mapmr_err; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 99ef83d673d6..436ad7312614 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -229,14 +229,12 @@ struct rpcrdma_sendctx { * An external memory region is any buffer or page that is registered * on the fly (ie, not pre-registered). */ -struct rpcrdma_frwr { - struct ib_mr *fr_mr; -}; - struct rpcrdma_req; struct rpcrdma_mr { struct list_head mr_list; struct rpcrdma_req *mr_req; + + struct ib_mr *mr_ibmr; struct ib_device *mr_device; struct scatterlist *mr_sg; int mr_nents; @@ -247,7 +245,6 @@ struct rpcrdma_mr { struct ib_reg_wr mr_regwr; struct ib_send_wr mr_invwr; }; - struct rpcrdma_frwr frwr; struct rpcrdma_xprt *mr_xprt; u32 mr_handle; u32 mr_length; From d9092b4bb2109502eb8972021a3f74febc931a63 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Thu, 22 Apr 2021 03:37:49 -0400 Subject: [PATCH 0642/1379] NFSv4.2: Remove ifdef CONFIG_NFSD from NFSv4.2 client SSC code. The client SSC code should not depend on any of the CONFIG_NFSD config. This patch removes all CONFIG_NFSD from NFSv4.2 client SSC code and simplifies the config of CONFIG_NFS_V4_2_SSC_HELPER, NFSD_V4_2_INTER_SSC. Signed-off-by: Dai Ngo Signed-off-by: Trond Myklebust --- fs/Kconfig | 4 ++-- fs/nfs/nfs4file.c | 4 ---- fs/nfs/super.c | 4 ---- fs/nfsd/Kconfig | 2 +- 4 files changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index a55bda4233bb..afa585e62332 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -334,8 +334,8 @@ config NFS_COMMON default y config NFS_V4_2_SSC_HELPER - tristate - default y if NFS_V4=y || NFS_FS=y + bool + default y if NFS_V4_2 source "net/sunrpc/Kconfig" source "fs/ceph/Kconfig" diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 441a2fa073c8..57b3821d975a 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -420,9 +420,7 @@ static const struct nfs4_ssc_client_ops nfs4_ssc_clnt_ops_tbl = { */ void nfs42_ssc_register_ops(void) { -#ifdef CONFIG_NFSD_V4 nfs42_ssc_register(&nfs4_ssc_clnt_ops_tbl); -#endif } /** @@ -433,9 +431,7 @@ void nfs42_ssc_register_ops(void) */ void nfs42_ssc_unregister_ops(void) { -#ifdef CONFIG_NFSD_V4 nfs42_ssc_unregister(&nfs4_ssc_clnt_ops_tbl); -#endif } #endif /* CONFIG_NFS_V4_2 */ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 13a650750f04..5bfcc2557187 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -116,16 +116,12 @@ static void unregister_nfs4_fs(void) #ifdef CONFIG_NFS_V4_2 static void nfs_ssc_register_ops(void) { -#ifdef CONFIG_NFSD_V4 nfs_ssc_register(&nfs_ssc_clnt_ops_tbl); -#endif } static void nfs_ssc_unregister_ops(void) { -#ifdef CONFIG_NFSD_V4 nfs_ssc_unregister(&nfs_ssc_clnt_ops_tbl); -#endif } #endif /* CONFIG_NFS_V4_2 */ diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index d6cff5fbe705..c5346196c46f 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -138,7 +138,7 @@ config NFSD_FLEXFILELAYOUT config NFSD_V4_2_INTER_SSC bool "NFSv4.2 inter server to server COPY" - depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2 + depends on NFSD_V4 && NFS_V4_2 help This option enables support for NFSv4.2 inter server to server copy where the destination server calls the NFSv4.2 From 183787c6fcc2c793ec96e946a4fdd8cd0e6d7aa0 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 22 Mar 2021 22:26:02 +0800 Subject: [PATCH 0643/1379] riscv: Add 3 SBI wrapper functions to get cpu manufacturer information Add 3 wrapper functions to get vendor id, architecture id and implement id from M-mode Signed-off-by: Vincent Chen Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 3 +++ arch/riscv/kernel/sbi.c | 15 +++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 99895d9c3bdd..dd2329962ceb 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -97,6 +97,9 @@ struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, void sbi_console_putchar(int ch); int sbi_console_getchar(void); +long sbi_get_mvendorid(void); +long sbi_get_marchid(void); +long sbi_get_mimpid(void); void sbi_set_timer(uint64_t stime_value); void sbi_shutdown(void); void sbi_clear_ipi(void); diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index c0dcebdd30ec..fa0dd881fc5e 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -547,6 +547,21 @@ static inline long sbi_get_firmware_version(void) return __sbi_base_ecall(SBI_EXT_BASE_GET_IMP_VERSION); } +long sbi_get_mvendorid(void) +{ + return __sbi_base_ecall(SBI_EXT_BASE_GET_MVENDORID); +} + +long sbi_get_marchid(void) +{ + return __sbi_base_ecall(SBI_EXT_BASE_GET_MARCHID); +} + +long sbi_get_mimpid(void) +{ + return __sbi_base_ecall(SBI_EXT_BASE_GET_MIMPID); +} + static void sbi_send_cpumask_ipi(const struct cpumask *target) { struct cpumask hartid_mask; From 6f4eea90465ad0cd5f3d041b9b2c728426f2b8d4 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 22 Mar 2021 22:26:03 +0800 Subject: [PATCH 0644/1379] riscv: Introduce alternative mechanism to apply errata solution Introduce the "alternative" mechanism from ARM64 and x86 to apply the CPU vendors' errata solution at runtime. The main purpose of this patch is to provide a framework. Therefore, the implementation is quite basic for now so that some scenarios could not use this schemei, such as patching code to a module, relocating the patching code and heterogeneous CPU topology. Users could use the macro ALTERNATIVE to apply an errata to the existing code flow. In the macro ALTERNATIVE, users need to specify the manufacturer information(vendorid, archid, and impid) for this errata. Therefore, kernel will know this errata is suitable for which CPU core. During the booting procedure, kernel will select the errata required by the CPU core and then patch it. It means that the kernel only applies the errata to the specified CPU core. In this case, the vendor's errata does not affect each other at runtime. The above patching procedure only occurs during the booting phase, so we only take the overhead of the "alternative" mechanism once. This "alternative" mechanism is enabled by default to ensure that all required errata will be applied. However, users can disable this feature by the Kconfig "CONFIG_RISCV_ERRATA_ALTERNATIVE". Signed-off-by: Vincent Chen Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/Kconfig.erratas | 12 ++ arch/riscv/Makefile | 1 + arch/riscv/errata/Makefile | 1 + arch/riscv/errata/alternative.c | 69 ++++++++++ arch/riscv/include/asm/alternative-macros.h | 142 ++++++++++++++++++++ arch/riscv/include/asm/alternative.h | 36 +++++ arch/riscv/include/asm/asm.h | 1 + arch/riscv/include/asm/csr.h | 3 + arch/riscv/include/asm/errata_list.h | 12 ++ arch/riscv/include/asm/sections.h | 1 + arch/riscv/include/asm/vendorid_list.h | 10 ++ arch/riscv/kernel/smpboot.c | 4 + arch/riscv/kernel/vmlinux.lds.S | 7 + 14 files changed, 300 insertions(+) create mode 100644 arch/riscv/Kconfig.erratas create mode 100644 arch/riscv/errata/Makefile create mode 100644 arch/riscv/errata/alternative.c create mode 100644 arch/riscv/include/asm/alternative-macros.h create mode 100644 arch/riscv/include/asm/alternative.h create mode 100644 arch/riscv/include/asm/errata_list.h create mode 100644 arch/riscv/include/asm/vendorid_list.h diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 8ea60a0a19ae..82c469b340c2 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -207,6 +207,7 @@ config LOCKDEP_SUPPORT def_bool y source "arch/riscv/Kconfig.socs" +source "arch/riscv/Kconfig.erratas" menu "Platform type" diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas new file mode 100644 index 000000000000..4d0bafc536df --- /dev/null +++ b/arch/riscv/Kconfig.erratas @@ -0,0 +1,12 @@ +menu "CPU errata selection" + +config RISCV_ERRATA_ALTERNATIVE + bool "RISC-V alternative scheme" + default y + help + This Kconfig allows the kernel to automatically patch the + errata required by the execution platform at run time. The + code patching is performed once in the boot stages. It means + that the overhead from this mechanism is just taken once. + +endmenu diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 1368d943f1f3..1f5c03082976 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -87,6 +87,7 @@ KBUILD_IMAGE := $(boot)/Image.gz head-y := arch/riscv/kernel/head.o core-y += arch/riscv/ +core-$(CONFIG_RISCV_ERRATA_ALTERNATIVE) += arch/riscv/errata/ libs-y += arch/riscv/lib/ libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a diff --git a/arch/riscv/errata/Makefile b/arch/riscv/errata/Makefile new file mode 100644 index 000000000000..43e6d5424367 --- /dev/null +++ b/arch/riscv/errata/Makefile @@ -0,0 +1 @@ +obj-y += alternative.o diff --git a/arch/riscv/errata/alternative.c b/arch/riscv/errata/alternative.c new file mode 100644 index 000000000000..8efa60ad69b7 --- /dev/null +++ b/arch/riscv/errata/alternative.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * alternative runtime patching + * inspired by the ARM64 and x86 version + * + * Copyright (C) 2021 Sifive. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct cpu_manufacturer_info_t { + unsigned long vendor_id; + unsigned long arch_id; + unsigned long imp_id; +} cpu_mfr_info; + +static void (*vendor_patch_func)(struct alt_entry *begin, struct alt_entry *end, + unsigned long archid, unsigned long impid); + +static inline void __init riscv_fill_cpu_mfr_info(void) +{ +#ifdef CONFIG_RISCV_M_MODE + cpu_mfr_info.vendor_id = csr_read(CSR_MVENDORID); + cpu_mfr_info.arch_id = csr_read(CSR_MARCHID); + cpu_mfr_info.imp_id = csr_read(CSR_MIMPID); +#else + cpu_mfr_info.vendor_id = sbi_get_mvendorid(); + cpu_mfr_info.arch_id = sbi_get_marchid(); + cpu_mfr_info.imp_id = sbi_get_mimpid(); +#endif +} + +static void __init init_alternative(void) +{ + riscv_fill_cpu_mfr_info(); + + switch (cpu_mfr_info.vendor_id) { + default: + vendor_patch_func = NULL; + } +} + +/* + * This is called very early in the boot process (directly after we run + * a feature detect on the boot CPU). No need to worry about other CPUs + * here. + */ +void __init apply_boot_alternatives(void) +{ + /* If called on non-boot cpu things could go wrong */ + WARN_ON(smp_processor_id() != 0); + + init_alternative(); + + if (!vendor_patch_func) + return; + + vendor_patch_func((struct alt_entry *)__alt_start, + (struct alt_entry *)__alt_end, + cpu_mfr_info.arch_id, cpu_mfr_info.imp_id); +} + diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h new file mode 100644 index 000000000000..88c08705f64a --- /dev/null +++ b/arch/riscv/include/asm/alternative-macros.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_ALTERNATIVE_MACROS_H +#define __ASM_ALTERNATIVE_MACROS_H + +#ifdef CONFIG_RISCV_ERRATA_ALTERNATIVE + +#ifdef __ASSEMBLY__ + +.macro ALT_ENTRY oldptr newptr vendor_id errata_id new_len + RISCV_PTR \oldptr + RISCV_PTR \newptr + REG_ASM \vendor_id + REG_ASM \new_len + .word \errata_id +.endm + +.macro ALT_NEW_CONTENT vendor_id, errata_id, enable = 1, new_c : vararg + .if \enable + .pushsection .alternative, "a" + ALT_ENTRY 886b, 888f, \vendor_id, \errata_id, 889f - 888f + .popsection + .subsection 1 +888 : + \new_c +889 : + .previous + .org . - (889b - 888b) + (887b - 886b) + .org . - (887b - 886b) + (889b - 888b) + .endif +.endm + +.macro __ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, enable +886 : + \old_c +887 : + ALT_NEW_CONTENT \vendor_id, \errata_id, \enable, \new_c +.endm + +#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ + __ALTERNATIVE_CFG old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k) + +#else /* !__ASSEMBLY__ */ + +#include +#include + +#define ALT_ENTRY(oldptr, newptr, vendor_id, errata_id, newlen) \ + RISCV_PTR " " oldptr "\n" \ + RISCV_PTR " " newptr "\n" \ + REG_ASM " " vendor_id "\n" \ + REG_ASM " " newlen "\n" \ + ".word " errata_id "\n" + +#define ALT_NEW_CONSTENT(vendor_id, errata_id, enable, new_c) \ + ".if " __stringify(enable) " == 1\n" \ + ".pushsection .alternative, \"a\"\n" \ + ALT_ENTRY("886b", "888f", __stringify(vendor_id), __stringify(errata_id), "889f - 888f") \ + ".popsection\n" \ + ".subsection 1\n" \ + "888 :\n" \ + new_c "\n" \ + "889 :\n" \ + ".previous\n" \ + ".org . - (887b - 886b) + (889b - 888b)\n" \ + ".org . - (889b - 888b) + (887b - 886b)\n" \ + ".endif\n" + +#define __ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, enable) \ + "886 :\n" \ + old_c "\n" \ + "887 :\n" \ + ALT_NEW_CONSTENT(vendor_id, errata_id, enable, new_c) + +#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ + __ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, IS_ENABLED(CONFIG_k)) + +#endif /* __ASSEMBLY__ */ + +#else /* !CONFIG_RISCV_ERRATA_ALTERNATIVE*/ +#ifdef __ASSEMBLY__ + +.macro __ALTERNATIVE_CFG old_c + \old_c +.endm + +#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ + __ALTERNATIVE_CFG old_c + +#else /* !__ASSEMBLY__ */ + +#define __ALTERNATIVE_CFG(old_c) \ + old_c "\n" + +#define _ALTERNATIVE_CFG(old_c, new_c, vendor_id, errata_id, CONFIG_k) \ + __ALTERNATIVE_CFG(old_c) + +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_RISCV_ERRATA_ALTERNATIVE */ +/* + * Usage: + * ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k) + * in the assembly code. Otherwise, + * asm(ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k)); + * + * old_content: The old content which is probably replaced with new content. + * new_content: The new content. + * vendor_id: The CPU vendor ID. + * errata_id: The errata ID. + * CONFIG_k: The Kconfig of this errata. When Kconfig is disabled, the old + * content will alwyas be executed. + */ +#define ALTERNATIVE(old_content, new_content, vendor_id, errata_id, CONFIG_k) \ + _ALTERNATIVE_CFG(old_content, new_content, vendor_id, errata_id, CONFIG_k) + +/* + * A vendor wants to replace an old_content, but another vendor has used + * ALTERNATIVE() to patch its customized content at the same location. In + * this case, this vendor can create a new macro ALTERNATIVE_2() based + * on the following sample code and then replace ALTERNATIVE() with + * ALTERNATIVE_2() to append its customized content. + * + * .macro __ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, enable_1, \ + * new_c_2, vendor_id_2, errata_id_2, enable_2 + * 886 : + * \old_c + * 887 : + * ALT_NEW_CONTENT \vendor_id_1, \errata_id_1, \enable_1, \new_c_1 + * ALT_NEW_CONTENT \vendor_id_2, \errata_id_2, \enable_2, \new_c_2 + * .endm + * + * #define _ALTERNATIVE_CFG_2(old_c, new_c_1, vendor_id_1, errata_id_1, CONFIG_k_1, \ + * new_c_2, vendor_id_2, errata_id_2, CONFIG_k_2) \ + * __ALTERNATIVE_CFG_2 old_c, new_c_1, vendor_id_1, errata_id_1, IS_ENABLED(CONFIG_k_1), \ + * new_c_2, vendor_id_2, errata_id_2, IS_ENABLED(CONFIG_k_2) \ + * + * #define ALTERNATIVE_2(old_content, new_content_1, vendor_id_1, errata_id_1, CONFIG_k_1, \ + * new_content_2, vendor_id_2, errata_id_2, CONFIG_k_2) \ + * _ALTERNATIVE_CFG_2(old_content, new_content_1, vendor_id_1, errata_id_1, CONFIG_k_1, \ + * new_content_2, vendor_id_2, errata_id_2, CONFIG_k_2) + * + */ +#endif diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h new file mode 100644 index 000000000000..430bc4fea133 --- /dev/null +++ b/arch/riscv/include/asm/alternative.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Sifive. + */ + +#ifndef __ASM_ALTERNATIVE_H +#define __ASM_ALTERNATIVE_H + +#define ERRATA_STRING_LENGTH_MAX 32 + +#include + +#ifndef __ASSEMBLY__ + +#include +#include +#include +#include + +void __init apply_boot_alternatives(void); + +struct alt_entry { + void *old_ptr; /* address of original instruciton or data */ + void *alt_ptr; /* address of replacement instruction or data */ + unsigned long vendor_id; /* cpu vendor id */ + unsigned long alt_len; /* The replacement size */ + unsigned int errata_id; /* The errata id */ +} __packed; + +struct errata_checkfunc_id { + unsigned long vendor_id; + bool (*func)(struct alt_entry *alt); +}; + +#endif +#endif diff --git a/arch/riscv/include/asm/asm.h b/arch/riscv/include/asm/asm.h index 9c992a88d858..618d7c5af1a2 100644 --- a/arch/riscv/include/asm/asm.h +++ b/arch/riscv/include/asm/asm.h @@ -23,6 +23,7 @@ #define REG_L __REG_SEL(ld, lw) #define REG_S __REG_SEL(sd, sw) #define REG_SC __REG_SEL(sc.d, sc.w) +#define REG_ASM __REG_SEL(.dword, .word) #define SZREG __REG_SEL(8, 4) #define LGREG __REG_SEL(3, 2) diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index caadfc1d7487..87ac65696871 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -115,6 +115,9 @@ #define CSR_MIP 0x344 #define CSR_PMPCFG0 0x3a0 #define CSR_PMPADDR0 0x3b0 +#define CSR_MVENDORID 0xf11 +#define CSR_MARCHID 0xf12 +#define CSR_MIMPID 0xf13 #define CSR_MHARTID 0xf14 #ifdef CONFIG_RISCV_M_MODE diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h new file mode 100644 index 000000000000..1b56131431c9 --- /dev/null +++ b/arch/riscv/include/asm/errata_list.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Sifive. + */ +#ifndef ASM_ERRATA_LIST_H +#define ASM_ERRATA_LIST_H + +#ifdef CONFIG_ERRATA_SIFIVE +#define ERRATA_SIFIVE_NUMBER 0 +#endif + +#endif diff --git a/arch/riscv/include/asm/sections.h b/arch/riscv/include/asm/sections.h index 1595c5b60cfd..8a303fb1ee3b 100644 --- a/arch/riscv/include/asm/sections.h +++ b/arch/riscv/include/asm/sections.h @@ -11,5 +11,6 @@ extern char _start[]; extern char _start_kernel[]; extern char __init_data_begin[], __init_data_end[]; extern char __init_text_begin[], __init_text_end[]; +extern char __alt_start[], __alt_end[]; #endif /* __ASM_SECTIONS_H */ diff --git a/arch/riscv/include/asm/vendorid_list.h b/arch/riscv/include/asm/vendorid_list.h new file mode 100644 index 000000000000..9d934215b3c8 --- /dev/null +++ b/arch/riscv/include/asm/vendorid_list.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 SiFive + */ +#ifndef ASM_VENDOR_LIST_H +#define ASM_VENDOR_LIST_H + +#define SIFIVE_VENDOR_ID 0x489 + +#endif diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index 5e276c25646f..9a408e2942ac 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "head.h" @@ -40,6 +41,9 @@ static DECLARE_COMPLETION(cpu_running); void __init smp_prepare_boot_cpu(void) { init_cpu_topology(); +#ifdef CONFIG_RISCV_ERRATA_ALTERNATIVE + apply_boot_alternatives(); +#endif } void __init smp_prepare_cpus(unsigned int max_cpus) diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index de03cb22d0e9..7e61bc1dc36e 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -90,6 +90,13 @@ SECTIONS } __init_data_end = .; + + . = ALIGN(8); + .alternative : { + __alt_start = .; + *(.alternative) + __alt_end = .; + } __init_end = .; /* Start of data section */ From 1a0e5dbd3723e1194cc549def69fe7b557d4c72b Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 22 Mar 2021 22:26:04 +0800 Subject: [PATCH 0645/1379] riscv: sifive: Add SiFive alternative ports Add required ports of the Alternative scheme for SiFive. Signed-off-by: Vincent Chen Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.erratas | 10 ++++ arch/riscv/Kconfig.socs | 1 + arch/riscv/errata/Makefile | 1 + arch/riscv/errata/alternative.c | 5 ++ arch/riscv/errata/sifive/Makefile | 1 + arch/riscv/errata/sifive/errata.c | 68 ++++++++++++++++++++++++++++ arch/riscv/include/asm/alternative.h | 3 ++ 7 files changed, 89 insertions(+) create mode 100644 arch/riscv/errata/sifive/Makefile create mode 100644 arch/riscv/errata/sifive/errata.c diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas index 4d0bafc536df..302e7467f302 100644 --- a/arch/riscv/Kconfig.erratas +++ b/arch/riscv/Kconfig.erratas @@ -9,4 +9,14 @@ config RISCV_ERRATA_ALTERNATIVE code patching is performed once in the boot stages. It means that the overhead from this mechanism is just taken once. +config ERRATA_SIFIVE + bool "SiFive errata" + depends on RISCV_ERRATA_ALTERNATIVE + help + All SiFive errata Kconfig depend on this Kconfig. Disabling + this Kconfig will disable all SiFive errata. Please say "Y" + here if your platform uses SiFive CPU cores. + + Otherwise, please say "N" here to avoid unnecessary overhead. + endmenu diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index 7efcece8896c..b9eda857fc87 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -7,6 +7,7 @@ config SOC_SIFIVE select CLK_SIFIVE select CLK_SIFIVE_PRCI select SIFIVE_PLIC + select ERRATA_SIFIVE help This enables support for SiFive SoC platform hardware. diff --git a/arch/riscv/errata/Makefile b/arch/riscv/errata/Makefile index 43e6d5424367..b8f8740a3e44 100644 --- a/arch/riscv/errata/Makefile +++ b/arch/riscv/errata/Makefile @@ -1 +1,2 @@ obj-y += alternative.o +obj-$(CONFIG_ERRATA_SIFIVE) += sifive/ diff --git a/arch/riscv/errata/alternative.c b/arch/riscv/errata/alternative.c index 8efa60ad69b7..3b15885db70b 100644 --- a/arch/riscv/errata/alternative.c +++ b/arch/riscv/errata/alternative.c @@ -42,6 +42,11 @@ static void __init init_alternative(void) riscv_fill_cpu_mfr_info(); switch (cpu_mfr_info.vendor_id) { +#ifdef CONFIG_ERRATA_SIFIVE + case SIFIVE_VENDOR_ID: + vendor_patch_func = sifive_errata_patch_func; + break; +#endif default: vendor_patch_func = NULL; } diff --git a/arch/riscv/errata/sifive/Makefile b/arch/riscv/errata/sifive/Makefile new file mode 100644 index 000000000000..2d644e19caef --- /dev/null +++ b/arch/riscv/errata/sifive/Makefile @@ -0,0 +1 @@ +obj-y += errata.o diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c new file mode 100644 index 000000000000..826cd391fc55 --- /dev/null +++ b/arch/riscv/errata/sifive/errata.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021 Sifive. + */ + +#include +#include +#include +#include +#include +#include +#include + +struct errata_info_t { + char name[ERRATA_STRING_LENGTH_MAX]; + bool (*check_func)(unsigned long arch_id, unsigned long impid); +}; + +static u32 __init sifive_errata_probe(unsigned long archid, unsigned long impid) +{ + int idx; + u32 cpu_req_errata = 0; + + for (idx = 0; idx < ERRATA_SIFIVE_NUMBER; idx++) + if (errata_list[idx].check_func(archid, impid)) + cpu_req_errata |= (1U << idx); + + return cpu_req_errata; +} + +static void __init warn_miss_errata(u32 miss_errata) +{ + int i; + + pr_warn("----------------------------------------------------------------\n"); + pr_warn("WARNING: Missing the following errata may cause potential issues\n"); + for (i = 0; i < ERRATA_SIFIVE_NUMBER; i++) + if (miss_errata & 0x1 << i) + pr_warn("\tSiFive Errata[%d]:%s\n", i, errata_list[i].name); + pr_warn("Please enable the corresponding Kconfig to apply them\n"); + pr_warn("----------------------------------------------------------------\n"); +} + +void __init sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, + unsigned long archid, unsigned long impid) +{ + struct alt_entry *alt; + u32 cpu_req_errata = sifive_errata_probe(archid, impid); + u32 cpu_apply_errata = 0; + u32 tmp; + + for (alt = begin; alt < end; alt++) { + if (alt->vendor_id != SIFIVE_VENDOR_ID) + continue; + if (alt->errata_id >= ERRATA_SIFIVE_NUMBER) { + WARN(1, "This errata id:%d is not in kernel errata list", alt->errata_id); + continue; + } + + tmp = (1U << alt->errata_id); + if (cpu_req_errata & tmp) { + patch_text_nosync(alt->old_ptr, alt->alt_ptr, alt->alt_len); + cpu_apply_errata |= tmp; + } + } + if (cpu_apply_errata != cpu_req_errata) + warn_miss_errata(cpu_req_errata - cpu_apply_errata); +} diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h index 430bc4fea133..e625d3cafbed 100644 --- a/arch/riscv/include/asm/alternative.h +++ b/arch/riscv/include/asm/alternative.h @@ -32,5 +32,8 @@ struct errata_checkfunc_id { bool (*func)(struct alt_entry *alt); }; +void sifive_errata_patch_func(struct alt_entry *begin, struct alt_entry *end, + unsigned long archid, unsigned long impid); + #endif #endif From 800149a77c2cb8746a94457939b1ba1e37d2c14e Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 22 Mar 2021 22:26:05 +0800 Subject: [PATCH 0646/1379] riscv: sifive: Apply errata "cip-453" patch Add sign extension to the $badaddr before addressing the instruction page fault and instruction access fault to workaround the issue "cip-453". To avoid affecting the existing code sequence, this patch will creates two trampolines to add sign extension to the $badaddr. By the "alternative" mechanism, these two trampolines will replace the original exception handler of instruction page fault and instruction access fault in the excp_vect_table. In this case, only the specific SiFive CPU core jumps to the do_page_fault and do_trap_insn_fault through these two trampolines. Other CPUs are not affected. Signed-off-by: Vincent Chen Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.erratas | 11 +++++++ arch/riscv/errata/sifive/Makefile | 1 + arch/riscv/errata/sifive/errata.c | 20 ++++++++++++ arch/riscv/errata/sifive/errata_cip_453.S | 38 +++++++++++++++++++++++ arch/riscv/include/asm/errata_list.h | 21 ++++++++++++- arch/riscv/kernel/entry.S | 6 ++-- 6 files changed, 94 insertions(+), 3 deletions(-) create mode 100644 arch/riscv/errata/sifive/errata_cip_453.S diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas index 302e7467f302..b4146dca50fc 100644 --- a/arch/riscv/Kconfig.erratas +++ b/arch/riscv/Kconfig.erratas @@ -19,4 +19,15 @@ config ERRATA_SIFIVE Otherwise, please say "N" here to avoid unnecessary overhead. +config ERRATA_SIFIVE_CIP_453 + bool "Apply SiFive errata CIP-453" + depends on ERRATA_SIFIVE + default y + help + This will apply the SiFive CIP-453 errata to add sign extension + to the $badaddr when exception type is instruction page fault + and instruction access fault. + + If you don't know what to do here, say "Y". + endmenu diff --git a/arch/riscv/errata/sifive/Makefile b/arch/riscv/errata/sifive/Makefile index 2d644e19caef..bdd5fc843b8e 100644 --- a/arch/riscv/errata/sifive/Makefile +++ b/arch/riscv/errata/sifive/Makefile @@ -1 +1,2 @@ +obj-y += errata_cip_453.o obj-y += errata.o diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c index 826cd391fc55..e27391823f0f 100644 --- a/arch/riscv/errata/sifive/errata.c +++ b/arch/riscv/errata/sifive/errata.c @@ -16,6 +16,26 @@ struct errata_info_t { bool (*check_func)(unsigned long arch_id, unsigned long impid); }; +static bool errata_cip_453_check_func(unsigned long arch_id, unsigned long impid) +{ + /* + * Affected cores: + * Architecture ID: 0x8000000000000007 + * Implement ID: 0x20181004 <= impid <= 0x20191105 + */ + if (arch_id != 0x8000000000000007 || + (impid < 0x20181004 || impid > 0x20191105)) + return false; + return true; +} + +static struct errata_info_t errata_list[ERRATA_SIFIVE_NUMBER] = { + { + .name = "cip-453", + .check_func = errata_cip_453_check_func + }, +}; + static u32 __init sifive_errata_probe(unsigned long archid, unsigned long impid) { int idx; diff --git a/arch/riscv/errata/sifive/errata_cip_453.S b/arch/riscv/errata/sifive/errata_cip_453.S new file mode 100644 index 000000000000..f1b9623fe1de --- /dev/null +++ b/arch/riscv/errata/sifive/errata_cip_453.S @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 SiFive + */ + +#include +#include +#include +#include + +.macro ADD_SIGN_EXT pt_reg badaddr tmp_reg + REG_L \badaddr, PT_BADADDR(\pt_reg) + li \tmp_reg,1 + slli \tmp_reg,\tmp_reg,0x26 + and \tmp_reg,\tmp_reg,\badaddr + beqz \tmp_reg, 1f + li \tmp_reg,-1 + slli \tmp_reg,\tmp_reg,0x27 + or \badaddr,\tmp_reg,\badaddr + REG_S \badaddr, PT_BADADDR(\pt_reg) +1: +.endm + +ENTRY(sifive_cip_453_page_fault_trp) + ADD_SIGN_EXT a0, t0, t1 +#ifdef CONFIG_MMU + la t0, do_page_fault +#else + la t0, do_trap_unknown +#endif + jr t0 +END(sifive_cip_453_page_fault_trp) + +ENTRY(sifive_cip_453_insn_fault_trp) + ADD_SIGN_EXT a0, t0, t1 + la t0, do_trap_insn_fault + jr t0 +END(sifive_cip_453_insn_fault_trp) diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index 1b56131431c9..6148d34d4245 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -5,8 +5,27 @@ #ifndef ASM_ERRATA_LIST_H #define ASM_ERRATA_LIST_H +#include +#include + #ifdef CONFIG_ERRATA_SIFIVE -#define ERRATA_SIFIVE_NUMBER 0 +#define ERRATA_SIFIVE_CIP_453 0 +#define ERRATA_SIFIVE_NUMBER 1 #endif +#ifdef __ASSEMBLY__ + +#define ALT_INSN_FAULT(x) \ +ALTERNATIVE(__stringify(RISCV_PTR do_trap_insn_fault), \ + __stringify(RISCV_PTR sifive_cip_453_insn_fault_trp), \ + SIFIVE_VENDOR_ID, ERRATA_SIFIVE_CIP_453, \ + CONFIG_ERRATA_SIFIVE_CIP_453) + +#define ALT_PAGE_FAULT(x) \ +ALTERNATIVE(__stringify(RISCV_PTR do_page_fault), \ + __stringify(RISCV_PTR sifive_cip_453_page_fault_trp), \ + SIFIVE_VENDOR_ID, ERRATA_SIFIVE_CIP_453, \ + CONFIG_ERRATA_SIFIVE_CIP_453) +#endif /* __ASSEMBLY__ */ + #endif diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 744f3209c48d..60d0a2f1cd88 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -12,6 +12,7 @@ #include #include #include +#include #if !IS_ENABLED(CONFIG_PREEMPTION) .set resume_kernel, restore_all @@ -450,7 +451,7 @@ ENDPROC(__switch_to) /* Exception vector table */ ENTRY(excp_vect_table) RISCV_PTR do_trap_insn_misaligned - RISCV_PTR do_trap_insn_fault + ALT_INSN_FAULT(RISCV_PTR do_trap_insn_fault) RISCV_PTR do_trap_insn_illegal RISCV_PTR do_trap_break RISCV_PTR do_trap_load_misaligned @@ -461,7 +462,8 @@ ENTRY(excp_vect_table) RISCV_PTR do_trap_ecall_s RISCV_PTR do_trap_unknown RISCV_PTR do_trap_ecall_m - RISCV_PTR do_page_fault /* instruction page fault */ + /* instruciton page fault */ + ALT_PAGE_FAULT(RISCV_PTR do_page_fault) RISCV_PTR do_page_fault /* load page fault */ RISCV_PTR do_trap_unknown RISCV_PTR do_page_fault /* store page fault */ From bff3ff525460b492dca1d1665e821d2b5816ebdb Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Mon, 22 Mar 2021 22:26:06 +0800 Subject: [PATCH 0647/1379] riscv: sifive: Apply errata "cip-1200" patch For certain SiFive CPUs, "sfence.vma addr" cannot exactly flush addr from TLB in the particular cases. The details could be found here: https://sifive.cdn.prismic.io/sifive/167a1a56-03f4-4615-a79e-b2a86153148f_FU740_errata_20210205.pdf In order to ensure the functionality, this patch uses the Alternative scheme to replace all "sfence.vma addr" with "sfence.vma" at runtime. Signed-off-by: Vincent Chen Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.erratas | 11 +++++++++++ arch/riscv/errata/sifive/errata.c | 18 ++++++++++++++++++ arch/riscv/include/asm/errata_list.h | 10 +++++++++- arch/riscv/include/asm/tlbflush.h | 3 ++- 4 files changed, 40 insertions(+), 2 deletions(-) diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas index b4146dca50fc..d5d03ae8d685 100644 --- a/arch/riscv/Kconfig.erratas +++ b/arch/riscv/Kconfig.erratas @@ -30,4 +30,15 @@ config ERRATA_SIFIVE_CIP_453 If you don't know what to do here, say "Y". +config ERRATA_SIFIVE_CIP_1200 + bool "Apply SiFive errata CIP-1200" + depends on ERRATA_SIFIVE + default y + help + This will apply the SiFive CIP-1200 errata to repalce all + "sfence.vma addr" with "sfence.vma" to ensure that the addr + has been flushed from TLB. + + If you don't know what to do here, say "Y". + endmenu diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c index e27391823f0f..f5e5ae70e829 100644 --- a/arch/riscv/errata/sifive/errata.c +++ b/arch/riscv/errata/sifive/errata.c @@ -29,11 +29,29 @@ static bool errata_cip_453_check_func(unsigned long arch_id, unsigned long impi return true; } +static bool errata_cip_1200_check_func(unsigned long arch_id, unsigned long impid) +{ + /* + * Affected cores: + * Architecture ID: 0x8000000000000007 or 0x1 + * Implement ID: mimpid[23:0] <= 0x200630 and mimpid != 0x01200626 + */ + if (arch_id != 0x8000000000000007 && arch_id != 0x1) + return false; + if ((impid & 0xffffff) > 0x200630 || impid == 0x1200626) + return false; + return true; +} + static struct errata_info_t errata_list[ERRATA_SIFIVE_NUMBER] = { { .name = "cip-453", .check_func = errata_cip_453_check_func }, + { + .name = "cip-1200", + .check_func = errata_cip_1200_check_func + }, }; static u32 __init sifive_errata_probe(unsigned long archid, unsigned long impid) diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h index 6148d34d4245..5f1046e82d9f 100644 --- a/arch/riscv/include/asm/errata_list.h +++ b/arch/riscv/include/asm/errata_list.h @@ -10,7 +10,8 @@ #ifdef CONFIG_ERRATA_SIFIVE #define ERRATA_SIFIVE_CIP_453 0 -#define ERRATA_SIFIVE_NUMBER 1 +#define ERRATA_SIFIVE_CIP_1200 1 +#define ERRATA_SIFIVE_NUMBER 2 #endif #ifdef __ASSEMBLY__ @@ -26,6 +27,13 @@ ALTERNATIVE(__stringify(RISCV_PTR do_page_fault), \ __stringify(RISCV_PTR sifive_cip_453_page_fault_trp), \ SIFIVE_VENDOR_ID, ERRATA_SIFIVE_CIP_453, \ CONFIG_ERRATA_SIFIVE_CIP_453) +#else /* !__ASSEMBLY__ */ + +#define ALT_FLUSH_TLB_PAGE(x) \ +asm(ALTERNATIVE("sfence.vma %0", "sfence.vma", SIFIVE_VENDOR_ID, \ + ERRATA_SIFIVE_CIP_1200, CONFIG_ERRATA_SIFIVE_CIP_1200) \ + : : "r" (addr) : "memory") + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index 394cfbccdcd9..c84218ad7afc 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -9,6 +9,7 @@ #include #include +#include #ifdef CONFIG_MMU static inline void local_flush_tlb_all(void) @@ -19,7 +20,7 @@ static inline void local_flush_tlb_all(void) /* Flush one page from local TLB */ static inline void local_flush_tlb_page(unsigned long addr) { - __asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory"); + ALT_FLUSH_TLB_PAGE(__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory")); } #else /* CONFIG_MMU */ #define local_flush_tlb_all() do { } while (0) From 7f3d349065d0c643f7f7013fbf9bc9f2c90b675f Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Mar 2021 14:51:56 -0700 Subject: [PATCH 0648/1379] riscv: Use $(LD) instead of $(CC) to link vDSO Currently, the VDSO is being linked through $(CC). This does not match how the rest of the kernel links objects, which is through the $(LD) variable. When linking with clang, there are a couple of warnings about flags that will not be used during the link: clang-12: warning: argument unused during compilation: '-no-pie' [-Wunused-command-line-argument] clang-12: warning: argument unused during compilation: '-pg' [-Wunused-command-line-argument] '-no-pie' was added in commit 85602bea297f ("RISC-V: build vdso-dummy.o with -no-pie") to override '-pie' getting added to the ld command from distribution versions of GCC that enable PIE by default. It is technically no longer needed after commit c2c81bb2f691 ("RISC-V: Fix the VDSO symbol generaton for binutils-2.35+"), which removed vdso-dummy.o in favor of generating vdso-syms.S from vdso.so with $(NM) but this also resolves the issue in case it ever comes back due to having full control over the $(LD) command. '-pg' is for function tracing, it is not used during linking as clang states. These flags could be removed/filtered to fix the warnings but it is easier to just match the rest of the kernel and use $(LD) directly for linking. See commits fe00e50b2db8 ("ARM: 8858/1: vdso: use $(LD) instead of $(CC) to link VDSO") 691efbedc60d ("arm64: vdso: use $(LD) instead of $(CC) to link VDSO") 2ff906994b6c ("MIPS: VDSO: Use $(LD) instead of $(CC) to link VDSO") 2b2a25845d53 ("s390/vdso: Use $(LD) instead of $(CC) to link vDSO") for more information. The flags are converted to linker flags and '--eh-frame-hdr' is added to match what is added by GCC implicitly, which can be seen by adding '-v' to GCC's invocation. Additionally, since this area is being modified, use the $(OBJCOPY) variable instead of an open coded $(CROSS_COMPILE)objcopy so that the user's choice of objcopy binary is respected. Link: https://github.com/ClangBuiltLinux/linux/issues/803 Link: https://github.com/ClangBuiltLinux/linux/issues/970 Signed-off-by: Nathan Chancellor Reviewed-by: Fangrui Song Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vdso/Makefile | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 71a315e73cbe..ca2b40dfd24b 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -41,11 +41,10 @@ KASAN_SANITIZE := n $(obj)/vdso.o: $(obj)/vdso.so # link rule for the .so file, .lds has to be first -SYSCFLAGS_vdso.so.dbg = $(c_flags) $(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold) -SYSCFLAGS_vdso.so.dbg = -shared -s -Wl,-soname=linux-vdso.so.1 \ - -Wl,--build-id=sha1 -Wl,--hash-style=both +LDFLAGS_vdso.so.dbg = -shared -s -soname=linux-vdso.so.1 \ + --build-id=sha1 --hash-style=both --eh-frame-hdr # We also create a special relocatable object that should mirror the symbol # table and layout of the linked DSO. With ld --just-symbols we can then @@ -60,13 +59,10 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE # actual build commands # The DSO images are built using a special linker script -# Add -lgcc so rv32 gets static muldi3 and lshrdi3 definitions. # Make sure only to export the intended __vdso_xxx symbol offsets. quiet_cmd_vdsold = VDSOLD $@ - cmd_vdsold = $(CC) $(KBUILD_CFLAGS) $(call cc-option, -no-pie) -nostdlib -nostartfiles $(SYSCFLAGS_$(@F)) \ - -Wl,-T,$(filter-out FORCE,$^) -o $@.tmp && \ - $(CROSS_COMPILE)objcopy \ - $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \ + cmd_vdsold = $(LD) $(ld_flags) -T $(filter-out FORCE,$^) -o $@.tmp && \ + $(OBJCOPY) $(patsubst %, -G __vdso_%, $(vdso-syms)) $@.tmp $@ && \ rm $@.tmp # Extracts symbol offsets from the VDSO, converting them into an assembly file From 2f095504f4b9cf75856d6a9cf90299cf75aa46c5 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Mar 2021 15:38:05 -0700 Subject: [PATCH 0649/1379] scripts/recordmcount.pl: Fix RISC-V regex for clang Clang can generate R_RISCV_CALL_PLT relocations to _mcount: $ llvm-objdump -dr build/riscv/init/main.o | rg mcount 000000000000000e: R_RISCV_CALL_PLT _mcount 000000000000004e: R_RISCV_CALL_PLT _mcount After this, the __start_mcount_loc section is properly generated and function tracing still works. Link: https://github.com/ClangBuiltLinux/linux/issues/1331 Signed-off-by: Nathan Chancellor Reviewed-by: Fangrui Song Signed-off-by: Palmer Dabbelt --- scripts/recordmcount.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 867860ea57da..a36df04cfa09 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -392,7 +392,7 @@ if ($arch eq "x86_64") { $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$"; } elsif ($arch eq "riscv") { $function_regex = "^([0-9a-fA-F]+)\\s+<([^.0-9][0-9a-zA-Z_\\.]+)>:"; - $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL\\s_mcount\$"; + $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL(_PLT)?\\s_mcount\$"; $type = ".quad"; $alignment = 2; } elsif ($arch eq "nds32") { From 7ce04771503074a7de7f539cc43f5e1b385cb99b Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Mar 2021 15:38:06 -0700 Subject: [PATCH 0650/1379] riscv: Workaround mcount name prior to clang-13 Prior to clang 13.0.0, the RISC-V name for the mcount symbol was "mcount", which differs from the GCC version of "_mcount", which results in the following errors: riscv64-linux-gnu-ld: init/main.o: in function `__traceiter_initcall_level': main.c:(.text+0xe): undefined reference to `mcount' riscv64-linux-gnu-ld: init/main.o: in function `__traceiter_initcall_start': main.c:(.text+0x4e): undefined reference to `mcount' riscv64-linux-gnu-ld: init/main.o: in function `__traceiter_initcall_finish': main.c:(.text+0x92): undefined reference to `mcount' riscv64-linux-gnu-ld: init/main.o: in function `.LBB32_28': main.c:(.text+0x30c): undefined reference to `mcount' riscv64-linux-gnu-ld: init/main.o: in function `free_initmem': main.c:(.text+0x54c): undefined reference to `mcount' This has been corrected in https://reviews.llvm.org/D98881 but the minimum supported clang version is 10.0.1. To avoid build errors and to gain a working function tracer, adjust the name of the mcount symbol for older versions of clang in mount.S and recordmcount.pl. Link: https://github.com/ClangBuiltLinux/linux/issues/1331 Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/ftrace.h | 14 ++++++++++++-- arch/riscv/kernel/mcount.S | 10 +++++----- scripts/recordmcount.pl | 2 +- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index 845002cc2e57..04dad3380041 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -13,9 +13,19 @@ #endif #define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +/* + * Clang prior to 13 had "mcount" instead of "_mcount": + * https://reviews.llvm.org/D98881 + */ +#if defined(CONFIG_CC_IS_GCC) || CONFIG_CLANG_VERSION >= 130000 +#define MCOUNT_NAME _mcount +#else +#define MCOUNT_NAME mcount +#endif + #define ARCH_SUPPORTS_FTRACE_OPS 1 #ifndef __ASSEMBLY__ -void _mcount(void); +void MCOUNT_NAME(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { return addr; @@ -36,7 +46,7 @@ struct dyn_arch_ftrace { * both auipc and jalr at the same time. */ -#define MCOUNT_ADDR ((unsigned long)_mcount) +#define MCOUNT_ADDR ((unsigned long)MCOUNT_NAME) #define JALR_SIGN_MASK (0x00000800) #define JALR_OFFSET_MASK (0x00000fff) #define AUIPC_OFFSET_MASK (0xfffff000) diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S index 8a5593ff9ff3..6d462681c9c0 100644 --- a/arch/riscv/kernel/mcount.S +++ b/arch/riscv/kernel/mcount.S @@ -47,8 +47,8 @@ ENTRY(ftrace_stub) #ifdef CONFIG_DYNAMIC_FTRACE - .global _mcount - .set _mcount, ftrace_stub + .global MCOUNT_NAME + .set MCOUNT_NAME, ftrace_stub #endif ret ENDPROC(ftrace_stub) @@ -78,7 +78,7 @@ ENDPROC(return_to_handler) #endif #ifndef CONFIG_DYNAMIC_FTRACE -ENTRY(_mcount) +ENTRY(MCOUNT_NAME) la t4, ftrace_stub #ifdef CONFIG_FUNCTION_GRAPH_TRACER la t0, ftrace_graph_return @@ -124,6 +124,6 @@ do_trace: jalr t5 RESTORE_ABI_STATE ret -ENDPROC(_mcount) +ENDPROC(MCOUNT_NAME) #endif -EXPORT_SYMBOL(_mcount) +EXPORT_SYMBOL(MCOUNT_NAME) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index a36df04cfa09..7b83a1aaec98 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -392,7 +392,7 @@ if ($arch eq "x86_64") { $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s_mcount\$"; } elsif ($arch eq "riscv") { $function_regex = "^([0-9a-fA-F]+)\\s+<([^.0-9][0-9a-zA-Z_\\.]+)>:"; - $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL(_PLT)?\\s_mcount\$"; + $mcount_regex = "^\\s*([0-9a-fA-F]+):\\sR_RISCV_CALL(_PLT)?\\s_?mcount\$"; $type = ".quad"; $alignment = 2; } elsif ($arch eq "nds32") { From adebc8817b5c975d598ac379bbdf67a7a5186ade Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 25 Mar 2021 15:38:07 -0700 Subject: [PATCH 0651/1379] riscv: Select HAVE_DYNAMIC_FTRACE when -fpatchable-function-entry is available clang prior to 13.0.0 does not support -fpatchable-function-entry for RISC-V. clang: error: unsupported option '-fpatchable-function-entry=8' for target 'riscv64-unknown-linux-gnu' To avoid this error, only select HAVE_DYNAMIC_FTRACE when this option is not available. Fixes: afc76b8b8011 ("riscv: Using PATCHABLE_FUNCTION_ENTRY instead of MCOUNT") Link: https://github.com/ClangBuiltLinux/linux/issues/1268 Reported-by: kernel test robot Signed-off-by: Nathan Chancellor Reviewed-by: Fangrui Song Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 82c469b340c2..a840004d2829 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -231,7 +231,7 @@ config ARCH_RV64I bool "RV64I" select 64BIT select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && GCC_VERSION >= 50000 - select HAVE_DYNAMIC_FTRACE if MMU + select HAVE_DYNAMIC_FTRACE if MMU && $(cc-option,-fpatchable-function-entry=8) select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER From 8a07ac39f87d6c762006398029762c40e4d9d075 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 30 Mar 2021 02:04:16 +0800 Subject: [PATCH 0652/1379] samples/kprobes: Add riscv support Add riscv specific info dump in both handler_pre() and handler_post(). Signed-off-by: Jisheng Zhang Acked-by: Masami Hiramatsu Signed-off-by: Palmer Dabbelt --- samples/kprobes/kprobe_example.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c index 331dcf151532..c495664c0a9b 100644 --- a/samples/kprobes/kprobe_example.c +++ b/samples/kprobes/kprobe_example.c @@ -47,6 +47,10 @@ static int __kprobes handler_pre(struct kprobe *p, struct pt_regs *regs) pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx, cpsr = 0x%lx\n", p->symbol_name, p->addr, (long)regs->ARM_pc, (long)regs->ARM_cpsr); #endif +#ifdef CONFIG_RISCV + pr_info("<%s> pre_handler: p->addr = 0x%p, pc = 0x%lx, status = 0x%lx\n", + p->symbol_name, p->addr, regs->epc, regs->status); +#endif #ifdef CONFIG_S390 pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n", p->symbol_name, p->addr, regs->psw.addr, regs->flags); @@ -80,6 +84,10 @@ static void __kprobes handler_post(struct kprobe *p, struct pt_regs *regs, pr_info("<%s> post_handler: p->addr = 0x%p, cpsr = 0x%lx\n", p->symbol_name, p->addr, (long)regs->ARM_cpsr); #endif +#ifdef CONFIG_RISCV + pr_info("<%s> post_handler: p->addr = 0x%p, status = 0x%lx\n", + p->symbol_name, p->addr, regs->status); +#endif #ifdef CONFIG_S390 pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n", p->symbol_name, p->addr, regs->flags); From 2bfc6cd81bd17e4306e24ee47b9554c967bcb499 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Sun, 11 Apr 2021 12:41:44 -0400 Subject: [PATCH 0653/1379] riscv: Move kernel mapping outside of linear mapping This is a preparatory patch for relocatable kernel and sv48 support. The kernel used to be linked at PAGE_OFFSET address therefore we could use the linear mapping for the kernel mapping. But the relocated kernel base address will be different from PAGE_OFFSET and since in the linear mapping, two different virtual addresses cannot point to the same physical address, the kernel mapping needs to lie outside the linear mapping so that we don't have to copy it at the same physical offset. The kernel mapping is moved to the last 2GB of the address space, BPF is now always after the kernel and modules use the 2GB memory range right before the kernel, so BPF and modules regions do not overlap. KASLR implementation will simply have to move the kernel in the last 2GB range and just take care of leaving enough space for BPF. In addition, by moving the kernel to the end of the address space, both sv39 and sv48 kernels will be exactly the same without needing to be relocated at runtime. Suggested-by: Arnd Bergmann Signed-off-by: Alexandre Ghiti [Palmer: Squash the STRICT_RWX fix, and a !MMU fix] Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/loader.lds.S | 3 +- arch/riscv/include/asm/page.h | 26 ++++++- arch/riscv/include/asm/pgtable.h | 39 +++++++--- arch/riscv/include/asm/set_memory.h | 1 + arch/riscv/kernel/head.S | 3 +- arch/riscv/kernel/module.c | 6 +- arch/riscv/kernel/setup.c | 7 +- arch/riscv/kernel/vmlinux.lds.S | 3 +- arch/riscv/mm/fault.c | 13 ++++ arch/riscv/mm/init.c | 106 +++++++++++++++++++++++----- arch/riscv/mm/kasan_init.c | 9 +++ arch/riscv/mm/physaddr.c | 2 +- 12 files changed, 182 insertions(+), 36 deletions(-) diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S index 47a5003c2e28..62d94696a19c 100644 --- a/arch/riscv/boot/loader.lds.S +++ b/arch/riscv/boot/loader.lds.S @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include OUTPUT_ARCH(riscv) ENTRY(_start) SECTIONS { - . = PAGE_OFFSET; + . = KERNEL_LINK_ADDR; .payload : { *(.payload) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index adc9d26f3d75..f64b61296c0c 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -90,15 +90,37 @@ typedef struct page *pgtable_t; #ifdef CONFIG_MMU extern unsigned long va_pa_offset; +#ifdef CONFIG_64BIT +extern unsigned long va_kernel_pa_offset; +#endif extern unsigned long pfn_base; #define ARCH_PFN_OFFSET (pfn_base) #else #define va_pa_offset 0 +#ifdef CONFIG_64BIT +#define va_kernel_pa_offset 0 +#endif #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ -#define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) +#ifdef CONFIG_64BIT +extern unsigned long kernel_virt_addr; + +#define linear_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + va_pa_offset)) +#define kernel_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + va_kernel_pa_offset)) +#define __pa_to_va_nodebug(x) linear_mapping_pa_to_va(x) + +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) +#define kernel_mapping_va_to_pa(x) ((unsigned long)(x) - va_kernel_pa_offset) +#define __va_to_pa_nodebug(x) ({ \ + unsigned long _x = x; \ + (_x < kernel_virt_addr) ? \ + linear_mapping_va_to_pa(_x) : kernel_mapping_va_to_pa(_x); \ + }) +#else +#define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) +#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) +#endif #ifdef CONFIG_DEBUG_VIRTUAL extern phys_addr_t __virt_to_phys(unsigned long x); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index ebf817c1bdf4..5afda75cc2c3 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -11,23 +11,38 @@ #include -#ifndef __ASSEMBLY__ +#ifndef CONFIG_MMU +#define KERNEL_LINK_ADDR PAGE_OFFSET +#else -/* Page Upper Directory not used in RISC-V */ -#include -#include -#include -#include +#define ADDRESS_SPACE_END (UL(-1)) -#ifdef CONFIG_MMU +#ifdef CONFIG_64BIT +/* Leave 2GB for kernel and BPF at the end of the address space */ +#define KERNEL_LINK_ADDR (ADDRESS_SPACE_END - SZ_2G + 1) +#else +#define KERNEL_LINK_ADDR PAGE_OFFSET +#endif #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) #define BPF_JIT_REGION_SIZE (SZ_128M) +#ifdef CONFIG_64BIT +/* KASLR should leave at least 128MB for BPF after the kernel */ +#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) +#else #define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) #define BPF_JIT_REGION_END (VMALLOC_END) +#endif + +/* Modules always live before the kernel */ +#ifdef CONFIG_64BIT +#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G) +#define MODULES_END (PFN_ALIGN((unsigned long)&_start)) +#endif /* * Roughly size the vmemmap space to be large enough to fit enough @@ -57,9 +72,16 @@ #define FIXADDR_SIZE PGDIR_SIZE #endif #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) - #endif +#ifndef __ASSEMBLY__ + +/* Page Upper Directory not used in RISC-V */ +#include +#include +#include +#include + #ifdef CONFIG_64BIT #include #else @@ -484,6 +506,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, #define kern_addr_valid(addr) (1) /* FIXME */ +extern char _start[]; extern void *dtb_early_va; extern uintptr_t dtb_early_pa; void setup_bootmem(void); diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index 6887b3d9f371..a9c56776fa0e 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -17,6 +17,7 @@ int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); int set_memory_rw_nx(unsigned long addr, int numpages); void protect_kernel_text_data(void); +void protect_kernel_linear_mapping_text_rodata(void); #else static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index f5a9bad86e58..6cb05f22e52a 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -69,7 +69,8 @@ pe_head_start: #ifdef CONFIG_MMU relocate: /* Relocate return address */ - li a1, PAGE_OFFSET + la a1, kernel_virt_addr + REG_L a1, 0(a1) la a2, _start sub a1, a1, a2 add ra, ra, a1 diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 104fba889cf7..ce153771e5e9 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -408,12 +408,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, } #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) -#define VMALLOC_MODULE_START \ - max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START) void *module_alloc(unsigned long size) { - return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START, - VMALLOC_END, GFP_KERNEL, + return __vmalloc_node_range(size, 1, MODULES_VADDR, + MODULES_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); } diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index e85bacff1b50..d208abc0b473 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -263,8 +263,13 @@ void __init setup_arch(char **cmdline_p) sbi_init(); - if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) { protect_kernel_text_data(); +#if defined(CONFIG_64BIT) && defined(CONFIG_MMU) + protect_kernel_linear_mapping_text_rodata(); +#endif + } + #ifdef CONFIG_SWIOTLB swiotlb_init(1); #endif diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 7e61bc1dc36e..56677137c85b 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -4,7 +4,8 @@ * Copyright (C) 2017 SiFive */ -#define LOAD_OFFSET PAGE_OFFSET +#include +#define LOAD_OFFSET KERNEL_LINK_ADDR #include #include #include diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 8f17519208c7..1b14d523a95c 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -231,6 +231,19 @@ asmlinkage void do_page_fault(struct pt_regs *regs) return; } +#ifdef CONFIG_64BIT + /* + * Modules in 64bit kernels lie in their own virtual region which is not + * in the vmalloc region, but dealing with page faults in this region + * or the vmalloc region amounts to doing the same thing: checking that + * the mapping exists in init_mm.pgd and updating user page table, so + * just use vmalloc_fault. + */ + if (unlikely(addr >= MODULES_VADDR && addr < MODULES_END)) { + vmalloc_fault(regs, code, addr); + return; + } +#endif /* Enable interrupts if they were enabled in the parent context. */ if (likely(regs->status & SR_PIE)) local_irq_enable(); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 7f5036fbee8c..dc9b988e0778 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -25,6 +25,9 @@ #include "../kernel/head.h" +unsigned long kernel_virt_addr = KERNEL_LINK_ADDR; +EXPORT_SYMBOL(kernel_virt_addr); + unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); @@ -88,6 +91,10 @@ static void print_vm_layout(void) (unsigned long)VMALLOC_END); print_mlm("lowmem", (unsigned long)PAGE_OFFSET, (unsigned long)high_memory); +#ifdef CONFIG_64BIT + print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR, + (unsigned long)ADDRESS_SPACE_END); +#endif } #else static void print_vm_layout(void) { } @@ -116,8 +123,13 @@ void __init setup_bootmem(void) /* The maximal physical memory size is -PAGE_OFFSET. */ memblock_enforce_memory_limit(-PAGE_OFFSET); - /* Reserve from the start of the kernel to the end of the kernel */ - memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); + /* + * Reserve from the start of the kernel to the end of the kernel + * and make sure we align the reservation on PMD_SIZE since we will + * map the kernel in the linear mapping as read-only: we do not want + * any allocation to happen between _end and the next pmd aligned page. + */ + memblock_reserve(vmlinux_start, (vmlinux_end - vmlinux_start + PMD_SIZE - 1) & PMD_MASK); /* * memblock allocator is not aware of the fact that last 4K bytes of @@ -152,8 +164,14 @@ void __init setup_bootmem(void) #ifdef CONFIG_MMU static struct pt_alloc_ops pt_ops; +/* Offset between linear mapping virtual address and kernel load address */ unsigned long va_pa_offset; EXPORT_SYMBOL(va_pa_offset); +#ifdef CONFIG_64BIT +/* Offset between kernel mapping virtual address and kernel load address */ +unsigned long va_kernel_pa_offset; +EXPORT_SYMBOL(va_kernel_pa_offset); +#endif unsigned long pfn_base; EXPORT_SYMBOL(pfn_base); @@ -257,7 +275,7 @@ static pmd_t *get_pmd_virt_late(phys_addr_t pa) static phys_addr_t __init alloc_pmd_early(uintptr_t va) { - BUG_ON((va - PAGE_OFFSET) >> PGDIR_SHIFT); + BUG_ON((va - kernel_virt_addr) >> PGDIR_SHIFT); return (uintptr_t)early_pmd; } @@ -372,17 +390,34 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing." #endif +uintptr_t load_pa, load_sz; + +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size) +{ + uintptr_t va, end_va; + + end_va = kernel_virt_addr + load_sz; + for (va = kernel_virt_addr; va < end_va; va += map_size) + create_pgd_mapping(pgdir, va, + load_pa + (va - kernel_virt_addr), + map_size, PAGE_KERNEL_EXEC); +} + asmlinkage void __init setup_vm(uintptr_t dtb_pa) { - uintptr_t va, pa, end_va; - uintptr_t load_pa = (uintptr_t)(&_start); - uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; + uintptr_t pa; uintptr_t map_size; #ifndef __PAGETABLE_PMD_FOLDED pmd_t fix_bmap_spmd, fix_bmap_epmd; #endif + load_pa = (uintptr_t)(&_start); + load_sz = (uintptr_t)(&_end) - load_pa; va_pa_offset = PAGE_OFFSET - load_pa; +#ifdef CONFIG_64BIT + va_kernel_pa_offset = kernel_virt_addr - load_pa; +#endif + pfn_base = PFN_DOWN(load_pa); /* @@ -410,26 +445,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) create_pmd_mapping(fixmap_pmd, FIXADDR_START, (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); /* Setup trampoline PGD and PMD */ - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); - create_pmd_mapping(trampoline_pmd, PAGE_OFFSET, + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, load_pa, PMD_SIZE, PAGE_KERNEL_EXEC); #else /* Setup trampoline PGD */ - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC); #endif /* - * Setup early PGD covering entire kernel which will allows + * Setup early PGD covering entire kernel which will allow * us to reach paging_init(). We map all memory banks later * in setup_vm_final() below. */ - end_va = PAGE_OFFSET + load_sz; - for (va = PAGE_OFFSET; va < end_va; va += map_size) - create_pgd_mapping(early_pg_dir, va, - load_pa + (va - PAGE_OFFSET), - map_size, PAGE_KERNEL_EXEC); + create_kernel_page_table(early_pg_dir, map_size); #ifndef __PAGETABLE_PMD_FOLDED /* Setup early PMD for DTB */ @@ -444,7 +475,16 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pa + PMD_SIZE, PMD_SIZE, PAGE_KERNEL); dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PMD_SIZE - 1)); #else /* CONFIG_BUILTIN_DTB */ +#ifdef CONFIG_64BIT + /* + * __va can't be used since it would return a linear mapping address + * whereas dtb_early_va will be used before setup_vm_final installs + * the linear mapping. + */ + dtb_early_va = kernel_mapping_pa_to_va(dtb_pa); +#else dtb_early_va = __va(dtb_pa); +#endif /* CONFIG_64BIT */ #endif /* CONFIG_BUILTIN_DTB */ #else #ifndef CONFIG_BUILTIN_DTB @@ -456,7 +496,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pa + PGDIR_SIZE, PGDIR_SIZE, PAGE_KERNEL); dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1)); #else /* CONFIG_BUILTIN_DTB */ +#ifdef CONFIG_64BIT + dtb_early_va = kernel_mapping_pa_to_va(dtb_pa); +#else dtb_early_va = __va(dtb_pa); +#endif /* CONFIG_64BIT */ #endif /* CONFIG_BUILTIN_DTB */ #endif dtb_early_pa = dtb_pa; @@ -492,6 +536,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) #endif } +#ifdef CONFIG_64BIT +void protect_kernel_linear_mapping_text_rodata(void) +{ + unsigned long text_start = (unsigned long)lm_alias(_start); + unsigned long init_text_start = (unsigned long)lm_alias(__init_text_begin); + unsigned long rodata_start = (unsigned long)lm_alias(__start_rodata); + unsigned long data_start = (unsigned long)lm_alias(_data); + + set_memory_ro(text_start, (init_text_start - text_start) >> PAGE_SHIFT); + set_memory_nx(text_start, (init_text_start - text_start) >> PAGE_SHIFT); + + set_memory_ro(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT); + set_memory_nx(rodata_start, (data_start - rodata_start) >> PAGE_SHIFT); +} +#endif + static void __init setup_vm_final(void) { uintptr_t va, map_size; @@ -513,7 +573,7 @@ static void __init setup_vm_final(void) __pa_symbol(fixmap_pgd_next), PGDIR_SIZE, PAGE_TABLE); - /* Map all memory banks */ + /* Map all memory banks in the linear mapping */ for_each_mem_range(i, &start, &end) { if (start >= end) break; @@ -525,10 +585,22 @@ static void __init setup_vm_final(void) for (pa = start; pa < end; pa += map_size) { va = (uintptr_t)__va(pa); create_pgd_mapping(swapper_pg_dir, va, pa, - map_size, PAGE_KERNEL_EXEC); + map_size, +#ifdef CONFIG_64BIT + PAGE_KERNEL +#else + PAGE_KERNEL_EXEC +#endif + ); + } } +#ifdef CONFIG_64BIT + /* Map the kernel */ + create_kernel_page_table(swapper_pg_dir, PMD_SIZE); +#endif + /* Clear fixmap PTE and PMD mappings */ clear_fixmap(FIX_PTE); clear_fixmap(FIX_PMD); diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 2c39f0386673..28f4d52cf17e 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -171,6 +171,10 @@ void __init kasan_init(void) phys_addr_t _start, _end; u64 i; + /* + * Populate all kernel virtual address space with kasan_early_shadow_page + * except for the linear mapping and the modules/kernel/BPF mapping. + */ kasan_populate_early_shadow((void *)KASAN_SHADOW_START, (void *)kasan_mem_to_shadow((void *) VMEMMAP_END)); @@ -183,6 +187,7 @@ void __init kasan_init(void) (void *)kasan_mem_to_shadow((void *)VMALLOC_START), (void *)kasan_mem_to_shadow((void *)VMALLOC_END)); + /* Populate the linear mapping */ for_each_mem_range(i, &_start, &_end) { void *start = (void *)__va(_start); void *end = (void *)__va(_end); @@ -193,6 +198,10 @@ void __init kasan_init(void) kasan_populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end)); }; + /* Populate kernel, BPF, modules mapping */ + kasan_populate(kasan_mem_to_shadow((const void *)MODULES_VADDR), + kasan_mem_to_shadow((const void *)BPF_JIT_REGION_END)); + for (i = 0; i < PTRS_PER_PTE; i++) set_pte(&kasan_early_shadow_pte[i], mk_pte(virt_to_page(kasan_early_shadow_page), diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c index e8e4dcd39fed..35703d5ef5fd 100644 --- a/arch/riscv/mm/physaddr.c +++ b/arch/riscv/mm/physaddr.c @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys); phys_addr_t __phys_addr_symbol(unsigned long x) { - unsigned long kernel_start = (unsigned long)PAGE_OFFSET; + unsigned long kernel_start = (unsigned long)kernel_virt_addr; unsigned long kernel_end = (unsigned long)_end; /* From 2a433cf8f3cdb26d9e4f137db5d3b31aed6a2ca7 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Sun, 11 Apr 2021 12:41:45 -0400 Subject: [PATCH 0654/1379] Documentation: riscv: Add documentation that describes the VM layout This new document presents the RISC-V virtual memory layout and is based one the x86 one: it describes the different limits of the different regions of the virtual address space. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- Documentation/riscv/index.rst | 1 + Documentation/riscv/vm-layout.rst | 63 +++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 Documentation/riscv/vm-layout.rst diff --git a/Documentation/riscv/index.rst b/Documentation/riscv/index.rst index 6e6e39482502..ea915c196048 100644 --- a/Documentation/riscv/index.rst +++ b/Documentation/riscv/index.rst @@ -6,6 +6,7 @@ RISC-V architecture :maxdepth: 1 boot-image-header + vm-layout pmu patch-acceptance diff --git a/Documentation/riscv/vm-layout.rst b/Documentation/riscv/vm-layout.rst new file mode 100644 index 000000000000..329d32098af4 --- /dev/null +++ b/Documentation/riscv/vm-layout.rst @@ -0,0 +1,63 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================== +Virtual Memory Layout on RISC-V Linux +===================================== + +:Author: Alexandre Ghiti +:Date: 12 February 2021 + +This document describes the virtual memory layout used by the RISC-V Linux +Kernel. + +RISC-V Linux Kernel 32bit +========================= + +RISC-V Linux Kernel SV32 +------------------------ + +TODO + +RISC-V Linux Kernel 64bit +========================= + +The RISC-V privileged architecture document states that the 64bit addresses +"must have bits 63–48 all equal to bit 47, or else a page-fault exception will +occur.": that splits the virtual address space into 2 halves separated by a very +big hole, the lower half is where the userspace resides, the upper half is where +the RISC-V Linux Kernel resides. + +RISC-V Linux Kernel SV39 +------------------------ + +:: + + ======================================================================================================================== + Start addr | Offset | End addr | Size | VM area description + ======================================================================================================================== + | | | | + 0000000000000000 | 0 | 0000003fffffffff | 256 GB | user-space virtual memory, different per mm + __________________|____________|__________________|_________|___________________________________________________________ + | | | | + 0000004000000000 | +256 GB | ffffffbfffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical + | | | | virtual memory addresses up to the -256 GB + | | | | starting offset of kernel mappings. + __________________|____________|__________________|_________|___________________________________________________________ + | + | Kernel-space virtual memory, shared between all processes: + ____________________________________________________________|___________________________________________________________ + | | | | + ffffffc000000000 | -256 GB | ffffffc7ffffffff | 32 GB | kasan + ffffffcefee00000 | -196 GB | ffffffcefeffffff | 2 MB | fixmap + ffffffceff000000 | -196 GB | ffffffceffffffff | 16 MB | PCI io + ffffffcf00000000 | -196 GB | ffffffcfffffffff | 4 GB | vmemmap + ffffffd000000000 | -192 GB | ffffffdfffffffff | 64 GB | vmalloc/ioremap space + ffffffe000000000 | -128 GB | ffffffff7fffffff | 124 GB | direct mapping of all physical memory + __________________|____________|__________________|_________|____________________________________________________________ + | + | + ____________________________________________________________|____________________________________________________________ + | | | | + ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | modules + ffffffff80000000 | -2 GB | ffffffffffffffff | 2 GB | kernel, BPF + __________________|____________|__________________|_________|____________________________________________________________ From 0df68ce4c26a48115a9e8d45e24f18d964a10050 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Sun, 11 Apr 2021 12:41:46 -0400 Subject: [PATCH 0655/1379] riscv: Prepare ptdump for vm layout dynamic addresses This is a preparatory patch for sv48 support that will introduce dynamic PAGE_OFFSET. Dynamic PAGE_OFFSET implies that all zones (vmalloc, vmemmap, fixaddr...) whose addresses depend on PAGE_OFFSET become dynamic and can't be used to statically initialize the array used by ptdump to identify the different zones of the vm layout. Signed-off-by: Alexandre Ghiti Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/ptdump.c | 73 +++++++++++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 12 deletions(-) diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index ace74dec7492..0aba4421115c 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -58,29 +58,56 @@ struct ptd_mm_info { unsigned long end; }; +enum address_markers_idx { +#ifdef CONFIG_KASAN + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, +#endif + FIXMAP_START_NR, + FIXMAP_END_NR, + PCI_IO_START_NR, + PCI_IO_END_NR, +#ifdef CONFIG_SPARSEMEM_VMEMMAP + VMEMMAP_START_NR, + VMEMMAP_END_NR, +#endif + VMALLOC_START_NR, + VMALLOC_END_NR, + PAGE_OFFSET_NR, +#ifdef CONFIG_64BIT + MODULES_MAPPING_NR, +#endif + KERNEL_MAPPING_NR, + END_OF_SPACE_NR +}; + static struct addr_marker address_markers[] = { #ifdef CONFIG_KASAN - {KASAN_SHADOW_START, "Kasan shadow start"}, - {KASAN_SHADOW_END, "Kasan shadow end"}, + {0, "Kasan shadow start"}, + {0, "Kasan shadow end"}, #endif - {FIXADDR_START, "Fixmap start"}, - {FIXADDR_TOP, "Fixmap end"}, - {PCI_IO_START, "PCI I/O start"}, - {PCI_IO_END, "PCI I/O end"}, + {0, "Fixmap start"}, + {0, "Fixmap end"}, + {0, "PCI I/O start"}, + {0, "PCI I/O end"}, #ifdef CONFIG_SPARSEMEM_VMEMMAP - {VMEMMAP_START, "vmemmap start"}, - {VMEMMAP_END, "vmemmap end"}, + {0, "vmemmap start"}, + {0, "vmemmap end"}, #endif - {VMALLOC_START, "vmalloc() area"}, - {VMALLOC_END, "vmalloc() end"}, - {PAGE_OFFSET, "Linear mapping"}, + {0, "vmalloc() area"}, + {0, "vmalloc() end"}, + {0, "Linear mapping"}, +#ifdef CONFIG_64BIT + {0, "Modules mapping"}, +#endif + {0, "Kernel mapping (kernel, BPF)"}, {-1, NULL}, }; static struct ptd_mm_info kernel_ptd_info = { .mm = &init_mm, .markers = address_markers, - .base_addr = KERN_VIRT_START, + .base_addr = 0, .end = ULONG_MAX, }; @@ -335,6 +362,28 @@ static int ptdump_init(void) { unsigned int i, j; +#ifdef CONFIG_KASAN + address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; + address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; +#endif + address_markers[FIXMAP_START_NR].start_address = FIXADDR_START; + address_markers[FIXMAP_END_NR].start_address = FIXADDR_TOP; + address_markers[PCI_IO_START_NR].start_address = PCI_IO_START; + address_markers[PCI_IO_END_NR].start_address = PCI_IO_END; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START; + address_markers[VMEMMAP_END_NR].start_address = VMEMMAP_END; +#endif + address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; + address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; + address_markers[PAGE_OFFSET_NR].start_address = PAGE_OFFSET; +#ifdef CONFIG_64BIT + address_markers[MODULES_MAPPING_NR].start_address = MODULES_VADDR; +#endif + address_markers[KERNEL_MAPPING_NR].start_address = kernel_virt_addr; + + kernel_ptd_info.base_addr = KERN_VIRT_START; + for (i = 0; i < ARRAY_SIZE(pg_level); i++) for (j = 0; j < ARRAY_SIZE(pte_bits); j++) pg_level[i].mask |= pte_bits[j].mask; From 1987501b1130c6b4b7e1cef4b9c1dc9a8adae025 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 30 Mar 2021 02:22:21 +0800 Subject: [PATCH 0656/1379] riscv: add __init section marker to some functions They are not needed after booting, so mark them as __init to move them to the __init section. Signed-off-by: Jisheng Zhang Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps.c | 2 +- arch/riscv/mm/init.c | 6 +++--- arch/riscv/mm/kasan_init.c | 6 +++--- arch/riscv/mm/ptdump.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 3ed2c23601a0..6272da7b19d2 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -195,6 +195,6 @@ int is_valid_bugaddr(unsigned long pc) #endif /* CONFIG_GENERIC_BUG */ /* stvec & scratch is already set from head.S */ -void trap_init(void) +void __init trap_init(void) { } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index dc9b988e0778..3de7ec030342 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -60,7 +60,7 @@ static void __init zone_sizes_init(void) free_area_init(max_zone_pfns); } -static void setup_zero_page(void) +static void __init setup_zero_page(void) { memset((void *)empty_zero_page, 0, PAGE_SIZE); } @@ -78,7 +78,7 @@ static inline void print_mlm(char *name, unsigned long b, unsigned long t) (((t) - (b)) >> 20)); } -static void print_vm_layout(void) +static void __init print_vm_layout(void) { pr_notice("Virtual kernel memory layout:\n"); print_mlk("fixmap", (unsigned long)FIXADDR_START, @@ -630,7 +630,7 @@ static inline void setup_vm_final(void) #endif /* CONFIG_MMU */ #ifdef CONFIG_STRICT_KERNEL_RWX -void protect_kernel_text_data(void) +void __init protect_kernel_text_data(void) { unsigned long text_start = (unsigned long)_start; unsigned long init_text_start = (unsigned long)__init_text_begin; diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 28f4d52cf17e..4fa412a314f9 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -48,7 +48,7 @@ asmlinkage void __init kasan_early_init(void) local_flush_tlb_all(); } -static void kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) +static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; pte_t *ptep, *base_pte; @@ -70,7 +70,7 @@ static void kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long en set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE)); } -static void kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end) +static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; pmd_t *pmdp, *base_pmd; @@ -105,7 +105,7 @@ static void kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long en set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); } -static void kasan_populate_pgd(unsigned long vaddr, unsigned long end) +static void __init kasan_populate_pgd(unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; pgd_t *pgdp = pgd_offset_k(vaddr); diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 0aba4421115c..92179598d8f8 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -358,7 +358,7 @@ static int ptdump_show(struct seq_file *m, void *v) DEFINE_SHOW_ATTRIBUTE(ptdump); -static int ptdump_init(void) +static int __init ptdump_init(void) { unsigned int i, j; From de31ea4a1181a8bb4d32ab74f3434f2bc2b79122 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 30 Mar 2021 02:22:51 +0800 Subject: [PATCH 0657/1379] riscv: Mark some global variables __ro_after_init All of these are never modified after init, so they can be __ro_after_init. Signed-off-by: Jisheng Zhang Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sbi.c | 8 ++++---- arch/riscv/kernel/smp.c | 4 ++-- arch/riscv/kernel/time.c | 2 +- arch/riscv/kernel/vdso.c | 4 ++-- arch/riscv/mm/init.c | 6 +++--- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index fa0dd881fc5e..a404d648ef79 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -11,14 +11,14 @@ #include /* default SBI version is 0.1 */ -unsigned long sbi_spec_version = SBI_SPEC_VERSION_DEFAULT; +unsigned long sbi_spec_version __ro_after_init = SBI_SPEC_VERSION_DEFAULT; EXPORT_SYMBOL(sbi_spec_version); -static void (*__sbi_set_timer)(uint64_t stime); -static int (*__sbi_send_ipi)(const unsigned long *hart_mask); +static void (*__sbi_set_timer)(uint64_t stime) __ro_after_init; +static int (*__sbi_send_ipi)(const unsigned long *hart_mask) __ro_after_init; static int (*__sbi_rfence)(int fid, const unsigned long *hart_mask, unsigned long start, unsigned long size, - unsigned long arg4, unsigned long arg5); + unsigned long arg4, unsigned long arg5) __ro_after_init; struct sbiret sbi_ecall(int ext, int fid, unsigned long arg0, unsigned long arg1, unsigned long arg2, diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index 8325d33411d8..476e2e4bc5c5 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -32,7 +32,7 @@ enum ipi_message_type { IPI_MAX }; -unsigned long __cpuid_to_hartid_map[NR_CPUS] = { +unsigned long __cpuid_to_hartid_map[NR_CPUS] __ro_after_init = { [0 ... NR_CPUS-1] = INVALID_HARTID }; @@ -87,7 +87,7 @@ static void ipi_stop(void) wait_for_interrupt(); } -static struct riscv_ipi_ops *ipi_ops; +static struct riscv_ipi_ops *ipi_ops __ro_after_init; void riscv_set_ipi_ops(struct riscv_ipi_ops *ops) { diff --git a/arch/riscv/kernel/time.c b/arch/riscv/kernel/time.c index 8a5cf99c0776..302ceafe0f81 100644 --- a/arch/riscv/kernel/time.c +++ b/arch/riscv/kernel/time.c @@ -10,7 +10,7 @@ #include #include -unsigned long riscv_timebase; +unsigned long riscv_timebase __ro_after_init; EXPORT_SYMBOL_GPL(riscv_timebase); void __init time_init(void) diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 3f1d35e7c98a..25a3b8849599 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -20,8 +20,8 @@ extern char vdso_start[], vdso_end[]; -static unsigned int vdso_pages; -static struct page **vdso_pagelist; +static unsigned int vdso_pages __ro_after_init; +static struct page **vdso_pagelist __ro_after_init; /* * The vDSO data page. diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 3de7ec030342..9914ca0d789b 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -162,17 +162,17 @@ void __init setup_bootmem(void) } #ifdef CONFIG_MMU -static struct pt_alloc_ops pt_ops; +static struct pt_alloc_ops pt_ops __ro_after_init; /* Offset between linear mapping virtual address and kernel load address */ -unsigned long va_pa_offset; +unsigned long va_pa_offset __ro_after_init; EXPORT_SYMBOL(va_pa_offset); #ifdef CONFIG_64BIT /* Offset between kernel mapping virtual address and kernel load address */ unsigned long va_kernel_pa_offset; EXPORT_SYMBOL(va_kernel_pa_offset); #endif -unsigned long pfn_base; +unsigned long pfn_base __ro_after_init; EXPORT_SYMBOL(pfn_base); pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss; From e6a302248cec96c3af4cbfcedc44b0de8a26ebe0 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 30 Mar 2021 02:23:24 +0800 Subject: [PATCH 0658/1379] riscv: Constify sys_call_table Constify the sys_call_table so that it will be placed in the .rodata section. This will cause attempts to modify the table to fail when strict page permissions are in place. Signed-off-by: Jisheng Zhang Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/syscall.h | 2 +- arch/riscv/kernel/syscall_table.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h index 49350c8bd7b0..b933b1583c9f 100644 --- a/arch/riscv/include/asm/syscall.h +++ b/arch/riscv/include/asm/syscall.h @@ -15,7 +15,7 @@ #include /* The array of function pointers for syscalls. */ -extern void *sys_call_table[]; +extern void * const sys_call_table[]; /* * Only the low 32 bits of orig_r0 are meaningful, so we return int. diff --git a/arch/riscv/kernel/syscall_table.c b/arch/riscv/kernel/syscall_table.c index f1ead9df96ca..a63c667c27b3 100644 --- a/arch/riscv/kernel/syscall_table.c +++ b/arch/riscv/kernel/syscall_table.c @@ -13,7 +13,7 @@ #undef __SYSCALL #define __SYSCALL(nr, call) [nr] = (call), -void *sys_call_table[__NR_syscalls] = { +void * const sys_call_table[__NR_syscalls] = { [0 ... __NR_syscalls - 1] = sys_ni_syscall, #include }; From 300f62c37d4601e5b7967c6399917dc6880070bc Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 30 Mar 2021 02:23:54 +0800 Subject: [PATCH 0659/1379] riscv: Constify sbi_ipi_ops Constify the sbi_ipi_ops so that it will be placed in the .rodata section. This will cause attempts to modify it to fail when strict page permissions are in place. Signed-off-by: Jisheng Zhang Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/smp.h | 4 ++-- arch/riscv/kernel/sbi.c | 2 +- arch/riscv/kernel/smp.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h index df1f7c4cd433..a7d2811f3536 100644 --- a/arch/riscv/include/asm/smp.h +++ b/arch/riscv/include/asm/smp.h @@ -46,7 +46,7 @@ int riscv_hartid_to_cpuid(int hartid); void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out); /* Set custom IPI operations */ -void riscv_set_ipi_ops(struct riscv_ipi_ops *ops); +void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops); /* Clear IPI for current CPU */ void riscv_clear_ipi(void); @@ -92,7 +92,7 @@ static inline void riscv_cpuid_to_hartid_mask(const struct cpumask *in, cpumask_set_cpu(boot_cpu_hartid, out); } -static inline void riscv_set_ipi_ops(struct riscv_ipi_ops *ops) +static inline void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops) { } diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index a404d648ef79..38f94926f2ee 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -571,7 +571,7 @@ static void sbi_send_cpumask_ipi(const struct cpumask *target) sbi_send_ipi(cpumask_bits(&hartid_mask)); } -static struct riscv_ipi_ops sbi_ipi_ops = { +static const struct riscv_ipi_ops sbi_ipi_ops = { .ipi_inject = sbi_send_cpumask_ipi }; diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index 476e2e4bc5c5..366cb87c0e2e 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -87,9 +87,9 @@ static void ipi_stop(void) wait_for_interrupt(); } -static struct riscv_ipi_ops *ipi_ops __ro_after_init; +static const struct riscv_ipi_ops *ipi_ops __ro_after_init; -void riscv_set_ipi_ops(struct riscv_ipi_ops *ops) +void riscv_set_ipi_ops(const struct riscv_ipi_ops *ops) { ipi_ops = ops; } From cdd1b2bd358ffda2638fe18ff47191e84e18525f Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 30 Mar 2021 02:24:21 +0800 Subject: [PATCH 0660/1379] riscv: kprobes: Implement alloc_insn_page() Allocate PAGE_KERNEL_READ_EXEC(read only, executable) page for kprobes insn page. This is to prepare for STRICT_MODULE_RWX. Signed-off-by: Jisheng Zhang Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/probes/kprobes.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index a2ec18662fee..0b451ff76d55 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -84,6 +84,14 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return 0; } +void *alloc_insn_page(void) +{ + return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_READ_EXEC, + VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, + __builtin_return_address(0)); +} + /* install breakpoint in text */ void __kprobes arch_arm_kprobe(struct kprobe *p) { From 1d27d854425faec98f352cf88ec3e2a8844429a4 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 30 Mar 2021 02:24:54 +0800 Subject: [PATCH 0661/1379] riscv: bpf: Move bpf_jit_alloc_exec() and bpf_jit_free_exec() to core We will drop the executable permissions of the code pages from the mapping at allocation time soon. Move bpf_jit_alloc_exec() and bpf_jit_free_exec() to bpf_jit_core.c so that they can be shared by both RV64I and RV32I. Signed-off-by: Jisheng Zhang Acked-by: Luke Nelson Signed-off-by: Palmer Dabbelt --- arch/riscv/net/bpf_jit_comp64.c | 13 ------------- arch/riscv/net/bpf_jit_core.c | 13 +++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index b44ff52f84a6..87e3bf5b9086 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1148,16 +1148,3 @@ void bpf_jit_build_epilogue(struct rv_jit_context *ctx) { __build_epilogue(false, ctx); } - -void *bpf_jit_alloc_exec(unsigned long size) -{ - return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, - BPF_JIT_REGION_END, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, - __builtin_return_address(0)); -} - -void bpf_jit_free_exec(void *addr) -{ - return vfree(addr); -} diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index 3630d447352c..d8da819290b7 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -164,3 +164,16 @@ out: tmp : orig_prog); return prog; } + +void *bpf_jit_alloc_exec(unsigned long size) +{ + return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, + BPF_JIT_REGION_END, GFP_KERNEL, + PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + __builtin_return_address(0)); +} + +void bpf_jit_free_exec(void *addr) +{ + return vfree(addr); +} From fc8504765ec5e812135b8ccafca7101069a0c6d8 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 30 Mar 2021 02:25:21 +0800 Subject: [PATCH 0662/1379] riscv: bpf: Avoid breaking W^X We allocate Non-executable pages, then call bpf_jit_binary_lock_ro() to enable executable permission after mapping them read-only. This is to prepare for STRICT_MODULE_RWX in following patch. Signed-off-by: Jisheng Zhang Signed-off-by: Palmer Dabbelt --- arch/riscv/net/bpf_jit_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index d8da819290b7..fed86f42dfbe 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -152,6 +152,7 @@ skip_init_ctx: bpf_flush_icache(jit_data->header, ctx->insns + ctx->ninsns); if (!prog->is_func || extra_pass) { + bpf_jit_binary_lock_ro(jit_data->header); out_offset: kfree(ctx->offset); kfree(jit_data); @@ -169,7 +170,7 @@ void *bpf_jit_alloc_exec(unsigned long size) { return __vmalloc_node_range(size, PAGE_SIZE, BPF_JIT_REGION_START, BPF_JIT_REGION_END, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); } From 5387054b986e2d0d994b519020d81b8aa64789c5 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 30 Mar 2021 02:25:51 +0800 Subject: [PATCH 0663/1379] riscv: module: Create module allocations without exec permissions The core code manages the executable permissions of code regions of modules explicitly, it is not necessary to create the module vmalloc regions with RWX permissions. Create them with RW- permissions instead. Signed-off-by: Jisheng Zhang Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index ce153771e5e9..68a9e3d1fe16 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -412,7 +412,7 @@ void *module_alloc(unsigned long size) { return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); } #endif From a9451b8e19716cf8bf420a1d0e58199558ecaeb5 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 30 Mar 2021 02:26:17 +0800 Subject: [PATCH 0664/1379] riscv: Set ARCH_HAS_STRICT_MODULE_RWX if MMU Now we can set ARCH_HAS_STRICT_MODULE_RWX for MMU riscv platforms, this is good from security perspective. Signed-off-by: Jisheng Zhang Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a840004d2829..847ab3c32fc0 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -29,6 +29,7 @@ config RISCV select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY select ARCH_HAS_STRICT_KERNEL_RWX if MMU + select ARCH_HAS_STRICT_MODULE_RWX if MMU select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT From b1ebaa0e1318494a7637099a26add50509e37964 Mon Sep 17 00:00:00 2001 From: Liao Chang Date: Tue, 30 Mar 2021 16:18:48 +0800 Subject: [PATCH 0665/1379] riscv/kprobe: fix kernel panic when invoking sys_read traced by kprobe The execution of sys_read end up hitting a BUG_ON() in __find_get_block after installing kprobe at sys_read, the BUG message like the following: [ 65.708663] ------------[ cut here ]------------ [ 65.709987] kernel BUG at fs/buffer.c:1251! [ 65.711283] Kernel BUG [#1] [ 65.712032] Modules linked in: [ 65.712925] CPU: 0 PID: 51 Comm: sh Not tainted 5.12.0-rc4 #1 [ 65.714407] Hardware name: riscv-virtio,qemu (DT) [ 65.715696] epc : __find_get_block+0x218/0x2c8 [ 65.716835] ra : __getblk_gfp+0x1c/0x4a [ 65.717831] epc : ffffffe00019f11e ra : ffffffe00019f56a sp : ffffffe002437930 [ 65.719553] gp : ffffffe000f06030 tp : ffffffe0015abc00 t0 : ffffffe00191e038 [ 65.721290] t1 : ffffffe00191e038 t2 : 000000000000000a s0 : ffffffe002437960 [ 65.723051] s1 : ffffffe00160ad00 a0 : ffffffe00160ad00 a1 : 000000000000012a [ 65.724772] a2 : 0000000000000400 a3 : 0000000000000008 a4 : 0000000000000040 [ 65.726545] a5 : 0000000000000000 a6 : ffffffe00191e000 a7 : 0000000000000000 [ 65.728308] s2 : 000000000000012a s3 : 0000000000000400 s4 : 0000000000000008 [ 65.730049] s5 : 000000000000006c s6 : ffffffe00240f800 s7 : ffffffe000f080a8 [ 65.731802] s8 : 0000000000000001 s9 : 000000000000012a s10: 0000000000000008 [ 65.733516] s11: 0000000000000008 t3 : 00000000000003ff t4 : 000000000000000f [ 65.734434] t5 : 00000000000003ff t6 : 0000000000040000 [ 65.734613] status: 0000000000000100 badaddr: 0000000000000000 cause: 0000000000000003 [ 65.734901] Call Trace: [ 65.735076] [] __find_get_block+0x218/0x2c8 [ 65.735417] [] __ext4_get_inode_loc+0xb2/0x2f6 [ 65.735618] [] ext4_get_inode_loc+0x3a/0x8a [ 65.735802] [] ext4_reserve_inode_write+0x2e/0x8c [ 65.735999] [] __ext4_mark_inode_dirty+0x4c/0x18e [ 65.736208] [] ext4_dirty_inode+0x46/0x66 [ 65.736387] [] __mark_inode_dirty+0x12c/0x3da [ 65.736576] [] touch_atime+0x146/0x150 [ 65.736748] [] filemap_read+0x234/0x246 [ 65.736920] [] generic_file_read_iter+0xc0/0x114 [ 65.737114] [] ext4_file_read_iter+0x42/0xea [ 65.737310] [] new_sync_read+0xe2/0x15a [ 65.737483] [] vfs_read+0xca/0xf2 [ 65.737641] [] ksys_read+0x5e/0xc8 [ 65.737816] [] sys_read+0xe/0x16 [ 65.737973] [] ret_from_syscall+0x0/0x2 [ 65.738858] ---[ end trace fe93f985456c935d ]--- A simple reproducer looks like: echo 'p:myprobe sys_read fd=%a0 buf=%a1 count=%a2' > /sys/kernel/debug/tracing/kprobe_events echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable cat /sys/kernel/debug/tracing/trace Here's what happens to hit that BUG_ON(): 1) After installing kprobe at entry of sys_read, the first instruction is replaced by 'ebreak' instruction on riscv64 platform. 2) Once kernel reach the 'ebreak' instruction at the entry of sys_read, it trap into the riscv breakpoint handler, where it do something to setup for coming single-step of origin instruction, including backup the 'sstatus' in pt_regs, followed by disable interrupt during single stepping via clear 'SIE' bit of 'sstatus' in pt_regs. 3) Then kernel restore to the instruction slot contains two instructions, one is original instruction at entry of sys_read, the other is 'ebreak'. Here it trigger a 'Instruction page fault' exception (value at 'scause' is '0xc'), if PF is not filled into PageTabe for that slot yet. 4) Again kernel trap into page fault exception handler, where it choose different policy according to the state of running kprobe. Because afte 2) the state is KPROBE_HIT_SS, so kernel reset the current kprobe and 'pc' points back to the probe address. 5) Because 'epc' point back to 'ebreak' instrution at sys_read probe, kernel trap into breakpoint handler again, and repeat the operations at 2), however 'sstatus' without 'SIE' is keep at 4), it cause the real 'sstatus' saved at 2) is overwritten by the one withou 'SIE'. 6) When kernel cross the probe the 'sstatus' CSR restore with value without 'SIE', and reach __find_get_block where it requires the interrupt must be enabled. Fix this is very trivial, just restore the value of 'sstatus' in pt_regs with backup one at 2) when the instruction being single stepped cause a page fault. Fixes: c22b0bcb1dd02 ("riscv: Add kprobes supported") Signed-off-by: Liao Chang Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/probes/kprobes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index 0b451ff76d55..2610febc5012 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -269,8 +269,10 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr) if (kcb->kprobe_status == KPROBE_REENTER) restore_previous_kprobe(kcb); - else + else { + kprobes_restore_local_irqflag(kcb, regs); reset_current_kprobe(); + } break; case KPROBE_HIT_ACTIVE: From e75e6bf47a4723ce16f65c7387c20a8c18a1c13b Mon Sep 17 00:00:00 2001 From: zhouchuangao Date: Tue, 30 Mar 2021 06:56:26 -0700 Subject: [PATCH 0666/1379] riscv/mm: Use BUG_ON instead of if condition followed by BUG. BUG_ON() uses unlikely in if(), which can be optimized at compile time. Signed-off-by: zhouchuangao Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 9914ca0d789b..a2f4c148f02e 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -232,8 +232,8 @@ static phys_addr_t alloc_pte_late(uintptr_t va) unsigned long vaddr; vaddr = __get_free_page(GFP_KERNEL); - if (!vaddr || !pgtable_pte_page_ctor(virt_to_page(vaddr))) - BUG(); + BUG_ON(!vaddr || !pgtable_pte_page_ctor(virt_to_page(vaddr))); + return __pa(vaddr); } From 772d7891e8b3b0baae7bb88a294d61fd07ba6d15 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 2 Apr 2021 21:29:08 +0800 Subject: [PATCH 0667/1379] riscv: vdso: fix and clean-up Makefile Running "make" on an already compiled kernel tree will rebuild the kernel even without any modifications: CALL linux/scripts/checksyscalls.sh CALL linux/scripts/atomic/check-atomics.sh CHK include/generated/compile.h SO2S arch/riscv/kernel/vdso/vdso-syms.S AS arch/riscv/kernel/vdso/vdso-syms.o AR arch/riscv/kernel/vdso/built-in.a AR arch/riscv/kernel/built-in.a AR arch/riscv/built-in.a GEN .version CHK include/generated/compile.h UPD include/generated/compile.h CC init/version.o AR init/built-in.a LD vmlinux.o The reason is "Any target that utilizes if_changed must be listed in $(targets), otherwise the command line check will fail, and the target will always be built" as explained by Documentation/kbuild/makefiles.rst Fix this build bug by adding vdso-syms.S to $(targets) At the same time, there are two trivial clean up modifications: - the vdso-dummy.o is not needed any more after so remove it. - vdso.lds is a generated file, so it should be prefixed with $(obj)/ instead of $(src)/ Fixes: c2c81bb2f691 ("RISC-V: Fix the VDSO symbol generaton for binutils-2.35+") Cc: stable@vger.kernel.org Signed-off-by: Jisheng Zhang Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vdso/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index ca2b40dfd24b..24d936c147cd 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -23,7 +23,7 @@ ifneq ($(c-gettimeofday-y),) endif # Build rules -targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds vdso-dummy.o +targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.lds vdso-syms.S obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) obj-y += vdso.o vdso-syms.o @@ -41,7 +41,7 @@ KASAN_SANITIZE := n $(obj)/vdso.o: $(obj)/vdso.so # link rule for the .so file, .lds has to be first -$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) FORCE +$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold) LDFLAGS_vdso.so.dbg = -shared -s -soname=linux-vdso.so.1 \ --build-id=sha1 --hash-style=both --eh-frame-hdr From d83e682e301071313e390e2f5ba2f6ca2ebc1848 Mon Sep 17 00:00:00 2001 From: Nick Kossifidis Date: Mon, 19 Apr 2021 03:55:35 +0300 Subject: [PATCH 0668/1379] RISC-V: Add EM_RISCV to kexec UAPI header Add RISC-V to the list of supported kexec architectures, we need to add the definition early-on so that later patches can use it. EM_RISCV is 243 as per ELF psABI specification here: https://github.com/riscv/riscv-elf-psabi-doc/blob/master/riscv-elf.md Signed-off-by: Nick Kossifidis Signed-off-by: Palmer Dabbelt --- include/uapi/linux/kexec.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 05669c87a0af..778dc191c265 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -42,6 +42,7 @@ #define KEXEC_ARCH_MIPS_LE (10 << 16) #define KEXEC_ARCH_MIPS ( 8 << 16) #define KEXEC_ARCH_AARCH64 (183 << 16) +#define KEXEC_ARCH_RISCV (243 << 16) /* The artificial cap on the number of segments passed to kexec_load. */ #define KEXEC_SEGMENT_MAX 16 From fba8a8674f68a0628abae470dfcfbcb4a0d7a79e Mon Sep 17 00:00:00 2001 From: Nick Kossifidis Date: Mon, 19 Apr 2021 03:55:36 +0300 Subject: [PATCH 0669/1379] RISC-V: Add kexec support This patch adds support for kexec on RISC-V. On SMP systems it depends on HOTPLUG_CPU in order to be able to bring up all harts after kexec. It also needs a recent OpenSBI version that supports the HSM extension. I tested it on riscv64 QEMU on both an smp and a non-smp system. Signed-off-by: Nick Kossifidis Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 15 +++ arch/riscv/include/asm/kexec.h | 49 ++++++++ arch/riscv/kernel/Makefile | 5 + arch/riscv/kernel/kexec_relocate.S | 157 ++++++++++++++++++++++++ arch/riscv/kernel/machine_kexec.c | 186 +++++++++++++++++++++++++++++ 5 files changed, 412 insertions(+) create mode 100644 arch/riscv/include/asm/kexec.h create mode 100644 arch/riscv/kernel/kexec_relocate.S create mode 100644 arch/riscv/kernel/machine_kexec.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 847ab3c32fc0..e38bd044610f 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -391,6 +391,21 @@ config RISCV_SBI_V01 help This config allows kernel to use SBI v0.1 APIs. This will be deprecated in future once legacy M-mode software are no longer in use. + +config KEXEC + bool "Kexec system call" + select KEXEC_CORE + select HOTPLUG_CPU if SMP + depends on MMU + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is independent of the system firmware. And like a reboot + you can start any kernel with it, not just Linux. + + The name comes from the similarity to the exec system call. + + endmenu menu "Boot options" diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h new file mode 100644 index 000000000000..86e6e4922d1f --- /dev/null +++ b/arch/riscv/include/asm/kexec.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 FORTH-ICS/CARV + * Nick Kossifidis + */ + +#ifndef _RISCV_KEXEC_H +#define _RISCV_KEXEC_H + +#include /* For PAGE_SIZE */ + +/* Maximum physical address we can use pages from */ +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) + +/* Maximum address we can reach in physical address mode */ +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) + +/* Maximum address we can use for the control code buffer */ +#define KEXEC_CONTROL_MEMORY_LIMIT (-1UL) + +/* Reserve a page for the control code buffer */ +#define KEXEC_CONTROL_PAGE_SIZE PAGE_SIZE + +#define KEXEC_ARCH KEXEC_ARCH_RISCV + +static inline void +crash_setup_regs(struct pt_regs *newregs, + struct pt_regs *oldregs) +{ + /* Dummy implementation for now */ +} + + +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + unsigned long fdt_addr; +}; + +const extern unsigned char riscv_kexec_relocate[]; +const extern unsigned int riscv_kexec_relocate_size; + +typedef void (*riscv_kexec_do_relocate)(unsigned long first_ind_entry, + unsigned long jump_addr, + unsigned long fdt_addr, + unsigned long hartid, + unsigned long va_pa_off); + +#endif diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 3dc0abde988a..96650aaed5eb 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -9,6 +9,10 @@ CFLAGS_REMOVE_patch.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_sbi.o = $(CC_FLAGS_FTRACE) endif +ifdef CONFIG_KEXEC +AFLAGS_kexec_relocate.o := -mcmodel=medany -mno-relax +endif + extra-y += head.o extra-y += vmlinux.lds @@ -54,6 +58,7 @@ obj-$(CONFIG_SMP) += cpu_ops_sbi.o endif obj-$(CONFIG_HOTPLUG_CPU) += cpu-hotplug.o obj-$(CONFIG_KGDB) += kgdb.o +obj-$(CONFIG_KEXEC) += kexec_relocate.o machine_kexec.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o diff --git a/arch/riscv/kernel/kexec_relocate.S b/arch/riscv/kernel/kexec_relocate.S new file mode 100644 index 000000000000..84101d0a23ef --- /dev/null +++ b/arch/riscv/kernel/kexec_relocate.S @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 FORTH-ICS/CARV + * Nick Kossifidis + */ + +#include /* For RISCV_* and REG_* macros */ +#include /* For CSR_* macros */ +#include /* For PAGE_SIZE */ +#include /* For SYM_* macros */ + +.section ".rodata" +SYM_CODE_START(riscv_kexec_relocate) + + /* + * s0: Pointer to the current entry + * s1: (const) Phys address to jump to after relocation + * s2: (const) Phys address of the FDT image + * s3: (const) The hartid of the current hart + * s4: Pointer to the destination address for the relocation + * s5: (const) Number of words per page + * s6: (const) 1, used for subtraction + * s7: (const) va_pa_offset, used when switching MMU off + * s8: (const) Physical address of the main loop + * s9: (debug) indirection page counter + * s10: (debug) entry counter + * s11: (debug) copied words counter + */ + mv s0, a0 + mv s1, a1 + mv s2, a2 + mv s3, a3 + mv s4, zero + li s5, (PAGE_SIZE / RISCV_SZPTR) + li s6, 1 + mv s7, a4 + mv s8, zero + mv s9, zero + mv s10, zero + mv s11, zero + + /* Disable / cleanup interrupts */ + csrw CSR_SIE, zero + csrw CSR_SIP, zero + + /* + * When we switch SATP.MODE to "Bare" we'll only + * play with physical addresses. However the first time + * we try to jump somewhere, the offset on the jump + * will be relative to pc which will still be on VA. To + * deal with this we set stvec to the physical address at + * the start of the loop below so that we jump there in + * any case. + */ + la s8, 1f + sub s8, s8, s7 + csrw CSR_STVEC, s8 + + /* Process entries in a loop */ +.align 2 +1: + addi s10, s10, 1 + REG_L t0, 0(s0) /* t0 = *image->entry */ + addi s0, s0, RISCV_SZPTR /* image->entry++ */ + + /* IND_DESTINATION entry ? -> save destination address */ + andi t1, t0, 0x1 + beqz t1, 2f + andi s4, t0, ~0x1 + j 1b + +2: + /* IND_INDIRECTION entry ? -> update next entry ptr (PA) */ + andi t1, t0, 0x2 + beqz t1, 2f + andi s0, t0, ~0x2 + addi s9, s9, 1 + csrw CSR_SATP, zero + jalr zero, s8, 0 + +2: + /* IND_DONE entry ? -> jump to done label */ + andi t1, t0, 0x4 + beqz t1, 2f + j 4f + +2: + /* + * IND_SOURCE entry ? -> copy page word by word to the + * destination address we got from IND_DESTINATION + */ + andi t1, t0, 0x8 + beqz t1, 1b /* Unknown entry type, ignore it */ + andi t0, t0, ~0x8 + mv t3, s5 /* i = num words per page */ +3: /* copy loop */ + REG_L t1, (t0) /* t1 = *src_ptr */ + REG_S t1, (s4) /* *dst_ptr = *src_ptr */ + addi t0, t0, RISCV_SZPTR /* stc_ptr++ */ + addi s4, s4, RISCV_SZPTR /* dst_ptr++ */ + sub t3, t3, s6 /* i-- */ + addi s11, s11, 1 /* c++ */ + beqz t3, 1b /* copy done ? */ + j 3b + +4: + /* Pass the arguments to the next kernel / Cleanup*/ + mv a0, s3 + mv a1, s2 + mv a2, s1 + + /* Cleanup */ + mv a3, zero + mv a4, zero + mv a5, zero + mv a6, zero + mv a7, zero + + mv s0, zero + mv s1, zero + mv s2, zero + mv s3, zero + mv s4, zero + mv s5, zero + mv s6, zero + mv s7, zero + mv s8, zero + mv s9, zero + mv s10, zero + mv s11, zero + + mv t0, zero + mv t1, zero + mv t2, zero + mv t3, zero + mv t4, zero + mv t5, zero + mv t6, zero + csrw CSR_SEPC, zero + csrw CSR_SCAUSE, zero + csrw CSR_SSCRATCH, zero + + /* + * Make sure the relocated code is visible + * and jump to the new kernel + */ + fence.i + + jalr zero, a2, 0 + +SYM_CODE_END(riscv_kexec_relocate) +riscv_kexec_relocate_end: + + .section ".rodata" +SYM_DATA(riscv_kexec_relocate_size, + .long riscv_kexec_relocate_end - riscv_kexec_relocate) + diff --git a/arch/riscv/kernel/machine_kexec.c b/arch/riscv/kernel/machine_kexec.c new file mode 100644 index 000000000000..e165c7874740 --- /dev/null +++ b/arch/riscv/kernel/machine_kexec.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 FORTH-ICS/CARV + * Nick Kossifidis + */ + +#include +#include /* For riscv_kexec_* symbol defines */ +#include /* For smp_send_stop () */ +#include /* For local_flush_icache_all() */ +#include /* For smp_wmb() */ +#include /* For PAGE_MASK */ +#include /* For fdt_check_header() */ +#include /* For set_memory_x() */ +#include /* For unreachable() */ +#include /* For cpu_down() */ + +/** + * kexec_image_info - Print received image details + */ +static void +kexec_image_info(const struct kimage *image) +{ + unsigned long i; + + pr_debug("Kexec image info:\n"); + pr_debug("\ttype: %d\n", image->type); + pr_debug("\tstart: %lx\n", image->start); + pr_debug("\thead: %lx\n", image->head); + pr_debug("\tnr_segments: %lu\n", image->nr_segments); + + for (i = 0; i < image->nr_segments; i++) { + pr_debug("\t segment[%lu]: %016lx - %016lx", i, + image->segment[i].mem, + image->segment[i].mem + image->segment[i].memsz); + pr_debug("\t\t0x%lx bytes, %lu pages\n", + (unsigned long) image->segment[i].memsz, + (unsigned long) image->segment[i].memsz / PAGE_SIZE); + } +} + +/** + * machine_kexec_prepare - Initialize kexec + * + * This function is called from do_kexec_load, when the user has + * provided us with an image to be loaded. Its goal is to validate + * the image and prepare the control code buffer as needed. + * Note that kimage_alloc_init has already been called and the + * control buffer has already been allocated. + */ +int +machine_kexec_prepare(struct kimage *image) +{ + struct kimage_arch *internal = &image->arch; + struct fdt_header fdt = {0}; + void *control_code_buffer = NULL; + unsigned int control_code_buffer_sz = 0; + int i = 0; + + kexec_image_info(image); + + if (image->type == KEXEC_TYPE_CRASH) { + pr_warn("Loading a crash kernel is unsupported for now.\n"); + return -EINVAL; + } + + /* Find the Flattened Device Tree and save its physical address */ + for (i = 0; i < image->nr_segments; i++) { + if (image->segment[i].memsz <= sizeof(fdt)) + continue; + + if (copy_from_user(&fdt, image->segment[i].buf, sizeof(fdt))) + continue; + + if (fdt_check_header(&fdt)) + continue; + + internal->fdt_addr = (unsigned long) image->segment[i].mem; + break; + } + + if (!internal->fdt_addr) { + pr_err("Device tree not included in the provided image\n"); + return -EINVAL; + } + + /* Copy the assembler code for relocation to the control page */ + control_code_buffer = page_address(image->control_code_page); + control_code_buffer_sz = page_size(image->control_code_page); + if (unlikely(riscv_kexec_relocate_size > control_code_buffer_sz)) { + pr_err("Relocation code doesn't fit within a control page\n"); + return -EINVAL; + } + memcpy(control_code_buffer, riscv_kexec_relocate, + riscv_kexec_relocate_size); + + /* Mark the control page executable */ + set_memory_x((unsigned long) control_code_buffer, 1); + + return 0; +} + + +/** + * machine_kexec_cleanup - Cleanup any leftovers from + * machine_kexec_prepare + * + * This function is called by kimage_free to handle any arch-specific + * allocations done on machine_kexec_prepare. Since we didn't do any + * allocations there, this is just an empty function. Note that the + * control buffer is freed by kimage_free. + */ +void +machine_kexec_cleanup(struct kimage *image) +{ +} + + +/* + * machine_shutdown - Prepare for a kexec reboot + * + * This function is called by kernel_kexec just before machine_kexec + * below. Its goal is to prepare the rest of the system (the other + * harts and possibly devices etc) for a kexec reboot. + */ +void machine_shutdown(void) +{ + /* + * No more interrupts on this hart + * until we are back up. + */ + local_irq_disable(); + +#if defined(CONFIG_HOTPLUG_CPU) + smp_shutdown_nonboot_cpus(smp_processor_id()); +#endif +} + +/** + * machine_crash_shutdown - Prepare to kexec after a kernel crash + * + * This function is called by crash_kexec just before machine_kexec + * below and its goal is similar to machine_shutdown, but in case of + * a kernel crash. Since we don't handle such cases yet, this function + * is empty. + */ +void +machine_crash_shutdown(struct pt_regs *regs) +{ +} + +/** + * machine_kexec - Jump to the loaded kimage + * + * This function is called by kernel_kexec which is called by the + * reboot system call when the reboot cmd is LINUX_REBOOT_CMD_KEXEC, + * or by crash_kernel which is called by the kernel's arch-specific + * trap handler in case of a kernel panic. It's the final stage of + * the kexec process where the pre-loaded kimage is ready to be + * executed. We assume at this point that all other harts are + * suspended and this hart will be the new boot hart. + */ +void __noreturn +machine_kexec(struct kimage *image) +{ + struct kimage_arch *internal = &image->arch; + unsigned long jump_addr = (unsigned long) image->start; + unsigned long first_ind_entry = (unsigned long) &image->head; + unsigned long this_hart_id = raw_smp_processor_id(); + unsigned long fdt_addr = internal->fdt_addr; + void *control_code_buffer = page_address(image->control_code_page); + riscv_kexec_do_relocate do_relocate = control_code_buffer; + + pr_notice("Will call new kernel at %08lx from hart id %lx\n", + jump_addr, this_hart_id); + pr_notice("FDT image at %08lx\n", fdt_addr); + + /* Make sure the relocation code is visible to the hart */ + local_flush_icache_all(); + + /* Jump to the relocation code */ + pr_notice("Bye...\n"); + do_relocate(first_ind_entry, jump_addr, fdt_addr, + this_hart_id, va_pa_offset); + unreachable(); +} From ffe0e526126884cf036a6f724220f1f9b4094fd2 Mon Sep 17 00:00:00 2001 From: Nick Kossifidis Date: Mon, 19 Apr 2021 03:55:37 +0300 Subject: [PATCH 0670/1379] RISC-V: Improve init_resources() The kernel region is always present and we know where it is, no need to look for it inside the loop, just ignore it like the rest of the reserved regions within system's memory. Additionally, we don't need to call memblock_free inside the loop, as if called it'll split the region of pre-allocated resources in two parts, messing things up, just re-use the previous pre-allocated resource and free any unused resources after both loops finish. Signed-off-by: Nick Kossifidis [Palmer: commit text] Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 110 +++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 54 deletions(-) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index d208abc0b473..9193ba3a006b 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -60,6 +60,7 @@ static DEFINE_PER_CPU(struct cpu, cpu_devices); * also add "System RAM" regions for compatibility with other * archs, and the rest of the known regions for completeness. */ +static struct resource kimage_res = { .name = "Kernel image", }; static struct resource code_res = { .name = "Kernel code", }; static struct resource data_res = { .name = "Kernel data", }; static struct resource rodata_res = { .name = "Kernel rodata", }; @@ -80,56 +81,16 @@ static int __init add_resource(struct resource *parent, return 1; } -static int __init add_kernel_resources(struct resource *res) +static int __init add_kernel_resources(void) { int ret = 0; /* * The memory region of the kernel image is continuous and - * was reserved on setup_bootmem, find it here and register - * it as a resource, then register the various segments of - * the image as child nodes + * was reserved on setup_bootmem, register it here as a + * resource, with the various segments of the image as + * child nodes. */ - if (!(res->start <= code_res.start && res->end >= data_res.end)) - return 0; - - res->name = "Kernel image"; - res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - - /* - * We removed a part of this region on setup_bootmem so - * we need to expand the resource for the bss to fit in. - */ - res->end = bss_res.end; - - ret = add_resource(&iomem_resource, res); - if (ret < 0) - return ret; - - ret = add_resource(res, &code_res); - if (ret < 0) - return ret; - - ret = add_resource(res, &rodata_res); - if (ret < 0) - return ret; - - ret = add_resource(res, &data_res); - if (ret < 0) - return ret; - - ret = add_resource(res, &bss_res); - - return ret; -} - -static void __init init_resources(void) -{ - struct memblock_region *region = NULL; - struct resource *res = NULL; - struct resource *mem_res = NULL; - size_t mem_res_sz = 0; - int ret = 0, i = 0; code_res.start = __pa_symbol(_text); code_res.end = __pa_symbol(_etext) - 1; @@ -147,35 +108,73 @@ static void __init init_resources(void) bss_res.end = __pa_symbol(__bss_stop) - 1; bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - mem_res_sz = (memblock.memory.cnt + memblock.reserved.cnt) * sizeof(*mem_res); + kimage_res.start = code_res.start; + kimage_res.end = bss_res.end; + kimage_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + + ret = add_resource(&iomem_resource, &kimage_res); + if (ret < 0) + return ret; + + ret = add_resource(&kimage_res, &code_res); + if (ret < 0) + return ret; + + ret = add_resource(&kimage_res, &rodata_res); + if (ret < 0) + return ret; + + ret = add_resource(&kimage_res, &data_res); + if (ret < 0) + return ret; + + ret = add_resource(&kimage_res, &bss_res); + + return ret; +} + +static void __init init_resources(void) +{ + struct memblock_region *region = NULL; + struct resource *res = NULL; + struct resource *mem_res = NULL; + size_t mem_res_sz = 0; + int num_resources = 0, res_idx = 0; + int ret = 0; + + /* + 1 as memblock_alloc() might increase memblock.reserved.cnt */ + num_resources = memblock.memory.cnt + memblock.reserved.cnt + 1; + res_idx = num_resources - 1; + + mem_res_sz = num_resources * sizeof(*mem_res); mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES); if (!mem_res) panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz); + /* * Start by adding the reserved regions, if they overlap * with /memory regions, insert_resource later on will take * care of it. */ + ret = add_kernel_resources(); + if (ret < 0) + goto error; + for_each_reserved_mem_region(region) { - res = &mem_res[i++]; + res = &mem_res[res_idx--]; res->name = "Reserved"; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; res->start = __pfn_to_phys(memblock_region_reserved_base_pfn(region)); res->end = __pfn_to_phys(memblock_region_reserved_end_pfn(region)) - 1; - ret = add_kernel_resources(res); - if (ret < 0) - goto error; - else if (ret) - continue; - /* * Ignore any other reserved regions within * system memory. */ if (memblock_is_memory(res->start)) { - memblock_free((phys_addr_t) res, sizeof(struct resource)); + /* Re-use this pre-allocated resource */ + res_idx++; continue; } @@ -186,7 +185,7 @@ static void __init init_resources(void) /* Add /memory regions to the resource tree */ for_each_mem_region(region) { - res = &mem_res[i++]; + res = &mem_res[res_idx--]; if (unlikely(memblock_is_nomap(region))) { res->name = "Reserved"; @@ -204,6 +203,9 @@ static void __init init_resources(void) goto error; } + /* Clean-up any unused pre-allocated resources */ + mem_res_sz = (num_resources - res_idx + 1) * sizeof(*mem_res); + memblock_free((phys_addr_t) mem_res, mem_res_sz); return; error: From e53d28180d4d0fd12b6d2bde49cb87aa775b6ba8 Mon Sep 17 00:00:00 2001 From: Nick Kossifidis Date: Mon, 19 Apr 2021 03:55:38 +0300 Subject: [PATCH 0671/1379] RISC-V: Add kdump support This patch adds support for kdump, the kernel will reserve a region for the crash kernel and jump there on panic. In order for userspace tools (kexec-tools) to prepare the crash kernel kexec image, we also need to expose some information on /proc/iomem for the memory regions used by the kernel and for the region reserved for crash kernel. Note that on userspace the device tree is used to determine the system's memory layout so the "System RAM" on /proc/iomem is ignored. I tested this on riscv64 qemu and works as expected, you may test it by triggering a crash through /proc/sysrq_trigger: echo c > /proc/sysrq_trigger Signed-off-by: Nick Kossifidis Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/elf.h | 6 +++ arch/riscv/include/asm/kexec.h | 19 +++++--- arch/riscv/kernel/Makefile | 2 +- arch/riscv/kernel/crash_save_regs.S | 56 +++++++++++++++++++++++ arch/riscv/kernel/kexec_relocate.S | 68 ++++++++++++++++++++++++++- arch/riscv/kernel/machine_kexec.c | 43 +++++++++-------- arch/riscv/kernel/setup.c | 11 ++++- arch/riscv/mm/init.c | 71 +++++++++++++++++++++++++++++ 8 files changed, 249 insertions(+), 27 deletions(-) create mode 100644 arch/riscv/kernel/crash_save_regs.S diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index 5c725e1df58b..f4b490cd0e5d 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -81,4 +81,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #endif /* CONFIG_MMU */ +#define ELF_CORE_COPY_REGS(dest, regs) \ +do { \ + *(struct user_regs_struct *)&(dest) = \ + *(struct user_regs_struct *)regs; \ +} while (0); + #endif /* _ASM_RISCV_ELF_H */ diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h index 86e6e4922d1f..1e954101906a 100644 --- a/arch/riscv/include/asm/kexec.h +++ b/arch/riscv/include/asm/kexec.h @@ -23,11 +23,16 @@ #define KEXEC_ARCH KEXEC_ARCH_RISCV +extern void riscv_crash_save_regs(struct pt_regs *newregs); + static inline void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs) { - /* Dummy implementation for now */ + if (oldregs) + memcpy(newregs, oldregs, sizeof(struct pt_regs)); + else + riscv_crash_save_regs(newregs); } @@ -40,10 +45,12 @@ struct kimage_arch { const extern unsigned char riscv_kexec_relocate[]; const extern unsigned int riscv_kexec_relocate_size; -typedef void (*riscv_kexec_do_relocate)(unsigned long first_ind_entry, - unsigned long jump_addr, - unsigned long fdt_addr, - unsigned long hartid, - unsigned long va_pa_off); +typedef void (*riscv_kexec_method)(unsigned long first_ind_entry, + unsigned long jump_addr, + unsigned long fdt_addr, + unsigned long hartid, + unsigned long va_pa_off); + +extern riscv_kexec_method riscv_kexec_norelocate; #endif diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 96650aaed5eb..3ee07bf0cea7 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -58,7 +58,7 @@ obj-$(CONFIG_SMP) += cpu_ops_sbi.o endif obj-$(CONFIG_HOTPLUG_CPU) += cpu-hotplug.o obj-$(CONFIG_KGDB) += kgdb.o -obj-$(CONFIG_KEXEC) += kexec_relocate.o machine_kexec.o +obj-$(CONFIG_KEXEC) += kexec_relocate.o crash_save_regs.o machine_kexec.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o diff --git a/arch/riscv/kernel/crash_save_regs.S b/arch/riscv/kernel/crash_save_regs.S new file mode 100644 index 000000000000..7832fb763aba --- /dev/null +++ b/arch/riscv/kernel/crash_save_regs.S @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 FORTH-ICS/CARV + * Nick Kossifidis + */ + +#include /* For RISCV_* and REG_* macros */ +#include /* For CSR_* macros */ +#include /* For offsets on pt_regs */ +#include /* For SYM_* macros */ + +.section ".text" +SYM_CODE_START(riscv_crash_save_regs) + REG_S ra, PT_RA(a0) /* x1 */ + REG_S sp, PT_SP(a0) /* x2 */ + REG_S gp, PT_GP(a0) /* x3 */ + REG_S tp, PT_TP(a0) /* x4 */ + REG_S t0, PT_T0(a0) /* x5 */ + REG_S t1, PT_T1(a0) /* x6 */ + REG_S t2, PT_T2(a0) /* x7 */ + REG_S s0, PT_S0(a0) /* x8/fp */ + REG_S s1, PT_S1(a0) /* x9 */ + REG_S a0, PT_A0(a0) /* x10 */ + REG_S a1, PT_A1(a0) /* x11 */ + REG_S a2, PT_A2(a0) /* x12 */ + REG_S a3, PT_A3(a0) /* x13 */ + REG_S a4, PT_A4(a0) /* x14 */ + REG_S a5, PT_A5(a0) /* x15 */ + REG_S a6, PT_A6(a0) /* x16 */ + REG_S a7, PT_A7(a0) /* x17 */ + REG_S s2, PT_S2(a0) /* x18 */ + REG_S s3, PT_S3(a0) /* x19 */ + REG_S s4, PT_S4(a0) /* x20 */ + REG_S s5, PT_S5(a0) /* x21 */ + REG_S s6, PT_S6(a0) /* x22 */ + REG_S s7, PT_S7(a0) /* x23 */ + REG_S s8, PT_S8(a0) /* x24 */ + REG_S s9, PT_S9(a0) /* x25 */ + REG_S s10, PT_S10(a0) /* x26 */ + REG_S s11, PT_S11(a0) /* x27 */ + REG_S t3, PT_T3(a0) /* x28 */ + REG_S t4, PT_T4(a0) /* x29 */ + REG_S t5, PT_T5(a0) /* x30 */ + REG_S t6, PT_T6(a0) /* x31 */ + + csrr t1, CSR_STATUS + csrr t2, CSR_EPC + csrr t3, CSR_TVAL + csrr t4, CSR_CAUSE + + REG_S t1, PT_STATUS(a0) + REG_S t2, PT_EPC(a0) + REG_S t3, PT_BADADDR(a0) + REG_S t4, PT_CAUSE(a0) + ret +SYM_CODE_END(riscv_crash_save_regs) diff --git a/arch/riscv/kernel/kexec_relocate.S b/arch/riscv/kernel/kexec_relocate.S index 84101d0a23ef..88c3beabe9b4 100644 --- a/arch/riscv/kernel/kexec_relocate.S +++ b/arch/riscv/kernel/kexec_relocate.S @@ -151,7 +151,73 @@ SYM_CODE_START(riscv_kexec_relocate) SYM_CODE_END(riscv_kexec_relocate) riscv_kexec_relocate_end: - .section ".rodata" + +/* Used for jumping to crashkernel */ +.section ".text" +SYM_CODE_START(riscv_kexec_norelocate) + /* + * s0: (const) Phys address to jump to + * s1: (const) Phys address of the FDT image + * s2: (const) The hartid of the current hart + * s3: (const) va_pa_offset, used when switching MMU off + */ + mv s0, a1 + mv s1, a2 + mv s2, a3 + mv s3, a4 + + /* Disable / cleanup interrupts */ + csrw CSR_SIE, zero + csrw CSR_SIP, zero + + /* Switch to physical addressing */ + la s4, 1f + sub s4, s4, s3 + csrw CSR_STVEC, s4 + csrw CSR_SATP, zero + +.align 2 +1: + /* Pass the arguments to the next kernel / Cleanup*/ + mv a0, s2 + mv a1, s1 + mv a2, s0 + + /* Cleanup */ + mv a3, zero + mv a4, zero + mv a5, zero + mv a6, zero + mv a7, zero + + mv s0, zero + mv s1, zero + mv s2, zero + mv s3, zero + mv s4, zero + mv s5, zero + mv s6, zero + mv s7, zero + mv s8, zero + mv s9, zero + mv s10, zero + mv s11, zero + + mv t0, zero + mv t1, zero + mv t2, zero + mv t3, zero + mv t4, zero + mv t5, zero + mv t6, zero + csrw CSR_SEPC, zero + csrw CSR_SCAUSE, zero + csrw CSR_SSCRATCH, zero + + jalr zero, a2, 0 +SYM_CODE_END(riscv_kexec_norelocate) + +.section ".rodata" SYM_DATA(riscv_kexec_relocate_size, .long riscv_kexec_relocate_end - riscv_kexec_relocate) diff --git a/arch/riscv/kernel/machine_kexec.c b/arch/riscv/kernel/machine_kexec.c index e165c7874740..cc048143fba5 100644 --- a/arch/riscv/kernel/machine_kexec.c +++ b/arch/riscv/kernel/machine_kexec.c @@ -59,11 +59,6 @@ machine_kexec_prepare(struct kimage *image) kexec_image_info(image); - if (image->type == KEXEC_TYPE_CRASH) { - pr_warn("Loading a crash kernel is unsupported for now.\n"); - return -EINVAL; - } - /* Find the Flattened Device Tree and save its physical address */ for (i = 0; i < image->nr_segments; i++) { if (image->segment[i].memsz <= sizeof(fdt)) @@ -85,17 +80,21 @@ machine_kexec_prepare(struct kimage *image) } /* Copy the assembler code for relocation to the control page */ - control_code_buffer = page_address(image->control_code_page); - control_code_buffer_sz = page_size(image->control_code_page); - if (unlikely(riscv_kexec_relocate_size > control_code_buffer_sz)) { - pr_err("Relocation code doesn't fit within a control page\n"); - return -EINVAL; - } - memcpy(control_code_buffer, riscv_kexec_relocate, - riscv_kexec_relocate_size); + if (image->type != KEXEC_TYPE_CRASH) { + control_code_buffer = page_address(image->control_code_page); + control_code_buffer_sz = page_size(image->control_code_page); - /* Mark the control page executable */ - set_memory_x((unsigned long) control_code_buffer, 1); + if (unlikely(riscv_kexec_relocate_size > control_code_buffer_sz)) { + pr_err("Relocation code doesn't fit within a control page\n"); + return -EINVAL; + } + + memcpy(control_code_buffer, riscv_kexec_relocate, + riscv_kexec_relocate_size); + + /* Mark the control page executable */ + set_memory_x((unsigned long) control_code_buffer, 1); + } return 0; } @@ -147,6 +146,9 @@ void machine_shutdown(void) void machine_crash_shutdown(struct pt_regs *regs) { + crash_save_cpu(regs, smp_processor_id()); + machine_shutdown(); + pr_info("Starting crashdump kernel...\n"); } /** @@ -169,7 +171,12 @@ machine_kexec(struct kimage *image) unsigned long this_hart_id = raw_smp_processor_id(); unsigned long fdt_addr = internal->fdt_addr; void *control_code_buffer = page_address(image->control_code_page); - riscv_kexec_do_relocate do_relocate = control_code_buffer; + riscv_kexec_method kexec_method = NULL; + + if (image->type != KEXEC_TYPE_CRASH) + kexec_method = control_code_buffer; + else + kexec_method = (riscv_kexec_method) &riscv_kexec_norelocate; pr_notice("Will call new kernel at %08lx from hart id %lx\n", jump_addr, this_hart_id); @@ -180,7 +187,7 @@ machine_kexec(struct kimage *image) /* Jump to the relocation code */ pr_notice("Bye...\n"); - do_relocate(first_ind_entry, jump_addr, fdt_addr, - this_hart_id, va_pa_offset); + kexec_method(first_ind_entry, jump_addr, fdt_addr, + this_hart_id, va_pa_offset); unreachable(); } diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 9193ba3a006b..0aa9f8340115 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -160,6 +161,14 @@ static void __init init_resources(void) if (ret < 0) goto error; +#ifdef CONFIG_KEXEC_CORE + if (crashk_res.start != crashk_res.end) { + ret = add_resource(&iomem_resource, &crashk_res); + if (ret < 0) + goto error; + } +#endif + for_each_reserved_mem_region(region) { res = &mem_res[res_idx--]; @@ -252,7 +261,6 @@ void __init setup_arch(char **cmdline_p) efi_init(); setup_bootmem(); paging_init(); - init_resources(); #if IS_ENABLED(CONFIG_BUILTIN_DTB) unflatten_and_copy_device_tree(); #else @@ -263,6 +271,7 @@ void __init setup_arch(char **cmdline_p) #endif misc_mem_init(); + init_resources(); sbi_init(); if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) { diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index a2f4c148f02e..63c94d33e18a 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -2,6 +2,8 @@ /* * Copyright (C) 2012 Regents of the University of California * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * Copyright (C) 2020 FORTH-ICS/CARV + * Nick Kossifidis */ #include @@ -14,6 +16,7 @@ #include #include #include +#include #include #include @@ -658,6 +661,71 @@ void mark_rodata_ro(void) } #endif +#ifdef CONFIG_KEXEC_CORE +/* + * reserve_crashkernel() - reserves memory for crash kernel + * + * This function reserves memory area given in "crashkernel=" kernel command + * line parameter. The memory reserved is used by dump capture kernel when + * primary kernel is crashing. + */ +static void __init reserve_crashkernel(void) +{ + unsigned long long crash_base = 0; + unsigned long long crash_size = 0; + unsigned long search_start = memblock_start_of_DRAM(); + unsigned long search_end = memblock_end_of_DRAM(); + + int ret = 0; + + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + &crash_size, &crash_base); + if (ret || !crash_size) + return; + + crash_size = PAGE_ALIGN(crash_size); + + if (crash_base == 0) { + /* + * Current riscv boot protocol requires 2MB alignment for + * RV64 and 4MB alignment for RV32 (hugepage size) + */ + crash_base = memblock_find_in_range(search_start, search_end, + crash_size, PMD_SIZE); + + if (crash_base == 0) { + pr_warn("crashkernel: couldn't allocate %lldKB\n", + crash_size >> 10); + return; + } + } else { + /* User specifies base address explicitly. */ + if (!memblock_is_region_memory(crash_base, crash_size)) { + pr_warn("crashkernel: requested region is not memory\n"); + return; + } + + if (memblock_is_region_reserved(crash_base, crash_size)) { + pr_warn("crashkernel: requested region is reserved\n"); + return; + } + + + if (!IS_ALIGNED(crash_base, PMD_SIZE)) { + pr_warn("crashkernel: requested region is misaligned\n"); + return; + } + } + memblock_reserve(crash_base, crash_size); + + pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n", + crash_base, crash_base + crash_size, crash_size >> 20); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; +} +#endif /* CONFIG_KEXEC_CORE */ + void __init paging_init(void) { setup_vm_final(); @@ -670,6 +738,9 @@ void __init misc_mem_init(void) arch_numa_init(); sparse_init(); zone_sizes_init(); +#ifdef CONFIG_KEXEC_CORE + reserve_crashkernel(); +#endif memblock_dump_all(); } From 5640975003d0234da08559677e22ec25b9cb3267 Mon Sep 17 00:00:00 2001 From: Nick Kossifidis Date: Mon, 19 Apr 2021 03:55:39 +0300 Subject: [PATCH 0672/1379] RISC-V: Add crash kernel support This patch allows Linux to act as a crash kernel for use with kdump. Userspace will let the crash kernel know about the memory region it can use through linux,usable-memory property on the /memory node (overriding its reg property), and about the memory region where the elf core header of the previous kernel is saved, through a reserved-memory node with a compatible string of "linux,elfcorehdr". This approach is the least invasive and re-uses functionality already present. I tested this on riscv64 qemu and it works as expected, you may test it by retrieving the dmesg of the previous kernel through /proc/vmcore, using the vmcore-dmesg utility from kexec-tools. Signed-off-by: Nick Kossifidis Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 10 ++++++++ arch/riscv/kernel/Makefile | 1 + arch/riscv/kernel/crash_dump.c | 46 ++++++++++++++++++++++++++++++++++ arch/riscv/kernel/setup.c | 12 +++++++++ arch/riscv/mm/init.c | 33 ++++++++++++++++++++++++ 5 files changed, 102 insertions(+) create mode 100644 arch/riscv/kernel/crash_dump.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index e38bd044610f..61514e0d268e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -405,6 +405,16 @@ config KEXEC The name comes from the similarity to the exec system call. +config CRASH_DUMP + bool "Build kdump crash kernel" + help + Generate crash dump after being started by kexec. This should + be normally only set in special crash dump kernels which are + loaded in the main kernel with kexec-tools into a specially + reserved region and then later executed after a crash by + kdump/kexec. + + For more details see Documentation/admin-guide/kdump/kdump.rst endmenu diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 3ee07bf0cea7..56d5cd2a5982 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -59,6 +59,7 @@ endif obj-$(CONFIG_HOTPLUG_CPU) += cpu-hotplug.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_KEXEC) += kexec_relocate.o crash_save_regs.o machine_kexec.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o diff --git a/arch/riscv/kernel/crash_dump.c b/arch/riscv/kernel/crash_dump.c new file mode 100644 index 000000000000..86cc0ada5752 --- /dev/null +++ b/arch/riscv/kernel/crash_dump.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This code comes from arch/arm64/kernel/crash_dump.c + * Created by: AKASHI Takahiro + * Copyright (C) 2017 Linaro Limited + */ + +#include +#include + +/** + * copy_oldmem_page() - copy one page from old kernel memory + * @pfn: page frame number to be copied + * @buf: buffer where the copied page is placed + * @csize: number of bytes to copy + * @offset: offset in bytes into the page + * @userbuf: if set, @buf is in a user address space + * + * This function copies one page from old kernel memory into buffer pointed by + * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes + * copied or negative error in case of failure. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, + int userbuf) +{ + void *vaddr; + + if (!csize) + return 0; + + vaddr = memremap(__pfn_to_phys(pfn), PAGE_SIZE, MEMREMAP_WB); + if (!vaddr) + return -ENOMEM; + + if (userbuf) { + if (copy_to_user((char __user *)buf, vaddr + offset, csize)) { + memunmap(vaddr); + return -EFAULT; + } + } else + memcpy(buf, vaddr + offset, csize); + + memunmap(vaddr); + return csize; +} diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 0aa9f8340115..932ef73cf622 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -66,6 +66,9 @@ static struct resource code_res = { .name = "Kernel code", }; static struct resource data_res = { .name = "Kernel data", }; static struct resource rodata_res = { .name = "Kernel rodata", }; static struct resource bss_res = { .name = "Kernel bss", }; +#ifdef CONFIG_CRASH_DUMP +static struct resource elfcorehdr_res = { .name = "ELF Core hdr", }; +#endif static int __init add_resource(struct resource *parent, struct resource *res) @@ -169,6 +172,15 @@ static void __init init_resources(void) } #endif +#ifdef CONFIG_CRASH_DUMP + if (elfcorehdr_size > 0) { + elfcorehdr_res.start = elfcorehdr_addr; + elfcorehdr_res.end = elfcorehdr_addr + elfcorehdr_size - 1; + elfcorehdr_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + add_resource(&iomem_resource, &elfcorehdr_res); + } +#endif + for_each_reserved_mem_region(region) { res = &mem_res[res_idx--]; diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 63c94d33e18a..3cb4c1b6cee1 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -678,6 +679,18 @@ static void __init reserve_crashkernel(void) int ret = 0; + /* + * Don't reserve a region for a crash kernel on a crash kernel + * since it doesn't make much sense and we have limited memory + * resources. + */ +#ifdef CONFIG_CRASH_DUMP + if (is_kdump_kernel()) { + pr_info("crashkernel: ignoring reservation request\n"); + return; + } +#endif + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base); if (ret || !crash_size) @@ -726,6 +739,26 @@ static void __init reserve_crashkernel(void) } #endif /* CONFIG_KEXEC_CORE */ +#ifdef CONFIG_CRASH_DUMP +/* + * We keep track of the ELF core header of the crashed + * kernel with a reserved-memory region with compatible + * string "linux,elfcorehdr". Here we register a callback + * to populate elfcorehdr_addr/size when this region is + * present. Note that this region will be marked as + * reserved once we call early_init_fdt_scan_reserved_mem() + * later on. + */ +static int elfcore_hdr_setup(struct reserved_mem *rmem) +{ + elfcorehdr_addr = rmem->base; + elfcorehdr_size = rmem->size; + return 0; +} + +RESERVEDMEM_OF_DECLARE(elfcorehdr, "linux,elfcorehdr", elfcore_hdr_setup); +#endif + void __init paging_init(void) { setup_vm_final(); From 44c922572952d89a1ed15764f2b373ba62692865 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Tue, 13 Apr 2021 02:35:14 -0400 Subject: [PATCH 0673/1379] RISC-V: enable XIP Introduce XIP (eXecute In Place) support for RISC-V platforms. It allows code to be executed directly from non-volatile storage directly addressable by the CPU, such as QSPI NOR flash which can be found on many RISC-V platforms. This makes way for significant optimization of RAM footprint. The XIP kernel is not compressed since it has to run directly from flash, so it will occupy more space on the non-volatile storage. The physical flash address used to link the kernel object files and for storing it has to be known at compile time and is represented by a Kconfig option. XIP on RISC-V will for the time being only work on MMU-enabled kernels. Signed-off-by: Vitaly Wool [Alex: Rebase on top of "Move kernel mapping outside the linear mapping" ] Signed-off-by: Alexandre Ghiti [Palmer: disable XIP for allyesconfig] Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 60 ++++++++++++- arch/riscv/Makefile | 8 +- arch/riscv/boot/Makefile | 13 +++ arch/riscv/include/asm/page.h | 21 +++++ arch/riscv/include/asm/pgtable.h | 25 +++++- arch/riscv/kernel/head.S | 46 +++++++++- arch/riscv/kernel/head.h | 3 + arch/riscv/kernel/setup.c | 11 ++- arch/riscv/kernel/vmlinux-xip.lds.S | 133 ++++++++++++++++++++++++++++ arch/riscv/kernel/vmlinux.lds.S | 6 ++ arch/riscv/mm/init.c | 117 ++++++++++++++++++++++-- 11 files changed, 424 insertions(+), 19 deletions(-) create mode 100644 arch/riscv/kernel/vmlinux-xip.lds.S diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 61514e0d268e..6ddf5a2eb84d 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -28,8 +28,8 @@ config RISCV select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY - select ARCH_HAS_STRICT_KERNEL_RWX if MMU - select ARCH_HAS_STRICT_MODULE_RWX if MMU + select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL + select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT @@ -468,7 +468,7 @@ config EFI_STUB config EFI bool "UEFI runtime support" - depends on OF + depends on OF && !XIP_KERNEL select LIBFDT select UCS2_STRING select EFI_PARAMS_FROM_FDT @@ -492,11 +492,63 @@ config STACKPROTECTOR_PER_TASK def_bool y depends on STACKPROTECTOR && CC_HAVE_STACKPROTECTOR_TLS +config PHYS_RAM_BASE_FIXED + bool "Explicitly specified physical RAM address" + default n + +config PHYS_RAM_BASE + hex "Platform Physical RAM address" + depends on PHYS_RAM_BASE_FIXED + default "0x80000000" + help + This is the physical address of RAM in the system. It has to be + explicitly specified to run early relocations of read-write data + from flash to RAM. + +config XIP_KERNEL + bool "Kernel Execute-In-Place from ROM" + depends on MMU && SPARSEMEM + # This prevents XIP from being enabled by all{yes,mod}config, which + # fail to build since XIP doesn't support large kernels. + depends on !COMPILE_TEST + select PHYS_RAM_BASE_FIXED + help + Execute-In-Place allows the kernel to run from non-volatile storage + directly addressable by the CPU, such as NOR flash. This saves RAM + space since the text section of the kernel is not loaded from flash + to RAM. Read-write sections, such as the data section and stack, + are still copied to RAM. The XIP kernel is not compressed since + it has to run directly from flash, so it will take more space to + store it. The flash address used to link the kernel object files, + and for storing it, is configuration dependent. Therefore, if you + say Y here, you must know the proper physical address where to + store the kernel image depending on your own flash memory usage. + + Also note that the make target becomes "make xipImage" rather than + "make zImage" or "make Image". The final kernel binary to put in + ROM memory will be arch/riscv/boot/xipImage. + + SPARSEMEM is required because the kernel text and rodata that are + flash resident are not backed by memmap, then any attempt to get + a struct page on those regions will trigger a fault. + + If unsure, say N. + +config XIP_PHYS_ADDR + hex "XIP Kernel Physical Location" + depends on XIP_KERNEL + default "0x21000000" + help + This is the physical address in your flash memory the kernel will + be linked for and stored to. This address is dependent on your + own flash usage. + endmenu config BUILTIN_DTB - def_bool n + bool depends on OF + default y if XIP_KERNEL menu "Power management options" diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 1f5c03082976..3eb9590a0775 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -82,7 +82,11 @@ CHECKFLAGS += -D__riscv -D__riscv_xlen=$(BITS) # Default target when executing plain make boot := arch/riscv/boot +ifeq ($(CONFIG_XIP_KERNEL),y) +KBUILD_IMAGE := $(boot)/xipImage +else KBUILD_IMAGE := $(boot)/Image.gz +endif head-y := arch/riscv/kernel/head.o @@ -96,12 +100,14 @@ PHONY += vdso_install vdso_install: $(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@ +ifneq ($(CONFIG_XIP_KERNEL),y) ifeq ($(CONFIG_RISCV_M_MODE)$(CONFIG_SOC_CANAAN),yy) KBUILD_IMAGE := $(boot)/loader.bin else KBUILD_IMAGE := $(boot)/Image.gz endif -BOOT_TARGETS := Image Image.gz loader loader.bin +endif +BOOT_TARGETS := Image Image.gz loader loader.bin xipImage all: $(notdir $(KBUILD_IMAGE)) diff --git a/arch/riscv/boot/Makefile b/arch/riscv/boot/Makefile index 03404c84f971..6bf299f70c27 100644 --- a/arch/riscv/boot/Makefile +++ b/arch/riscv/boot/Makefile @@ -17,8 +17,21 @@ KCOV_INSTRUMENT := n OBJCOPYFLAGS_Image :=-O binary -R .note -R .note.gnu.build-id -R .comment -S +OBJCOPYFLAGS_xipImage :=-O binary -R .note -R .note.gnu.build-id -R .comment -S targets := Image Image.* loader loader.o loader.lds loader.bin +targets := Image Image.* loader loader.o loader.lds loader.bin xipImage + +ifeq ($(CONFIG_XIP_KERNEL),y) + +quiet_cmd_mkxip = $(quiet_cmd_objcopy) +cmd_mkxip = $(cmd_objcopy) + +$(obj)/xipImage: vmlinux FORCE + $(call if_changed,mkxip) + @$(kecho) ' Physical Address of xipImage: $(CONFIG_XIP_PHYS_ADDR)' + +endif $(obj)/Image: vmlinux FORCE $(call if_changed,objcopy) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index f64b61296c0c..e280ba60cb34 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -93,6 +93,9 @@ extern unsigned long va_pa_offset; #ifdef CONFIG_64BIT extern unsigned long va_kernel_pa_offset; #endif +#ifdef CONFIG_XIP_KERNEL +extern unsigned long va_kernel_xip_pa_offset; +#endif extern unsigned long pfn_base; #define ARCH_PFN_OFFSET (pfn_base) #else @@ -107,11 +110,29 @@ extern unsigned long pfn_base; extern unsigned long kernel_virt_addr; #define linear_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + va_pa_offset)) +#ifdef CONFIG_XIP_KERNEL +#define kernel_mapping_pa_to_va(y) ({ \ + unsigned long _y = y; \ + (_y >= CONFIG_PHYS_RAM_BASE) ? \ + (void *)((unsigned long)(_y) + va_kernel_pa_offset + XIP_OFFSET) : \ + (void *)((unsigned long)(_y) + va_kernel_xip_pa_offset); \ + }) +#else #define kernel_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + va_kernel_pa_offset)) +#endif #define __pa_to_va_nodebug(x) linear_mapping_pa_to_va(x) #define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) +#ifdef CONFIG_XIP_KERNEL +#define kernel_mapping_va_to_pa(y) ({ \ + unsigned long _y = y; \ + (_y < kernel_virt_addr + XIP_OFFSET) ? \ + ((unsigned long)(_y) - va_kernel_xip_pa_offset) : \ + ((unsigned long)(_y) - va_kernel_pa_offset - XIP_OFFSET); \ + }) +#else #define kernel_mapping_va_to_pa(x) ((unsigned long)(x) - va_kernel_pa_offset) +#endif #define __va_to_pa_nodebug(x) ({ \ unsigned long _x = x; \ (_x < kernel_virt_addr) ? \ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 5afda75cc2c3..2f1384e14e31 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -72,6 +72,19 @@ #define FIXADDR_SIZE PGDIR_SIZE #endif #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) + +#ifdef CONFIG_XIP_KERNEL +#define XIP_OFFSET SZ_8M +#define XIP_FIXUP(addr) ({ \ + uintptr_t __a = (uintptr_t)(addr); \ + (__a >= CONFIG_XIP_PHYS_ADDR && __a < CONFIG_XIP_PHYS_ADDR + SZ_16M) ? \ + __a - CONFIG_XIP_PHYS_ADDR + CONFIG_PHYS_RAM_BASE - XIP_OFFSET :\ + __a; \ + }) +#else +#define XIP_FIXUP(addr) (addr) +#endif /* CONFIG_XIP_KERNEL */ + #endif #ifndef __ASSEMBLY__ @@ -507,8 +520,16 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, #define kern_addr_valid(addr) (1) /* FIXME */ extern char _start[]; -extern void *dtb_early_va; -extern uintptr_t dtb_early_pa; +extern void *_dtb_early_va; +extern uintptr_t _dtb_early_pa; +#if defined(CONFIG_XIP_KERNEL) && defined(CONFIG_MMU) +#define dtb_early_va (*(void **)XIP_FIXUP(&_dtb_early_va)) +#define dtb_early_pa (*(uintptr_t *)XIP_FIXUP(&_dtb_early_pa)) +#else +#define dtb_early_va _dtb_early_va +#define dtb_early_pa _dtb_early_pa +#endif /* CONFIG_XIP_KERNEL */ + void setup_bootmem(void); void paging_init(void); void misc_mem_init(void); diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 6cb05f22e52a..89cc58ab52b4 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -9,11 +9,23 @@ #include #include #include +#include #include #include #include #include "efi-header.S" +#ifdef CONFIG_XIP_KERNEL +.macro XIP_FIXUP_OFFSET reg + REG_L t0, _xip_fixup + add \reg, \reg, t0 +.endm +_xip_fixup: .dword CONFIG_PHYS_RAM_BASE - CONFIG_XIP_PHYS_ADDR - XIP_OFFSET +#else +.macro XIP_FIXUP_OFFSET reg +.endm +#endif /* CONFIG_XIP_KERNEL */ + __HEAD ENTRY(_start) /* @@ -70,6 +82,7 @@ pe_head_start: relocate: /* Relocate return address */ la a1, kernel_virt_addr + XIP_FIXUP_OFFSET a1 REG_L a1, 0(a1) la a2, _start sub a1, a1, a2 @@ -92,6 +105,7 @@ relocate: * to ensure the new translations are in use. */ la a0, trampoline_pg_dir + XIP_FIXUP_OFFSET a0 srl a0, a0, PAGE_SHIFT or a0, a0, a1 sfence.vma @@ -145,7 +159,9 @@ secondary_start_sbi: slli a3, a0, LGREG la a4, __cpu_up_stack_pointer + XIP_FIXUP_OFFSET a4 la a5, __cpu_up_task_pointer + XIP_FIXUP_OFFSET a5 add a4, a3, a4 add a5, a3, a5 REG_L sp, (a4) @@ -157,6 +173,7 @@ secondary_start_common: #ifdef CONFIG_MMU /* Enable virtual memory and relocate to virtual address */ la a0, swapper_pg_dir + XIP_FIXUP_OFFSET a0 call relocate #endif call setup_trap_vector @@ -237,12 +254,33 @@ pmp_done: .Lgood_cores: #endif +#ifndef CONFIG_XIP_KERNEL /* Pick one hart to run the main boot sequence */ la a3, hart_lottery li a2, 1 amoadd.w a3, a2, (a3) bnez a3, .Lsecondary_start +#else + /* hart_lottery in flash contains a magic number */ + la a3, hart_lottery + mv a2, a3 + XIP_FIXUP_OFFSET a2 + lw t1, (a3) + amoswap.w t0, t1, (a2) + /* first time here if hart_lottery in RAM is not set */ + beq t0, t1, .Lsecondary_start + + la sp, _end + THREAD_SIZE + XIP_FIXUP_OFFSET sp + mv s0, a0 + call __copy_data + + /* Restore a0 copy */ + mv a0, s0 +#endif + +#ifndef CONFIG_XIP_KERNEL /* Clear BSS for flat non-ELF images */ la a3, __bss_start la a4, __bss_stop @@ -252,15 +290,18 @@ clear_bss: add a3, a3, RISCV_SZPTR blt a3, a4, clear_bss clear_bss_done: - +#endif /* Save hart ID and DTB physical address */ mv s0, a0 mv s1, a1 + la a2, boot_cpu_hartid + XIP_FIXUP_OFFSET a2 REG_S a0, (a2) /* Initialize page tables and relocate to virtual addresses */ la sp, init_thread_union + THREAD_SIZE + XIP_FIXUP_OFFSET sp #ifdef CONFIG_BUILTIN_DTB la a0, __dtb_start #else @@ -269,6 +310,7 @@ clear_bss_done: call setup_vm #ifdef CONFIG_MMU la a0, early_pg_dir + XIP_FIXUP_OFFSET a0 call relocate #endif /* CONFIG_MMU */ @@ -293,7 +335,9 @@ clear_bss_done: slli a3, a0, LGREG la a1, __cpu_up_stack_pointer + XIP_FIXUP_OFFSET a1 la a2, __cpu_up_task_pointer + XIP_FIXUP_OFFSET a2 add a1, a3, a1 add a2, a3, a2 diff --git a/arch/riscv/kernel/head.h b/arch/riscv/kernel/head.h index b48dda3d04f6..aabbc3ac3e48 100644 --- a/arch/riscv/kernel/head.h +++ b/arch/riscv/kernel/head.h @@ -12,6 +12,9 @@ extern atomic_t hart_lottery; asmlinkage void do_page_fault(struct pt_regs *regs); asmlinkage void __init setup_vm(uintptr_t dtb_pa); +#ifdef CONFIG_XIP_KERNEL +asmlinkage void __init __copy_data(void); +#endif extern void *__cpu_up_stack_pointer[]; extern void *__cpu_up_task_pointer[]; diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 932ef73cf622..7b31779101f6 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -51,7 +52,11 @@ struct screen_info screen_info __section(".data") = { * This is used before the kernel initializes the BSS so it can't be in the * BSS. */ -atomic_t hart_lottery __section(".sdata"); +atomic_t hart_lottery __section(".sdata") +#ifdef CONFIG_XIP_KERNEL += ATOMIC_INIT(0xC001BEEF) +#endif +; unsigned long boot_cpu_hartid; static DEFINE_PER_CPU(struct cpu, cpu_devices); @@ -276,7 +281,7 @@ void __init setup_arch(char **cmdline_p) #if IS_ENABLED(CONFIG_BUILTIN_DTB) unflatten_and_copy_device_tree(); #else - if (early_init_dt_verify(__va(dtb_early_pa))) + if (early_init_dt_verify(__va(XIP_FIXUP(dtb_early_pa)))) unflatten_device_tree(); else pr_err("No DTB found in kernel mappings\n"); @@ -288,7 +293,7 @@ void __init setup_arch(char **cmdline_p) if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) { protect_kernel_text_data(); -#if defined(CONFIG_64BIT) && defined(CONFIG_MMU) +#if defined(CONFIG_64BIT) && defined(CONFIG_MMU) && !defined(CONFIG_XIP_KERNEL) protect_kernel_linear_mapping_text_rodata(); #endif } diff --git a/arch/riscv/kernel/vmlinux-xip.lds.S b/arch/riscv/kernel/vmlinux-xip.lds.S new file mode 100644 index 000000000000..4b29b9917f99 --- /dev/null +++ b/arch/riscv/kernel/vmlinux-xip.lds.S @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012 Regents of the University of California + * Copyright (C) 2017 SiFive + * Copyright (C) 2020 Vitaly Wool, Konsulko AB + */ + +#include +#define LOAD_OFFSET KERNEL_LINK_ADDR +/* No __ro_after_init data in the .rodata section - which will always be ro */ +#define RO_AFTER_INIT_DATA + +#include +#include +#include +#include +#include + +OUTPUT_ARCH(riscv) +ENTRY(_start) + +jiffies = jiffies_64; + +SECTIONS +{ + /* Beginning of code and text segment */ + . = LOAD_OFFSET; + _xiprom = .; + _start = .; + HEAD_TEXT_SECTION + INIT_TEXT_SECTION(PAGE_SIZE) + /* we have to discard exit text and such at runtime, not link time */ + .exit.text : + { + EXIT_TEXT + } + + .text : { + _text = .; + _stext = .; + TEXT_TEXT + SCHED_TEXT + CPUIDLE_TEXT + LOCK_TEXT + KPROBES_TEXT + ENTRY_TEXT + IRQENTRY_TEXT + SOFTIRQENTRY_TEXT + *(.fixup) + _etext = .; + } + RO_DATA(L1_CACHE_BYTES) + .srodata : { + *(.srodata*) + } + .init.rodata : { + INIT_SETUP(16) + INIT_CALLS + CON_INITCALL + INIT_RAM_FS + } + _exiprom = .; /* End of XIP ROM area */ + + +/* + * From this point, stuff is considered writable and will be copied to RAM + */ + __data_loc = ALIGN(16); /* location in file */ + . = LOAD_OFFSET + XIP_OFFSET; /* location in memory */ + + _sdata = .; /* Start of data section */ + _data = .; + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + _edata = .; + __start_ro_after_init = .; + .data.ro_after_init : AT(ADDR(.data.ro_after_init) - LOAD_OFFSET) { + *(.data..ro_after_init) + } + __end_ro_after_init = .; + + . = ALIGN(PAGE_SIZE); + __init_begin = .; + .init.data : { + INIT_DATA + } + .exit.data : { + EXIT_DATA + } + . = ALIGN(8); + __soc_early_init_table : { + __soc_early_init_table_start = .; + KEEP(*(__soc_early_init_table)) + __soc_early_init_table_end = .; + } + __soc_builtin_dtb_table : { + __soc_builtin_dtb_table_start = .; + KEEP(*(__soc_builtin_dtb_table)) + __soc_builtin_dtb_table_end = .; + } + PERCPU_SECTION(L1_CACHE_BYTES) + + . = ALIGN(PAGE_SIZE); + __init_end = .; + + .sdata : { + __global_pointer$ = . + 0x800; + *(.sdata*) + *(.sbss*) + } + + BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0) + EXCEPTION_TABLE(0x10) + + .rel.dyn : AT(ADDR(.rel.dyn) - LOAD_OFFSET) { + *(.rel.dyn*) + } + + /* + * End of copied data. We need a dummy section to get its LMA. + * Also located before final ALIGN() as trailing padding is not stored + * in the resulting binary file and useless to copy. + */ + .data.endmark : AT(ADDR(.data.endmark) - LOAD_OFFSET) { } + _edata_loc = LOADADDR(.data.endmark); + + . = ALIGN(PAGE_SIZE); + _end = .; + + STABS_DEBUG + DWARF_DEBUG + + DISCARDS +} diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 56677137c85b..891742ff75a7 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -4,8 +4,13 @@ * Copyright (C) 2017 SiFive */ +#ifdef CONFIG_XIP_KERNEL +#include "vmlinux-xip.lds.S" +#else + #include #define LOAD_OFFSET KERNEL_LINK_ADDR + #include #include #include @@ -140,3 +145,4 @@ SECTIONS DISCARDS } +#endif /* CONFIG_XIP_KERNEL */ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 3cb4c1b6cee1..788eb222deac 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -31,6 +31,9 @@ unsigned long kernel_virt_addr = KERNEL_LINK_ADDR; EXPORT_SYMBOL(kernel_virt_addr); +#ifdef CONFIG_XIP_KERNEL +#define kernel_virt_addr (*((unsigned long *)XIP_FIXUP(&kernel_virt_addr))) +#endif unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; @@ -38,8 +41,8 @@ EXPORT_SYMBOL(empty_zero_page); extern char _start[]; #define DTB_EARLY_BASE_VA PGDIR_SIZE -void *dtb_early_va __initdata; -uintptr_t dtb_early_pa __initdata; +void *_dtb_early_va __initdata; +uintptr_t _dtb_early_pa __initdata; struct pt_alloc_ops { pte_t *(*get_pte_virt)(phys_addr_t pa); @@ -124,6 +127,10 @@ void __init setup_bootmem(void) phys_addr_t dram_end = memblock_end_of_DRAM(); phys_addr_t max_mapped_addr = __pa(~(ulong)0); +#ifdef CONFIG_XIP_KERNEL + vmlinux_start = __pa_symbol(&_sdata); +#endif + /* The maximal physical memory size is -PAGE_OFFSET. */ memblock_enforce_memory_limit(-PAGE_OFFSET); @@ -165,17 +172,41 @@ void __init setup_bootmem(void) memblock_allow_resize(); } +#ifdef CONFIG_XIP_KERNEL + +extern char _xiprom[], _exiprom[]; +extern char _sdata[], _edata[]; + +#endif /* CONFIG_XIP_KERNEL */ + #ifdef CONFIG_MMU -static struct pt_alloc_ops pt_ops __ro_after_init; +static struct pt_alloc_ops _pt_ops __ro_after_init; + +#ifdef CONFIG_XIP_KERNEL +#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops)) +#else +#define pt_ops _pt_ops +#endif /* Offset between linear mapping virtual address and kernel load address */ unsigned long va_pa_offset __ro_after_init; EXPORT_SYMBOL(va_pa_offset); -#ifdef CONFIG_64BIT +#ifdef CONFIG_XIP_KERNEL +#define va_pa_offset (*((unsigned long *)XIP_FIXUP(&va_pa_offset))) +#endif /* Offset between kernel mapping virtual address and kernel load address */ +#ifdef CONFIG_64BIT unsigned long va_kernel_pa_offset; EXPORT_SYMBOL(va_kernel_pa_offset); #endif +#ifdef CONFIG_XIP_KERNEL +#define va_kernel_pa_offset (*((unsigned long *)XIP_FIXUP(&va_kernel_pa_offset))) +#endif +unsigned long va_kernel_xip_pa_offset; +EXPORT_SYMBOL(va_kernel_xip_pa_offset); +#ifdef CONFIG_XIP_KERNEL +#define va_kernel_xip_pa_offset (*((unsigned long *)XIP_FIXUP(&va_kernel_xip_pa_offset))) +#endif unsigned long pfn_base __ro_after_init; EXPORT_SYMBOL(pfn_base); @@ -185,6 +216,12 @@ pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); +#ifdef CONFIG_XIP_KERNEL +#define trampoline_pg_dir ((pgd_t *)XIP_FIXUP(trampoline_pg_dir)) +#define fixmap_pte ((pte_t *)XIP_FIXUP(fixmap_pte)) +#define early_pg_dir ((pgd_t *)XIP_FIXUP(early_pg_dir)) +#endif /* CONFIG_XIP_KERNEL */ + void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) { unsigned long addr = __fix_to_virt(idx); @@ -260,6 +297,12 @@ pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss; pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); pmd_t early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); +#ifdef CONFIG_XIP_KERNEL +#define trampoline_pmd ((pmd_t *)XIP_FIXUP(trampoline_pmd)) +#define fixmap_pmd ((pmd_t *)XIP_FIXUP(fixmap_pmd)) +#define early_pmd ((pmd_t *)XIP_FIXUP(early_pmd)) +#endif /* CONFIG_XIP_KERNEL */ + static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) { /* Before MMU is enabled */ @@ -376,6 +419,19 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) return PMD_SIZE; } +#ifdef CONFIG_XIP_KERNEL +/* called from head.S with MMU off */ +asmlinkage void __init __copy_data(void) +{ + void *from = (void *)(&_sdata); + void *end = (void *)(&_end); + void *to = (void *)CONFIG_PHYS_RAM_BASE; + size_t sz = (size_t)(end - from + 1); + + memcpy(to, from, sz); +} +#endif + /* * setup_vm() is called from head.S with MMU-off. * @@ -395,7 +451,35 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) #endif uintptr_t load_pa, load_sz; +#ifdef CONFIG_XIP_KERNEL +#define load_pa (*((uintptr_t *)XIP_FIXUP(&load_pa))) +#define load_sz (*((uintptr_t *)XIP_FIXUP(&load_sz))) +#endif +#ifdef CONFIG_XIP_KERNEL +uintptr_t xiprom, xiprom_sz; +#define xiprom_sz (*((uintptr_t *)XIP_FIXUP(&xiprom_sz))) +#define xiprom (*((uintptr_t *)XIP_FIXUP(&xiprom))) + +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size) +{ + uintptr_t va, end_va; + + /* Map the flash resident part */ + end_va = kernel_virt_addr + xiprom_sz; + for (va = kernel_virt_addr; va < end_va; va += map_size) + create_pgd_mapping(pgdir, va, + xiprom + (va - kernel_virt_addr), + map_size, PAGE_KERNEL_EXEC); + + /* Map the data in RAM */ + end_va = kernel_virt_addr + XIP_OFFSET + load_sz; + for (va = kernel_virt_addr + XIP_OFFSET; va < end_va; va += map_size) + create_pgd_mapping(pgdir, va, + load_pa + (va - (kernel_virt_addr + XIP_OFFSET)), + map_size, PAGE_KERNEL); +} +#else static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size) { uintptr_t va, end_va; @@ -406,16 +490,28 @@ static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size) load_pa + (va - kernel_virt_addr), map_size, PAGE_KERNEL_EXEC); } +#endif asmlinkage void __init setup_vm(uintptr_t dtb_pa) { - uintptr_t pa; + uintptr_t __maybe_unused pa; uintptr_t map_size; #ifndef __PAGETABLE_PMD_FOLDED pmd_t fix_bmap_spmd, fix_bmap_epmd; #endif + +#ifdef CONFIG_XIP_KERNEL + xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; + xiprom_sz = (uintptr_t)(&_exiprom) - (uintptr_t)(&_xiprom); + + load_pa = (uintptr_t)CONFIG_PHYS_RAM_BASE; + load_sz = (uintptr_t)(&_end) - (uintptr_t)(&_sdata); + + va_kernel_xip_pa_offset = kernel_virt_addr - xiprom; +#else load_pa = (uintptr_t)(&_start); load_sz = (uintptr_t)(&_end) - load_pa; +#endif va_pa_offset = PAGE_OFFSET - load_pa; #ifdef CONFIG_64BIT @@ -451,8 +547,13 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) /* Setup trampoline PGD and PMD */ create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); +#ifdef CONFIG_XIP_KERNEL + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, + xiprom, PMD_SIZE, PAGE_KERNEL_EXEC); +#else create_pmd_mapping(trampoline_pmd, kernel_virt_addr, load_pa, PMD_SIZE, PAGE_KERNEL_EXEC); +#endif #else /* Setup trampoline PGD */ create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, @@ -485,7 +586,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) * whereas dtb_early_va will be used before setup_vm_final installs * the linear mapping. */ - dtb_early_va = kernel_mapping_pa_to_va(dtb_pa); + dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa)); #else dtb_early_va = __va(dtb_pa); #endif /* CONFIG_64BIT */ @@ -501,7 +602,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) dtb_early_va = (void *)DTB_EARLY_BASE_VA + (dtb_pa & (PGDIR_SIZE - 1)); #else /* CONFIG_BUILTIN_DTB */ #ifdef CONFIG_64BIT - dtb_early_va = kernel_mapping_pa_to_va(dtb_pa); + dtb_early_va = kernel_mapping_pa_to_va(XIP_FIXUP(dtb_pa)); #else dtb_early_va = __va(dtb_pa); #endif /* CONFIG_64BIT */ @@ -540,7 +641,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) #endif } -#ifdef CONFIG_64BIT +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) void protect_kernel_linear_mapping_text_rodata(void) { unsigned long text_start = (unsigned long)lm_alias(_start); From 99b3e3d41a034d9b3993800287d023ea063da293 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Wed, 3 Mar 2021 12:02:49 -0800 Subject: [PATCH 0674/1379] RISC-V: Add Microchip PolarFire SoC kconfig option Add Microchip PolarFire kconfig option which selects SoC specific and common drivers that is required for this SoC. Signed-off-by: Atish Patra Reviewed-by: Bin Meng Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.socs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs index b9eda857fc87..00c2b205654c 100644 --- a/arch/riscv/Kconfig.socs +++ b/arch/riscv/Kconfig.socs @@ -1,5 +1,12 @@ menu "SoC selection" +config SOC_MICROCHIP_POLARFIRE + bool "Microchip PolarFire SoCs" + select MCHP_CLK_MPFS + select SIFIVE_PLIC + help + This enables support for Microchip PolarFire SoC platforms. + config SOC_SIFIVE bool "SiFive SoCs" select SERIAL_SIFIVE if TTY From d53b0244c84c4e2721bede258e6a229ef56a138e Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Wed, 3 Mar 2021 12:02:50 -0800 Subject: [PATCH 0675/1379] dt-bindings: riscv: microchip: Add YAML documentation for the PolarFire SoC Add YAML DT binding documentation for the Microchip PolarFire SoC. It is documented at: https://www.microsemi.com/products/fpga-soc/polarfire-soc-icicle-quick-start-guide Signed-off-by: Atish Patra Reviewed-by: Rob Herring Signed-off-by: Palmer Dabbelt --- .../devicetree/bindings/riscv/microchip.yaml | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 Documentation/devicetree/bindings/riscv/microchip.yaml diff --git a/Documentation/devicetree/bindings/riscv/microchip.yaml b/Documentation/devicetree/bindings/riscv/microchip.yaml new file mode 100644 index 000000000000..3f981e897126 --- /dev/null +++ b/Documentation/devicetree/bindings/riscv/microchip.yaml @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/riscv/microchip.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Microchip PolarFire SoC-based boards device tree bindings + +maintainers: + - Cyril Jean + - Lewis Hanly + +description: + Microchip PolarFire SoC-based boards + +properties: + $nodename: + const: '/' + compatible: + items: + - enum: + - microchip,mpfs-icicle-kit + - const: microchip,mpfs + +additionalProperties: true + +... From 0fa6107eca4186adc6adda3b54c8b942477066c1 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Wed, 3 Mar 2021 12:02:51 -0800 Subject: [PATCH 0676/1379] RISC-V: Initial DTS for Microchip ICICLE board Add initial DTS for Microchip ICICLE board having only essential devices (clocks, sdhci, ethernet, serial, etc). The device tree is based on the U-Boot patch. https://patchwork.ozlabs.org/project/uboot/patch/20201110103414.10142-6-padmarao.begari@microchip.com/ Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/Makefile | 1 + arch/riscv/boot/dts/microchip/Makefile | 2 + .../microchip/microchip-mpfs-icicle-kit.dts | 72 ++++ .../boot/dts/microchip/microchip-mpfs.dtsi | 329 ++++++++++++++++++ 4 files changed, 404 insertions(+) create mode 100644 arch/riscv/boot/dts/microchip/Makefile create mode 100644 arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts create mode 100644 arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi diff --git a/arch/riscv/boot/dts/Makefile b/arch/riscv/boot/dts/Makefile index 7ffd502e3e7b..fe996b88319e 100644 --- a/arch/riscv/boot/dts/Makefile +++ b/arch/riscv/boot/dts/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 subdir-y += sifive subdir-$(CONFIG_SOC_CANAAN_K210_DTB_BUILTIN) += canaan +subdir-y += microchip obj-$(CONFIG_BUILTIN_DTB) := $(addsuffix /, $(subdir-y)) diff --git a/arch/riscv/boot/dts/microchip/Makefile b/arch/riscv/boot/dts/microchip/Makefile new file mode 100644 index 000000000000..622b12771fd3 --- /dev/null +++ b/arch/riscv/boot/dts/microchip/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +dtb-$(CONFIG_SOC_MICROCHIP_POLARFIRE) += microchip-mpfs-icicle-kit.dtb diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts new file mode 100644 index 000000000000..ec79944065c9 --- /dev/null +++ b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Copyright (c) 2020 Microchip Technology Inc */ + +/dts-v1/; + +#include "microchip-mpfs.dtsi" + +/* Clock frequency (in Hz) of the rtcclk */ +#define RTCCLK_FREQ 1000000 + +/ { + #address-cells = <2>; + #size-cells = <2>; + model = "Microchip PolarFire-SoC Icicle Kit"; + compatible = "microchip,mpfs-icicle-kit"; + + chosen { + stdout-path = &serial0; + }; + + cpus { + timebase-frequency = ; + }; + + memory@80000000 { + device_type = "memory"; + reg = <0x0 0x80000000 0x0 0x40000000>; + clocks = <&clkcfg 26>; + }; + + soc { + }; +}; + +&serial0 { + status = "okay"; +}; + +&serial1 { + status = "okay"; +}; + +&serial2 { + status = "okay"; +}; + +&serial3 { + status = "okay"; +}; + +&sdcard { + status = "okay"; +}; + +&emac0 { + phy-mode = "sgmii"; + phy-handle = <&phy0>; + phy0: ethernet-phy@8 { + reg = <8>; + ti,fifo-depth = <0x01>; + }; +}; + +&emac1 { + status = "okay"; + phy-mode = "sgmii"; + phy-handle = <&phy1>; + phy1: ethernet-phy@9 { + reg = <9>; + ti,fifo-depth = <0x01>; + }; +}; diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi new file mode 100644 index 000000000000..b9819570a7d1 --- /dev/null +++ b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Copyright (c) 2020 Microchip Technology Inc */ + +/dts-v1/; + +/ { + #address-cells = <2>; + #size-cells = <2>; + model = "Microchip MPFS Icicle Kit"; + compatible = "microchip,mpfs-icicle-kit"; + + chosen { + }; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + clock-frequency = <0>; + compatible = "sifive,e51", "sifive,rocket0", "riscv"; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <128>; + i-cache-size = <16384>; + reg = <0>; + riscv,isa = "rv64imac"; + status = "disabled"; + + cpu0_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + + cpu@1 { + clock-frequency = <0>; + compatible = "sifive,u54-mc", "sifive,rocket0", "riscv"; + d-cache-block-size = <64>; + d-cache-sets = <64>; + d-cache-size = <32768>; + d-tlb-sets = <1>; + d-tlb-size = <32>; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <64>; + i-cache-size = <32768>; + i-tlb-sets = <1>; + i-tlb-size = <32>; + mmu-type = "riscv,sv39"; + reg = <1>; + riscv,isa = "rv64imafdc"; + tlb-split; + status = "okay"; + + cpu1_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + + cpu@2 { + clock-frequency = <0>; + compatible = "sifive,u54-mc", "sifive,rocket0", "riscv"; + d-cache-block-size = <64>; + d-cache-sets = <64>; + d-cache-size = <32768>; + d-tlb-sets = <1>; + d-tlb-size = <32>; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <64>; + i-cache-size = <32768>; + i-tlb-sets = <1>; + i-tlb-size = <32>; + mmu-type = "riscv,sv39"; + reg = <2>; + riscv,isa = "rv64imafdc"; + tlb-split; + status = "okay"; + + cpu2_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + + cpu@3 { + clock-frequency = <0>; + compatible = "sifive,u54-mc", "sifive,rocket0", "riscv"; + d-cache-block-size = <64>; + d-cache-sets = <64>; + d-cache-size = <32768>; + d-tlb-sets = <1>; + d-tlb-size = <32>; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <64>; + i-cache-size = <32768>; + i-tlb-sets = <1>; + i-tlb-size = <32>; + mmu-type = "riscv,sv39"; + reg = <3>; + riscv,isa = "rv64imafdc"; + tlb-split; + status = "okay"; + + cpu3_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + + cpu@4 { + clock-frequency = <0>; + compatible = "sifive,u54-mc", "sifive,rocket0", "riscv"; + d-cache-block-size = <64>; + d-cache-sets = <64>; + d-cache-size = <32768>; + d-tlb-sets = <1>; + d-tlb-size = <32>; + device_type = "cpu"; + i-cache-block-size = <64>; + i-cache-sets = <64>; + i-cache-size = <32768>; + i-tlb-sets = <1>; + i-tlb-size = <32>; + mmu-type = "riscv,sv39"; + reg = <4>; + riscv,isa = "rv64imafdc"; + tlb-split; + status = "okay"; + cpu4_intc: interrupt-controller { + #interrupt-cells = <1>; + compatible = "riscv,cpu-intc"; + interrupt-controller; + }; + }; + }; + + soc { + #address-cells = <2>; + #size-cells = <2>; + compatible = "simple-bus"; + ranges; + + cache-controller@2010000 { + compatible = "sifive,fu540-c000-ccache", "cache"; + cache-block-size = <64>; + cache-level = <2>; + cache-sets = <1024>; + cache-size = <2097152>; + cache-unified; + interrupt-parent = <&plic>; + interrupts = <1 2 3>; + reg = <0x0 0x2010000 0x0 0x1000>; + }; + + clint@2000000 { + compatible = "sifive,clint0"; + reg = <0x0 0x2000000 0x0 0xC000>; + interrupts-extended = <&cpu0_intc 3 &cpu0_intc 7 + &cpu1_intc 3 &cpu1_intc 7 + &cpu2_intc 3 &cpu2_intc 7 + &cpu3_intc 3 &cpu3_intc 7 + &cpu4_intc 3 &cpu4_intc 7>; + }; + + plic: interrupt-controller@c000000 { + #interrupt-cells = <1>; + compatible = "sifive,plic-1.0.0"; + reg = <0x0 0xc000000 0x0 0x4000000>; + riscv,ndev = <186>; + interrupt-controller; + interrupts-extended = <&cpu0_intc 11 + &cpu1_intc 11 &cpu1_intc 9 + &cpu2_intc 11 &cpu2_intc 9 + &cpu3_intc 11 &cpu3_intc 9 + &cpu4_intc 11 &cpu4_intc 9>; + }; + + dma@3000000 { + compatible = "sifive,fu540-c000-pdma"; + reg = <0x0 0x3000000 0x0 0x8000>; + interrupt-parent = <&plic>; + interrupts = <23 24 25 26 27 28 29 30>; + #dma-cells = <1>; + }; + + refclk: refclk { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <600000000>; + clock-output-names = "msspllclk"; + }; + + clkcfg: clkcfg@20002000 { + compatible = "microchip,mpfs-clkcfg"; + reg = <0x0 0x20002000 0x0 0x1000>; + reg-names = "mss_sysreg"; + clocks = <&refclk>; + #clock-cells = <1>; + clock-output-names = "cpu", "axi", "ahb", "envm", /* 0-3 */ + "mac0", "mac1", "mmc", "timer", /* 4-7 */ + "mmuart0", "mmuart1", "mmuart2", "mmuart3", /* 8-11 */ + "mmuart4", "spi0", "spi1", "i2c0", /* 12-15 */ + "i2c1", "can0", "can1", "usb", /* 16-19 */ + "rsvd", "rtc", "qspi", "gpio0", /* 20-23 */ + "gpio1", "gpio2", "ddrc", "fic0", /* 24-27 */ + "fic1", "fic2", "fic3", "athena", "cfm"; /* 28-32 */ + }; + + serial0: serial@20000000 { + compatible = "ns16550a"; + reg = <0x0 0x20000000 0x0 0x400>; + reg-io-width = <4>; + reg-shift = <2>; + interrupt-parent = <&plic>; + interrupts = <90>; + current-speed = <115200>; + clocks = <&clkcfg 8>; + status = "disabled"; + }; + + serial1: serial@20100000 { + compatible = "ns16550a"; + reg = <0x0 0x20100000 0x0 0x400>; + reg-io-width = <4>; + reg-shift = <2>; + interrupt-parent = <&plic>; + interrupts = <91>; + current-speed = <115200>; + clocks = <&clkcfg 9>; + status = "disabled"; + }; + + serial2: serial@20102000 { + compatible = "ns16550a"; + reg = <0x0 0x20102000 0x0 0x400>; + reg-io-width = <4>; + reg-shift = <2>; + interrupt-parent = <&plic>; + interrupts = <92>; + current-speed = <115200>; + clocks = <&clkcfg 10>; + status = "disabled"; + }; + + serial3: serial@20104000 { + compatible = "ns16550a"; + reg = <0x0 0x20104000 0x0 0x400>; + reg-io-width = <4>; + reg-shift = <2>; + interrupt-parent = <&plic>; + interrupts = <93>; + current-speed = <115200>; + clocks = <&clkcfg 11>; + status = "disabled"; + }; + + emmc: mmc@20008000 { + compatible = "cdns,sd4hc"; + reg = <0x0 0x20008000 0x0 0x1000>; + interrupt-parent = <&plic>; + interrupts = <88 89>; + pinctrl-names = "default"; + clocks = <&clkcfg 6>; + bus-width = <4>; + cap-mmc-highspeed; + mmc-ddr-3_3v; + max-frequency = <200000000>; + non-removable; + no-sd; + no-sdio; + voltage-ranges = <3300 3300>; + status = "disabled"; + }; + + sdcard: sdhc@20008000 { + compatible = "cdns,sd4hc"; + reg = <0x0 0x20008000 0x0 0x1000>; + interrupt-parent = <&plic>; + interrupts = <88>; + pinctrl-names = "default"; + clocks = <&clkcfg 6>; + bus-width = <4>; + disable-wp; + cap-sd-highspeed; + card-detect-delay = <200>; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; + sd-uhs-sdr104; + max-frequency = <200000000>; + status = "disabled"; + }; + + emac0: ethernet@20110000 { + compatible = "cdns,macb"; + reg = <0x0 0x20110000 0x0 0x2000>; + interrupt-parent = <&plic>; + interrupts = <64 65 66 67>; + local-mac-address = [00 00 00 00 00 00]; + clocks = <&clkcfg 4>, <&clkcfg 2>; + clock-names = "pclk", "hclk"; + status = "disabled"; + #address-cells = <1>; + #size-cells = <0>; + }; + + emac1: ethernet@20112000 { + compatible = "cdns,macb"; + reg = <0x0 0x20112000 0x0 0x2000>; + interrupt-parent = <&plic>; + interrupts = <70 71 72 73>; + mac-address = [00 00 00 00 00 00]; + clocks = <&clkcfg 5>, <&clkcfg 2>; + status = "disabled"; + clock-names = "pclk", "hclk"; + #address-cells = <1>; + #size-cells = <0>; + }; + + }; +}; From 2951162094e61f574b0ddf886c783ace65049450 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Wed, 3 Mar 2021 12:02:52 -0800 Subject: [PATCH 0677/1379] RISC-V: Enable Microchip PolarFire ICICLE SoC Enable Microchip PolarFire ICICLE soc config in defconfig. It allows the default upstream kernel to boot on PolarFire ICICLE board. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Reviewed-by: Bin Meng Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 6c0625aa96c7..1f2be234b11c 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -16,6 +16,7 @@ CONFIG_EXPERT=y CONFIG_BPF_SYSCALL=y CONFIG_SOC_SIFIVE=y CONFIG_SOC_VIRT=y +CONFIG_SOC_MICROCHIP_POLARFIRE=y CONFIG_SMP=y CONFIG_HOTPLUG_CPU=y CONFIG_JUMP_LABEL=y @@ -82,6 +83,9 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_OHCI_HCD_PLATFORM=y CONFIG_USB_STORAGE=y CONFIG_USB_UAS=y +CONFIG_MMC_SDHCI=y +CONFIG_MMC_SDHCI_PLTFM=y +CONFIG_MMC_SDHCI_CADENCE=y CONFIG_MMC=y CONFIG_MMC_SPI=y CONFIG_RTC_CLASS=y From 8af85f712fce319dd9fe3d41046b5163e7eb0f93 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 21 Apr 2021 16:39:41 +0800 Subject: [PATCH 0678/1379] f2fs: compress: remove unneed check condition In only call path of __cluster_may_compress(), __f2fs_write_data_pages() has checked SBI_POR_DOING condition, and also cluster_may_compress() has checked CP_ERROR_FLAG condition, so remove redundant check condition in __cluster_may_compress() for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6e46a00c1930..53b13787eb2c 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -890,7 +890,6 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) static bool __cluster_may_compress(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); loff_t i_size = i_size_read(cc->inode); unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); int i; @@ -898,12 +897,7 @@ static bool __cluster_may_compress(struct compress_ctx *cc) for (i = 0; i < cc->cluster_size; i++) { struct page *page = cc->rpages[i]; - f2fs_bug_on(sbi, !page); - - if (unlikely(f2fs_cp_error(sbi))) - return false; - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - return false; + f2fs_bug_on(F2FS_I_SB(cc->inode), !page); /* beyond EOF */ if (page->index >= nr_pages) From 9557727876674893d35940fddbd03d3b505e7ed8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 22 Apr 2021 18:19:25 +0800 Subject: [PATCH 0679/1379] f2fs: drop inplace IO if fs status is abnormal If filesystem has cp_error or need_fsck status, let's drop inplace IO to avoid further corruption of fs data. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index efd1e57384b9..efac388d2468 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3552,7 +3552,13 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto drop_bio; + } + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || f2fs_cp_error(sbi)) { + err = -EIO; + goto drop_bio; } stat_inc_inplace_blocks(fio->sbi); @@ -3566,6 +3572,15 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } + return err; +drop_bio: + if (fio->bio) { + struct bio *bio = *(fio->bio); + + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + fio->bio = NULL; + } return err; } From 1ca86ac1ec8d201478e9616565d4df5d51595cfc Mon Sep 17 00:00:00 2001 From: Yanwei Gao Date: Wed, 10 Mar 2021 06:52:12 +0000 Subject: [PATCH 0680/1379] LSM: SafeSetID: Fix code specification by scripts/checkpatch.pl First, the code is found to be irregular through checkpatch.pl. Then I found break is really useless here. Signed-off-by: Yanwei Gao Signed-off-by: Micah Morton --- security/safesetid/lsm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/security/safesetid/lsm.c b/security/safesetid/lsm.c index 8a176b6adbe5..1079c6d54784 100644 --- a/security/safesetid/lsm.c +++ b/security/safesetid/lsm.c @@ -125,7 +125,6 @@ static int safesetid_security_capable(const struct cred *cred, pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n", __kuid_val(cred->uid)); return -EPERM; - break; case CAP_SETGID: /* * If no policy applies to this task, allow the use of CAP_SETGID for @@ -140,11 +139,9 @@ static int safesetid_security_capable(const struct cred *cred, pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n", __kuid_val(cred->uid)); return -EPERM; - break; default: /* Error, the only capabilities were checking for is CAP_SETUID/GID */ return 0; - break; } return 0; } From e1364711359f3ced054bda9920477c8bf93b74c5 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 21 Apr 2021 16:44:33 -0700 Subject: [PATCH 0681/1379] scsi: lpfc: Fix illegal memory access on Abort IOCBs In devloss timer handler and in backend calls to terminate remote port I/O, there is logic to walk through all active IOCBs and validate them to potentially trigger an abort request. This logic is causing illegal memory accesses which leads to a crash. Abort IOCBs, which may be on the list, do not have an associated lpfc_io_buf struct. The driver is trying to map an lpfc_io_buf struct on the IOCB and which results in a bogus address thus the issue. Fix by skipping over ABORT IOCBs (CLOSE IOCBs are ABORTS that don't send ABTS) in the IOCB scan logic. Link: https://lore.kernel.org/r/20210421234433.102079-1-jsmart2021@gmail.com Co-developed-by: Justin Tee Signed-off-by: Justin Tee Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_sli.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 06ccc0157bd8..579ac75dfe79 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -11804,13 +11804,20 @@ lpfc_sli_validate_fcp_iocb(struct lpfc_iocbq *iocbq, struct lpfc_vport *vport, lpfc_ctx_cmd ctx_cmd) { struct lpfc_io_buf *lpfc_cmd; + IOCB_t *icmd = NULL; int rc = 1; if (!iocbq || iocbq->vport != vport) return rc; - if (!(iocbq->iocb_flag & LPFC_IO_FCP) || - !(iocbq->iocb_flag & LPFC_IO_ON_TXCMPLQ)) + if (!(iocbq->iocb_flag & LPFC_IO_FCP) || + !(iocbq->iocb_flag & LPFC_IO_ON_TXCMPLQ) || + iocbq->iocb_flag & LPFC_DRIVER_ABORTED) + return rc; + + icmd = &iocbq->iocb; + if (icmd->ulpCommand == CMD_ABORT_XRI_CN || + icmd->ulpCommand == CMD_CLOSE_XRI_CN) return rc; lpfc_cmd = container_of(iocbq, struct lpfc_io_buf, cur_iocbq); From 83adbba746d1c8b6e3b07d73ae7815044804c96e Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 21 Apr 2021 16:44:48 -0700 Subject: [PATCH 0682/1379] scsi: lpfc: Fix DMA virtual address ptr assignment in bsg lpfc_bsg_ct_unsol_event() routine acts assigns a ct_request to the wrong structure address, resulting in a bad address that results in bsg related timeouts. Correct the ct_request assignment to use the kernel virtual buffer address (not the control structure address). Link: https://lore.kernel.org/r/20210421234448.102132-1-jsmart2021@gmail.com Co-developed-by: Justin Tee Signed-off-by: Justin Tee Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_bsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_bsg.c b/drivers/scsi/lpfc/lpfc_bsg.c index c2776b88d493..38cfe1bc6a4d 100644 --- a/drivers/scsi/lpfc/lpfc_bsg.c +++ b/drivers/scsi/lpfc/lpfc_bsg.c @@ -934,7 +934,7 @@ lpfc_bsg_ct_unsol_event(struct lpfc_hba *phba, struct lpfc_sli_ring *pring, INIT_LIST_HEAD(&head); list_add_tail(&head, &piocbq->list); - ct_req = (struct lpfc_sli_ct_request *)bdeBuf1; + ct_req = (struct lpfc_sli_ct_request *)bdeBuf1->virt; evt_req_id = ct_req->FsType; cmd = ct_req->CommandResponse.bits.CmdRsp; From e4ec10228fdf09b88ba018009f14a696fb50d3f2 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 21 Apr 2021 16:45:11 -0700 Subject: [PATCH 0683/1379] scsi: lpfc: Fix bad memory access during VPD DUMP mailbox command The dump command for reading a region passes a requested read length specified in words (4-byte units). The response overwrites the same field with the actual number of bytes read. The mailbox handler for DUMP which reads VPD data (region 23) is treating the response field as if it were still a word_cnt, thus multiplying it by 4 to set the read's "length". Given the read value was calculated based on the size of the read buffer, the longer response length runs off the end of the buffer. Fix by reworking the code to use the response field as a byte count. Link: https://lore.kernel.org/r/20210421234511.102206-1-jsmart2021@gmail.com Co-developed-by: Justin Tee Signed-off-by: Justin Tee Signed-off-by: James Smart Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_init.c | 12 ++++++------ drivers/scsi/lpfc/lpfc_sli.c | 15 ++++++++------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 1e4c792bb660..5f018d02bf56 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -254,13 +254,13 @@ lpfc_config_port_prep(struct lpfc_hba *phba) if (mb->un.varDmp.word_cnt == 0) break; - i = mb->un.varDmp.word_cnt * sizeof(uint32_t); - if (offset + i > DMP_VPD_SIZE) - i = DMP_VPD_SIZE - offset; + if (mb->un.varDmp.word_cnt > DMP_VPD_SIZE - offset) + mb->un.varDmp.word_cnt = DMP_VPD_SIZE - offset; lpfc_sli_pcimem_bcopy(((uint8_t *)mb) + DMP_RSP_OFFSET, - lpfc_vpd_data + offset, i); - offset += i; - } while (offset < DMP_VPD_SIZE); + lpfc_vpd_data + offset, + mb->un.varDmp.word_cnt); + offset += mb->un.varDmp.word_cnt; + } while (mb->un.varDmp.word_cnt && offset < DMP_VPD_SIZE); lpfc_parse_vpd(phba, lpfc_vpd_data, offset); diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 579ac75dfe79..573c8599d71c 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -19777,7 +19777,7 @@ lpfc_sli_get_config_region23(struct lpfc_hba *phba, char *rgn23_data) LPFC_MBOXQ_t *pmb = NULL; MAILBOX_t *mb; uint32_t offset = 0; - int i, rc; + int rc; if (!rgn23_data) return 0; @@ -19808,13 +19808,14 @@ lpfc_sli_get_config_region23(struct lpfc_hba *phba, char *rgn23_data) if (mb->un.varDmp.word_cnt == 0) break; - i = mb->un.varDmp.word_cnt * sizeof(uint32_t); - if (offset + i > DMP_RGN23_SIZE) - i = DMP_RGN23_SIZE - offset; + if (mb->un.varDmp.word_cnt > DMP_RGN23_SIZE - offset) + mb->un.varDmp.word_cnt = DMP_RGN23_SIZE - offset; + lpfc_sli_pcimem_bcopy(((uint8_t *)mb) + DMP_RSP_OFFSET, - rgn23_data + offset, i); - offset += i; - } while (offset < DMP_RGN23_SIZE); + rgn23_data + offset, + mb->un.varDmp.word_cnt); + offset += mb->un.varDmp.word_cnt; + } while (mb->un.varDmp.word_cnt && offset < DMP_RGN23_SIZE); mempool_free(pmb, phba->mbox_mem_pool); return offset; From df86ddbb9189d4fe6fe2c143d244e1121b57eb50 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Mar 2021 23:58:22 +0900 Subject: [PATCH 0684/1379] parisc: syscalls: switch to generic syscalltbl.sh Many architectures duplicate similar shell scripts. This commit converts parisc to use scripts/syscalltbl.sh. This also unifies syscall_table_64.h and syscall_table_c32.h. Signed-off-by: Masahiro Yamada Acked-by: Helge Deller Signed-off-by: Helge Deller --- arch/parisc/include/asm/Kbuild | 1 - arch/parisc/kernel/syscall.S | 16 +++++----- arch/parisc/kernel/syscalls/Makefile | 19 ++++-------- arch/parisc/kernel/syscalls/syscalltbl.sh | 36 ----------------------- 4 files changed, 12 insertions(+), 60 deletions(-) delete mode 100644 arch/parisc/kernel/syscalls/syscalltbl.sh diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 4406475a2304..e6e7f74c8ac9 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 generated-y += syscall_table_32.h generated-y += syscall_table_64.h -generated-y += syscall_table_c32.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += user.h diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S index 322503780db6..3f24a0af1e04 100644 --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@ -919,24 +919,24 @@ ENTRY(lws_table) END(lws_table) /* End of lws table */ +#ifdef CONFIG_64BIT +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) +#else +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) +#endif #define __SYSCALL(nr, entry) ASM_ULONG_INSN entry .align 8 ENTRY(sys_call_table) .export sys_call_table,data -#ifdef CONFIG_64BIT -#include /* Compat syscalls */ -#else -#include /* 32-bit native syscalls */ -#endif +#include /* 32-bit syscalls */ END(sys_call_table) #ifdef CONFIG_64BIT .align 8 ENTRY(sys_call_table64) -#include /* 64-bit native syscalls */ +#include /* 64-bit syscalls */ END(sys_call_table64) #endif -#undef __SYSCALL /* All light-weight-syscall atomic operations @@ -961,5 +961,3 @@ END(lws_lock_start) .previous .end - - diff --git a/arch/parisc/kernel/syscalls/Makefile b/arch/parisc/kernel/syscalls/Makefile index 283f64407b07..11424f1c8d9e 100644 --- a/arch/parisc/kernel/syscalls/Makefile +++ b/arch/parisc/kernel/syscalls/Makefile @@ -7,7 +7,7 @@ _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ syscall := $(src)/syscall.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh -systbl := $(srctree)/$(src)/syscalltbl.sh +systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ @@ -16,10 +16,7 @@ quiet_cmd_syshdr = SYSHDR $@ '$(syshdr_offset_$(basetarget))' quiet_cmd_systbl = SYSTBL $@ - cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@' \ - '$(systbl_abis_$(basetarget))' \ - '$(systbl_abi_$(basetarget))' \ - '$(systbl_offset_$(basetarget))' + cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ syshdr_abis_unistd_32 := common,32 $(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE @@ -29,23 +26,17 @@ syshdr_abis_unistd_64 := common,64 $(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -systbl_abis_syscall_table_32 := common,32 +$(kapi)/syscall_table_32.h: abis := common,32 $(kapi)/syscall_table_32.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) -systbl_abis_syscall_table_64 := common,64 +$(kapi)/syscall_table_64.h: abis := common,64 $(kapi)/syscall_table_64.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) -systbl_abis_syscall_table_c32 := common,32 -systbl_abi_syscall_table_c32 := c32 -$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) FORCE - $(call if_changed,systbl) - uapisyshdr-y += unistd_32.h unistd_64.h kapisyshdr-y += syscall_table_32.h \ - syscall_table_64.h \ - syscall_table_c32.h + syscall_table_64.h uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) diff --git a/arch/parisc/kernel/syscalls/syscalltbl.sh b/arch/parisc/kernel/syscalls/syscalltbl.sh deleted file mode 100644 index f7393a7b18aa..000000000000 --- a/arch/parisc/kernel/syscalls/syscalltbl.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -my_abi="$4" -offset="$5" - -emit() { - t_nxt="$1" - t_nr="$2" - t_entry="$3" - - while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" - t_nxt=$((t_nxt+1)) - done - printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" -} - -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - nxt=0 - if [ -z "$offset" ]; then - offset=0 - fi - - while read nr abi name entry compat ; do - if [ "$my_abi" = "c32" ] && [ ! -z "$compat" ]; then - emit $((nxt+offset)) $((nr+offset)) $compat - else - emit $((nxt+offset)) $((nr+offset)) $entry - fi - nxt=$((nr+1)) - done -) > "$out" From adf27404e8a02cbcca9610bc51e41986c880b5aa Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Mar 2021 23:58:23 +0900 Subject: [PATCH 0685/1379] parisc: syscalls: switch to generic syscallhdr.sh Many architectures duplicate similar shell scripts. This commit converts parisc to use scripts/syscallhdr.sh. Signed-off-by: Masahiro Yamada Signed-off-by: Helge Deller --- arch/parisc/kernel/syscalls/Makefile | 11 +++---- arch/parisc/kernel/syscalls/syscallhdr.sh | 36 ----------------------- 2 files changed, 4 insertions(+), 43 deletions(-) delete mode 100644 arch/parisc/kernel/syscalls/syscallhdr.sh diff --git a/arch/parisc/kernel/syscalls/Makefile b/arch/parisc/kernel/syscalls/Makefile index 11424f1c8d9e..0f2ea5bcb0d7 100644 --- a/arch/parisc/kernel/syscalls/Makefile +++ b/arch/parisc/kernel/syscalls/Makefile @@ -6,23 +6,20 @@ _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') syscall := $(src)/syscall.tbl -syshdr := $(srctree)/$(src)/syscallhdr.sh +syshdr := $(srctree)/scripts/syscallhdr.sh systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ - '$(syshdr_abis_$(basetarget))' \ - '$(syshdr_pfx_$(basetarget))' \ - '$(syshdr_offset_$(basetarget))' + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr --abis $(abis) $< $@ quiet_cmd_systbl = SYSTBL $@ cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ -syshdr_abis_unistd_32 := common,32 +$(uapi)/unistd_32.h: abis := common,32 $(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abis_unistd_64 := common,64 +$(uapi)/unistd_64.h: abis := common,64 $(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) diff --git a/arch/parisc/kernel/syscalls/syscallhdr.sh b/arch/parisc/kernel/syscalls/syscallhdr.sh deleted file mode 100644 index 730db288fe54..000000000000 --- a/arch/parisc/kernel/syscalls/syscallhdr.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -prefix="$4" -offset="$5" - -fileguard=_UAPI_ASM_PARISC_`basename "$out" | sed \ - -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ - -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - printf "#ifndef %s\n" "${fileguard}" - printf "#define %s\n" "${fileguard}" - printf "\n" - - nxt=0 - while read nr abi name entry compat ; do - if [ -z "$offset" ]; then - printf "#define __NR_%s%s\t%s\n" \ - "${prefix}" "${name}" "${nr}" - else - printf "#define __NR_%s%s\t(%s + %s)\n" \ - "${prefix}" "${name}" "${offset}" "${nr}" - fi - nxt=$((nr+1)) - done - - printf "\n" - printf "#ifdef __KERNEL__\n" - printf "#define __NR_syscalls\t%s\n" "${nxt}" - printf "#endif\n" - printf "\n" - printf "#endif /* %s */\n" "${fileguard}" -) > "$out" From 80342d484afceec491bcc85ff1e32c5491c1182f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 27 Apr 2021 12:48:28 +0100 Subject: [PATCH 0686/1379] kernel-doc: Add support for __deprecated The current linux-next tree has a new error: ./Documentation/gpu/drm-mm:445: ./drivers/gpu/drm/drm_prime.c:994: WARNING: Error in declarator or parameters Invalid C declaration: Expecting "(" in parameters. [error at 17] int __deprecated drm_prime_sg_to_page_array (struct sg_table *sgt, struct page **pages, int max_entries) -----------------^ While we might consider that documenting a deprecated interface is not necessarily best practice, removing the error is easy. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20210427114828.GY235567@casper.infradead.org Signed-off-by: Jonathan Corbet --- scripts/kernel-doc | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 2a85d34fdcd0..4840e748fca8 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -1777,6 +1777,7 @@ sub dump_function($$) { $prototype =~ s/^noinline +//; $prototype =~ s/__init +//; $prototype =~ s/__init_or_module +//; + $prototype =~ s/__deprecated +//; $prototype =~ s/__flatten +//; $prototype =~ s/__meminit +//; $prototype =~ s/__must_check +//; From 95b079d8215b83b37fa59341fda92fcb9392f14a Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Thu, 22 Apr 2021 16:14:53 +0800 Subject: [PATCH 0687/1379] swiotlb: Fix the type of index Fix the type of index from unsigned int to int since find_slots() might return -1. Fixes: 26a7e094783d ("swiotlb: refactor swiotlb_tbl_map_single") Reviewed-by: Christoph Hellwig Signed-off-by: Claire Chang Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0a5b6f7e75bc..8635a57f88e9 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -499,7 +499,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, { struct io_tlb_mem *mem = io_tlb_default_mem; unsigned int offset = swiotlb_align_offset(dev, orig_addr); - unsigned int index, i; + unsigned int i; + int index; phys_addr_t tlb_addr; if (!mem) From 6a79162fe5d5e0eb55bc48e99450982b0daf0a0f Mon Sep 17 00:00:00 2001 From: "bilbao@vt.edu" Date: Tue, 27 Apr 2021 13:28:29 -0400 Subject: [PATCH 0688/1379] docs: Fix typo in Documentation/x86/x86_64/5level-paging.rst fix two typos in the documentation (Documentation/x86/x86_64/5level-paging.rst), changing 'paing' for 'paging' and using the right verbal form for plural on 'some vendors offer'. Signed-off-by: Carlos Bilbao Link: https://lore.kernel.org/r/2599991.mvXUDI8C0e@iron-maiden Signed-off-by: Jonathan Corbet --- Documentation/x86/x86_64/5level-paging.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/x86/x86_64/5level-paging.rst b/Documentation/x86/x86_64/5level-paging.rst index 44856417e6a5..b792bbdc0b01 100644 --- a/Documentation/x86/x86_64/5level-paging.rst +++ b/Documentation/x86/x86_64/5level-paging.rst @@ -6,9 +6,9 @@ Overview ======== -Original x86-64 was limited by 4-level paing to 256 TiB of virtual address +Original x86-64 was limited by 4-level paging to 256 TiB of virtual address space and 64 TiB of physical address space. We are already bumping into -this limit: some vendors offers servers with 64 TiB of memory today. +this limit: some vendors offer servers with 64 TiB of memory today. To overcome the limitation upcoming hardware will introduce support for 5-level paging. It is a straight-forward extension of the current page From e7df4524cd9a6a006f9e12f3d908e5af69dfa145 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 21 Jan 2021 12:32:05 -0500 Subject: [PATCH 0689/1379] ceph: rip out old fscache readpage handling With the new netfs read helper functions, we won't need a lot of this infrastructure as it handles the pagecache pages itself. Rip out the read handling for now, and much of the old infrastructure that deals in individual pages. The cookie handling is mostly unchanged, however. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 31 +----------- fs/ceph/cache.c | 125 ------------------------------------------------ fs/ceph/cache.h | 91 +---------------------------------- fs/ceph/caps.c | 9 ---- fs/ceph/super.h | 1 - 5 files changed, 3 insertions(+), 254 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 26e66436f005..e267aa60c8b6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -155,8 +155,6 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, return; } - ceph_invalidate_fscache_page(inode, page); - WARN_ON(!PageLocked(page)); if (!PagePrivate(page)) return; @@ -175,10 +173,6 @@ static int ceph_releasepage(struct page *page, gfp_t g) dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host, page, page->index, PageDirty(page) ? "" : "not "); - /* Can we release the page from the cache? */ - if (!ceph_release_fscache_page(page, g)) - return 0; - return !PagePrivate(page); } @@ -213,10 +207,6 @@ static int ceph_do_readpage(struct file *filp, struct page *page) return 0; } - err = ceph_readpage_from_fscache(inode, page); - if (err == 0) - return -EINPROGRESS; - dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n", vino.ino, vino.snap, filp, off, len, page, page->index); req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1, @@ -241,7 +231,6 @@ static int ceph_do_readpage(struct file *filp, struct page *page) if (err == -ENOENT) err = 0; if (err < 0) { - ceph_fscache_readpage_cancel(inode, page); if (err == -EBLOCKLISTED) fsc->blocklisted = true; goto out; @@ -253,8 +242,6 @@ static int ceph_do_readpage(struct file *filp, struct page *page) flush_dcache_page(page); SetPageUptodate(page); - ceph_readpage_to_fscache(inode, page); - out: return err < 0 ? err : 0; } @@ -294,10 +281,8 @@ static void finish_read(struct ceph_osd_request *req) for (i = 0; i < num_pages; i++) { struct page *page = osd_data->pages[i]; - if (rc < 0 && rc != -ENOENT) { - ceph_fscache_readpage_cancel(inode, page); + if (rc < 0 && rc != -ENOENT) goto unlock; - } if (bytes < (int)PAGE_SIZE) { /* zero (remainder of) page */ int s = bytes < 0 ? 0 : bytes; @@ -307,7 +292,6 @@ static void finish_read(struct ceph_osd_request *req) page->index); flush_dcache_page(page); SetPageUptodate(page); - ceph_readpage_to_fscache(inode, page); unlock: unlock_page(page); put_page(page); @@ -408,7 +392,6 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, page->index); if (add_to_page_cache_lru(page, &inode->i_data, page->index, GFP_KERNEL)) { - ceph_fscache_uncache_page(inode, page); put_page(page); dout("start_read %p add_to_page_cache failed %p\n", inode, page); @@ -440,10 +423,8 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, return nr_pages; out_pages: - for (i = 0; i < nr_pages; ++i) { - ceph_fscache_readpage_cancel(inode, pages[i]); + for (i = 0; i < nr_pages; ++i) unlock_page(pages[i]); - } ceph_put_page_vector(pages, nr_pages, false); out_put: ceph_osdc_put_request(req); @@ -471,12 +452,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) return -EINVAL; - rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, - &nr_pages); - - if (rc == 0) - goto out; - rw_ctx = ceph_find_rw_context(fi); max = fsc->mount_options->rsize >> PAGE_SHIFT; dout("readpages %p file %p ctx %p nr_pages %d max %d\n", @@ -487,8 +462,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, goto out; } out: - ceph_fscache_readpages_cancel(inode, page_list); - dout("readpages %p file %p ret %d\n", inode, file, rc); return rc; } diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 2f5cb6bc78e1..9cfadbb86568 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -173,7 +173,6 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) ci->fscache = NULL; - fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode); fscache_relinquish_cookie(cookie, &ci->i_vino, false); } @@ -194,7 +193,6 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) dout("fscache_file_set_cookie %p %p disabling cache\n", inode, filp); fscache_disable_cookie(ci->fscache, &ci->i_vino, false); - fscache_uncache_all_inode_pages(ci->fscache, inode); } else { fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode), ceph_fscache_can_enable, inode); @@ -205,108 +203,6 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) } } -static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error) -{ - if (!error) - SetPageUptodate(page); - - unlock_page(page); -} - -static inline bool cache_valid(struct ceph_inode_info *ci) -{ - return ci->i_fscache_gen == ci->i_rdcache_gen; -} - - -/* Atempt to read from the fscache, - * - * This function is called from the readpage_nounlock context. DO NOT attempt to - * unlock the page here (or in the callback). - */ -int ceph_readpage_from_fscache(struct inode *inode, struct page *page) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; - - if (!cache_valid(ci)) - return -ENOBUFS; - - ret = fscache_read_or_alloc_page(ci->fscache, page, - ceph_readpage_from_fscache_complete, NULL, - GFP_KERNEL); - - switch (ret) { - case 0: /* Page found */ - dout("page read submitted\n"); - return 0; - case -ENOBUFS: /* Pages were not found, and can't be */ - case -ENODATA: /* Pages were not found */ - dout("page/inode not in cache\n"); - return ret; - default: - dout("%s: unknown error ret = %i\n", __func__, ret); - return ret; - } -} - -int ceph_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; - - if (!cache_valid(ci)) - return -ENOBUFS; - - ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, - ceph_readpage_from_fscache_complete, - NULL, mapping_gfp_mask(mapping)); - - switch (ret) { - case 0: /* All pages found */ - dout("all-page read submitted\n"); - return 0; - case -ENOBUFS: /* Some pages were not found, and can't be */ - case -ENODATA: /* some pages were not found */ - dout("page/inode not in cache\n"); - return ret; - default: - dout("%s: unknown error ret = %i\n", __func__, ret); - return ret; - } -} - -void ceph_readpage_to_fscache(struct inode *inode, struct page *page) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; - - if (!PageFsCache(page)) - return; - - if (!cache_valid(ci)) - return; - - ret = fscache_write_page(ci->fscache, page, i_size_read(inode), - GFP_KERNEL); - if (ret) - fscache_uncache_page(ci->fscache, page); -} - -void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - - if (!PageFsCache(page)) - return; - - fscache_wait_on_page_write(ci->fscache, page); - fscache_uncache_page(ci->fscache, page); -} - void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { if (fscache_cookie_valid(fsc->fscache)) { @@ -329,24 +225,3 @@ void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) } fsc->fscache = NULL; } - -/* - * caller should hold CEPH_CAP_FILE_{RD,CACHE} - */ -void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) -{ - if (cache_valid(ci)) - return; - - /* resue i_truncate_mutex. There should be no pending - * truncate while the caller holds CEPH_CAP_FILE_RD */ - mutex_lock(&ci->i_truncate_mutex); - if (!cache_valid(ci)) { - if (fscache_check_consistency(ci->fscache, &ci->i_vino)) - fscache_invalidate(ci->fscache); - spin_lock(&ci->i_ceph_lock); - ci->i_fscache_gen = ci->i_rdcache_gen; - spin_unlock(&ci->i_ceph_lock); - } - mutex_unlock(&ci->i_truncate_mutex); -} diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 89dbdd1eb14a..10c21317b62f 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -29,13 +29,10 @@ int ceph_readpages_from_fscache(struct inode *inode, struct address_space *mapping, struct list_head *pages, unsigned *nr_pages); -void ceph_readpage_to_fscache(struct inode *inode, struct page *page); -void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { ci->fscache = NULL; - ci->i_fscache_gen = 0; } static inline void ceph_fscache_invalidate(struct inode *inode) @@ -43,40 +40,6 @@ static inline void ceph_fscache_invalidate(struct inode *inode) fscache_invalidate(ceph_inode(inode)->fscache); } -static inline void ceph_fscache_uncache_page(struct inode *inode, - struct page *page) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - return fscache_uncache_page(ci->fscache, page); -} - -static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) -{ - struct inode* inode = page->mapping->host; - struct ceph_inode_info *ci = ceph_inode(inode); - return fscache_maybe_release_page(ci->fscache, page, gfp); -} - -static inline void ceph_fscache_readpage_cancel(struct inode *inode, - struct page *page) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) - __fscache_uncache_page(ci->fscache, page); -} - -static inline void ceph_fscache_readpages_cancel(struct inode *inode, - struct list_head *pages) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - return fscache_readpages_cancel(ci->fscache, pages); -} - -static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) -{ - ci->i_fscache_gen = ci->i_rdcache_gen - 1; -} - #else static inline int ceph_fscache_register(void) @@ -115,62 +78,10 @@ static inline void ceph_fscache_file_set_cookie(struct inode *inode, { } -static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) -{ -} - -static inline void ceph_fscache_uncache_page(struct inode *inode, - struct page *pages) -{ -} - -static inline int ceph_readpage_from_fscache(struct inode* inode, - struct page *page) -{ - return -ENOBUFS; -} - -static inline int ceph_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - return -ENOBUFS; -} - -static inline void ceph_readpage_to_fscache(struct inode *inode, - struct page *page) -{ -} - static inline void ceph_fscache_invalidate(struct inode *inode) { } -static inline void ceph_invalidate_fscache_page(struct inode *inode, - struct page *page) -{ -} - -static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) -{ - return 1; -} - -static inline void ceph_fscache_readpage_cancel(struct inode *inode, - struct page *page) -{ -} - -static inline void ceph_fscache_readpages_cancel(struct inode *inode, - struct list_head *pages) -{ -} - -static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) -{ -} - #endif -#endif +#endif /* _CEPH_CACHE_H */ diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 570731c4d019..b256fc8c68d0 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2730,10 +2730,6 @@ again: *got = need | want; else *got = need; - if (S_ISREG(inode->i_mode) && - (need & CEPH_CAP_FILE_RD) && - !(*got & CEPH_CAP_FILE_CACHE)) - ceph_disable_fscache_readpage(ci); ceph_take_cap_refs(ci, *got, true); ret = 1; } @@ -2983,11 +2979,6 @@ int ceph_get_caps(struct file *filp, int need, int want, } break; } - - if (S_ISREG(ci->vfs_inode.i_mode) && - (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) - ceph_fscache_revalidate_cookie(ci); - *got = _got; return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c48bb30c8d70..6185c59954e3 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -427,7 +427,6 @@ struct ceph_inode_info { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; - u32 i_fscache_gen; #endif errseq_t i_meta_err; From 7c46b31809337df12a538239e6caa41df7c7deec Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 21 Jan 2021 16:27:14 -0500 Subject: [PATCH 0690/1379] ceph: rework PageFsCache handling With the new fscache API, the PageFsCache bit now indicates that the page is being written to the cache and shouldn't be modified or released until it's finished. Change releasepage and invalidatepage to wait on that bit before returning. Also define FSCACHE_USE_NEW_IO_API so that we opt into the new fscache API. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 9 ++++++++- fs/ceph/super.h | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e267aa60c8b6..3936ae3e8dcd 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -146,6 +146,8 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, struct ceph_inode_info *ci; struct ceph_snap_context *snapc = page_snap_context(page); + wait_on_page_fscache(page); + inode = page->mapping->host; ci = ceph_inode(inode); @@ -168,11 +170,16 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, ClearPagePrivate(page); } -static int ceph_releasepage(struct page *page, gfp_t g) +static int ceph_releasepage(struct page *page, gfp_t gfp) { dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host, page, page->index, PageDirty(page) ? "" : "not "); + if (PageFsCache(page)) { + if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) + return 0; + wait_on_page_fscache(page); + } return !PagePrivate(page); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6185c59954e3..188565d806b2 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -21,6 +21,7 @@ #include #ifdef CONFIG_CEPH_FSCACHE +#define FSCACHE_USE_NEW_IO_API #include #endif From 10a7052c7868bc7bc72d947f5aac6f768928db87 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 21 Jan 2021 18:05:37 -0500 Subject: [PATCH 0691/1379] ceph: fix fscache invalidation Ensure that we invalidate the fscache whenever we invalidate the pagecache. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 1 + fs/ceph/inode.c | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b256fc8c68d0..e12e4cdefac1 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1867,6 +1867,7 @@ static int try_nonblocking_invalidate(struct inode *inode) u32 invalidating_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); + ceph_fscache_invalidate(inode); invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&ci->i_ceph_lock); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 156f849f5385..4418d4be2907 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1863,6 +1863,7 @@ static void ceph_do_invalidate_pages(struct inode *inode) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); + ceph_fscache_invalidate(inode); if (invalidate_inode_pages2(inode->i_mapping) < 0) { pr_err("invalidate_pages %p fails\n", inode); } From f0702876e152f0443911514aec8b2bf563a2432b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 1 Jun 2020 10:10:21 -0400 Subject: [PATCH 0692/1379] ceph: convert ceph_readpage to netfs_readpage Have the ceph KConfig select NETFS_SUPPORT. Add a new netfs ops structure and the operations for it. Convert ceph_readpage to use the new netfs_readpage helper. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/Kconfig | 1 + fs/ceph/addr.c | 168 +++++++++++++++++++++++++++++++++++++++++++++--- fs/ceph/cache.h | 36 +++++++++++ 3 files changed, 195 insertions(+), 10 deletions(-) diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 471e40156065..94df854147d3 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -6,6 +6,7 @@ config CEPH_FS select LIBCRC32C select CRYPTO_AES select CRYPTO + select NETFS_SUPPORT default n help Choose Y or M here to include support for mounting the diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 3936ae3e8dcd..c848e8a4e539 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "super.h" #include "mds_client.h" @@ -183,6 +184,163 @@ static int ceph_releasepage(struct page *page, gfp_t gfp) return !PagePrivate(page); } +static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) +{ + struct inode *inode = rreq->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_file_layout *lo = &ci->i_layout; + u32 blockoff; + u64 blockno; + + /* Expand the start downward */ + blockno = div_u64_rem(rreq->start, lo->stripe_unit, &blockoff); + rreq->start = blockno * lo->stripe_unit; + rreq->len += blockoff; + + /* Now, round up the length to the next block */ + rreq->len = roundup(rreq->len, lo->stripe_unit); +} + +static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq) +{ + struct inode *inode = subreq->rreq->mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + u64 objno, objoff; + u32 xlen; + + /* Truncate the extent at the end of the current block */ + ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, + &objno, &objoff, &xlen); + subreq->len = min(xlen, fsc->mount_options->rsize); + return true; +} + +static void finish_netfs_read(struct ceph_osd_request *req) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode); + struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + struct netfs_read_subrequest *subreq = req->r_priv; + int num_pages; + int err = req->r_result; + + ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, err); + + dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result, + subreq->len, i_size_read(req->r_inode)); + + /* no object means success but no data */ + if (err == -ENOENT) + err = 0; + else if (err == -EBLOCKLISTED) + fsc->blocklisted = true; + + if (err >= 0 && err < subreq->len) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + + netfs_subreq_terminated(subreq, err, true); + + num_pages = calc_pages_for(osd_data->alignment, osd_data->length); + ceph_put_page_vector(osd_data->pages, num_pages, false); + iput(req->r_inode); +} + +static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) +{ + struct netfs_read_request *rreq = subreq->rreq; + struct inode *inode = rreq->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + struct ceph_vino vino = ceph_vino(inode); + struct iov_iter iter; + struct page **pages; + size_t page_off; + int err = 0; + u64 len = subreq->len; + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, + 0, 1, CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, + NULL, ci->i_truncate_seq, ci->i_truncate_size, false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + req = NULL; + goto out; + } + + dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off); + if (err < 0) { + dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err); + goto out; + } + + /* should always give us a page-aligned read */ + WARN_ON_ONCE(page_off); + len = err; + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); + req->r_callback = finish_netfs_read; + req->r_priv = subreq; + req->r_inode = inode; + ihold(inode); + + err = ceph_osdc_start_request(req->r_osdc, req, false); + if (err) + iput(inode); +out: + ceph_osdc_put_request(req); + if (err) + netfs_subreq_terminated(subreq, err, false); + dout("%s: result %d\n", __func__, err); +} + +static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file) +{ +} + +const struct netfs_read_request_ops ceph_netfs_read_ops = { + .init_rreq = ceph_init_rreq, + .is_cache_enabled = ceph_is_cache_enabled, + .begin_cache_operation = ceph_begin_cache_operation, + .issue_op = ceph_netfs_issue_op, + .expand_readahead = ceph_netfs_expand_readahead, + .clamp_length = ceph_netfs_clamp_length, +}; + +/* read a single page, without unlocking it. */ +static int ceph_readpage(struct file *file, struct page *page) +{ + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_vino vino = ceph_vino(inode); + u64 off = page_offset(page); + u64 len = PAGE_SIZE; + + if (ci->i_inline_version != CEPH_INLINE_NONE) { + /* + * Uptodate inline data should have been added + * into page cache while getting Fcr caps. + */ + if (off == 0) { + unlock_page(page); + return -EINVAL; + } + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + unlock_page(page); + return 0; + } + + dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n", + vino.ino, vino.snap, file, off, len, page, page->index); + + return netfs_readpage(file, page, &ceph_netfs_read_ops, NULL); +} + /* read a single page, without unlocking it. */ static int ceph_do_readpage(struct file *filp, struct page *page) { @@ -253,16 +411,6 @@ out: return err < 0 ? err : 0; } -static int ceph_readpage(struct file *filp, struct page *page) -{ - int r = ceph_do_readpage(filp, page); - if (r != -EINPROGRESS) - unlock_page(page); - else - r = 0; - return r; -} - /* * Finish an async read(ahead) op. */ diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 10c21317b62f..1409d6149281 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -9,6 +9,8 @@ #ifndef _CEPH_CACHE_H #define _CEPH_CACHE_H +#include + #ifdef CONFIG_CEPH_FSCACHE extern struct fscache_netfs ceph_cache_netfs; @@ -35,11 +37,31 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) ci->fscache = NULL; } +static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) +{ + return ci->fscache; +} + static inline void ceph_fscache_invalidate(struct inode *inode) { fscache_invalidate(ceph_inode(inode)->fscache); } +static inline bool ceph_is_cache_enabled(struct inode *inode) +{ + struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(inode)); + + if (!cookie) + return false; + return fscache_cookie_enabled(cookie); +} + +static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) +{ + struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode)); + + return fscache_begin_read_operation(rreq, cookie); +} #else static inline int ceph_fscache_register(void) @@ -65,6 +87,11 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { } +static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) +{ + return NULL; +} + static inline void ceph_fscache_register_inode_cookie(struct inode *inode) { } @@ -82,6 +109,15 @@ static inline void ceph_fscache_invalidate(struct inode *inode) { } +static inline bool ceph_is_cache_enabled(struct inode *inode) +{ + return false; +} + +static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) +{ + return -ENOBUFS; +} #endif #endif /* _CEPH_CACHE_H */ From d801327d9500c74628b65121eedbdb31441c58c9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 5 Jun 2020 10:43:21 -0400 Subject: [PATCH 0693/1379] ceph: convert ceph_write_begin to netfs_write_begin Convert ceph_write_begin to use the netfs_write_begin helper. Most of the ops we need for it are already in place from the readpage conversion but we do add a new check_write_begin op since ceph needs to be able to vet whether there is an incompatible writeback already in flight before reading in the page. With this, we can also remove the old ceph_do_readpage helper. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 188 +++++++++++++++---------------------------------- 1 file changed, 58 insertions(+), 130 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c848e8a4e539..8a214f11e349 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -62,6 +62,9 @@ (CONGESTION_ON_THRESH(congestion_kb) - \ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) +static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, + struct page *page, void **_fsdata); + static inline struct ceph_snap_context *page_snap_context(struct page *page) { if (PagePrivate(page)) @@ -309,6 +312,7 @@ const struct netfs_read_request_ops ceph_netfs_read_ops = { .issue_op = ceph_netfs_issue_op, .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, + .check_write_begin = ceph_netfs_check_write_begin, }; /* read a single page, without unlocking it. */ @@ -341,76 +345,6 @@ static int ceph_readpage(struct file *file, struct page *page) return netfs_readpage(file, page, &ceph_netfs_read_ops, NULL); } -/* read a single page, without unlocking it. */ -static int ceph_do_readpage(struct file *filp, struct page *page) -{ - struct inode *inode = file_inode(filp); - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_client *osdc = &fsc->client->osdc; - struct ceph_osd_request *req; - struct ceph_vino vino = ceph_vino(inode); - int err = 0; - u64 off = page_offset(page); - u64 len = PAGE_SIZE; - - if (off >= i_size_read(inode)) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); - return 0; - } - - if (ci->i_inline_version != CEPH_INLINE_NONE) { - /* - * Uptodate inline data should have been added - * into page cache while getting Fcr caps. - */ - if (off == 0) - return -EINVAL; - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); - return 0; - } - - dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n", - vino.ino, vino.snap, filp, off, len, page, page->index); - req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, - ci->i_truncate_seq, ci->i_truncate_size, - false); - if (IS_ERR(req)) - return PTR_ERR(req); - - osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); - - err = ceph_osdc_start_request(osdc, req, false); - if (!err) - err = ceph_osdc_wait_request(osdc, req); - - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, err); - - ceph_osdc_put_request(req); - dout("readpage result %d\n", err); - - if (err == -ENOENT) - err = 0; - if (err < 0) { - if (err == -EBLOCKLISTED) - fsc->blocklisted = true; - goto out; - } - if (err < PAGE_SIZE) - /* zero fill remainder of page */ - zero_user_segment(page, err, PAGE_SIZE); - else - flush_dcache_page(page); - - SetPageUptodate(page); -out: - return err < 0 ? err : 0; -} - /* * Finish an async read(ahead) op. */ @@ -1430,6 +1364,31 @@ ceph_find_incompatible(struct page *page) return NULL; } +static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, + struct page *page, void **_fsdata) +{ + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + + snapc = ceph_find_incompatible(page); + if (snapc) { + int r; + + unlock_page(page); + put_page(page); + if (IS_ERR(snapc)) + return PTR_ERR(snapc); + + ceph_queue_writeback(inode); + r = wait_event_killable(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + return r == 0 ? -EAGAIN : r; + } + return 0; +} + /* * We are only allowed to write into/dirty the page if the page is * clean, or already dirty within the same snap context. @@ -1440,75 +1399,47 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_snap_context *snapc; struct page *page = NULL; pgoff_t index = pos >> PAGE_SHIFT; - int pos_in_page = pos & ~PAGE_MASK; - int r = 0; + int r; - dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); - - for (;;) { + /* + * Uninlining should have already been done and everything updated, EXCEPT + * for inline_version sent to the MDS. + */ + if (ci->i_inline_version != CEPH_INLINE_NONE) { page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { - r = -ENOMEM; - break; - } + if (!page) + return -ENOMEM; - snapc = ceph_find_incompatible(page); - if (snapc) { - if (IS_ERR(snapc)) { - r = PTR_ERR(snapc); - break; + /* + * The inline_version on a new inode is set to 1. If that's the + * case, then the page is brand new and isn't yet Uptodate. + */ + r = 0; + if (index == 0 && ci->i_inline_version != 1) { + if (!PageUptodate(page)) { + WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n", + ci->i_inline_version); + r = -EINVAL; } - unlock_page(page); - put_page(page); - page = NULL; - ceph_queue_writeback(inode); - r = wait_event_killable(ci->i_cap_wq, - context_is_writeable_or_written(inode, snapc)); - ceph_put_snap_context(snapc); - if (r != 0) - break; - continue; + goto out; } - - if (PageUptodate(page)) { - dout(" page %p already uptodate\n", page); - break; - } - - /* - * In some cases we don't need to read at all: - * - full page write - * - write that lies completely beyond EOF - * - write that covers the the page from start to EOF or beyond it - */ - if ((pos_in_page == 0 && len == PAGE_SIZE) || - (pos >= i_size_read(inode)) || - (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) { - zero_user_segments(page, 0, pos_in_page, - pos_in_page + len, PAGE_SIZE); - break; - } - - /* - * We need to read it. If we get back -EINPROGRESS, then the page was - * handed off to fscache and it will be unlocked when the read completes. - * Refind the page in that case so we can reacquire the page lock. Otherwise - * we got a hard error or the read was completed synchronously. - */ - r = ceph_do_readpage(file, page); - if (r != -EINPROGRESS) - break; + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + goto out; } + r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &page, NULL, + &ceph_netfs_read_ops, NULL); +out: + if (r == 0) + wait_on_page_fscache(page); if (r < 0) { - if (page) { - unlock_page(page); + if (page) put_page(page); - } } else { + WARN_ON_ONCE(!PageLocked(page)); *pagep = page; } return r; @@ -1681,9 +1612,6 @@ out_restore: return ret; } -/* - * Reuse write_begin here for simplicity. - */ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; From 49870056005ca9387e5ee31451991491f99cc45f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 9 Jul 2020 14:43:23 -0400 Subject: [PATCH 0694/1379] ceph: convert ceph_readpages to ceph_readahead Convert ceph_readpages to ceph_readahead and make it use netfs_readahead. With this we can rip out a lot of the old readpage/readpages infrastructure. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 234 +++++++------------------------------------------ 1 file changed, 33 insertions(+), 201 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8a214f11e349..cb88ba91b671 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -305,6 +305,16 @@ static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file) { } +static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + int got = (uintptr_t)priv; + + if (got) + ceph_put_cap_refs(ci, got); +} + const struct netfs_read_request_ops ceph_netfs_read_ops = { .init_rreq = ceph_init_rreq, .is_cache_enabled = ceph_is_cache_enabled, @@ -313,6 +323,7 @@ const struct netfs_read_request_ops ceph_netfs_read_ops = { .expand_readahead = ceph_netfs_expand_readahead, .clamp_length = ceph_netfs_clamp_length, .check_write_begin = ceph_netfs_check_write_begin, + .cleanup = ceph_readahead_cleanup, }; /* read a single page, without unlocking it. */ @@ -345,214 +356,35 @@ static int ceph_readpage(struct file *file, struct page *page) return netfs_readpage(file, page, &ceph_netfs_read_ops, NULL); } -/* - * Finish an async read(ahead) op. - */ -static void finish_read(struct ceph_osd_request *req) +static void ceph_readahead(struct readahead_control *ractl) { - struct inode *inode = req->r_inode; - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_data *osd_data; - int rc = req->r_result <= 0 ? req->r_result : 0; - int bytes = req->r_result >= 0 ? req->r_result : 0; - int num_pages; - int i; - - dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); - if (rc == -EBLOCKLISTED) - ceph_inode_to_client(inode)->blocklisted = true; - - /* unlock all pages, zeroing any data we didn't read */ - osd_data = osd_req_op_extent_osd_data(req, 0); - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); - num_pages = calc_pages_for((u64)osd_data->alignment, - (u64)osd_data->length); - for (i = 0; i < num_pages; i++) { - struct page *page = osd_data->pages[i]; - - if (rc < 0 && rc != -ENOENT) - goto unlock; - if (bytes < (int)PAGE_SIZE) { - /* zero (remainder of) page */ - int s = bytes < 0 ? 0 : bytes; - zero_user_segment(page, s, PAGE_SIZE); - } - dout("finish_read %p uptodate %p idx %lu\n", inode, page, - page->index); - flush_dcache_page(page); - SetPageUptodate(page); -unlock: - unlock_page(page); - put_page(page); - bytes -= PAGE_SIZE; - } - - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, rc); - - kfree(osd_data->pages); -} - -/* - * start an async read(ahead) operation. return nr_pages we submitted - * a read for on success, or negative error code. - */ -static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, - struct list_head *page_list, int max) -{ - struct ceph_osd_client *osdc = - &ceph_inode_to_client(inode)->client->osdc; - struct ceph_inode_info *ci = ceph_inode(inode); - struct page *page = lru_to_page(page_list); - struct ceph_vino vino; - struct ceph_osd_request *req; - u64 off; - u64 len; - int i; - struct page **pages; - pgoff_t next_index; - int nr_pages = 0; + struct inode *inode = file_inode(ractl->file); + struct ceph_file_info *fi = ractl->file->private_data; + struct ceph_rw_context *rw_ctx; int got = 0; int ret = 0; - if (!rw_ctx) { - /* caller of readpages does not hold buffer and read caps - * (fadvise, madvise and readahead cases) */ - int want = CEPH_CAP_FILE_CACHE; - ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, - true, &got); - if (ret < 0) { - dout("start_read %p, error getting cap\n", inode); - } else if (!(got & want)) { - dout("start_read %p, no cache cap\n", inode); - ret = 0; - } - if (ret <= 0) { - if (got) - ceph_put_cap_refs(ci, got); - while (!list_empty(page_list)) { - page = lru_to_page(page_list); - list_del(&page->lru); - put_page(page); - } - return ret; - } - } - - off = (u64) page_offset(page); - - /* count pages */ - next_index = page->index; - list_for_each_entry_reverse(page, page_list, lru) { - if (page->index != next_index) - break; - nr_pages++; - next_index++; - if (max && nr_pages == max) - break; - } - len = nr_pages << PAGE_SHIFT; - dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, - off, len); - vino = ceph_vino(inode); - req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, - 0, 1, CEPH_OSD_OP_READ, - CEPH_OSD_FLAG_READ, NULL, - ci->i_truncate_seq, ci->i_truncate_size, - false); - if (IS_ERR(req)) { - ret = PTR_ERR(req); - goto out; - } - - /* build page vector */ - nr_pages = calc_pages_for(0, len); - pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto out_put; - } - for (i = 0; i < nr_pages; ++i) { - page = list_entry(page_list->prev, struct page, lru); - BUG_ON(PageLocked(page)); - list_del(&page->lru); - - dout("start_read %p adding %p idx %lu\n", inode, page, - page->index); - if (add_to_page_cache_lru(page, &inode->i_data, page->index, - GFP_KERNEL)) { - put_page(page); - dout("start_read %p add_to_page_cache failed %p\n", - inode, page); - nr_pages = i; - if (nr_pages > 0) { - len = nr_pages << PAGE_SHIFT; - osd_req_op_extent_update(req, 0, len); - break; - } - goto out_pages; - } - pages[i] = page; - } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); - req->r_callback = finish_read; - req->r_inode = inode; - - dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); - ret = ceph_osdc_start_request(osdc, req, false); - if (ret < 0) - goto out_pages; - ceph_osdc_put_request(req); - - /* After adding locked pages to page cache, the inode holds cache cap. - * So we can drop our cap refs. */ - if (got) - ceph_put_cap_refs(ci, got); - - return nr_pages; - -out_pages: - for (i = 0; i < nr_pages; ++i) - unlock_page(pages[i]); - ceph_put_page_vector(pages, nr_pages, false); -out_put: - ceph_osdc_put_request(req); -out: - if (got) - ceph_put_cap_refs(ci, got); - return ret; -} - - -/* - * Read multiple pages. Leave pages we don't read + unlock in page_list; - * the caller (VM) cleans them up. - */ -static int ceph_readpages(struct file *file, struct address_space *mapping, - struct list_head *page_list, unsigned nr_pages) -{ - struct inode *inode = file_inode(file); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_file_info *fi = file->private_data; - struct ceph_rw_context *rw_ctx; - int rc = 0; - int max = 0; - if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) - return -EINVAL; + return; rw_ctx = ceph_find_rw_context(fi); - max = fsc->mount_options->rsize >> PAGE_SHIFT; - dout("readpages %p file %p ctx %p nr_pages %d max %d\n", - inode, file, rw_ctx, nr_pages, max); - while (!list_empty(page_list)) { - rc = start_read(inode, rw_ctx, page_list, max); - if (rc < 0) - goto out; + if (!rw_ctx) { + /* + * readahead callers do not necessarily hold Fcb caps + * (e.g. fadvise, madvise). + */ + int want = CEPH_CAP_FILE_CACHE; + + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); + if (ret < 0) + dout("start_read %p, error getting cap\n", inode); + else if (!(got & want)) + dout("start_read %p, no cache cap\n", inode); + + if (ret <= 0) + return; } -out: - dout("readpages %p file %p ret %d\n", inode, file, rc); - return rc; + netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got); } struct ceph_writeback_ctl @@ -1497,7 +1329,7 @@ static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations ceph_aops = { .readpage = ceph_readpage, - .readpages = ceph_readpages, + .readahead = ceph_readahead, .writepage = ceph_writepage, .writepages = ceph_writepages_start, .write_begin = ceph_write_begin, From fcaddb1d851bf69c94b3046227341d9684e276b1 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 5 Mar 2021 03:59:23 -0600 Subject: [PATCH 0695/1379] ceph: fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix a couple of warnings by explicitly adding a break and a goto statements instead of just letting the code fall through to the next case. URL: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 83d9358854fb..3e575656713e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -631,10 +631,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) switch (whence) { case SEEK_CUR: offset += file->f_pos; + break; case SEEK_SET: break; case SEEK_END: retval = -EOPNOTSUPP; + goto out; default: goto out; } From d3c51ae1b8cce5bdaf91a1ce32b33cf5626075dc Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 1 Mar 2021 07:38:01 -0500 Subject: [PATCH 0696/1379] ceph: don't clobber i_snap_caps on non-I_NEW inode We want the snapdir to mirror the non-snapped directory's attributes for most things, but i_snap_caps represents the caps granted on the snapshot directory by the MDS itself. A misbehaving MDS could issue different caps for the snapdir and we lose them here. Only reset i_snap_caps when the inode is I_NEW. Also, move the setting of i_op and i_fop inside the if block since they should never change anyway. Reported-by: Al Viro Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4418d4be2907..2fd1c48ac5d7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -87,14 +87,15 @@ struct inode *ceph_get_snapdir(struct inode *parent) inode->i_mtime = parent->i_mtime; inode->i_ctime = parent->i_ctime; inode->i_atime = parent->i_atime; - inode->i_op = &ceph_snapdir_iops; - inode->i_fop = &ceph_snapdir_fops; - ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ ci->i_rbytes = 0; ci->i_btime = ceph_inode(parent)->i_btime; - if (inode->i_state & I_NEW) + if (inode->i_state & I_NEW) { + inode->i_op = &ceph_snapdir_iops; + inode->i_fop = &ceph_snapdir_fops; + ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ unlock_new_inode(inode); + } return inode; } From aa60cfc3f7ee32766766f71e6bfbea963b4f94bc Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 1 Mar 2021 08:01:54 -0500 Subject: [PATCH 0697/1379] ceph: don't use d_add in ceph_handle_snapdir It's possible ceph_get_snapdir could end up finding a (disconnected) inode that already exists in the cache. Change the prototype for ceph_handle_snapdir to return a dentry pointer and have it use d_splice_alias so we don't end up with an aliased dentry in the cache. Reported-by: Al Viro Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 30 +++++++++++++++++++----------- fs/ceph/file.c | 7 +++++-- fs/ceph/super.h | 2 +- 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 3e575656713e..5624fae7a603 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -667,8 +667,8 @@ out: /* * Handle lookups for the hidden .snap directory. */ -int ceph_handle_snapdir(struct ceph_mds_request *req, - struct dentry *dentry, int err) +struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ @@ -676,16 +676,17 @@ int ceph_handle_snapdir(struct ceph_mds_request *req, /* .snap dir? */ if (err == -ENOENT && ceph_snap(parent) == CEPH_NOSNAP && - strcmp(dentry->d_name.name, - fsc->mount_options->snapdir_name) == 0) { + strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) { + struct dentry *res; struct inode *inode = ceph_get_snapdir(parent); - dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n", - dentry, dentry, inode); - BUG_ON(!d_unhashed(dentry)); - d_add(dentry, inode); - err = 0; + + res = d_splice_alias(inode, dentry); + dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n", + dentry, dentry, inode, res); + if (res) + dentry = res; } - return err; + return dentry; } /* @@ -741,6 +742,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; + struct dentry *res; int op; int mask; int err; @@ -791,7 +793,13 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, req->r_parent = dir; set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, NULL, req); - err = ceph_handle_snapdir(req, dentry, err); + res = ceph_handle_snapdir(req, dentry, err); + if (IS_ERR(res)) { + err = PTR_ERR(res); + } else { + dentry = res; + err = 0; + } dentry = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); /* will dput(dentry) */ dout("lookup result=%p\n", dentry); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 209535d5b8d3..a6ef1d143308 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -739,9 +739,12 @@ retry: err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); - err = ceph_handle_snapdir(req, dentry, err); - if (err) + dentry = ceph_handle_snapdir(req, dentry, err); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); goto out_req; + } + err = 0; if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 188565d806b2..07a3fb52ae30 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1193,7 +1193,7 @@ extern const struct dentry_operations ceph_dentry_ops; extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); -extern int ceph_handle_snapdir(struct ceph_mds_request *req, +extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry, int err); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); From 379fc7fad0ae6ed5ceefd39b8a7a37e83a63c25e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 23 Mar 2021 15:16:52 -0400 Subject: [PATCH 0698/1379] ceph: use attach/detach_page_private for tracking snap context There is some ambiguity around the use of PagePrivate. It's generally expected in core code that if PagePrivate is set then you have a reference to it. It's not clear that ceph always does (and I believe it may not). Change ceph to use attach/detach_page_private so that we keep a reference to the page until the snap context is detached. Link: https://lore.kernel.org/ceph-devel/2503810.1616508988@warthog.procyon.org.uk/T/#mf29e5abbb0ec8035cde0de30778690de7d956f84 Reported-by: David Howells Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index cb88ba91b671..72c0901b6d15 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -128,8 +128,7 @@ static int ceph_set_page_dirty(struct page *page) * PagePrivate so that we get invalidatepage callback. */ BUG_ON(PagePrivate(page)); - page->private = (unsigned long)snapc; - SetPagePrivate(page); + attach_page_private(page, snapc); ret = __set_page_dirty_nobuffers(page); WARN_ON(!PageLocked(page)); @@ -148,7 +147,7 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, { struct inode *inode; struct ceph_inode_info *ci; - struct ceph_snap_context *snapc = page_snap_context(page); + struct ceph_snap_context *snapc; wait_on_page_fscache(page); @@ -168,10 +167,9 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, dout("%p invalidatepage %p idx %lu full dirty page\n", inode, page, page->index); + snapc = detach_page_private(page); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); - page->private = 0; - ClearPagePrivate(page); } static int ceph_releasepage(struct page *page, gfp_t gfp) @@ -589,8 +587,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage cleaned page %p\n", page); err = 0; /* vfs expects us to return 0 */ } - page->private = 0; - ClearPagePrivate(page); + oldest = detach_page_private(page); + WARN_ON_ONCE(oldest != snapc); end_page_writeback(page); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ @@ -682,11 +680,9 @@ static void writepages_finish(struct ceph_osd_request *req) clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); - ceph_put_snap_context(page_snap_context(page)); - page->private = 0; - ClearPagePrivate(page); - dout("unlocking %p\n", page); + ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); + dout("unlocking %p\n", page); if (remove_page) generic_error_remove_page(inode->i_mapping, From 54b026b456d08dfb6f19d37ae07b809004dc4b57 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 23 Mar 2021 12:18:19 -0400 Subject: [PATCH 0699/1379] ceph: fix kerneldoc copypasta over ceph_start_io_direct Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/io.c b/fs/ceph/io.c index 97602ea92ff4..c456509b31c3 100644 --- a/fs/ceph/io.c +++ b/fs/ceph/io.c @@ -118,7 +118,7 @@ static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) } /** - * ceph_end_io_direct - declare the file is being used for direct i/o + * ceph_start_io_direct - declare the file is being used for direct i/o * @inode: file inode * * Declare that a direct I/O operation is about to start, and ensure From 8ae99ae2b40766a73026d5793942b4fea6d9ed31 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 22 Mar 2021 20:28:49 +0800 Subject: [PATCH 0700/1379] ceph: rename the metric helpers Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 8 ++++---- fs/ceph/debugfs.c | 12 ++++++------ fs/ceph/file.c | 12 ++++++------ fs/ceph/mds_client.c | 2 +- fs/ceph/metric.c | 24 ++++++++++++------------ fs/ceph/metric.h | 12 ++++++------ 6 files changed, 35 insertions(+), 35 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 72c0901b6d15..bc9864524fde 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -225,7 +225,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) int num_pages; int err = req->r_result; - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, + ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, err); dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result, @@ -559,7 +559,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (!err) err = ceph_osdc_wait_request(osdc, req); - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, err); ceph_osdc_put_request(req); @@ -647,7 +647,7 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_clear_error_write(ci); } - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, rc); /* @@ -1716,7 +1716,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, err); out_put: diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 66989c880adb..425f3356332a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -162,34 +162,34 @@ static int metric_show(struct seq_file *s, void *p) seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); seq_printf(s, "-----------------------------------------------------------------------------------\n"); - spin_lock(&m->read_latency_lock); + spin_lock(&m->read_metric_lock); total = m->total_reads; sum = m->read_latency_sum; avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; min = m->read_latency_min; max = m->read_latency_max; sq = m->read_latency_sq_sum; - spin_unlock(&m->read_latency_lock); + spin_unlock(&m->read_metric_lock); CEPH_METRIC_SHOW("read", total, avg, min, max, sq); - spin_lock(&m->write_latency_lock); + spin_lock(&m->write_metric_lock); total = m->total_writes; sum = m->write_latency_sum; avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; min = m->write_latency_min; max = m->write_latency_max; sq = m->write_latency_sq_sum; - spin_unlock(&m->write_latency_lock); + spin_unlock(&m->write_metric_lock); CEPH_METRIC_SHOW("write", total, avg, min, max, sq); - spin_lock(&m->metadata_latency_lock); + spin_lock(&m->metadata_metric_lock); total = m->total_metadatas; sum = m->metadata_latency_sum; avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; min = m->metadata_latency_min; max = m->metadata_latency_max; sq = m->metadata_latency_sq_sum; - spin_unlock(&m->metadata_latency_lock); + spin_unlock(&m->metadata_metric_lock); CEPH_METRIC_SHOW("metadata", total, avg, min, max, sq); seq_printf(s, "\n"); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a6ef1d143308..a27aabcb0e0b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -895,7 +895,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, if (!ret) ret = ceph_osdc_wait_request(osdc, req); - ceph_update_read_latency(&fsc->mdsc->metric, + ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, ret); @@ -1040,10 +1040,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) /* r_start_latency == 0 means the request was not submitted */ if (req->r_start_latency) { if (aio_req->write) - ceph_update_write_latency(metric, req->r_start_latency, + ceph_update_write_metrics(metric, req->r_start_latency, req->r_end_latency, rc); else - ceph_update_read_latency(metric, req->r_start_latency, + ceph_update_read_metrics(metric, req->r_start_latency, req->r_end_latency, rc); } @@ -1293,10 +1293,10 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ret = ceph_osdc_wait_request(&fsc->client->osdc, req); if (write) - ceph_update_write_latency(metric, req->r_start_latency, + ceph_update_write_metrics(metric, req->r_start_latency, req->r_end_latency, ret); else - ceph_update_read_latency(metric, req->r_start_latency, + ceph_update_read_metrics(metric, req->r_start_latency, req->r_end_latency, ret); size = i_size_read(inode); @@ -1470,7 +1470,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, ret); out: ceph_osdc_put_request(req); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d87bd852ed96..73ecb7d128c9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3306,7 +3306,7 @@ out_err: /* kick calling process */ complete_request(mdsc, req); - ceph_update_metadata_latency(&mdsc->metric, req->r_start_latency, + ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency, req->r_end_latency, err); out: ceph_mdsc_put_request(req); diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 5ec94bd4c1de..75d309f2fb0c 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -183,21 +183,21 @@ int ceph_metric_init(struct ceph_client_metric *m) if (ret) goto err_i_caps_mis; - spin_lock_init(&m->read_latency_lock); + spin_lock_init(&m->read_metric_lock); m->read_latency_sq_sum = 0; m->read_latency_min = KTIME_MAX; m->read_latency_max = 0; m->total_reads = 0; m->read_latency_sum = 0; - spin_lock_init(&m->write_latency_lock); + spin_lock_init(&m->write_metric_lock); m->write_latency_sq_sum = 0; m->write_latency_min = KTIME_MAX; m->write_latency_max = 0; m->total_writes = 0; m->write_latency_sum = 0; - spin_lock_init(&m->metadata_latency_lock); + spin_lock_init(&m->metadata_metric_lock); m->metadata_latency_sq_sum = 0; m->metadata_latency_min = KTIME_MAX; m->metadata_latency_max = 0; @@ -274,7 +274,7 @@ static inline void __update_latency(ktime_t *totalp, ktime_t *lsump, *sq_sump += sq; } -void ceph_update_read_latency(struct ceph_client_metric *m, +void ceph_update_read_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc) { @@ -283,14 +283,14 @@ void ceph_update_read_latency(struct ceph_client_metric *m, if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) return; - spin_lock(&m->read_latency_lock); + spin_lock(&m->read_metric_lock); __update_latency(&m->total_reads, &m->read_latency_sum, &m->read_latency_min, &m->read_latency_max, &m->read_latency_sq_sum, lat); - spin_unlock(&m->read_latency_lock); + spin_unlock(&m->read_metric_lock); } -void ceph_update_write_latency(struct ceph_client_metric *m, +void ceph_update_write_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc) { @@ -299,14 +299,14 @@ void ceph_update_write_latency(struct ceph_client_metric *m, if (unlikely(rc && rc != -ETIMEDOUT)) return; - spin_lock(&m->write_latency_lock); + spin_lock(&m->write_metric_lock); __update_latency(&m->total_writes, &m->write_latency_sum, &m->write_latency_min, &m->write_latency_max, &m->write_latency_sq_sum, lat); - spin_unlock(&m->write_latency_lock); + spin_unlock(&m->write_metric_lock); } -void ceph_update_metadata_latency(struct ceph_client_metric *m, +void ceph_update_metadata_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc) { @@ -315,9 +315,9 @@ void ceph_update_metadata_latency(struct ceph_client_metric *m, if (unlikely(rc && rc != -ENOENT)) return; - spin_lock(&m->metadata_latency_lock); + spin_lock(&m->metadata_metric_lock); __update_latency(&m->total_metadatas, &m->metadata_latency_sum, &m->metadata_latency_min, &m->metadata_latency_max, &m->metadata_latency_sq_sum, lat); - spin_unlock(&m->metadata_latency_lock); + spin_unlock(&m->metadata_metric_lock); } diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index af6038ff39d4..57b5f0ec38be 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -108,21 +108,21 @@ struct ceph_client_metric { struct percpu_counter i_caps_hit; struct percpu_counter i_caps_mis; - spinlock_t read_latency_lock; + spinlock_t read_metric_lock; u64 total_reads; ktime_t read_latency_sum; ktime_t read_latency_sq_sum; ktime_t read_latency_min; ktime_t read_latency_max; - spinlock_t write_latency_lock; + spinlock_t write_metric_lock; u64 total_writes; ktime_t write_latency_sum; ktime_t write_latency_sq_sum; ktime_t write_latency_min; ktime_t write_latency_max; - spinlock_t metadata_latency_lock; + spinlock_t metadata_metric_lock; u64 total_metadatas; ktime_t metadata_latency_sum; ktime_t metadata_latency_sq_sum; @@ -162,13 +162,13 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m) percpu_counter_inc(&m->i_caps_mis); } -extern void ceph_update_read_latency(struct ceph_client_metric *m, +extern void ceph_update_read_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc); -extern void ceph_update_write_latency(struct ceph_client_metric *m, +extern void ceph_update_write_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc); -extern void ceph_update_metadata_latency(struct ceph_client_metric *m, +extern void ceph_update_metadata_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc); #endif /* _FS_CEPH_MDS_METRIC_H */ From fbd47ddc5e887571ee39f0d6b47c6155f2257f55 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 22 Mar 2021 20:28:51 +0800 Subject: [PATCH 0701/1379] ceph: avoid counting the same request twice or more If the request will retry, skip updating the latency metric. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a27aabcb0e0b..31542eac7e59 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1037,16 +1037,6 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, osd_data->bvec_pos.iter.bi_size); - /* r_start_latency == 0 means the request was not submitted */ - if (req->r_start_latency) { - if (aio_req->write) - ceph_update_write_metrics(metric, req->r_start_latency, - req->r_end_latency, rc); - else - ceph_update_read_metrics(metric, req->r_start_latency, - req->r_end_latency, rc); - } - if (rc == -EOLDSNAPC) { struct ceph_aio_work *aio_work; BUG_ON(!aio_req->write); @@ -1089,6 +1079,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } } + /* r_start_latency == 0 means the request was not submitted */ + if (req->r_start_latency) { + if (aio_req->write) + ceph_update_write_metrics(metric, req->r_start_latency, + req->r_end_latency, rc); + else + ceph_update_read_metrics(metric, req->r_start_latency, + req->r_end_latency, rc); + } + put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, aio_req->should_dirty); ceph_osdc_put_request(req); From 3d8b6987a276f4292b5b71f4df8fe34129ab9e5d Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Wed, 24 Mar 2021 13:08:25 +0800 Subject: [PATCH 0702/1379] ceph: send opened files/pinned caps/opened inodes metrics to MDS daemon For the old ceph version, if it received this metric info, it will just ignore them. URL: https://tracker.ceph.com/issues/46866 Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/metric.c | 38 +++++++++++++++++++++++++++++++++++++- fs/ceph/metric.h | 44 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 75d309f2fb0c..28b6b42ad677 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -17,6 +17,9 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_metric_write_latency *write; struct ceph_metric_metadata_latency *meta; struct ceph_metric_dlease *dlease; + struct ceph_opened_files *files; + struct ceph_pinned_icaps *icaps; + struct ceph_opened_inodes *inodes; struct ceph_client_metric *m = &mdsc->metric; u64 nr_caps = atomic64_read(&m->total_caps); struct ceph_msg *msg; @@ -26,7 +29,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, s32 len; len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) - + sizeof(*meta) + sizeof(*dlease); + + sizeof(*meta) + sizeof(*dlease) + sizeof(*files) + + sizeof(*icaps) + sizeof(*inodes); msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); if (!msg) { @@ -95,6 +99,38 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries)); items++; + sum = percpu_counter_sum(&m->total_inodes); + + /* encode the opened files metric */ + files = (struct ceph_opened_files *)(dlease + 1); + files->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES); + files->ver = 1; + files->compat = 1; + files->data_len = cpu_to_le32(sizeof(*files) - 10); + files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files)); + files->total = cpu_to_le64(sum); + items++; + + /* encode the pinned icaps metric */ + icaps = (struct ceph_pinned_icaps *)(files + 1); + icaps->type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS); + icaps->ver = 1; + icaps->compat = 1; + icaps->data_len = cpu_to_le32(sizeof(*icaps) - 10); + icaps->pinned_icaps = cpu_to_le64(nr_caps); + icaps->total = cpu_to_le64(sum); + items++; + + /* encode the opened inodes metric */ + inodes = (struct ceph_opened_inodes *)(icaps + 1); + inodes->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES); + inodes->ver = 1; + inodes->compat = 1; + inodes->data_len = cpu_to_le32(sizeof(*inodes) - 10); + inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes)); + inodes->total = cpu_to_le64(sum); + items++; + put_unaligned_le32(items, &head->num); msg->front.iov_len = len; msg->hdr.version = cpu_to_le16(1); diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 57b5f0ec38be..e984eb2bb14b 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -14,8 +14,11 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_WRITE_LATENCY, CLIENT_METRIC_TYPE_METADATA_LATENCY, CLIENT_METRIC_TYPE_DENTRY_LEASE, + CLIENT_METRIC_TYPE_OPENED_FILES, + CLIENT_METRIC_TYPE_PINNED_ICAPS, + CLIENT_METRIC_TYPE_OPENED_INODES, - CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE, + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_OPENED_INODES, }; /* @@ -28,6 +31,9 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_WRITE_LATENCY, \ CLIENT_METRIC_TYPE_METADATA_LATENCY, \ CLIENT_METRIC_TYPE_DENTRY_LEASE, \ + CLIENT_METRIC_TYPE_OPENED_FILES, \ + CLIENT_METRIC_TYPE_PINNED_ICAPS, \ + CLIENT_METRIC_TYPE_OPENED_INODES, \ \ CLIENT_METRIC_TYPE_MAX, \ } @@ -94,6 +100,42 @@ struct ceph_metric_dlease { __le64 total; } __packed; +/* metric opened files header */ +struct ceph_opened_files { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(opened_files + total) */ + __le64 opened_files; + __le64 total; +} __packed; + +/* metric pinned i_caps header */ +struct ceph_pinned_icaps { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(pinned_icaps + total) */ + __le64 pinned_icaps; + __le64 total; +} __packed; + +/* metric opened inodes header */ +struct ceph_opened_inodes { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(opened_inodes + total) */ + __le64 opened_inodes; + __le64 total; +} __packed; + struct ceph_metric_head { __le32 num; /* the number of metrics that will be sent */ } __packed; From e9b2250156c381b0973ea6ec3890fe8706426ecc Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 26 Jan 2021 11:49:54 -0500 Subject: [PATCH 0703/1379] ceph: only check pool permissions for regular files There is no need to do a ceph_pool_perm_check() on anything that isn't a regular file, as the MDS is what handles talking to the OSD in those cases. Just return 0 if it's not a regular file. Reported-by: Luis Henriques Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index bc9864524fde..aa612bc8a559 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1941,6 +1941,10 @@ int ceph_pool_perm_check(struct inode *inode, int need) s64 pool; int ret, flags; + /* Only need to do this for regular files */ + if (!S_ISREG(inode->i_mode)) + return 0; + if (ci->i_vino.snap != CEPH_NOSNAP) { /* * Pool permission check needs to write to the first object. From 1775c7ddacfcea29051c67409087578f8f4d751b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 26 Mar 2021 09:21:53 -0400 Subject: [PATCH 0704/1379] ceph: fix inode leak on getattr error in __fh_to_dentry Fixes: 878dabb64117 ("ceph: don't return -ESTALE if there's still an open file") Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/export.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e088843a7734..baa6368bece5 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -178,8 +178,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) return ERR_CAST(inode); /* We need LINK caps to reliably check i_nlink */ err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false); - if (err) + if (err) { + iput(inode); return ERR_PTR(err); + } /* -ESTALE if inode as been unlinked and no file is open */ if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) { iput(inode); From e72968e15b297a51dcefe93a95e875dcefe6c4aa Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 5 Apr 2021 12:19:35 -0400 Subject: [PATCH 0705/1379] ceph: drop pinned_page parameter from ceph_get_caps All of the existing callers that don't set this to NULL just drop the page reference at some arbitrary point later in processing. There's no point in keeping a page reference that we don't use, so just drop the reference immediately after checking the Uptodate flag. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 9 ++------- fs/ceph/caps.c | 11 +++++------ fs/ceph/file.c | 17 +++++------------ fs/ceph/super.h | 2 +- 4 files changed, 13 insertions(+), 26 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index aa612bc8a559..ba0b9193b45c 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1357,7 +1357,6 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; - struct page *pinned_page = NULL; loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT; int want, got, err; sigset_t oldset; @@ -1373,8 +1372,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE; got = 0; - err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, - &got, &pinned_page); + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got); if (err < 0) goto out_restore; @@ -1393,8 +1391,6 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) } else err = -EAGAIN; - if (pinned_page) - put_page(pinned_page); ceph_put_cap_refs(ci, got); if (err != -EAGAIN) @@ -1488,8 +1484,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, - &got, NULL); + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got); if (err < 0) goto out_free; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index e12e4cdefac1..010621d095f5 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2855,8 +2855,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct file *filp, int need, int want, - loff_t endoff, int *got, struct page **pinned_page) +int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got) { struct ceph_file_info *fi = filp->private_data; struct inode *inode = file_inode(filp); @@ -2954,11 +2953,11 @@ int ceph_get_caps(struct file *filp, int need, int want, struct page *page = find_get_page(inode->i_mapping, 0); if (page) { - if (PageUptodate(page)) { - *pinned_page = page; - break; - } + bool uptodate = PageUptodate(page); + put_page(page); + if (uptodate) + break; } /* * drop cap refs first because getattr while diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 31542eac7e59..77fc037d5beb 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1513,7 +1513,6 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) size_t len = iov_iter_count(to); struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - struct page *pinned_page = NULL; bool direct_lock = iocb->ki_flags & IOCB_DIRECT; ssize_t ret; int want, got = 0; @@ -1532,8 +1531,7 @@ again: want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, - &got, &pinned_page); + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got); if (ret < 0) { if (iocb->ki_flags & IOCB_DIRECT) ceph_end_io_direct(inode); @@ -1574,10 +1572,6 @@ again: dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); - if (pinned_page) { - put_page(pinned_page); - pinned_page = NULL; - } ceph_put_cap_refs(ci, got); if (direct_lock) @@ -1756,8 +1750,7 @@ retry_snap: else want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, - &got, NULL); + err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got); if (err < 0) goto out; @@ -2086,7 +2079,7 @@ static long ceph_fallocate(struct file *file, int mode, else want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); + ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got); if (ret < 0) goto unlock; @@ -2124,7 +2117,7 @@ static int get_rd_wr_caps(struct file *src_filp, int *src_got, retry_caps: ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, - dst_endoff, dst_got, NULL); + dst_endoff, dst_got); if (ret < 0) return ret; @@ -2146,7 +2139,7 @@ retry_caps: return ret; } ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, - CEPH_CAP_FILE_SHARED, -1, src_got, NULL); + CEPH_CAP_FILE_SHARED, -1, src_got); if (ret < 0) return ret; /*... drop src_ci caps too, and retry */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 07a3fb52ae30..68dd6a8520dd 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1156,7 +1156,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, int mds, int drop, int unless); extern int ceph_get_caps(struct file *filp, int need, int want, - loff_t endoff, int *got, struct page **pinned_page); + loff_t endoff, int *got); extern int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got); From e7f72952508ac4354f9bec0607ac8a200d050e65 Mon Sep 17 00:00:00 2001 From: Yanhu Cao Date: Fri, 28 Aug 2020 09:28:44 +0800 Subject: [PATCH 0706/1379] ceph: support getting ceph.dir.rsnaps vxattr Add support for grabbing the rsnaps value out of the inode info in traces, and exposing that via ceph.dir.rsnaps xattr. Signed-off-by: Yanhu Cao Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 1 + fs/ceph/mds_client.c | 9 ++++++++- fs/ceph/mds_client.h | 1 + fs/ceph/super.h | 2 +- fs/ceph/xattr.c | 7 +++++++ 5 files changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2fd1c48ac5d7..d6ff9b4585fa 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -895,6 +895,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, ci->i_rfiles = le64_to_cpu(info->rfiles); ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); ci->i_dir_pin = iinfo->dir_pin; + ci->i_rsnaps = iinfo->rsnaps; ceph_decode_timespec64(&ci->i_rctime, &info->rctime); } } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 73ecb7d128c9..5c50f7986404 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -176,6 +176,13 @@ static int parse_reply_info_in(void **p, void *end, memset(&info->snap_btime, 0, sizeof(info->snap_btime)); } + /* snapshot count, remains zero for v<=3 */ + if (struct_v >= 4) { + ceph_decode_64_safe(p, end, info->rsnaps, bad); + } else { + info->rsnaps = 0; + } + *p = end; } else { if (features & CEPH_FEATURE_MDS_INLINE_DATA) { @@ -214,7 +221,7 @@ static int parse_reply_info_in(void **p, void *end, } info->dir_pin = -ENODATA; - /* info->snap_btime remains zero */ + /* info->snap_btime and info->rsnaps remain zero */ } return 0; bad: diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index eaa7c5422116..15c11a0f2caf 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -88,6 +88,7 @@ struct ceph_mds_reply_info_in { s32 dir_pin; struct ceph_timespec btime; struct ceph_timespec snap_btime; + u64 rsnaps; u64 change_attr; }; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 68dd6a8520dd..df0851b9240e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -334,7 +334,7 @@ struct ceph_inode_info { /* for dirs */ struct timespec64 i_rctime; - u64 i_rbytes, i_rfiles, i_rsubdirs; + u64 i_rbytes, i_rfiles, i_rsubdirs, i_rsnaps; u64 i_files, i_subdirs; /* quotas */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 02f59bcb4f27..1242db8d3444 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -233,6 +233,12 @@ static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs); } +static ssize_t ceph_vxattrcb_dir_rsnaps(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", ci->i_rsnaps); +} + static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, size_t size) { @@ -384,6 +390,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_RSTAT_FIELD(dir, rentries), XATTR_RSTAT_FIELD(dir, rfiles), XATTR_RSTAT_FIELD(dir, rsubdirs), + XATTR_RSTAT_FIELD(dir, rsnaps), XATTR_RSTAT_FIELD(dir, rbytes), XATTR_RSTAT_FIELD(dir, rctime), { From 8ff2d290c8ce77c8e30d9b08c13d87cd5688d7e1 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 5 Apr 2021 10:40:56 -0400 Subject: [PATCH 0707/1379] ceph: convert some PAGE_SIZE invocations to thp_size() Start preparing to allow the use of THPs in the pagecache with ceph by making it use thp_size() in lieu of PAGE_SIZE in the appropriate places. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 51 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index ba0b9193b45c..c1570fada3d8 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -154,7 +154,7 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, inode = page->mapping->host; ci = ceph_inode(inode); - if (offset != 0 || length != PAGE_SIZE) { + if (offset != 0 || length != thp_size(page)) { dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", inode, page, page->index, offset, length); return; @@ -331,7 +331,7 @@ static int ceph_readpage(struct file *file, struct page *page) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_vino vino = ceph_vino(inode); u64 off = page_offset(page); - u64 len = PAGE_SIZE; + u64 len = thp_size(page); if (ci->i_inline_version != CEPH_INLINE_NONE) { /* @@ -342,7 +342,7 @@ static int ceph_readpage(struct file *file, struct page *page) unlock_page(page); return -EINVAL; } - zero_user_segment(page, 0, PAGE_SIZE); + zero_user_segment(page, 0, thp_size(page)); SetPageUptodate(page); unlock_page(page); return 0; @@ -477,8 +477,8 @@ static u64 get_writepages_data_length(struct inode *inode, spin_unlock(&ci->i_ceph_lock); WARN_ON(!found); } - if (end > page_offset(page) + PAGE_SIZE) - end = page_offset(page) + PAGE_SIZE; + if (end > page_offset(page) + thp_size(page)) + end = page_offset(page) + thp_size(page); return end > start ? end - start : 0; } @@ -496,7 +496,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); int err; - loff_t len = PAGE_SIZE; + loff_t len = thp_size(page); struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; @@ -524,7 +524,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) /* is this a partial page at end of file? */ if (page_off >= ceph_wbc.i_size) { dout("%p page eof %llu\n", page, ceph_wbc.i_size); - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); + page->mapping->a_ops->invalidatepage(page, 0, thp_size(page)); return 0; } @@ -550,7 +550,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } /* it may be a short write due to an object boundary */ - WARN_ON_ONCE(len > PAGE_SIZE); + WARN_ON_ONCE(len > thp_size(page)); osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); @@ -839,7 +839,7 @@ get_more_pages: page_offset(page) >= i_size_read(inode)) && clear_page_dirty_for_io(page)) mapping->a_ops->invalidatepage(page, - 0, PAGE_SIZE); + 0, thp_size(page)); unlock_page(page); continue; } @@ -928,7 +928,7 @@ get_more_pages: pages[locked_pages++] = page; pvec.pages[i] = NULL; - len += PAGE_SIZE; + len += thp_size(page); } /* did we get anything? */ @@ -977,7 +977,7 @@ new_request: BUG_ON(IS_ERR(req)); } BUG_ON(len < page_offset(pages[locked_pages - 1]) + - PAGE_SIZE - offset); + thp_size(page) - offset); req->r_callback = writepages_finish; req->r_inode = inode; @@ -1007,7 +1007,7 @@ new_request: } set_page_writeback(pages[i]); - len += PAGE_SIZE; + len += thp_size(page); } if (ceph_wbc.size_stable) { @@ -1016,7 +1016,7 @@ new_request: /* writepages_finish() clears writeback pages * according to the data length, so make sure * data length covers all locked pages */ - u64 min_len = len + 1 - PAGE_SIZE; + u64 min_len = len + 1 - thp_size(page); len = get_writepages_data_length(inode, pages[i - 1], offset); len = max(len, min_len); @@ -1253,7 +1253,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, } goto out; } - zero_user_segment(page, 0, PAGE_SIZE); + zero_user_segment(page, 0, thp_size(page)); SetPageUptodate(page); goto out; } @@ -1364,8 +1364,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ceph_block_sigs(&oldset); - dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", - inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); + dout("filemap_fault %p %llx.%llx %llu trying to get caps\n", + inode, ceph_vinop(inode), off); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else @@ -1376,8 +1376,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) if (err < 0) goto out_restore; - dout("filemap_fault %p %llu~%zd got cap refs on %s\n", - inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); + dout("filemap_fault %p %llu got cap refs on %s\n", + inode, off, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || ci->i_inline_version == CEPH_INLINE_NONE) { @@ -1385,9 +1385,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ceph_add_rw_context(fi, &rw_ctx); ret = filemap_fault(vmf); ceph_del_rw_context(fi, &rw_ctx); - dout("filemap_fault %p %llu~%zd drop cap refs %s ret %x\n", - inode, off, (size_t)PAGE_SIZE, - ceph_cap_string(got), ret); + dout("filemap_fault %p %llu drop cap refs %s ret %x\n", + inode, off, ceph_cap_string(got), ret); } else err = -EAGAIN; @@ -1425,8 +1424,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; out_inline: - dout("filemap_fault %p %llu~%zd read inline data ret %x\n", - inode, off, (size_t)PAGE_SIZE, ret); + dout("filemap_fault %p %llu read inline data ret %x\n", + inode, off, ret); } out_restore: ceph_restore_sigs(&oldset); @@ -1471,10 +1470,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) goto out_free; } - if (off + PAGE_SIZE <= size) - len = PAGE_SIZE; + if (off + thp_size(page) <= size) + len = thp_size(page); else - len = size & ~PAGE_MASK; + len = offset_in_thp(page, size); dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", inode, ceph_vinop(inode), off, len, size); From 2d6795fbb8c34ed5eb44db2a99960614424585f8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 9 Apr 2021 15:58:35 -0400 Subject: [PATCH 0708/1379] ceph: fix up some bare fetches of i_size We need to use i_size_read(), which properly handles the torn read case on 32-bit arches. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 6 +++--- fs/ceph/inode.c | 22 +++++++++++----------- fs/ceph/mds_client.c | 2 +- fs/ceph/snap.c | 2 +- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 010621d095f5..d2f955a14429 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1390,7 +1390,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, arg->flush_tid = flush_tid; arg->oldest_flush_tid = oldest_flush_tid; - arg->size = inode->i_size; + arg->size = i_size_read(inode); ci->i_reported_size = arg->size; arg->max_size = ci->i_wanted_max_size; if (cap == ci->i_auth_cap) { @@ -1885,7 +1885,7 @@ static int try_nonblocking_invalidate(struct inode *inode) bool __ceph_should_report_size(struct ceph_inode_info *ci) { - loff_t size = ci->vfs_inode.i_size; + loff_t size = i_size_read(&ci->vfs_inode); /* mds will adjust max size according to the reported size */ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) return false; @@ -3299,7 +3299,7 @@ static void handle_cap_grant(struct inode *inode, dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, session->s_mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, - inode->i_size); + i_size_read(inode)); /* diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d6ff9b4585fa..8b96370306cf 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -617,10 +617,11 @@ int ceph_fill_file_size(struct inode *inode, int issued, { struct ceph_inode_info *ci = ceph_inode(inode); int queue_trunc = 0; + loff_t isize = i_size_read(inode); if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || - (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { - dout("size %lld -> %llu\n", inode->i_size, size); + (truncate_seq == ci->i_truncate_seq && size > isize)) { + dout("size %lld -> %llu\n", isize, size); if (size > 0 && S_ISDIR(inode->i_mode)) { pr_err("fill_file_size non-zero size for directory\n"); size = 0; @@ -1789,7 +1790,7 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size) bool ret; spin_lock(&ci->i_ceph_lock); - dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); + dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); i_size_write(inode, size); inode->i_blocks = calc_inode_blocks(size); @@ -2096,20 +2097,19 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } if (ia_valid & ATTR_SIZE) { - dout("setattr %p size %lld -> %lld\n", inode, - inode->i_size, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && - attr->ia_size > inode->i_size) { + loff_t isize = i_size_read(inode); + + dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size); + if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > isize) { i_size_write(inode, attr->ia_size); inode->i_blocks = calc_inode_blocks(attr->ia_size); ci->i_reported_size = attr->ia_size; dirtied |= CEPH_CAP_FILE_EXCL; ia_valid |= ATTR_MTIME; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - attr->ia_size != inode->i_size) { + attr->ia_size != isize) { req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = - cpu_to_le64(inode->i_size); + req->r_args.setattr.old_size = cpu_to_le64(isize); mask |= CEPH_SETATTR_SIZE; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; @@ -2219,7 +2219,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return err; if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size > max(inode->i_size, fsc->max_file_size)) + attr->ia_size > max(i_size_read(inode), fsc->max_file_size)) return -EFBIG; if ((attr->ia_valid & ATTR_SIZE) && diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5c50f7986404..63b53098360c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3787,7 +3787,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v1.issued = cpu_to_le32(cap->issued); - rec.v1.size = cpu_to_le64(inode->i_size); + rec.v1.size = cpu_to_le64(i_size_read(inode)); ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime); ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 0728b01d4d43..4ce18055d931 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -605,7 +605,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); BUG_ON(capsnap->writing); - capsnap->size = inode->i_size; + capsnap->size = i_size_read(inode); capsnap->mtime = inode->i_mtime; capsnap->atime = inode->i_atime; capsnap->ctime = inode->i_ctime; From d4f6b31d721779d91b5e2f8072478af73b196c34 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 1 Apr 2021 13:55:11 -0400 Subject: [PATCH 0709/1379] ceph: don't allow access to MDS-private inodes The MDS reserves a set of inodes for its own usage, and these should never be accessible to clients. Add a new helper to vet a proposed inode number against that range, and complain loudly and refuse to create or look it up if it's in it. Also, ensure that the MDS doesn't try to delegate inodes that are in that range or lower. Print a warning if it does, and don't save the range in the xarray. URL: https://tracker.ceph.com/issues/49922 Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/export.c | 8 ++++++++ fs/ceph/inode.c | 3 +++ fs/ceph/mds_client.c | 7 +++++++ fs/ceph/super.h | 24 ++++++++++++++++++++++++ 4 files changed, 42 insertions(+) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index baa6368bece5..042bb4a02c0a 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -129,6 +129,10 @@ static struct inode *__lookup_inode(struct super_block *sb, u64 ino) vino.ino = ino; vino.snap = CEPH_NOSNAP; + + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-ESTALE); + inode = ceph_find_inode(sb, vino); if (!inode) { struct ceph_mds_request *req; @@ -214,6 +218,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, vino.ino = sfh->ino; vino.snap = sfh->snapid; } + + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-ESTALE); + inode = ceph_find_inode(sb, vino); if (inode) return d_obtain_alias(inode); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8b96370306cf..8c6811e3682d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -56,6 +56,9 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) { struct inode *inode; + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-EREMOTEIO); + inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare, ceph_set_ino_cb, &vino); if (!inode) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 63b53098360c..e5af591d3bd4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -440,6 +440,13 @@ static int ceph_parse_deleg_inos(void **p, void *end, ceph_decode_64_safe(p, end, start, bad); ceph_decode_64_safe(p, end, len, bad); + + /* Don't accept a delegation of system inodes */ + if (start < CEPH_INO_SYSTEM_BASE) { + pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n", + start, len); + continue; + } while (len--) { int err = xa_insert(&s->s_delegated_inos, ino = start++, DELEGATED_INO_AVAILABLE, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index df0851b9240e..db80d89556b1 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -529,10 +529,34 @@ static inline int ceph_ino_compare(struct inode *inode, void *data) ci->i_vino.snap == pvino->snap; } +/* + * The MDS reserves a set of inodes for its own usage. These should never + * be accessible by clients, and so the MDS has no reason to ever hand these + * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE. + * + * These come from src/mds/mdstypes.h in the ceph sources. + */ +#define CEPH_MAX_MDS 0x100 +#define CEPH_NUM_STRAY 10 +#define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) +#define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) + +static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) +{ + if (vino.ino < CEPH_INO_SYSTEM_BASE && + vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) { + WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino); + return true; + } + return false; +} static inline struct inode *ceph_find_inode(struct super_block *sb, struct ceph_vino vino) { + if (ceph_vino_is_reserved(vino)) + return NULL; + /* * NB: The hashval will be run through the fs/inode.c hash function * anyway, so there is no need to squash the inode number down to From 7807dafda21a549403d922da98dde0ddfeb70d08 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 14 Apr 2021 10:38:40 +0200 Subject: [PATCH 0710/1379] libceph: bump CephXAuthenticate encoding version A dummy v3 encoding (exactly the same as v2) was introduced so that the monitors can distinguish broken clients that may not include their auth ticket in CEPHX_GET_AUTH_SESSION_KEY request on reconnects, thus failing to prove previous possession of their global_id (one part of CVE-2021-20288). The kernel client has always included its auth ticket, so it is compatible with enforcing mode as is. However we want to bump the encoding version to avoid having to authenticate twice on the initial connect -- all legacy (CephXAuthenticate < v3) are now forced do so in order to expose insecure global_id reclaim. Marking for stable since at least for 5.11 and 5.12 it is trivial (v2 -> v3). Cc: stable@vger.kernel.org # 5.11+ URL: https://tracker.ceph.com/issues/50452 Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/auth_x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index ca44c327bace..79641c4afee9 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -526,7 +526,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, if (ret < 0) return ret; - auth->struct_v = 2; /* nautilus+ */ + auth->struct_v = 3; /* nautilus+ */ auth->key = 0; for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++) auth->key ^= *(__le64 *)u; From 61ca49a9105faefa003b37542cebad8722f8ae22 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 26 Apr 2021 19:11:37 +0200 Subject: [PATCH 0711/1379] libceph: don't set global_id until we get an auth ticket With the introduction of enforcing mode, setting global_id as soon as we get it in the first MAuth reply will result in EACCES if the connection is reset before we get the second MAuth reply containing an auth ticket -- because on retry we would attempt to reclaim that global_id with no auth ticket at hand. Neither ceph_auth_client nor ceph_mon_client depend on global_id being set ealy, so just delay the setting until we get and process the second MAuth reply. While at it, complain if the monitor sends a zero global_id or changes our global_id as the session is likely to fail after that. Cc: stable@vger.kernel.org # needs backporting for < 5.11 Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/auth.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/net/ceph/auth.c b/net/ceph/auth.c index eb261aa5fe18..de407e8feb97 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -36,6 +36,20 @@ static int init_protocol(struct ceph_auth_client *ac, int proto) } } +static void set_global_id(struct ceph_auth_client *ac, u64 global_id) +{ + dout("%s global_id %llu\n", __func__, global_id); + + if (!global_id) + pr_err("got zero global_id\n"); + + if (ac->global_id && global_id != ac->global_id) + pr_err("global_id changed from %llu to %llu\n", ac->global_id, + global_id); + + ac->global_id = global_id; +} + /* * setup, teardown. */ @@ -222,11 +236,6 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, payload_end = payload + payload_len; - if (global_id && ac->global_id != global_id) { - dout(" set global_id %lld -> %lld\n", ac->global_id, global_id); - ac->global_id = global_id; - } - if (ac->negotiating) { /* server does not support our protocols? */ if (!protocol && result < 0) { @@ -253,11 +262,16 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ret = ac->ops->handle_reply(ac, result, payload, payload_end, NULL, NULL, NULL, NULL); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { ret = build_request(ac, true, reply_buf, reply_len); - else if (ret) + goto out; + } else if (ret) { pr_err("auth protocol '%s' mauth authentication failed: %d\n", ceph_auth_proto_name(ac->protocol), result); + goto out; + } + + set_global_id(ac, global_id); out: mutex_unlock(&ac->mutex); @@ -484,15 +498,11 @@ int ceph_auth_handle_reply_done(struct ceph_auth_client *ac, int ret; mutex_lock(&ac->mutex); - if (global_id && ac->global_id != global_id) { - dout("%s global_id %llu -> %llu\n", __func__, ac->global_id, - global_id); - ac->global_id = global_id; - } - ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len, session_key, session_key_len, con_secret, con_secret_len); + if (!ret) + set_global_id(ac, global_id); mutex_unlock(&ac->mutex); return ret; } From e1d3f3268b0e512ceb811dd4765e476626bde71c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Fri, 16 Apr 2021 20:58:37 +0000 Subject: [PATCH 0712/1379] PCI/sysfs: Convert "config" to static attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "config" sysfs attribute allows access to either the legacy (PCI and PCI-X Mode 1) or the extended (PCI-X Mode 2 and PCIe) device configuration space. Previously it was dynamically created either when a device was added (for hot-added devices) or via a late_initcall (for devices present at boot): pci_bus_add_devices pci_bus_add_device pci_create_sysfs_dev_files if (!sysfs_initialized) return sysfs_create_bin_file # for hot-added devices pci_sysfs_init # late_initcall sysfs_initialized = 1 for_each_pci_dev(pdev) pci_create_sysfs_dev_files(pdev) # for devices present at boot And dynamically removed when the PCI device is stopped and removed: pci_stop_bus_device pci_stop_dev pci_remove_sysfs_dev_files sysfs_remove_bin_file This attribute does not need to be created or removed dynamically, so we can use a static attribute so the device model takes care of addition and removal automatically. Convert "config" to a static attribute and use the .is_bin_visible() callback to set the correct object size (either 256 bytes or 4 KiB) at runtime. The pci_sysfs_init() scheme was added in the pre-git era by https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/drivers/pci/pci-sysfs.c?id=f6d553444da2 [bhelgaas: commit log] Suggested-by: Oliver O'Halloran Link: https://lore.kernel.org/r/CAOSf1CHss03DBSDO4PmTtMp0tCEu5kScn704ZEwLKGXQzBfqaA@mail.gmail.com Link: https://lore.kernel.org/r/20210416205856.3234481-2-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-sysfs.c | 64 ++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 39 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index f8afd54ca3e1..dc14daf404f5 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -808,6 +808,29 @@ static ssize_t pci_write_config(struct file *filp, struct kobject *kobj, return count; } +static BIN_ATTR(config, 0644, pci_read_config, pci_write_config, 0); + +static struct bin_attribute *pci_dev_config_attrs[] = { + &bin_attr_config, + NULL, +}; + +static umode_t pci_dev_config_attr_is_visible(struct kobject *kobj, + struct bin_attribute *a, int n) +{ + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + + a->size = PCI_CFG_SPACE_SIZE; + if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) + a->size = PCI_CFG_SPACE_EXP_SIZE; + + return a->attr.mode; +} + +static const struct attribute_group pci_dev_config_attr_group = { + .bin_attrs = pci_dev_config_attrs, + .is_bin_visible = pci_dev_config_attr_is_visible, +}; #ifdef HAVE_PCI_LEGACY /** @@ -1284,26 +1307,6 @@ static ssize_t pci_read_rom(struct file *filp, struct kobject *kobj, return count; } -static const struct bin_attribute pci_config_attr = { - .attr = { - .name = "config", - .mode = 0644, - }, - .size = PCI_CFG_SPACE_SIZE, - .read = pci_read_config, - .write = pci_write_config, -}; - -static const struct bin_attribute pcie_config_attr = { - .attr = { - .name = "config", - .mode = 0644, - }, - .size = PCI_CFG_SPACE_EXP_SIZE, - .read = pci_read_config, - .write = pci_write_config, -}; - static ssize_t reset_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -1355,16 +1358,9 @@ int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev) if (!sysfs_initialized) return -EACCES; - if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) - retval = sysfs_create_bin_file(&pdev->dev.kobj, &pcie_config_attr); - else - retval = sysfs_create_bin_file(&pdev->dev.kobj, &pci_config_attr); - if (retval) - goto err; - retval = pci_create_resource_files(pdev); if (retval) - goto err_config_file; + goto err; /* If the device has a ROM, try to expose it in sysfs. */ rom_size = pci_resource_len(pdev, PCI_ROM_RESOURCE); @@ -1405,11 +1401,6 @@ err_rom_file: } err_resource_files: pci_remove_resource_files(pdev); -err_config_file: - if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) - sysfs_remove_bin_file(&pdev->dev.kobj, &pcie_config_attr); - else - sysfs_remove_bin_file(&pdev->dev.kobj, &pci_config_attr); err: return retval; } @@ -1435,12 +1426,6 @@ void pci_remove_sysfs_dev_files(struct pci_dev *pdev) return; pci_remove_capabilities_sysfs(pdev); - - if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) - sysfs_remove_bin_file(&pdev->dev.kobj, &pcie_config_attr); - else - sysfs_remove_bin_file(&pdev->dev.kobj, &pci_config_attr); - pci_remove_resource_files(pdev); if (pdev->rom_attr) { @@ -1540,6 +1525,7 @@ static const struct attribute_group pci_dev_group = { const struct attribute_group *pci_dev_groups[] = { &pci_dev_group, + &pci_dev_config_attr_group, NULL, }; From 527139d738d7f2e9f929c752eebf3cbf0f74c754 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Fri, 16 Apr 2021 20:58:38 +0000 Subject: [PATCH 0713/1379] PCI/sysfs: Convert "rom" to static attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "rom" sysfs attribute allows access to the PCI Option ROM. Previously it was dynamically created either by pci_bus_add_device() or the pci_sysfs_init() initcall, but since it doesn't need to be created or removed dynamically, we can use a static attribute so the device model takes care of addition and removal automatically. Convert "rom" to a static attribute and use the .is_bin_visible() callback to set the correct object size based on the ROM size. Remove "rom_attr" from the struct pci_dev since it is no longer needed. This attribute was added in the pre-git era by https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/drivers/pci/pci-sysfs.c?id=f6d553444da2 [bhelgaas: commit log] Suggested-by: Oliver O'Halloran Link: https://lore.kernel.org/r/20210416205856.3234481-3-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-sysfs.c | 67 ++++++++++++++++++----------------------- include/linux/pci.h | 1 - 2 files changed, 29 insertions(+), 39 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index dc14daf404f5..fa8373685140 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1306,6 +1306,33 @@ static ssize_t pci_read_rom(struct file *filp, struct kobject *kobj, return count; } +static BIN_ATTR(rom, 0600, pci_read_rom, pci_write_rom, 0); + +static struct bin_attribute *pci_dev_rom_attrs[] = { + &bin_attr_rom, + NULL, +}; + +static umode_t pci_dev_rom_attr_is_visible(struct kobject *kobj, + struct bin_attribute *a, int n) +{ + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + size_t rom_size; + + /* If the device has a ROM, try to expose it in sysfs. */ + rom_size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + if (!rom_size) + return 0; + + a->size = rom_size; + + return a->attr.mode; +} + +static const struct attribute_group pci_dev_rom_attr_group = { + .bin_attrs = pci_dev_rom_attrs, + .is_bin_visible = pci_dev_rom_attr_is_visible, +}; static ssize_t reset_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -1352,8 +1379,6 @@ error: int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev) { int retval; - int rom_size; - struct bin_attribute *attr; if (!sysfs_initialized) return -EACCES; @@ -1362,43 +1387,15 @@ int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev) if (retval) goto err; - /* If the device has a ROM, try to expose it in sysfs. */ - rom_size = pci_resource_len(pdev, PCI_ROM_RESOURCE); - if (rom_size) { - attr = kzalloc(sizeof(*attr), GFP_ATOMIC); - if (!attr) { - retval = -ENOMEM; - goto err_resource_files; - } - sysfs_bin_attr_init(attr); - attr->size = rom_size; - attr->attr.name = "rom"; - attr->attr.mode = 0600; - attr->read = pci_read_rom; - attr->write = pci_write_rom; - retval = sysfs_create_bin_file(&pdev->dev.kobj, attr); - if (retval) { - kfree(attr); - goto err_resource_files; - } - pdev->rom_attr = attr; - } - /* add sysfs entries for various capabilities */ retval = pci_create_capabilities_sysfs(pdev); if (retval) - goto err_rom_file; + goto err_resource_files; pci_create_firmware_label_files(pdev); return 0; -err_rom_file: - if (pdev->rom_attr) { - sysfs_remove_bin_file(&pdev->dev.kobj, pdev->rom_attr); - kfree(pdev->rom_attr); - pdev->rom_attr = NULL; - } err_resource_files: pci_remove_resource_files(pdev); err: @@ -1427,13 +1424,6 @@ void pci_remove_sysfs_dev_files(struct pci_dev *pdev) pci_remove_capabilities_sysfs(pdev); pci_remove_resource_files(pdev); - - if (pdev->rom_attr) { - sysfs_remove_bin_file(&pdev->dev.kobj, pdev->rom_attr); - kfree(pdev->rom_attr); - pdev->rom_attr = NULL; - } - pci_remove_firmware_label_files(pdev); } @@ -1526,6 +1516,7 @@ static const struct attribute_group pci_dev_group = { const struct attribute_group *pci_dev_groups[] = { &pci_dev_group, &pci_dev_config_attr_group, + &pci_dev_rom_attr_group, NULL, }; diff --git a/include/linux/pci.h b/include/linux/pci.h index 86c799c97b77..45f1fef80b50 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -458,7 +458,6 @@ struct pci_dev { u32 saved_config_space[16]; /* Config space saved at suspend time */ struct hlist_head saved_cap_space; - struct bin_attribute *rom_attr; /* Attribute descriptor for sysfs ROM entry */ int rom_attr_enabled; /* Display of ROM attribute enabled? */ struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */ struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */ From f42c35ea3b137c01b3e073232131674be8efb924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Fri, 16 Apr 2021 20:58:39 +0000 Subject: [PATCH 0714/1379] PCI/sysfs: Convert "reset" to static attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "reset" sysfs attribute allows for resetting a PCI function. Previously it was dynamically created either by pci_bus_add_device() or the pci_sysfs_init() initcall, but since it doesn't need to be created or removed dynamically, we can use a static attribute so the device model takes care of addition and removal automatically. Convert "reset" to a static attribute and use the .is_visible() callback to check whether the device supports reset. Clear reset_fn in pci_stop_dev() instead of pci_remove_capabilities_sysfs() since we no longer explicitly remove the "reset" sysfs file. [bhelgaas: commit log] Suggested-by: Oliver O'Halloran Link: https://lore.kernel.org/r/20210416205856.3234481-4-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-sysfs.c | 53 +++++++++++++++++++---------------------- drivers/pci/remove.c | 2 ++ 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index fa8373685140..c469d9cad0a8 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1355,25 +1355,33 @@ static ssize_t reset_store(struct device *dev, struct device_attribute *attr, return count; } +static DEVICE_ATTR_WO(reset); -static DEVICE_ATTR(reset, 0200, NULL, reset_store); +static struct attribute *pci_dev_reset_attrs[] = { + &dev_attr_reset.attr, + NULL, +}; -static int pci_create_capabilities_sysfs(struct pci_dev *dev) +static umode_t pci_dev_reset_attr_is_visible(struct kobject *kobj, + struct attribute *a, int n) { - int retval; + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); + if (!pdev->reset_fn) + return 0; + + return a->mode; +} + +static const struct attribute_group pci_dev_reset_attr_group = { + .attrs = pci_dev_reset_attrs, + .is_visible = pci_dev_reset_attr_is_visible, +}; + + +static void pci_create_capabilities_sysfs(struct pci_dev *dev) +{ pcie_vpd_create_sysfs_dev_files(dev); - - if (dev->reset_fn) { - retval = device_create_file(&dev->dev, &dev_attr_reset); - if (retval) - goto error; - } - return 0; - -error: - pcie_vpd_remove_sysfs_dev_files(dev); - return retval; } int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev) @@ -1385,30 +1393,18 @@ int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev) retval = pci_create_resource_files(pdev); if (retval) - goto err; + return retval; /* add sysfs entries for various capabilities */ - retval = pci_create_capabilities_sysfs(pdev); - if (retval) - goto err_resource_files; - + pci_create_capabilities_sysfs(pdev); pci_create_firmware_label_files(pdev); return 0; - -err_resource_files: - pci_remove_resource_files(pdev); -err: - return retval; } static void pci_remove_capabilities_sysfs(struct pci_dev *dev) { pcie_vpd_remove_sysfs_dev_files(dev); - if (dev->reset_fn) { - device_remove_file(&dev->dev, &dev_attr_reset); - dev->reset_fn = 0; - } } /** @@ -1517,6 +1513,7 @@ const struct attribute_group *pci_dev_groups[] = { &pci_dev_group, &pci_dev_config_attr_group, &pci_dev_rom_attr_group, + &pci_dev_reset_attr_group, NULL, }; diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index 95dec03d9f2a..dd12c2fcc7dc 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -19,6 +19,8 @@ static void pci_stop_dev(struct pci_dev *dev) pci_pme_active(dev, false); if (pci_dev_is_added(dev)) { + dev->reset_fn = 0; + device_release_driver(&dev->dev); pci_proc_detach_device(dev); pci_remove_sysfs_dev_files(dev); From fee742b502894c8ed02506fff61d7605934f93cb Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 27 Apr 2021 23:07:19 -0500 Subject: [PATCH 0715/1379] smb3.1.1: enable negotiating stronger encryption by default Now that stronger encryption (gcm256) has been more broadly tested, and confirmed to work with multiple servers (Windows and Azure for example), enable it by default. Although gcm256 is the second choice we offer (after gcm128 which should be faster), this change allows mounts to server which are configured to require the strongest encryption to work (without changing a module load parameter). Signed-off-by: Steve French Suggested-by: Ronnie Sahlberg Reviewed-by: Ronnie Sahlberg --- fs/cifs/cifsfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 5f2c139143a7..44fd850f05c1 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -75,7 +75,7 @@ bool enable_oplocks = true; bool linuxExtEnabled = true; bool lookupCacheEnabled = true; bool disable_legacy_dialects; /* false by default */ -bool enable_gcm_256; /* false by default, change when more servers support it */ +bool enable_gcm_256 = true; bool require_gcm_256; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ From 0679d29d3e2351a1c3049c26a63ce1959cad5447 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 11 Apr 2021 09:41:04 -0700 Subject: [PATCH 0716/1379] csky: fix syscache.c fallthrough warning This case of the switch statement falls through to the following case. This appears to be on purpose, so declare it as OK. ../arch/csky/mm/syscache.c: In function '__do_sys_cacheflush': ../arch/csky/mm/syscache.c:17:3: warning: this statement may fall through [-Wimplicit-fallthrough=] 17 | flush_icache_mm_range(current->mm, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 18 | (unsigned long)addr, | ~~~~~~~~~~~~~~~~~~~~ 19 | (unsigned long)addr + bytes); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../arch/csky/mm/syscache.c:20:2: note: here 20 | case DCACHE: | ^~~~ Fixes: 997153b9a75c ("csky: Add flush_icache_mm to defer flush icache all") Signed-off-by: Randy Dunlap Signed-off-by: Guo Ren Cc: linux-csky@vger.kernel.org Cc: Arnd Bergmann --- arch/csky/mm/syscache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/csky/mm/syscache.c b/arch/csky/mm/syscache.c index ffade2f9a4c8..4e51d63850c4 100644 --- a/arch/csky/mm/syscache.c +++ b/arch/csky/mm/syscache.c @@ -17,6 +17,7 @@ SYSCALL_DEFINE3(cacheflush, flush_icache_mm_range(current->mm, (unsigned long)addr, (unsigned long)addr + bytes); + fallthrough; case DCACHE: dcache_wb_range((unsigned long)addr, (unsigned long)addr + bytes); From e58a41c2226847fb1446f3942dc1b55af8acfe02 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 21 Apr 2021 16:03:28 +0800 Subject: [PATCH 0717/1379] csky: uaccess.h: Coding convention with asm generic Using asm-generic/uaccess.h to prevent duplicated code: - Add user_addr_max which mentioned in generic uaccess.h - Remove custom definitions of KERNEL/USER_DS, get/set_fs, uaccess_kerenl - Using generic extable.h instead of custom definitions in uaccess.h Change v2: - Fixup tinyconfig compile error, "__put_user_bad" - Add __get_user_asm_64 Signed-off-by: Guo Ren Link: https://lore.kernel.org/linux-csky/CAK8P3a1DvsXSEDoovLk11hzNHyJi7vqNoToU+n5aFi2viZO_Uw@mail.gmail.com/T/#mbcd58a0e3450e5598974116b607589afa16a3ab7 Cc: Arnd Bergmann --- arch/csky/include/asm/Kbuild | 1 + arch/csky/include/asm/segment.h | 7 - arch/csky/include/asm/uaccess.h | 472 ++++++++++---------------------- arch/csky/lib/usercopy.c | 364 +++++++++++++----------- arch/csky/mm/fault.c | 2 +- 5 files changed, 355 insertions(+), 491 deletions(-) diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index cc24bb8e539f..904a18a818be 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 generic-y += asm-offsets.h +generic-y += extable.h generic-y += gpio.h generic-y += kvm_para.h generic-y += qrwlock.h diff --git a/arch/csky/include/asm/segment.h b/arch/csky/include/asm/segment.h index 589e8321dc14..5bc1cc62b87f 100644 --- a/arch/csky/include/asm/segment.h +++ b/arch/csky/include/asm/segment.h @@ -7,11 +7,4 @@ typedef struct { unsigned long seg; } mm_segment_t; -#define KERNEL_DS ((mm_segment_t) { 0xFFFFFFFF }) - -#define USER_DS ((mm_segment_t) { PAGE_OFFSET }) -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) -#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) - #endif /* __ASM_CSKY_SEGMENT_H */ diff --git a/arch/csky/include/asm/uaccess.h b/arch/csky/include/asm/uaccess.h index 3dec272e1fa3..ac83823fc437 100644 --- a/arch/csky/include/asm/uaccess.h +++ b/arch/csky/include/asm/uaccess.h @@ -3,122 +3,26 @@ #ifndef __ASM_CSKY_UACCESS_H #define __ASM_CSKY_UACCESS_H -/* - * User space memory access functions - */ -#include -#include -#include -#include -#include -#include -#include +#define user_addr_max() \ + (uaccess_kernel() ? KERNEL_DS.seg : get_fs().seg) -static inline int access_ok(const void *addr, unsigned long size) +static inline int __access_ok(unsigned long addr, unsigned long size) { unsigned long limit = current_thread_info()->addr_limit.seg; - return (((unsigned long)addr < limit) && - ((unsigned long)(addr + size) < limit)); + return ((addr < limit) && ((addr + size) < limit)); } +#define __access_ok __access_ok -#define __addr_ok(addr) (access_ok(addr, 0)) - +/* + * __put_user_fn + */ extern int __put_user_bad(void); -/* - * Tell gcc we read from memory instead of writing: this is because - * we do not write to any memory gcc knows about, so there are no - * aliasing issues. - */ - -/* - * These are the main single-value transfer routines. They automatically - * use the right size if we just have the right pointer type. - * - * This gets kind of ugly. We want to return _two_ values in "get_user()" - * and yet we don't want to do any pointers, because that is too much - * of a performance impact. Thus we have a few rather ugly macros here, - * and hide all the ugliness from the user. - * - * The "__xxx" versions of the user access functions are versions that - * do not verify the address space, that must have been done previously - * with a separate "access_ok()" call (this is used when we do multiple - * accesses to the same area of user memory). - * - * As we use the same address space for kernel and user data on - * Ckcore, we can just do these as direct assignments. (Of course, the - * exception handling means that it's no longer "just"...) - */ - -#define put_user(x, ptr) \ - __put_user_check((x), (ptr), sizeof(*(ptr))) - -#define __put_user(x, ptr) \ - __put_user_nocheck((x), (ptr), sizeof(*(ptr))) - -#define __ptr(x) ((unsigned long *)(x)) - -#define get_user(x, ptr) \ - __get_user_check((x), (ptr), sizeof(*(ptr))) - -#define __get_user(x, ptr) \ - __get_user_nocheck((x), (ptr), sizeof(*(ptr))) - -#define __put_user_nocheck(x, ptr, size) \ -({ \ - long __pu_err = 0; \ - typeof(*(ptr)) *__pu_addr = (ptr); \ - typeof(*(ptr)) __pu_val = (typeof(*(ptr)))(x); \ - if (__pu_addr) \ - __put_user_size(__pu_val, (__pu_addr), (size), \ - __pu_err); \ - __pu_err; \ -}) - -#define __put_user_check(x, ptr, size) \ -({ \ - long __pu_err = -EFAULT; \ - typeof(*(ptr)) *__pu_addr = (ptr); \ - typeof(*(ptr)) __pu_val = (typeof(*(ptr)))(x); \ - if (access_ok(__pu_addr, size) && __pu_addr) \ - __put_user_size(__pu_val, __pu_addr, (size), __pu_err); \ - __pu_err; \ -}) - -#define __put_user_size(x, ptr, size, retval) \ -do { \ - retval = 0; \ - switch (size) { \ - case 1: \ - __put_user_asm_b(x, ptr, retval); \ - break; \ - case 2: \ - __put_user_asm_h(x, ptr, retval); \ - break; \ - case 4: \ - __put_user_asm_w(x, ptr, retval); \ - break; \ - case 8: \ - __put_user_asm_64(x, ptr, retval); \ - break; \ - default: \ - __put_user_bad(); \ - } \ -} while (0) - -/* - * We don't tell gcc that we are accessing memory, but this is OK - * because we do not write to any memory gcc knows about, so there - * are no aliasing issues. - * - * Note that PC at a fault is the address *after* the faulting - * instruction. - */ #define __put_user_asm_b(x, ptr, err) \ do { \ int errcode; \ - asm volatile( \ + __asm__ __volatile__( \ "1: stb %1, (%2,0) \n" \ " br 3f \n" \ "2: mov %0, %3 \n" \ @@ -136,7 +40,7 @@ do { \ #define __put_user_asm_h(x, ptr, err) \ do { \ int errcode; \ - asm volatile( \ + __asm__ __volatile__( \ "1: sth %1, (%2,0) \n" \ " br 3f \n" \ "2: mov %0, %3 \n" \ @@ -154,7 +58,7 @@ do { \ #define __put_user_asm_w(x, ptr, err) \ do { \ int errcode; \ - asm volatile( \ + __asm__ __volatile__( \ "1: stw %1, (%2,0) \n" \ " br 3f \n" \ "2: mov %0, %3 \n" \ @@ -169,241 +73,149 @@ do { \ : "memory"); \ } while (0) -#define __put_user_asm_64(x, ptr, err) \ -do { \ - int tmp; \ - int errcode; \ - typeof(*(ptr))src = (typeof(*(ptr)))x; \ - typeof(*(ptr))*psrc = &src; \ - \ - asm volatile( \ - " ldw %3, (%1, 0) \n" \ - "1: stw %3, (%2, 0) \n" \ - " ldw %3, (%1, 4) \n" \ - "2: stw %3, (%2, 4) \n" \ - " br 4f \n" \ - "3: mov %0, %4 \n" \ - " br 4f \n" \ - ".section __ex_table, \"a\" \n" \ - ".align 2 \n" \ - ".long 1b, 3b \n" \ - ".long 2b, 3b \n" \ - ".previous \n" \ - "4: \n" \ - : "=r"(err), "=r"(psrc), "=r"(ptr), \ - "=r"(tmp), "=r"(errcode) \ - : "0"(err), "1"(psrc), "2"(ptr), "3"(0), "4"(-EFAULT) \ - : "memory"); \ -} while (0) - -#define __get_user_nocheck(x, ptr, size) \ -({ \ - long __gu_err; \ - __get_user_size(x, (ptr), (size), __gu_err); \ - __gu_err; \ -}) - -#define __get_user_check(x, ptr, size) \ -({ \ - int __gu_err = -EFAULT; \ - const __typeof__(*(ptr)) __user *__gu_ptr = (ptr); \ - if (access_ok(__gu_ptr, size) && __gu_ptr) \ - __get_user_size(x, __gu_ptr, size, __gu_err); \ - __gu_err; \ -}) - -#define __get_user_size(x, ptr, size, retval) \ -do { \ - switch (size) { \ - case 1: \ - __get_user_asm_common((x), ptr, "ldb", retval); \ - break; \ - case 2: \ - __get_user_asm_common((x), ptr, "ldh", retval); \ - break; \ - case 4: \ - __get_user_asm_common((x), ptr, "ldw", retval); \ - break; \ - default: \ - x = 0; \ - (retval) = __get_user_bad(); \ - } \ -} while (0) - -#define __get_user_asm_common(x, ptr, ins, err) \ -do { \ - int errcode; \ - asm volatile( \ - "1: " ins " %1, (%4,0) \n" \ - " br 3f \n" \ - /* Fix up codes */ \ - "2: mov %0, %2 \n" \ - " movi %1, 0 \n" \ - " br 3f \n" \ - ".section __ex_table,\"a\" \n" \ - ".align 2 \n" \ - ".long 1b, 2b \n" \ - ".previous \n" \ - "3: \n" \ - : "=r"(err), "=r"(x), "=r"(errcode) \ - : "0"(0), "r"(ptr), "2"(-EFAULT) \ - : "memory"); \ -} while (0) - -extern int __get_user_bad(void); - -#define ___copy_to_user(to, from, n) \ -do { \ - int w0, w1, w2, w3; \ - asm volatile( \ - "0: cmpnei %1, 0 \n" \ - " bf 8f \n" \ - " mov %3, %1 \n" \ - " or %3, %2 \n" \ - " andi %3, 3 \n" \ - " cmpnei %3, 0 \n" \ - " bf 1f \n" \ - " br 5f \n" \ - "1: cmplti %0, 16 \n" /* 4W */ \ - " bt 3f \n" \ - " ldw %3, (%2, 0) \n" \ - " ldw %4, (%2, 4) \n" \ - " ldw %5, (%2, 8) \n" \ - " ldw %6, (%2, 12) \n" \ - "2: stw %3, (%1, 0) \n" \ - "9: stw %4, (%1, 4) \n" \ - "10: stw %5, (%1, 8) \n" \ - "11: stw %6, (%1, 12) \n" \ - " addi %2, 16 \n" \ - " addi %1, 16 \n" \ - " subi %0, 16 \n" \ - " br 1b \n" \ - "3: cmplti %0, 4 \n" /* 1W */ \ - " bt 5f \n" \ - " ldw %3, (%2, 0) \n" \ - "4: stw %3, (%1, 0) \n" \ - " addi %2, 4 \n" \ - " addi %1, 4 \n" \ - " subi %0, 4 \n" \ - " br 3b \n" \ - "5: cmpnei %0, 0 \n" /* 1B */ \ - " bf 13f \n" \ - " ldb %3, (%2, 0) \n" \ - "6: stb %3, (%1, 0) \n" \ - " addi %2, 1 \n" \ - " addi %1, 1 \n" \ - " subi %0, 1 \n" \ - " br 5b \n" \ - "7: subi %0, 4 \n" \ - "8: subi %0, 4 \n" \ - "12: subi %0, 4 \n" \ - " br 13f \n" \ - ".section __ex_table, \"a\" \n" \ - ".align 2 \n" \ - ".long 2b, 13f \n" \ - ".long 4b, 13f \n" \ - ".long 6b, 13f \n" \ - ".long 9b, 12b \n" \ - ".long 10b, 8b \n" \ - ".long 11b, 7b \n" \ - ".previous \n" \ - "13: \n" \ - : "=r"(n), "=r"(to), "=r"(from), "=r"(w0), \ - "=r"(w1), "=r"(w2), "=r"(w3) \ - : "0"(n), "1"(to), "2"(from) \ - : "memory"); \ -} while (0) - -#define ___copy_from_user(to, from, n) \ +#define __put_user_asm_64(x, ptr, err) \ do { \ int tmp; \ - int nsave; \ - asm volatile( \ - "0: cmpnei %1, 0 \n" \ - " bf 7f \n" \ - " mov %3, %1 \n" \ - " or %3, %2 \n" \ - " andi %3, 3 \n" \ - " cmpnei %3, 0 \n" \ - " bf 1f \n" \ - " br 5f \n" \ - "1: cmplti %0, 16 \n" \ - " bt 3f \n" \ - "2: ldw %3, (%2, 0) \n" \ - "10: ldw %4, (%2, 4) \n" \ - " stw %3, (%1, 0) \n" \ - " stw %4, (%1, 4) \n" \ - "11: ldw %3, (%2, 8) \n" \ - "12: ldw %4, (%2, 12) \n" \ - " stw %3, (%1, 8) \n" \ - " stw %4, (%1, 12) \n" \ - " addi %2, 16 \n" \ - " addi %1, 16 \n" \ - " subi %0, 16 \n" \ - " br 1b \n" \ - "3: cmplti %0, 4 \n" \ - " bt 5f \n" \ - "4: ldw %3, (%2, 0) \n" \ - " stw %3, (%1, 0) \n" \ - " addi %2, 4 \n" \ - " addi %1, 4 \n" \ - " subi %0, 4 \n" \ - " br 3b \n" \ - "5: cmpnei %0, 0 \n" \ - " bf 7f \n" \ - "6: ldb %3, (%2, 0) \n" \ - " stb %3, (%1, 0) \n" \ - " addi %2, 1 \n" \ - " addi %1, 1 \n" \ - " subi %0, 1 \n" \ - " br 5b \n" \ - "8: stw %3, (%1, 0) \n" \ - " subi %0, 4 \n" \ - " bf 7f \n" \ - "9: subi %0, 8 \n" \ - " bf 7f \n" \ - "13: stw %3, (%1, 8) \n" \ - " subi %0, 12 \n" \ - " bf 7f \n" \ - ".section __ex_table, \"a\" \n" \ - ".align 2 \n" \ - ".long 2b, 7f \n" \ - ".long 4b, 7f \n" \ - ".long 6b, 7f \n" \ - ".long 10b, 8b \n" \ - ".long 11b, 9b \n" \ - ".long 12b,13b \n" \ - ".previous \n" \ - "7: \n" \ - : "=r"(n), "=r"(to), "=r"(from), "=r"(nsave), \ - "=r"(tmp) \ - : "0"(n), "1"(to), "2"(from) \ + int errcode; \ + \ + __asm__ __volatile__( \ + " ldw %3, (%1, 0) \n" \ + "1: stw %3, (%2, 0) \n" \ + " ldw %3, (%1, 4) \n" \ + "2: stw %3, (%2, 4) \n" \ + " br 4f \n" \ + "3: mov %0, %4 \n" \ + " br 4f \n" \ + ".section __ex_table, \"a\" \n" \ + ".align 2 \n" \ + ".long 1b, 3b \n" \ + ".long 2b, 3b \n" \ + ".previous \n" \ + "4: \n" \ + : "=r"(err), "=r"(x), "=r"(ptr), \ + "=r"(tmp), "=r"(errcode) \ + : "0"(err), "1"(x), "2"(ptr), "3"(0), \ + "4"(-EFAULT) \ : "memory"); \ } while (0) +static inline int __put_user_fn(size_t size, void __user *ptr, void *x) +{ + int retval = 0; + u32 tmp; + + switch (size) { + case 1: + tmp = *(u8 *)x; + __put_user_asm_b(tmp, ptr, retval); + break; + case 2: + tmp = *(u16 *)x; + __put_user_asm_h(tmp, ptr, retval); + break; + case 4: + tmp = *(u32 *)x; + __put_user_asm_w(tmp, ptr, retval); + break; + case 8: + __put_user_asm_64(x, (u64 *)ptr, retval); + break; + } + + return retval; +} +#define __put_user_fn __put_user_fn + +/* + * __get_user_fn + */ +extern int __get_user_bad(void); + +#define __get_user_asm_common(x, ptr, ins, err) \ +do { \ + int errcode; \ + __asm__ __volatile__( \ + "1: " ins " %1, (%4, 0) \n" \ + " br 3f \n" \ + "2: mov %0, %2 \n" \ + " movi %1, 0 \n" \ + " br 3f \n" \ + ".section __ex_table,\"a\" \n" \ + ".align 2 \n" \ + ".long 1b, 2b \n" \ + ".previous \n" \ + "3: \n" \ + : "=r"(err), "=r"(x), "=r"(errcode) \ + : "0"(0), "r"(ptr), "2"(-EFAULT) \ + : "memory"); \ +} while (0) + +#define __get_user_asm_64(x, ptr, err) \ +do { \ + int tmp; \ + int errcode; \ + \ + __asm__ __volatile__( \ + "1: ldw %3, (%2, 0) \n" \ + " stw %3, (%1, 0) \n" \ + "2: ldw %3, (%2, 4) \n" \ + " stw %3, (%1, 4) \n" \ + " br 4f \n" \ + "3: mov %0, %4 \n" \ + " br 4f \n" \ + ".section __ex_table, \"a\" \n" \ + ".align 2 \n" \ + ".long 1b, 3b \n" \ + ".long 2b, 3b \n" \ + ".previous \n" \ + "4: \n" \ + : "=r"(err), "=r"(x), "=r"(ptr), \ + "=r"(tmp), "=r"(errcode) \ + : "0"(err), "1"(x), "2"(ptr), "3"(0), \ + "4"(-EFAULT) \ + : "memory"); \ +} while (0) + +static inline int __get_user_fn(size_t size, const void __user *ptr, void *x) +{ + int retval; + u32 tmp; + + switch (size) { + case 1: + __get_user_asm_common(tmp, ptr, "ldb", retval); + *(u8 *)x = (u8)tmp; + break; + case 2: + __get_user_asm_common(tmp, ptr, "ldh", retval); + *(u16 *)x = (u16)tmp; + break; + case 4: + __get_user_asm_common(tmp, ptr, "ldw", retval); + *(u32 *)x = (u32)tmp; + break; + case 8: + __get_user_asm_64(x, ptr, retval); + break; + } + + return retval; +} +#define __get_user_fn __get_user_fn + unsigned long raw_copy_from_user(void *to, const void *from, unsigned long n); unsigned long raw_copy_to_user(void *to, const void *from, unsigned long n); -unsigned long clear_user(void *to, unsigned long n); unsigned long __clear_user(void __user *to, unsigned long n); +#define __clear_user __clear_user -long strncpy_from_user(char *dst, const char *src, long count); long __strncpy_from_user(char *dst, const char *src, long count); +#define __strncpy_from_user __strncpy_from_user -/* - * Return the size of a string (including the ending 0) - * - * Return 0 on exception, a value greater than N if too long - */ -long strnlen_user(const char *src, long n); +long __strnlen_user(const char *s, long n); +#define __strnlen_user __strnlen_user -#define strlen_user(str) strnlen_user(str, 32767) - -struct exception_table_entry { - unsigned long insn; - unsigned long nextinsn; -}; - -extern int fixup_exception(struct pt_regs *regs); +#include +#include #endif /* __ASM_CSKY_UACCESS_H */ diff --git a/arch/csky/lib/usercopy.c b/arch/csky/lib/usercopy.c index 3c9bd645e643..a71836056fc8 100644 --- a/arch/csky/lib/usercopy.c +++ b/arch/csky/lib/usercopy.c @@ -7,7 +7,70 @@ unsigned long raw_copy_from_user(void *to, const void *from, unsigned long n) { - ___copy_from_user(to, from, n); + int tmp, nsave; + + __asm__ __volatile__( + "0: cmpnei %1, 0 \n" + " bf 7f \n" + " mov %3, %1 \n" + " or %3, %2 \n" + " andi %3, 3 \n" + " cmpnei %3, 0 \n" + " bf 1f \n" + " br 5f \n" + "1: cmplti %0, 16 \n" + " bt 3f \n" + "2: ldw %3, (%2, 0) \n" + "10: ldw %4, (%2, 4) \n" + " stw %3, (%1, 0) \n" + " stw %4, (%1, 4) \n" + "11: ldw %3, (%2, 8) \n" + "12: ldw %4, (%2, 12) \n" + " stw %3, (%1, 8) \n" + " stw %4, (%1, 12) \n" + " addi %2, 16 \n" + " addi %1, 16 \n" + " subi %0, 16 \n" + " br 1b \n" + "3: cmplti %0, 4 \n" + " bt 5f \n" + "4: ldw %3, (%2, 0) \n" + " stw %3, (%1, 0) \n" + " addi %2, 4 \n" + " addi %1, 4 \n" + " subi %0, 4 \n" + " br 3b \n" + "5: cmpnei %0, 0 \n" + " bf 7f \n" + "6: ldb %3, (%2, 0) \n" + " stb %3, (%1, 0) \n" + " addi %2, 1 \n" + " addi %1, 1 \n" + " subi %0, 1 \n" + " br 5b \n" + "8: stw %3, (%1, 0) \n" + " subi %0, 4 \n" + " bf 7f \n" + "9: subi %0, 8 \n" + " bf 7f \n" + "13: stw %3, (%1, 8) \n" + " subi %0, 12 \n" + " bf 7f \n" + ".section __ex_table, \"a\" \n" + ".align 2 \n" + ".long 2b, 7f \n" + ".long 4b, 7f \n" + ".long 6b, 7f \n" + ".long 10b, 8b \n" + ".long 11b, 9b \n" + ".long 12b,13b \n" + ".previous \n" + "7: \n" + : "=r"(n), "=r"(to), "=r"(from), "=r"(nsave), + "=r"(tmp) + : "0"(n), "1"(to), "2"(from) + : "memory"); + return n; } EXPORT_SYMBOL(raw_copy_from_user); @@ -15,48 +78,70 @@ EXPORT_SYMBOL(raw_copy_from_user); unsigned long raw_copy_to_user(void *to, const void *from, unsigned long n) { - ___copy_to_user(to, from, n); + int w0, w1, w2, w3; + + __asm__ __volatile__( + "0: cmpnei %1, 0 \n" + " bf 8f \n" + " mov %3, %1 \n" + " or %3, %2 \n" + " andi %3, 3 \n" + " cmpnei %3, 0 \n" + " bf 1f \n" + " br 5f \n" + "1: cmplti %0, 16 \n" /* 4W */ + " bt 3f \n" + " ldw %3, (%2, 0) \n" + " ldw %4, (%2, 4) \n" + " ldw %5, (%2, 8) \n" + " ldw %6, (%2, 12) \n" + "2: stw %3, (%1, 0) \n" + "9: stw %4, (%1, 4) \n" + "10: stw %5, (%1, 8) \n" + "11: stw %6, (%1, 12) \n" + " addi %2, 16 \n" + " addi %1, 16 \n" + " subi %0, 16 \n" + " br 1b \n" + "3: cmplti %0, 4 \n" /* 1W */ + " bt 5f \n" + " ldw %3, (%2, 0) \n" + "4: stw %3, (%1, 0) \n" + " addi %2, 4 \n" + " addi %1, 4 \n" + " subi %0, 4 \n" + " br 3b \n" + "5: cmpnei %0, 0 \n" /* 1B */ + " bf 13f \n" + " ldb %3, (%2, 0) \n" + "6: stb %3, (%1, 0) \n" + " addi %2, 1 \n" + " addi %1, 1 \n" + " subi %0, 1 \n" + " br 5b \n" + "7: subi %0, 4 \n" + "8: subi %0, 4 \n" + "12: subi %0, 4 \n" + " br 13f \n" + ".section __ex_table, \"a\" \n" + ".align 2 \n" + ".long 2b, 13f \n" + ".long 4b, 13f \n" + ".long 6b, 13f \n" + ".long 9b, 12b \n" + ".long 10b, 8b \n" + ".long 11b, 7b \n" + ".previous \n" + "13: \n" + : "=r"(n), "=r"(to), "=r"(from), "=r"(w0), + "=r"(w1), "=r"(w2), "=r"(w3) + : "0"(n), "1"(to), "2"(from) + : "memory"); + return n; } EXPORT_SYMBOL(raw_copy_to_user); - -/* - * copy a null terminated string from userspace. - */ -#define __do_strncpy_from_user(dst, src, count, res) \ -do { \ - int tmp; \ - long faultres; \ - asm volatile( \ - " cmpnei %3, 0 \n" \ - " bf 4f \n" \ - "1: cmpnei %1, 0 \n" \ - " bf 5f \n" \ - "2: ldb %4, (%3, 0) \n" \ - " stb %4, (%2, 0) \n" \ - " cmpnei %4, 0 \n" \ - " bf 3f \n" \ - " addi %3, 1 \n" \ - " addi %2, 1 \n" \ - " subi %1, 1 \n" \ - " br 1b \n" \ - "3: subu %0, %1 \n" \ - " br 5f \n" \ - "4: mov %0, %5 \n" \ - " br 5f \n" \ - ".section __ex_table, \"a\" \n" \ - ".align 2 \n" \ - ".long 2b, 4b \n" \ - ".previous \n" \ - "5: \n" \ - : "=r"(res), "=r"(count), "=r"(dst), \ - "=r"(src), "=r"(tmp), "=r"(faultres) \ - : "5"(-EFAULT), "0"(count), "1"(count), \ - "2"(dst), "3"(src) \ - : "memory", "cc"); \ -} while (0) - /* * __strncpy_from_user: - Copy a NUL terminated string from userspace, * with less checking. @@ -80,41 +165,41 @@ do { \ */ long __strncpy_from_user(char *dst, const char *src, long count) { - long res; + long res, faultres; + int tmp; + + __asm__ __volatile__( + " cmpnei %3, 0 \n" + " bf 4f \n" + "1: cmpnei %1, 0 \n" + " bf 5f \n" + "2: ldb %4, (%3, 0) \n" + " stb %4, (%2, 0) \n" + " cmpnei %4, 0 \n" + " bf 3f \n" + " addi %3, 1 \n" + " addi %2, 1 \n" + " subi %1, 1 \n" + " br 1b \n" + "3: subu %0, %1 \n" + " br 5f \n" + "4: mov %0, %5 \n" + " br 5f \n" + ".section __ex_table, \"a\" \n" + ".align 2 \n" + ".long 2b, 4b \n" + ".previous \n" + "5: \n" + : "=r"(res), "=r"(count), "=r"(dst), + "=r"(src), "=r"(tmp), "=r"(faultres) + : "5"(-EFAULT), "0"(count), "1"(count), + "2"(dst), "3"(src) + : "memory"); - __do_strncpy_from_user(dst, src, count, res); return res; } EXPORT_SYMBOL(__strncpy_from_user); -/* - * strncpy_from_user: - Copy a NUL terminated string from userspace. - * @dst: Destination address, in kernel space. This buffer must be at - * least @count bytes long. - * @src: Source address, in user space. - * @count: Maximum number of bytes to copy, including the trailing NUL. - * - * Copies a NUL-terminated string from userspace to kernel space. - * - * On success, returns the length of the string (not including the trailing - * NUL). - * - * If access to userspace fails, returns -EFAULT (some data may have been - * copied). - * - * If @count is smaller than the length of the string, copies @count bytes - * and returns @count. - */ -long strncpy_from_user(char *dst, const char *src, long count) -{ - long res = -EFAULT; - - if (access_ok(src, 1)) - __do_strncpy_from_user(dst, src, count, res); - return res; -} -EXPORT_SYMBOL(strncpy_from_user); - /* * strlen_user: - Get the size of a string in user space. * @str: The string to measure. @@ -126,14 +211,11 @@ EXPORT_SYMBOL(strncpy_from_user); * On exception, returns 0. * If the string is too long, returns a value greater than @n. */ -long strnlen_user(const char *s, long n) +long __strnlen_user(const char *s, long n) { unsigned long res, tmp; - if (s == NULL) - return 0; - - asm volatile( + __asm__ __volatile__( " cmpnei %1, 0 \n" " bf 3f \n" "1: cmpnei %0, 0 \n" @@ -156,87 +238,11 @@ long strnlen_user(const char *s, long n) "5: \n" : "=r"(n), "=r"(s), "=r"(res), "=r"(tmp) : "0"(n), "1"(s), "2"(n) - : "memory", "cc"); + : "memory"); return res; } -EXPORT_SYMBOL(strnlen_user); - -#define __do_clear_user(addr, size) \ -do { \ - int __d0, zvalue, tmp; \ - \ - asm volatile( \ - "0: cmpnei %1, 0 \n" \ - " bf 7f \n" \ - " mov %3, %1 \n" \ - " andi %3, 3 \n" \ - " cmpnei %3, 0 \n" \ - " bf 1f \n" \ - " br 5f \n" \ - "1: cmplti %0, 32 \n" /* 4W */ \ - " bt 3f \n" \ - "8: stw %2, (%1, 0) \n" \ - "10: stw %2, (%1, 4) \n" \ - "11: stw %2, (%1, 8) \n" \ - "12: stw %2, (%1, 12) \n" \ - "13: stw %2, (%1, 16) \n" \ - "14: stw %2, (%1, 20) \n" \ - "15: stw %2, (%1, 24) \n" \ - "16: stw %2, (%1, 28) \n" \ - " addi %1, 32 \n" \ - " subi %0, 32 \n" \ - " br 1b \n" \ - "3: cmplti %0, 4 \n" /* 1W */ \ - " bt 5f \n" \ - "4: stw %2, (%1, 0) \n" \ - " addi %1, 4 \n" \ - " subi %0, 4 \n" \ - " br 3b \n" \ - "5: cmpnei %0, 0 \n" /* 1B */ \ - "9: bf 7f \n" \ - "6: stb %2, (%1, 0) \n" \ - " addi %1, 1 \n" \ - " subi %0, 1 \n" \ - " br 5b \n" \ - ".section __ex_table,\"a\" \n" \ - ".align 2 \n" \ - ".long 8b, 9b \n" \ - ".long 10b, 9b \n" \ - ".long 11b, 9b \n" \ - ".long 12b, 9b \n" \ - ".long 13b, 9b \n" \ - ".long 14b, 9b \n" \ - ".long 15b, 9b \n" \ - ".long 16b, 9b \n" \ - ".long 4b, 9b \n" \ - ".long 6b, 9b \n" \ - ".previous \n" \ - "7: \n" \ - : "=r"(size), "=r" (__d0), \ - "=r"(zvalue), "=r"(tmp) \ - : "0"(size), "1"(addr), "2"(0) \ - : "memory", "cc"); \ -} while (0) - -/* - * clear_user: - Zero a block of memory in user space. - * @to: Destination address, in user space. - * @n: Number of bytes to zero. - * - * Zero a block of memory in user space. - * - * Returns number of bytes that could not be cleared. - * On success, this will be zero. - */ -unsigned long -clear_user(void __user *to, unsigned long n) -{ - if (access_ok(to, n)) - __do_clear_user(to, n); - return n; -} -EXPORT_SYMBOL(clear_user); +EXPORT_SYMBOL(__strnlen_user); /* * __clear_user: - Zero a block of memory in user space, with less checking. @@ -252,7 +258,59 @@ EXPORT_SYMBOL(clear_user); unsigned long __clear_user(void __user *to, unsigned long n) { - __do_clear_user(to, n); + int data, value, tmp; + + __asm__ __volatile__( + "0: cmpnei %1, 0 \n" + " bf 7f \n" + " mov %3, %1 \n" + " andi %3, 3 \n" + " cmpnei %3, 0 \n" + " bf 1f \n" + " br 5f \n" + "1: cmplti %0, 32 \n" /* 4W */ + " bt 3f \n" + "8: stw %2, (%1, 0) \n" + "10: stw %2, (%1, 4) \n" + "11: stw %2, (%1, 8) \n" + "12: stw %2, (%1, 12) \n" + "13: stw %2, (%1, 16) \n" + "14: stw %2, (%1, 20) \n" + "15: stw %2, (%1, 24) \n" + "16: stw %2, (%1, 28) \n" + " addi %1, 32 \n" + " subi %0, 32 \n" + " br 1b \n" + "3: cmplti %0, 4 \n" /* 1W */ + " bt 5f \n" + "4: stw %2, (%1, 0) \n" + " addi %1, 4 \n" + " subi %0, 4 \n" + " br 3b \n" + "5: cmpnei %0, 0 \n" /* 1B */ + "9: bf 7f \n" + "6: stb %2, (%1, 0) \n" + " addi %1, 1 \n" + " subi %0, 1 \n" + " br 5b \n" + ".section __ex_table,\"a\" \n" + ".align 2 \n" + ".long 8b, 9b \n" + ".long 10b, 9b \n" + ".long 11b, 9b \n" + ".long 12b, 9b \n" + ".long 13b, 9b \n" + ".long 14b, 9b \n" + ".long 15b, 9b \n" + ".long 16b, 9b \n" + ".long 4b, 9b \n" + ".long 6b, 9b \n" + ".previous \n" + "7: \n" + : "=r"(n), "=r" (data), "=r"(value), "=r"(tmp) + : "0"(n), "1"(to), "2"(0) + : "memory"); + return n; } EXPORT_SYMBOL(__clear_user); diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c index 1482de56f4f7..466ad949818a 100644 --- a/arch/csky/mm/fault.c +++ b/arch/csky/mm/fault.c @@ -12,7 +12,7 @@ int fixup_exception(struct pt_regs *regs) fixup = search_exception_tables(instruction_pointer(regs)); if (fixup) { - regs->pc = fixup->nextinsn; + regs->pc = fixup->fixup; return 1; } From ec3576eac11d66a388b6cba6a7cfb3b45039a712 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Sat, 24 Apr 2021 17:39:35 +0300 Subject: [PATCH 0718/1379] Documentation: firmware-guide: gpio-properties: Add note to SPI CS case Historically ACPI has no means of the GPIO polarity and thus the SPISerialBus() resource defines it on the per-chip basis. In order to avoid an ambiguity, the GPIO polarity is considered being always Active High. Add note about this to the respective documentation file. Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- Documentation/firmware-guide/acpi/gpio-properties.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/firmware-guide/acpi/gpio-properties.rst b/Documentation/firmware-guide/acpi/gpio-properties.rst index 4e264c16ddff..df4b711053ee 100644 --- a/Documentation/firmware-guide/acpi/gpio-properties.rst +++ b/Documentation/firmware-guide/acpi/gpio-properties.rst @@ -99,6 +99,12 @@ native:: } } +Note, that historically ACPI has no means of the GPIO polarity and thus +the SPISerialBus() resource defines it on the per-chip basis. In order +to avoid a chain of negations, the GPIO polarity is considered being +Active High. Even for the cases when _DSD() is involved (see the example +above) the GPIO CS polarity must be defined Active High to avoid ambiguity. + Other supported properties ========================== From e483bb9a991bdae29a0caa4b3a6d002c968f94aa Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Fri, 23 Apr 2021 10:28:17 -0500 Subject: [PATCH 0719/1379] ACPI: custom_method: fix potential use-after-free issue In cm_write(), buf is always freed when reaching the end of the function. If the requested count is less than table.length, the allocated buffer will be freed but subsequent calls to cm_write() will still try to access it. Remove the unconditional kfree(buf) at the end of the function and set the buf to NULL in the -EINVAL error path to match the rest of function. Fixes: 03d1571d9513 ("ACPI: custom_method: fix memory leaks") Signed-off-by: Mark Langsdorf Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/custom_method.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/custom_method.c b/drivers/acpi/custom_method.c index 443fdf62dd22..8844f895f9be 100644 --- a/drivers/acpi/custom_method.c +++ b/drivers/acpi/custom_method.c @@ -55,6 +55,7 @@ static ssize_t cm_write(struct file *file, const char __user *user_buf, (*ppos + count < count) || (count > uncopied_bytes)) { kfree(buf); + buf = NULL; return -EINVAL; } @@ -76,7 +77,6 @@ static ssize_t cm_write(struct file *file, const char __user *user_buf, add_taint(TAINT_OVERRIDDEN_ACPI_TABLE, LOCKDEP_NOW_UNRELIABLE); } - kfree(buf); return count; } From 1cfd8956437f842836e8a066b40d1ec2fc01f13e Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Tue, 27 Apr 2021 13:54:33 -0500 Subject: [PATCH 0720/1379] ACPI: custom_method: fix a possible memory leak In cm_write(), if the 'buf' is allocated memory but not fully consumed, it is possible to reallocate the buffer without freeing it by passing '*ppos' as 0 on a subsequent call. Add an explicit kfree() before kzalloc() to prevent the possible memory leak. Fixes: 526b4af47f44 ("ACPI: Split out custom_method functionality into an own driver") Signed-off-by: Mark Langsdorf Cc: 5.4+ # 5.4+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/custom_method.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/acpi/custom_method.c b/drivers/acpi/custom_method.c index 8844f895f9be..d39a9b474727 100644 --- a/drivers/acpi/custom_method.c +++ b/drivers/acpi/custom_method.c @@ -42,6 +42,8 @@ static ssize_t cm_write(struct file *file, const char __user *user_buf, sizeof(struct acpi_table_header))) return -EFAULT; uncopied_bytes = max_size = table.length; + /* make sure the buf is not allocated */ + kfree(buf); buf = kzalloc(max_size, GFP_KERNEL); if (!buf) return -ENOMEM; From 785e3c0a3a870e72dc530856136ab4c8dd207128 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 27 Apr 2021 11:32:07 -0400 Subject: [PATCH 0721/1379] tracing: Map all PIDs to command lines The default max PID is set by PID_MAX_DEFAULT, and the tracing infrastructure uses this number to map PIDs to the comm names of the tasks, such output of the trace can show names from the recorded PIDs in the ring buffer. This mapping is also exported to user space via the "saved_cmdlines" file in the tracefs directory. But currently the mapping expects the PIDs to be less than PID_MAX_DEFAULT, which is the default maximum and not the real maximum. Recently, systemd will increases the maximum value of a PID on the system, and when tasks are traced that have a PID higher than PID_MAX_DEFAULT, its comm is not recorded. This leads to the entire trace to have "<...>" as the comm name, which is pretty useless. Instead, keep the array mapping the size of PID_MAX_DEFAULT, but instead of just mapping the index to the comm, map a mask of the PID (PID_MAX_DEFAULT - 1) to the comm, and find the full PID from the map_cmdline_to_pid array (that already exists). This bug goes back to the beginning of ftrace, but hasn't been an issue until user space started increasing the maximum value of PIDs. Link: https://lkml.kernel.org/r/20210427113207.3c601884@gandalf.local.home Cc: stable@vger.kernel.org Fixes: bc0c38d139ec7 ("ftrace: latency tracer infrastructure") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 41 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 26 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 66a4ad93b5e9..e28d08905124 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2390,14 +2390,13 @@ static void tracing_stop_tr(struct trace_array *tr) static int trace_save_cmdline(struct task_struct *tsk) { - unsigned pid, idx; + unsigned tpid, idx; /* treat recording of idle task as a success */ if (!tsk->pid) return 1; - if (unlikely(tsk->pid > PID_MAX_DEFAULT)) - return 0; + tpid = tsk->pid & (PID_MAX_DEFAULT - 1); /* * It's not the end of the world if we don't get @@ -2408,26 +2407,15 @@ static int trace_save_cmdline(struct task_struct *tsk) if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; - idx = savedcmd->map_pid_to_cmdline[tsk->pid]; + idx = savedcmd->map_pid_to_cmdline[tpid]; if (idx == NO_CMDLINE_MAP) { idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; - /* - * Check whether the cmdline buffer at idx has a pid - * mapped. We are going to overwrite that entry so we - * need to clear the map_pid_to_cmdline. Otherwise we - * would read the new comm for the old pid. - */ - pid = savedcmd->map_cmdline_to_pid[idx]; - if (pid != NO_CMDLINE_MAP) - savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; - - savedcmd->map_cmdline_to_pid[idx] = tsk->pid; - savedcmd->map_pid_to_cmdline[tsk->pid] = idx; - + savedcmd->map_pid_to_cmdline[tpid] = idx; savedcmd->cmdline_idx = idx; } + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; set_cmdline(idx, tsk->comm); arch_spin_unlock(&trace_cmdline_lock); @@ -2438,6 +2426,7 @@ static int trace_save_cmdline(struct task_struct *tsk) static void __trace_find_cmdline(int pid, char comm[]) { unsigned map; + int tpid; if (!pid) { strcpy(comm, ""); @@ -2449,16 +2438,16 @@ static void __trace_find_cmdline(int pid, char comm[]) return; } - if (pid > PID_MAX_DEFAULT) { - strcpy(comm, "<...>"); - return; + tpid = pid & (PID_MAX_DEFAULT - 1); + map = savedcmd->map_pid_to_cmdline[tpid]; + if (map != NO_CMDLINE_MAP) { + tpid = savedcmd->map_cmdline_to_pid[map]; + if (tpid == pid) { + strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); + return; + } } - - map = savedcmd->map_pid_to_cmdline[pid]; - if (map != NO_CMDLINE_MAP) - strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); - else - strcpy(comm, "<...>"); + strcpy(comm, "<...>"); } void trace_find_cmdline(int pid, char comm[]) From 2f1137140fbcffad582d9e5eacc7f189ae0cc110 Mon Sep 17 00:00:00 2001 From: Keoseong Park Date: Tue, 27 Apr 2021 16:38:42 +0900 Subject: [PATCH 0722/1379] scsi: ufs: core: Fix a typo in ufs-sysfs.c Fix the following typo: ufschd_uic_link_state_to_string() -> ufshcd_uic_link_state_to_string() ufschd_ufs_dev_pwr_mode_to_string() -> ufshcd_ufs_dev_pwr_mode_to_string() Link: https://lore.kernel.org/r/1381713434.61619509208911.JavaMail.epsvc@epcpadp3 Signed-off-by: Keoseong Park Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufs-sysfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/ufs/ufs-sysfs.c b/drivers/scsi/ufs/ufs-sysfs.c index d7c3cff9662f..5d0e98a05ada 100644 --- a/drivers/scsi/ufs/ufs-sysfs.c +++ b/drivers/scsi/ufs/ufs-sysfs.c @@ -9,7 +9,7 @@ #include "ufs.h" #include "ufs-sysfs.h" -static const char *ufschd_uic_link_state_to_string( +static const char *ufshcd_uic_link_state_to_string( enum uic_link_state state) { switch (state) { @@ -21,7 +21,7 @@ static const char *ufschd_uic_link_state_to_string( } } -static const char *ufschd_ufs_dev_pwr_mode_to_string( +static const char *ufshcd_ufs_dev_pwr_mode_to_string( enum ufs_dev_pwr_mode state) { switch (state) { @@ -81,7 +81,7 @@ static ssize_t rpm_target_dev_state_show(struct device *dev, { struct ufs_hba *hba = dev_get_drvdata(dev); - return sysfs_emit(buf, "%s\n", ufschd_ufs_dev_pwr_mode_to_string( + return sysfs_emit(buf, "%s\n", ufshcd_ufs_dev_pwr_mode_to_string( ufs_pm_lvl_states[hba->rpm_lvl].dev_state)); } @@ -90,7 +90,7 @@ static ssize_t rpm_target_link_state_show(struct device *dev, { struct ufs_hba *hba = dev_get_drvdata(dev); - return sysfs_emit(buf, "%s\n", ufschd_uic_link_state_to_string( + return sysfs_emit(buf, "%s\n", ufshcd_uic_link_state_to_string( ufs_pm_lvl_states[hba->rpm_lvl].link_state)); } @@ -113,7 +113,7 @@ static ssize_t spm_target_dev_state_show(struct device *dev, { struct ufs_hba *hba = dev_get_drvdata(dev); - return sysfs_emit(buf, "%s\n", ufschd_ufs_dev_pwr_mode_to_string( + return sysfs_emit(buf, "%s\n", ufshcd_ufs_dev_pwr_mode_to_string( ufs_pm_lvl_states[hba->spm_lvl].dev_state)); } @@ -122,7 +122,7 @@ static ssize_t spm_target_link_state_show(struct device *dev, { struct ufs_hba *hba = dev_get_drvdata(dev); - return sysfs_emit(buf, "%s\n", ufschd_uic_link_state_to_string( + return sysfs_emit(buf, "%s\n", ufshcd_uic_link_state_to_string( ufs_pm_lvl_states[hba->spm_lvl].link_state)); } From 9814b55cde0588b6d9bc496cee43f87316cbc6f1 Mon Sep 17 00:00:00 2001 From: Bodo Stroesser Date: Fri, 23 Apr 2021 17:01:23 +0200 Subject: [PATCH 0723/1379] scsi: target: tcmu: Return from tcmu_handle_completions() if cmd_id not found If tcmu_handle_completions() finds an invalid cmd_id while looping over cmd responses from userspace it sets TCMU_DEV_BIT_BROKEN and breaks the loop. This means that it does further handling for the tcmu device. Skip that handling by replacing 'break' with 'return'. Additionally change tcmu_handle_completions() from unsigned int to bool, since the value used in return already is bool. Link: https://lore.kernel.org/r/20210423150123.24468-1-bostroesser@gmail.com Signed-off-by: Bodo Stroesser Signed-off-by: Martin K. Petersen --- drivers/target/target_core_user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index eec2fd573e2b..198d25ae482a 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -1413,7 +1413,7 @@ static int tcmu_run_tmr_queue(struct tcmu_dev *udev) return 1; } -static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) +static bool tcmu_handle_completions(struct tcmu_dev *udev) { struct tcmu_mailbox *mb; struct tcmu_cmd *cmd; @@ -1456,7 +1456,7 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) pr_err("cmd_id %u not found, ring is broken\n", entry->hdr.cmd_id); set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags); - break; + return false; } tcmu_handle_completion(cmd, entry); From 000e68faefe6240ea2e4c98b606c594b20974fb7 Mon Sep 17 00:00:00 2001 From: Bikash Hazarika Date: Mon, 26 Apr 2021 22:09:14 -0700 Subject: [PATCH 0724/1379] scsi: qla2xxx: Add marginal path handling support Add support for eh_should_retry_cmd callback in qla2xxx host template. Link: https://lore.kernel.org/r/20210427050914.7270-1-njavali@marvell.com Reviewed-by: Himanshu Madhani Signed-off-by: Bikash Hazarika Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_os.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index d74c32f84ef5..4eab564ea6a0 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -7707,6 +7707,7 @@ struct scsi_host_template qla2xxx_driver_template = { .eh_timed_out = fc_eh_timed_out, .eh_abort_handler = qla2xxx_eh_abort, + .eh_should_retry_cmd = fc_eh_should_retry_cmd, .eh_device_reset_handler = qla2xxx_eh_device_reset, .eh_target_reset_handler = qla2xxx_eh_target_reset, .eh_bus_reset_handler = qla2xxx_eh_bus_reset, From fcb16d9a8ecf1e9bfced0fc654ea4e2caa7517f4 Mon Sep 17 00:00:00 2001 From: Anastasia Kovaleva Date: Thu, 22 Apr 2021 18:34:14 +0300 Subject: [PATCH 0725/1379] scsi: qla2xxx: Prevent PRLI in target mode In a case when the initiator in P2P mode by some circumstances does not send PRLI, the target, in a case when the target port's WWPN is less than initiator's, changes the discovery state in DSC_GNL. When gnl completes it sends PRLI to the initiator. Usually the initiator in P2P mode always sends PRLI. We caught this issue on Linux stable v5.4.6 https://www.spinics.net/lists/stable/msg458515.html. Fix this particular corner case in the behaviour of the P2P mod target login state machine. Link: https://lore.kernel.org/r/20210422153414.4022-1-a.kovaleva@yadro.com Fixes: a9ed06d4e640 ("scsi: qla2xxx: Allow PLOGI in target mode") Reviewed-by: Roman Bolshakov Reviewed-by: Himanshu Madhani Signed-off-by: Anastasia Kovaleva Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_init.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index 9c5782e946e0..0de250570e39 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -1195,6 +1195,9 @@ static int qla24xx_post_prli_work(struct scsi_qla_host *vha, fc_port_t *fcport) { struct qla_work_evt *e; + if (vha->host->active_mode == MODE_TARGET) + return QLA_FUNCTION_FAILED; + e = qla2x00_alloc_work(vha, QLA_EVT_PRLI); if (!e) return QLA_FUNCTION_FAILED; From 23043dd87b153d02eaf676e752d32429be5e5126 Mon Sep 17 00:00:00 2001 From: Can Guo Date: Sun, 25 Apr 2021 20:48:38 -0700 Subject: [PATCH 0726/1379] scsi: ufs: core: Do not put UFS power into LPM if link is broken During resume, if link is broken due to AH8 failure, make sure ufshcd_resume() does not put UFS power back into LPM. Link: https://lore.kernel.org/r/1619408921-30426-2-git-send-email-cang@codeaurora.org Fixes: 4db7a2360597 ("scsi: ufs: Fix concurrency of error handler and other error recovery paths") Reviewed-by: Daejun Park Signed-off-by: Can Guo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 0625da7a42ee..19d3049a867c 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -8593,7 +8593,7 @@ static void ufshcd_vreg_set_lpm(struct ufs_hba *hba) } else if (!ufshcd_is_ufs_dev_active(hba)) { ufshcd_toggle_vreg(hba->dev, hba->vreg_info.vcc, false); vcc_off = true; - if (!ufshcd_is_link_active(hba)) { + if (ufshcd_is_link_hibern8(hba) || ufshcd_is_link_off(hba)) { ufshcd_config_vreg_lpm(hba, hba->vreg_info.vccq); ufshcd_config_vreg_lpm(hba, hba->vreg_info.vccq2); } @@ -8615,7 +8615,7 @@ static int ufshcd_vreg_set_hpm(struct ufs_hba *hba) !hba->dev_info.is_lu_power_on_wp) { ret = ufshcd_setup_vreg(hba, true); } else if (!ufshcd_is_ufs_dev_active(hba)) { - if (!ret && !ufshcd_is_link_active(hba)) { + if (!ufshcd_is_link_active(hba)) { ret = ufshcd_config_vreg_hpm(hba, hba->vreg_info.vccq); if (ret) goto vcc_disable; From 637822e63b79ee8a729f7ba2645a26cf5a524ee4 Mon Sep 17 00:00:00 2001 From: Can Guo Date: Sun, 25 Apr 2021 20:48:39 -0700 Subject: [PATCH 0727/1379] scsi: ufs: core: Cancel rpm_dev_flush_recheck_work during system suspend During ufs system suspend, leaving rpm_dev_flush_recheck_work running or pending is risky because concurrency may happen between system suspend/resume and runtime resume routine. Fix this by cancelling rpm_dev_flush_recheck_work synchronously during system suspend. Link: https://lore.kernel.org/r/1619408921-30426-3-git-send-email-cang@codeaurora.org Fixes: 51dd905bd2f6 ("scsi: ufs: Fix WriteBooster flush during runtime suspend") Reviewed-by: Daejun Park Signed-off-by: Can Guo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 19d3049a867c..f65f2dc92029 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -8975,6 +8975,8 @@ int ufshcd_system_suspend(struct ufs_hba *hba) if (!hba->is_powered) return 0; + cancel_delayed_work_sync(&hba->rpm_dev_flush_recheck_work); + if ((ufs_get_pm_lvl_to_dev_pwr_mode(hba->spm_lvl) == hba->curr_dev_pwr_mode) && (ufs_get_pm_lvl_to_link_pwr_state(hba->spm_lvl) == From ce4f62f9dd8cf43ac044045ed598a0b80ef33890 Mon Sep 17 00:00:00 2001 From: Can Guo Date: Sun, 25 Apr 2021 20:48:40 -0700 Subject: [PATCH 0728/1379] scsi: ufs: core: Narrow down fast path in system suspend path If spm_lvl is set to 0 or 1, when system suspend kicks start and HBA is runtime active, system suspend may just bail without doing anything (the fast path), leaving other contexts still running, e.g., clock gating and clock scaling. When system resume kicks start, concurrency can happen between ufshcd_resume() and these contexts, leading to various stability issues. Add a check against HBA's runtime state and allowing fast path only if HBA is runtime suspended, otherwise let system suspend go ahead call ufshcd_suspend(). This will guarantee that these contexts are stopped by either runtime suspend or system suspend. Link: https://lore.kernel.org/r/1619408921-30426-4-git-send-email-cang@codeaurora.org Fixes: 0b257734344a ("scsi: ufs: optimize system suspend handling") Reviewed-by: Daejun Park Signed-off-by: Can Guo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index f65f2dc92029..3eb54937f1d8 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -8981,6 +8981,7 @@ int ufshcd_system_suspend(struct ufs_hba *hba) hba->curr_dev_pwr_mode) && (ufs_get_pm_lvl_to_link_pwr_state(hba->spm_lvl) == hba->uic_link_state) && + pm_runtime_suspended(hba->dev) && !hba->dev_info.b_rpm_dev_flush_capable) goto out; From d89f6048bdcb6a56abb396c584747d5eeae650db Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Thu, 22 Apr 2021 19:10:52 -0400 Subject: [PATCH 0729/1379] drm/amd/display: Reject non-zero src_y and src_x for video planes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [Why] This hasn't been well tested and leads to complete system hangs on DCN1 based systems, possibly others. The system hang can be reproduced by gesturing the video on the YouTube Android app on ChromeOS into full screen. [How] Reject atomic commits with non-zero drm_plane_state.src_x or src_y values. v2: - Add code comment describing the reason we're rejecting non-zero src_x and src_y - Drop gerrit Change-Id - Add stable CC - Based on amd-staging-drm-next v3: removed trailing whitespace Signed-off-by: Harry Wentland Cc: stable@vger.kernel.org Cc: nicholas.kazlauskas@amd.com Cc: amd-gfx@lists.freedesktop.org Cc: alexander.deucher@amd.com Cc: Roman.Li@amd.com Cc: hersenxs.wu@amd.com Cc: danny.wang@amd.com Reviewed-by: Nicholas Kazlauskas Acked-by: Christian König Reviewed-by: Hersen Wu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index a0c8c41e4e57..deec7e0466da 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -4008,6 +4008,23 @@ static int fill_dc_scaling_info(const struct drm_plane_state *state, scaling_info->src_rect.x = state->src_x >> 16; scaling_info->src_rect.y = state->src_y >> 16; + /* + * For reasons we don't (yet) fully understand a non-zero + * src_y coordinate into an NV12 buffer can cause a + * system hang. To avoid hangs (and maybe be overly cautious) + * let's reject both non-zero src_x and src_y. + * + * We currently know of only one use-case to reproduce a + * scenario with non-zero src_x and src_y for NV12, which + * is to gesture the YouTube Android app into full screen + * on ChromeOS. + */ + if (state->fb && + state->fb->format->format == DRM_FORMAT_NV12 && + (scaling_info->src_rect.x != 0 || + scaling_info->src_rect.y != 0)) + return -EINVAL; + scaling_info->src_rect.width = state->src_w >> 16; if (scaling_info->src_rect.width == 0) return -EINVAL; From 20a5f5a98e1bb3d40acd97e89299e8c2d22784be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 22 Apr 2021 13:11:39 +0200 Subject: [PATCH 0730/1379] drm/amdgpu: fix concurrent VM flushes on Vega/Navi v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Starting with Vega the hardware supports concurrent flushes of VMID which can be used to implement per process VMID allocation. But concurrent flushes are mutual exclusive with back to back VMID allocations, fix this to avoid a VMID used in two ways at the same time. v2: don't set ring to NULL Signed-off-by: Christian König Reviewed-by: James Zhu Tested-by: James Zhu Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 19 +++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 ++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 1 + 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c index 94b069630db3..b4971e90b98c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c @@ -215,7 +215,11 @@ static int amdgpu_vmid_grab_idle(struct amdgpu_vm *vm, /* Check if we have an idle VMID */ i = 0; list_for_each_entry((*idle), &id_mgr->ids_lru, list) { - fences[i] = amdgpu_sync_peek_fence(&(*idle)->active, ring); + /* Don't use per engine and per process VMID at the same time */ + struct amdgpu_ring *r = adev->vm_manager.concurrent_flush ? + NULL : ring; + + fences[i] = amdgpu_sync_peek_fence(&(*idle)->active, r); if (!fences[i]) break; ++i; @@ -281,7 +285,7 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm, if (updates && (*id)->flushed_updates && updates->context == (*id)->flushed_updates->context && !dma_fence_is_later(updates, (*id)->flushed_updates)) - updates = NULL; + updates = NULL; if ((*id)->owner != vm->immediate.fence_context || job->vm_pd_addr != (*id)->pd_gpu_addr || @@ -290,6 +294,10 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm, !dma_fence_is_signaled((*id)->last_flush))) { struct dma_fence *tmp; + /* Don't use per engine and per process VMID at the same time */ + if (adev->vm_manager.concurrent_flush) + ring = NULL; + /* to prevent one context starved by another context */ (*id)->pd_gpu_addr = 0; tmp = amdgpu_sync_peek_fence(&(*id)->active, ring); @@ -365,12 +373,7 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm, if (updates && (!flushed || dma_fence_is_later(updates, flushed))) needs_flush = true; - /* Concurrent flushes are only possible starting with Vega10 and - * are broken on Navi10 and Navi14. - */ - if (needs_flush && (adev->asic_type < CHIP_VEGA10 || - adev->asic_type == CHIP_NAVI10 || - adev->asic_type == CHIP_NAVI14)) + if (needs_flush && !adev->vm_manager.concurrent_flush) continue; /* Good, we can use this VMID. Remember this submission as diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 0ffdf847cad0..9acee4a5b2ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -3148,6 +3148,12 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev) { unsigned i; + /* Concurrent flushes are only possible starting with Vega10 and + * are broken on Navi10 and Navi14. + */ + adev->vm_manager.concurrent_flush = !(adev->asic_type < CHIP_VEGA10 || + adev->asic_type == CHIP_NAVI10 || + adev->asic_type == CHIP_NAVI14); amdgpu_vmid_mgr_init(adev); adev->vm_manager.fence_context = diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 976a12e5a8b9..4e140288159c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -331,6 +331,7 @@ struct amdgpu_vm_manager { /* Handling of VMIDs */ struct amdgpu_vmid_mgr id_mgr[AMDGPU_MAX_VMHUBS]; unsigned int first_kfd_vmid; + bool concurrent_flush; /* Handling of VM fences */ u64 fence_context; From b117b3964f38a988cb79825950dbd607c02237f3 Mon Sep 17 00:00:00 2001 From: Darren Powell Date: Wed, 7 Apr 2021 00:34:35 -0400 Subject: [PATCH 0731/1379] amdgpu/pm: Prevent force of DCEFCLK on NAVI10 and SIENNA_CICHLID Writing to dcefclk causes the gpu to become unresponsive, and requires a reboot. Patch ignores a .force_clk_levels(SMU_DCEFCLK) call and issues an info message. Signed-off-by: Darren Powell Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c | 5 ++++- drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c index f827096dc849..ac13042672ea 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c @@ -1443,7 +1443,6 @@ static int navi10_force_clk_levels(struct smu_context *smu, case SMU_SOCCLK: case SMU_MCLK: case SMU_UCLK: - case SMU_DCEFCLK: case SMU_FCLK: /* There is only 2 levels for fine grained DPM */ if (navi10_is_support_fine_grained_dpm(smu, clk_type)) { @@ -1463,6 +1462,10 @@ static int navi10_force_clk_levels(struct smu_context *smu, if (ret) return size; break; + case SMU_DCEFCLK: + dev_info(smu->adev->dev,"Setting DCEFCLK min/max dpm level is not supported!\n"); + break; + default: break; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index 72d9c1be1835..d2fd44b903ca 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -1127,7 +1127,6 @@ static int sienna_cichlid_force_clk_levels(struct smu_context *smu, case SMU_SOCCLK: case SMU_MCLK: case SMU_UCLK: - case SMU_DCEFCLK: case SMU_FCLK: /* There is only 2 levels for fine grained DPM */ if (sienna_cichlid_is_support_fine_grained_dpm(smu, clk_type)) { @@ -1147,6 +1146,9 @@ static int sienna_cichlid_force_clk_levels(struct smu_context *smu, if (ret) goto forec_level_out; break; + case SMU_DCEFCLK: + dev_info(smu->adev->dev,"Setting DCEFCLK min/max dpm level is not supported!\n"); + break; default: break; } From d7b4a6077ec38763a1f6fed2b2f6a0113028eea7 Mon Sep 17 00:00:00 2001 From: Darren Powell Date: Wed, 7 Apr 2021 18:40:34 -0400 Subject: [PATCH 0732/1379] amdgpu/pm: set pp_dpm_dcefclk to readonly on NAVI10 and newer gpus v2 : change condition to apply to all chips after NAVI10 Writing to dcefclk causes the gpu to become unresponsive, and requires a reboot. Patch prevents user from successfully writing to file pp_dpm_dcefclk on parts NAVI10 and newer, and gives better user feedback that this operation is not allowed. Signed-off-by: Darren Powell Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index 204e34549013..a8d6cc2525b2 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -1891,6 +1891,14 @@ static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_ } } + if (DEVICE_ATTR_IS(pp_dpm_dcefclk)) { + /* SMU MP1 does not support dcefclk level setting */ + if (asic_type >= CHIP_NAVI10) { + dev_attr->attr.mode &= ~S_IWUGO; + dev_attr->store = NULL; + } + } + #undef DEVICE_ATTR_IS return 0; From 3cbae5abfa8ebc8bc2b445dbe392b6987cd15483 Mon Sep 17 00:00:00 2001 From: Mikita Lipski Date: Wed, 14 Apr 2021 14:15:52 -0400 Subject: [PATCH 0733/1379] drm/amd/display: fix wrong statement in mst hpd debugfs [why] Previous statement would always evaluate to true making it meaningless [how] Just check if a connector is MST by checking if its port exists. Fixes: 41efcd3879b1df ("drm/amd/display: Add MST capability to trigger_hotplug interface") Reported-by: kernel test robot Signed-off-by: Mikita Lipski Reviewed-by: Nicholas Kazlauskas Acked-by: Wayne Lin Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c index 9a13f47022df..2cfee7d6bb4d 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c @@ -3012,7 +3012,7 @@ static int trigger_hpd_mst_set(void *data, u64 val) if (!aconnector->dc_link) continue; - if (!(aconnector->port && &aconnector->mst_port->mst_mgr)) + if (!aconnector->mst_port) continue; link = aconnector->dc_link; From 4b12ee6f426e5e36396501a58f3a1af5b92a7e06 Mon Sep 17 00:00:00 2001 From: Victor Zhao Date: Tue, 27 Apr 2021 17:52:56 +0800 Subject: [PATCH 0734/1379] drm/amdgpu: fix r initial values Sriov gets suspend of IP block failed as return value was not initialized. v2: return 0 directly to align original code semantic before this was broken out into a separate helper function instead of setting initial values Signed-off-by: Victor Zhao Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c index 9a2f811450ed..4fbae20839b7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c @@ -1399,7 +1399,7 @@ int amdgpu_display_suspend_helper(struct amdgpu_device *adev) } } } - return r; + return 0; } int amdgpu_display_resume_helper(struct amdgpu_device *adev) From b45aeb2dea9142d4d32fa3a117ba381d84f27065 Mon Sep 17 00:00:00 2001 From: Pavan Kumar Ramayanam Date: Tue, 27 Apr 2021 10:21:18 +0530 Subject: [PATCH 0735/1379] drm/amdgpu: Handling of amdgpu_device_resume return value for graceful teardown The runtime resume PM op disregards the return value from amdgpu_device_resume(), masking errors for failed resumes at the PM layer. Reviewed-by: Alex Deucher Signed-off-by: Pavan Kumar Ramayanam Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index d8f131ed10cb..b0378cfe2218 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1573,6 +1573,9 @@ static int amdgpu_pmops_runtime_resume(struct device *dev) amdgpu_device_baco_exit(drm_dev); } ret = amdgpu_device_resume(drm_dev, false); + if (ret) + return ret; + if (amdgpu_device_supports_px(drm_dev)) drm_dev->switch_power_state = DRM_SWITCH_POWER_ON; adev->in_runpm = false; From 8c3dd61cfa05a65a7e1a8a028000fc95856156c4 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Mon, 26 Apr 2021 18:50:00 +0800 Subject: [PATCH 0736/1379] drm/amdgpu: Register VGA clients after init can no longer fail When an amdgpu device fails to init, it makes another VGA device cause kernel splat: kernel: amdgpu 0000:08:00.0: amdgpu: amdgpu_device_ip_init failed kernel: amdgpu 0000:08:00.0: amdgpu: Fatal error during GPU init kernel: amdgpu: probe of 0000:08:00.0 failed with error -110 ... kernel: amdgpu 0000:01:00.0: vgaarb: changed VGA decodes: olddecodes=io+mem,decodes=none:owns=none kernel: BUG: kernel NULL pointer dereference, address: 0000000000000018 kernel: #PF: supervisor read access in kernel mode kernel: #PF: error_code(0x0000) - not-present page kernel: PGD 0 P4D 0 kernel: Oops: 0000 [#1] SMP NOPTI kernel: CPU: 6 PID: 1080 Comm: Xorg Tainted: G W 5.12.0-rc8+ #12 kernel: Hardware name: HP HP EliteDesk 805 G6/872B, BIOS S09 Ver. 02.02.00 12/30/2020 kernel: RIP: 0010:amdgpu_device_vga_set_decode+0x13/0x30 [amdgpu] kernel: Code: 06 31 c0 c3 b8 ea ff ff ff 5d c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 0f 1f 44 00 00 55 48 8b 87 90 06 00 00 48 89 e5 53 89 f3 <48> 8b 40 18 40 0f b6 f6 e8 40 58 39 fd 80 fb 01 5b 5d 19 c0 83 e0 kernel: RSP: 0018:ffffae3c0246bd68 EFLAGS: 00010002 kernel: RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 kernel: RDX: ffff8dd1af5a8560 RSI: 0000000000000000 RDI: ffff8dce8c160000 kernel: RBP: ffffae3c0246bd70 R08: ffff8dd1af5985c0 R09: ffffae3c0246ba38 kernel: R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000246 kernel: R13: 0000000000000000 R14: 0000000000000003 R15: ffff8dce81490000 kernel: FS: 00007f9303d8fa40(0000) GS:ffff8dd1af580000(0000) knlGS:0000000000000000 kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 kernel: CR2: 0000000000000018 CR3: 0000000103cfa000 CR4: 0000000000350ee0 kernel: Call Trace: kernel: vga_arbiter_notify_clients.part.0+0x4a/0x80 kernel: vga_get+0x17f/0x1c0 kernel: vga_arb_write+0x121/0x6a0 kernel: ? apparmor_file_permission+0x1c/0x20 kernel: ? security_file_permission+0x30/0x180 kernel: vfs_write+0xca/0x280 kernel: ksys_write+0x67/0xe0 kernel: __x64_sys_write+0x1a/0x20 kernel: do_syscall_64+0x38/0x90 kernel: entry_SYSCALL_64_after_hwframe+0x44/0xae kernel: RIP: 0033:0x7f93041e02f7 kernel: Code: 75 05 48 83 c4 58 c3 e8 f7 33 ff ff 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 kernel: RSP: 002b:00007fff60e49b28 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 kernel: RAX: ffffffffffffffda RBX: 000000000000000b RCX: 00007f93041e02f7 kernel: RDX: 000000000000000b RSI: 00007fff60e49b40 RDI: 000000000000000f kernel: RBP: 00007fff60e49b40 R08: 00000000ffffffff R09: 00007fff60e499d0 kernel: R10: 00007f93049350b5 R11: 0000000000000246 R12: 000056111d45e808 kernel: R13: 0000000000000000 R14: 000056111d45e7f8 R15: 000056111d46c980 kernel: Modules linked in: nls_iso8859_1 snd_hda_codec_realtek snd_hda_codec_generic ledtrig_audio snd_hda_codec_hdmi snd_hda_intel snd_intel_dspcfg snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_seq input_leds snd_seq_device snd_timer snd soundcore joydev kvm_amd serio_raw k10temp mac_hid hp_wmi ccp kvm sparse_keymap wmi_bmof ucsi_acpi efi_pstore typec_ucsi rapl typec video wmi sch_fq_codel parport_pc ppdev lp parport ip_tables x_tables autofs4 btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx libcrc32c xor raid6_pq raid1 raid0 multipath linear dm_mirror dm_region_hash dm_log hid_generic usbhid hid amdgpu drm_ttm_helper ttm iommu_v2 gpu_sched i2c_algo_bit drm_kms_helper syscopyarea sysfillrect crct10dif_pclmul sysimgblt crc32_pclmul fb_sys_fops ghash_clmulni_intel cec rc_core aesni_intel crypto_simd psmouse cryptd r8169 i2c_piix4 drm ahci xhci_pci realtek libahci xhci_pci_renesas gpio_amdpt gpio_generic kernel: CR2: 0000000000000018 kernel: ---[ end trace 76d04313d4214c51 ]--- Commit 4192f7b57689 ("drm/amdgpu: unmap register bar on device init failure") makes amdgpu_driver_unload_kms() skips amdgpu_device_fini(), so the VGA clients remain registered. So when vga_arbiter_notify_clients() iterates over registered clients, it causes NULL pointer dereference. Since there's no reason to register VGA clients that early, so solve the issue by putting them after all the goto cleanups. v2: - Remove redundant vga_switcheroo cleanup in failed: label. Fixes: 4192f7b57689 ("drm/amdgpu: unmap register bar on device init failure") Signed-off-by: Kai-Heng Feng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 28 ++++++++++------------ 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b4ad1c055c70..7d3b54615147 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3410,19 +3410,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, /* doorbell bar mapping and doorbell index init*/ amdgpu_device_doorbell_init(adev); - /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ - /* this will fail for cards that aren't VGA class devices, just - * ignore it */ - if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) - vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); - - if (amdgpu_device_supports_px(ddev)) { - px = true; - vga_switcheroo_register_client(adev->pdev, - &amdgpu_switcheroo_ops, px); - vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); - } - if (amdgpu_emu_mode == 1) { /* post the asic on emulation mode */ emu_soc_asic_init(adev); @@ -3619,6 +3606,19 @@ fence_driver_init: if (amdgpu_device_cache_pci_state(adev->pdev)) pci_restore_state(pdev); + /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ + /* this will fail for cards that aren't VGA class devices, just + * ignore it */ + if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) + vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode); + + if (amdgpu_device_supports_px(ddev)) { + px = true; + vga_switcheroo_register_client(adev->pdev, + &amdgpu_switcheroo_ops, px); + vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain); + } + if (adev->gmc.xgmi.pending_reset) queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, msecs_to_jiffies(AMDGPU_RESUME_MS)); @@ -3630,8 +3630,6 @@ release_ras_con: failed: amdgpu_vf_error_trans_all(adev); - if (px) - vga_switcheroo_fini_domain_pm_ops(adev->dev); failed_unmap: iounmap(adev->rmmio); From e0c16eb4b3610298a74ae5504c7f6939b12be991 Mon Sep 17 00:00:00 2001 From: Simon Ser Date: Wed, 21 Apr 2021 11:16:35 +0200 Subject: [PATCH 0737/1379] amdgpu: fix GEM obj leak in amdgpu_display_user_framebuffer_create MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This error code-path is missing a drm_gem_object_put call. Other error code-paths are fine. Signed-off-by: Simon Ser Fixes: 1769152ac64b ("drm/amdgpu: Fail fb creation from imported dma-bufs. (v2)") Cc: Alex Deucher Cc: Harry Wentland Cc: Nicholas Kazlauskas Cc: Bas Nieuwenhuizen Reviewed-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c index 4fbae20839b7..c4cfefb33e03 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c @@ -986,6 +986,7 @@ amdgpu_display_user_framebuffer_create(struct drm_device *dev, domains = amdgpu_display_supported_domains(drm_to_adev(dev), bo->flags); if (obj->import_attach && !(domains & AMDGPU_GEM_DOMAIN_GTT)) { drm_dbg_kms(dev, "Cannot create framebuffer from imported dma_buf\n"); + drm_gem_object_put(obj); return ERR_PTR(-EINVAL); } From d385c16173f28a18866abf54c764200c276dace0 Mon Sep 17 00:00:00 2001 From: Colin Xu Date: Fri, 16 Apr 2021 16:33:55 +0800 Subject: [PATCH 0738/1379] drm/i915/gvt: Prevent divided by zero when calculating refresh rate To get refresh rate as vblank timer period and keep the precision, the calculation of rate is multiplied by 1000. However old logic was using: rate = pixel clock / (h * v / 1000). When the h/v total is invalid, like all 0, h * v / 1000 will be rounded to 0, which leads to a divided by 0 fault. 0 H/V are already checked above. Instead of divide after divide, refine the calculation to divide after multiply: "pixel clock * 1000 / (h * v)" Guest driver should guarantee the correctness of the timing regs' value. Fixes: 6a4500c7b83f ("drm/i915/gvt: Get accurate vGPU virtual display refresh rate from vreg") Reported-by: Zhenyu Wang Signed-off-by: Colin Xu Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20210416083355.159305-1-colin.xu@intel.com Reviewed-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/handlers.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c index 477badfcb258..dda320749c65 100644 --- a/drivers/gpu/drm/i915/gvt/handlers.c +++ b/drivers/gpu/drm/i915/gvt/handlers.c @@ -669,8 +669,8 @@ static void vgpu_update_refresh_rate(struct intel_vgpu *vgpu) link_n = vgpu_vreg_t(vgpu, PIPE_LINK_N1(TRANSCODER_A)); /* Get H/V total from transcoder timing */ - htotal = (vgpu_vreg_t(vgpu, HTOTAL(TRANSCODER_A)) >> TRANS_HTOTAL_SHIFT) + 1; - vtotal = (vgpu_vreg_t(vgpu, VTOTAL(TRANSCODER_A)) >> TRANS_VTOTAL_SHIFT) + 1; + htotal = (vgpu_vreg_t(vgpu, HTOTAL(TRANSCODER_A)) >> TRANS_HTOTAL_SHIFT); + vtotal = (vgpu_vreg_t(vgpu, VTOTAL(TRANSCODER_A)) >> TRANS_VTOTAL_SHIFT); if (dp_br && link_n && htotal && vtotal) { u64 pixel_clk = 0; @@ -682,7 +682,7 @@ static void vgpu_update_refresh_rate(struct intel_vgpu *vgpu) pixel_clk *= MSEC_PER_SEC; /* Calcuate refresh rate by (pixel_clk / (h_total * v_total)) */ - new_rate = DIV64_U64_ROUND_CLOSEST(pixel_clk, div64_u64(mul_u32_u32(htotal, vtotal), MSEC_PER_SEC)); + new_rate = DIV64_U64_ROUND_CLOSEST(mul_u64_u32_shr(pixel_clk, MSEC_PER_SEC, 0), mul_u32_u32(htotal + 1, vtotal + 1)); if (*old_rate != new_rate) *old_rate = new_rate; From 9b924f4f0d8f9557f4ef8a8d1468d507a662cef1 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 29 Apr 2021 09:26:29 +0800 Subject: [PATCH 0739/1379] psci: Remove unneeded semicolon Eliminate the following coccicheck warning: ./drivers/firmware/psci/psci.c:141:2-3: Unneeded semicolon Reported-by: Abaci Robot Signed-off-by: Yang Li Acked-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/1619659589-4775-1-git-send-email-yang.lee@linux.alibaba.com Signed-off-by: Catalin Marinas --- drivers/firmware/psci/psci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c index f5fc429cae3f..35b355e68e6d 100644 --- a/drivers/firmware/psci/psci.c +++ b/drivers/firmware/psci/psci.c @@ -138,7 +138,7 @@ static int psci_to_linux_errno(int errno) return -EINVAL; case PSCI_RET_DENIED: return -EPERM; - }; + } return -EINVAL; } From 75516c75a72b5629736c611cf45058d95978a9f2 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 23 Apr 2021 18:51:34 +0100 Subject: [PATCH 0740/1379] arm64: doc: Add brk/mmap/mremap() to the Tagged Address ABI Exceptions Prior to commit dcde237319e6 ("mm: Avoid creating virtual address aliases in brk()/mmap()/mremap()"), the kernel allowed tagged addresses to be passed to the brk/mmap/mremap() syscalls. This relaxation was tightened in 5.6 (backported to stable 5.4) but the tagged-address-abi.rst document was only partially updated. Signed-off-by: Catalin Marinas Fixes: dcde237319e6 ("mm: Avoid creating virtual address aliases in brk()/mmap()/mremap()") Reported-by: Peter Maydell Cc: Will Deacon Cc: Vincenzo Frascino Link: https://lore.kernel.org/r/20210423175134.14838-1-catalin.marinas@arm.com Signed-off-by: Catalin Marinas --- Documentation/arm64/tagged-address-abi.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/arm64/tagged-address-abi.rst b/Documentation/arm64/tagged-address-abi.rst index cbc4d4500241..459e6b66ff68 100644 --- a/Documentation/arm64/tagged-address-abi.rst +++ b/Documentation/arm64/tagged-address-abi.rst @@ -113,6 +113,12 @@ ABI relaxation: - ``shmat()`` and ``shmdt()``. +- ``brk()`` (since kernel v5.6). + +- ``mmap()`` (since kernel v5.6). + +- ``mremap()``, the ``new_address`` argument (since kernel v5.6). + Any attempt to use non-zero tagged pointers may result in an error code being returned, a (fatal) signal being raised, or other modes of failure. From 1aec7c3d05670b92b7339b19999009a93808efb9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Apr 2021 16:02:00 -0700 Subject: [PATCH 0741/1379] xfs: remove obsolete AGF counter debugging In commit f8f2835a9cf3 we changed the behavior of XFS to use EFIs to remove blocks from an overfilled AGFL because there were complaints about transaction overruns that stemmed from trying to free multiple blocks in a single transaction. Unfortunately, that commit missed a subtlety in the debug-mode transaction accounting when a realtime volume is attached. If a realtime file undergoes a data fork mapping change such that realtime extents are allocated (or freed) in the same transaction that a data device block is also allocated (or freed), we can trip a debugging assertion. This can happen (for example) if a realtime extent is allocated and it is necessary to reshape the bmbt to hold the new mapping. When we go to allocate a bmbt block from an AG, the first thing the data device block allocator does is ensure that the freelist is the proper length. If the freelist is too long, it will trim the freelist to the proper length. In debug mode, trimming the freelist calls xfs_trans_agflist_delta() to record the decrement in the AG free list count. Prior to f8f28 we would put the free block back in the free space btrees in the same transaction, which calls xfs_trans_agblocks_delta() to record the increment in the AG free block count. Since AGFL blocks are included in the global free block count (fdblocks), there is no corresponding fdblocks update, so the AGFL free satisfies the following condition in xfs_trans_apply_sb_deltas: /* * Check that superblock mods match the mods made to AGF counters. */ ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) == (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + tp->t_ag_btree_delta)); The comparison here used to be: (X + 0) == ((X+1) + -1 + 0), where X is the number blocks that were allocated. After commit f8f28 we defer the block freeing to the next chained transaction, which means that the calls to xfs_trans_agflist_delta and xfs_trans_agblocks_delta occur in separate transactions. The (first) transaction that shortens the free list trips on the comparison, which has now become: (X + 0) == ((X) + -1 + 0) because we haven't freed the AGFL block yet; we've only logged an intention to free it. When the second transaction (the deferred free) commits, it will evaluate the expression as: (0 + 0) == (1 + 0 + 0) and trip over that in turn. At this point, the astute reader may note that the two commits tagged by this patch have been in the kernel for a long time but haven't generated any bug reports. How is it that the author became aware of this bug? This originally surfaced as an intermittent failure when I was testing realtime rmap, but a different bug report by Zorro Lang reveals the same assertion occuring on !lazysbcount filesystems. The common factor to both reports (and why this problem wasn't previously reported) becomes apparent if we consider when xfs_trans_apply_sb_deltas is called by __xfs_trans_commit(): if (tp->t_flags & XFS_TRANS_SB_DIRTY) xfs_trans_apply_sb_deltas(tp); With a modern lazysbcount filesystem, transactions update only the percpu counters, so they don't need to set XFS_TRANS_SB_DIRTY, hence xfs_trans_apply_sb_deltas is rarely called. However, updates to the count of free realtime extents are not part of lazysbcount, so XFS_TRANS_SB_DIRTY will be set on transactions adding or removing data fork mappings to realtime files; similarly, XFS_TRANS_SB_DIRTY is always set on !lazysbcount filesystems. Dave mentioned in response to an earlier version of this patch: "IIUC, what you are saying is that this debug code is simply not exercised in normal testing and hasn't been for the past decade? And it still won't be exercised on anything other than realtime device testing? "...it was debugging code from 1994 that was largely turned into dead code when lazysbcounters were introduced in 2007. Hence I'm not sure it holds any value anymore." This debugging code isn't especially helpful - you can modify the flcount on one AG and the freeblks of another AG, and it won't trigger. Add the fact that nobody noticed for a decade, and let's just get rid of it (and start testing realtime :P). This bug was found by running generic/051 on either a V4 filesystem lacking lazysbcount; or a V5 filesystem with a realtime volume. Cc: bfoster@redhat.com, zlang@redhat.com Fixes: f8f2835a9cf3 ("xfs: defer agfl block frees when dfops is available") Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_alloc.c | 3 --- fs/xfs/libxfs/xfs_alloc_btree.c | 2 -- fs/xfs/libxfs/xfs_rmap_btree.c | 2 -- fs/xfs/xfs_fsops.c | 2 -- fs/xfs/xfs_trans.c | 7 ------- fs/xfs/xfs_trans.h | 15 --------------- 6 files changed, 31 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index aaa19101bb2a..f52b9e4a03f9 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -718,7 +718,6 @@ xfs_alloc_update_counters( agbp->b_pag->pagf_freeblks += len; be32_add_cpu(&agf->agf_freeblks, len); - xfs_trans_agblocks_delta(tp, len); if (unlikely(be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))) { xfs_buf_mark_corrupt(agbp); @@ -2739,7 +2738,6 @@ xfs_alloc_get_freelist( pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, -1); - xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; @@ -2846,7 +2844,6 @@ xfs_alloc_put_freelist( pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, 1); - xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++; logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 8e01231b308e..dbe302d1cb8d 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -73,7 +73,6 @@ xfs_allocbt_alloc_block( xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); - xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); *stat = 1; @@ -97,7 +96,6 @@ xfs_allocbt_free_block( xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); - xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index beb81c84a937..9f5bcbd834c3 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -103,7 +103,6 @@ xfs_rmapbt_alloc_block( xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); - xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); @@ -136,7 +135,6 @@ xfs_rmapbt_free_block( xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); - xfs_trans_agbtree_delta(cur->bc_tp, -1); pag = cur->bc_ag.agbp->b_pag; xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index b33c894b6cf3..be9cf88d2ad7 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -69,8 +69,6 @@ xfs_resizefs_init_new_ags( if (error) return error; - xfs_trans_agblocks_delta(tp, id->nfree); - if (delta) { *lastag_extended = true; error = xfs_ag_extend_space(mp, tp, id, delta); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index bcc978011869..2b46b4f713d1 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -487,13 +487,6 @@ xfs_trans_apply_sb_deltas( bp = xfs_trans_getsb(tp); sbp = bp->b_addr; - /* - * Check that superblock mods match the mods made to AGF counters. - */ - ASSERT((tp->t_fdblocks_delta + tp->t_res_fdblocks_delta) == - (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + - tp->t_ag_btree_delta)); - /* * Only update the superblock counters if we are logging them */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9dd745cf77c9..ee42d98d9011 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -140,11 +140,6 @@ typedef struct xfs_trans { int64_t t_res_fdblocks_delta; /* on-disk only chg */ int64_t t_frextents_delta;/* superblock freextents chg*/ int64_t t_res_frextents_delta; /* on-disk only chg */ -#if defined(DEBUG) || defined(XFS_WARN) - int64_t t_ag_freeblks_delta; /* debugging counter */ - int64_t t_ag_flist_delta; /* debugging counter */ - int64_t t_ag_btree_delta; /* debugging counter */ -#endif int64_t t_dblocks_delta;/* superblock dblocks change */ int64_t t_agcount_delta;/* superblock agcount change */ int64_t t_imaxpct_delta;/* superblock imaxpct change */ @@ -165,16 +160,6 @@ typedef struct xfs_trans { */ #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) -#if defined(DEBUG) || defined(XFS_WARN) -#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d) -#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d) -#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d) -#else -#define xfs_trans_agblocks_delta(tp, d) -#define xfs_trans_agflist_delta(tp, d) -#define xfs_trans_agbtree_delta(tp, d) -#endif - /* * XFS transaction mechanism exported interfaces. */ From e6c01077ec2d28fe8b6e0bc79eddea8d788f6ea3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Apr 2021 16:02:01 -0700 Subject: [PATCH 0742/1379] xfs: don't check agf_btreeblks on pre-lazysbcount filesystems The AGF free space btree block counter wasn't added until the lazysbcount feature was added to XFS midway through the life of the V4 format, so ignore the field when checking. Online AGF repair requires rmapbt, so it doesn't need the feature check. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/scrub/agheader.c | 7 ++++++- fs/xfs/scrub/fscounters.c | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 749faa17f8e2..7a2f9b5f2db5 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -416,6 +416,10 @@ xchk_agf_xref_btreeblks( xfs_agblock_t btreeblks; int error; + /* agf_btreeblks didn't exist before lazysbcount */ + if (!xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) + return; + /* Check agf_rmap_blocks; set up for agf_btreeblks check */ if (sc->sa.rmap_cur) { error = xfs_btree_count_blocks(sc->sa.rmap_cur, &blocks); @@ -581,7 +585,8 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - if (pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks)) + if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb) && + pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); xfs_perag_put(pag); diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 7b4386c78fbf..318b81c0f90d 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -182,7 +182,8 @@ retry: /* Add up the free/freelist/bnobt/cntbt blocks */ fsc->fdblocks += pag->pagf_freeblks; fsc->fdblocks += pag->pagf_flcount; - fsc->fdblocks += pag->pagf_btreeblks; + if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) + fsc->fdblocks += pag->pagf_btreeblks; /* * Per-AG reservations are taken out of the incore counters, From 6543990a168acf366f4b6174d7bd46ba15a8a2a6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 26 Apr 2021 18:28:31 -0700 Subject: [PATCH 0743/1379] xfs: update superblock counters correctly for !lazysbcount Keep the mount superblock counters up to date for !lazysbcount filesystems so that when we log the superblock they do not need updating in any way because they are already correct. It's found by what Zorro reported: 1. mkfs.xfs -f -l lazy-count=0 -m crc=0 $dev 2. mount $dev $mnt 3. fsstress -d $mnt -p 100 -n 1000 (maybe need more or less io load) 4. umount $mnt 5. xfs_repair -n $dev and I've seen no problem with this patch. Signed-off-by: Dave Chinner Reported-by: Zorro Lang Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_sb.c | 16 +++++++++++++--- fs/xfs/xfs_trans.c | 3 +++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 60e6d255e5e2..dfbbcbd448c1 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -926,9 +926,19 @@ xfs_log_sb( struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp = xfs_trans_getsb(tp); - mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); - mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); - mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + /* + * Lazy sb counters don't update the in-core superblock so do that now. + * If this is at unmount, the counters will be exactly correct, but at + * any other time they will only be ballpark correct because of + * reservations that have been taken out percpu counters. If we have an + * unclean shutdown, this will be corrected by log recovery rebuilding + * the counters from the AGF block counts. + */ + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) { + mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 2b46b4f713d1..586f2992b789 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -622,6 +622,9 @@ xfs_trans_unreserve_and_mod_sb( /* apply remaining deltas */ spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta; + mp->m_sb.sb_icount += idelta; + mp->m_sb.sb_ifree += ifreedelta; mp->m_sb.sb_frextents += rtxdelta; mp->m_sb.sb_dblocks += tp->t_dblocks_delta; mp->m_sb.sb_agcount += tp->t_agcount_delta; From e147a756ab263f9d10eafd08b79b9fac1b08e56c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 26 Apr 2021 19:06:58 -0700 Subject: [PATCH 0744/1379] xfs: count free space btree blocks when scrubbing pre-lazysbcount fses Since agf_btreeblks didn't exist before the lazysbcount feature, the fs summary count scrubber needs to walk the free space btrees to determine the amount of space being used by those btrees. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Gao Xiang --- fs/xfs/scrub/fscounters.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 318b81c0f90d..f1d1a8c58853 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -13,6 +13,7 @@ #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_health.h" +#include "xfs_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -143,6 +144,35 @@ xchk_setup_fscounters( return xchk_trans_alloc(sc, 0); } +/* Count free space btree blocks manually for pre-lazysbcount filesystems. */ +static int +xchk_fscount_btreeblks( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc, + xfs_agnumber_t agno) +{ + xfs_extlen_t blocks; + int error; + + error = xchk_ag_init(sc, agno, &sc->sa); + if (error) + return error; + + error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); + if (error) + goto out_free; + fsc->fdblocks += blocks - 1; + + error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); + if (error) + goto out_free; + fsc->fdblocks += blocks - 1; + +out_free: + xchk_ag_free(sc, &sc->sa); + return error; +} + /* * Calculate what the global in-core counters ought to be from the incore * per-AG structure. Callers can compare this to the actual in-core counters @@ -182,8 +212,15 @@ retry: /* Add up the free/freelist/bnobt/cntbt blocks */ fsc->fdblocks += pag->pagf_freeblks; fsc->fdblocks += pag->pagf_flcount; - if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) + if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) { fsc->fdblocks += pag->pagf_btreeblks; + } else { + error = xchk_fscount_btreeblks(sc, fsc, agno); + if (error) { + xfs_perag_put(pag); + break; + } + } /* * Per-AG reservations are taken out of the incore counters, From 2675ad3890db93e58f2264d07c2d1f615ec5adf7 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 28 Apr 2021 15:05:41 -0700 Subject: [PATCH 0745/1379] xfs: unconditionally read all AGFs on mounts with perag reservation perag reservation is enabled at mount time on a per AG basis. The upcoming change to set aside allocbt blocks from block reservation requires a populated allocbt counter as soon as possible after mount to be fully effective against large perag reservations. Therefore as a preparation step, initialize the pagf on all mounts where at least one reservation is active. Note that this already occurs to some degree on most default format filesystems as reservation requirement calculations already depend on the AGF or AGI, depending on the reservation type. Signed-off-by: Brian Foster Reviewed-by: Chandan Babu R Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag_resv.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 6c5f8d10589c..e32a1833d523 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -253,7 +253,8 @@ xfs_ag_resv_init( xfs_agnumber_t agno = pag->pag_agno; xfs_extlen_t ask; xfs_extlen_t used; - int error = 0; + int error = 0, error2; + bool has_resv = false; /* Create the metadata reservation. */ if (pag->pag_meta_resv.ar_asked == 0) { @@ -291,6 +292,8 @@ xfs_ag_resv_init( if (error) goto out; } + if (ask) + has_resv = true; } /* Create the RMAPBT metadata reservation */ @@ -304,19 +307,28 @@ xfs_ag_resv_init( error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used); if (error) goto out; + if (ask) + has_resv = true; } -#ifdef DEBUG - /* need to read in the AGF for the ASSERT below to work */ - error = xfs_alloc_pagf_init(pag->pag_mount, tp, pag->pag_agno, 0); - if (error) - return error; - - ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + - xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= - pag->pagf_freeblks + pag->pagf_flcount); -#endif out: + /* + * Initialize the pagf if we have at least one active reservation on the + * AG. This may have occurred already via reservation calculation, but + * fall back to an explicit init to ensure the in-core allocbt usage + * counters are initialized as soon as possible. This is important + * because filesystems with large perag reservations are susceptible to + * free space reservation problems that the allocbt counter is used to + * address. + */ + if (has_resv) { + error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0); + if (error2) + return error2; + ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + + xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= + pag->pagf_freeblks + pag->pagf_flcount); + } return error; } From 16eaab839a9273ed156ebfccbd40c15d1e72f3d8 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 28 Apr 2021 15:05:50 -0700 Subject: [PATCH 0746/1379] xfs: introduce in-core global counter of allocbt blocks Introduce an in-core counter to track the sum of all allocbt blocks used by the filesystem. This value is currently tracked per-ag via the ->agf_btreeblks field in the AGF, which also happens to include rmapbt blocks. A global, in-core count of allocbt blocks is required to identify the subset of global ->m_fdblocks that consists of unavailable blocks currently used for allocation btrees. To support this calculation at block reservation time, construct a similar global counter for allocbt blocks, populate it on first read of each AGF and update it as allocbt blocks are used and released. Signed-off-by: Brian Foster Reviewed-by: Chandan Babu R Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 14 ++++++++++++++ fs/xfs/libxfs/xfs_alloc_btree.c | 2 ++ fs/xfs/xfs_mount.h | 6 ++++++ 3 files changed, 22 insertions(+) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index f52b9e4a03f9..82b7cbb1f24f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -3033,6 +3033,7 @@ xfs_alloc_read_agf( struct xfs_agf *agf; /* ag freelist header */ struct xfs_perag *pag; /* per allocation group data */ int error; + int allocbt_blks; trace_xfs_alloc_read_agf(mp, agno); @@ -3063,6 +3064,19 @@ xfs_alloc_read_agf( pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); pag->pagf_init = 1; pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf); + + /* + * Update the in-core allocbt counter. Filter out the rmapbt + * subset of the btreeblks counter because the rmapbt is managed + * by perag reservation. Subtract one for the rmapbt root block + * because the rmap counter includes it while the btreeblks + * counter only tracks non-root blocks. + */ + allocbt_blks = pag->pagf_btreeblks; + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; + if (allocbt_blks > 0) + atomic64_add(allocbt_blks, &mp->m_allocbt_blks); } #ifdef DEBUG else if (!XFS_FORCED_SHUTDOWN(mp)) { diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index dbe302d1cb8d..a43e4c50e69b 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -71,6 +71,7 @@ xfs_allocbt_alloc_block( return 0; } + atomic64_inc(&cur->bc_mp->m_allocbt_blks); xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); new->s = cpu_to_be32(bno); @@ -94,6 +95,7 @@ xfs_allocbt_free_block( if (error) return error; + atomic64_dec(&cur->bc_mp->m_allocbt_blks); xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, XFS_EXTENT_BUSY_SKIP_DISCARD); return 0; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 81829d19596e..bb67274ee23f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -170,6 +170,12 @@ typedef struct xfs_mount { * extents or anything related to the rt device. */ struct percpu_counter m_delalloc_blks; + /* + * Global count of allocation btree blocks in use across all AGs. Only + * used when perag reservation is enabled. Helps prevent block + * reservation from attempting to reserve allocation btree blocks. + */ + atomic64_t m_allocbt_blks; struct radix_tree_root m_perag_tree; /* per-ag accounting info */ spinlock_t m_perag_lock; /* lock for m_perag_tree */ From fd43cf600cf61c66ae0a1021aca2f636115c7fcb Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 28 Apr 2021 15:06:05 -0700 Subject: [PATCH 0747/1379] xfs: set aside allocation btree blocks from block reservation The blocks used for allocation btrees (bnobt and countbt) are technically considered free space. This is because as free space is used, allocbt blocks are removed and naturally become available for traditional allocation. However, this means that a significant portion of free space may consist of in-use btree blocks if free space is severely fragmented. On large filesystems with large perag reservations, this can lead to a rare but nasty condition where a significant amount of physical free space is available, but the majority of actual usable blocks consist of in-use allocbt blocks. We have a record of a (~12TB, 32 AG) filesystem with multiple AGs in a state with ~2.5GB or so free blocks tracked across ~300 total allocbt blocks, but effectively at 100% full because the the free space is entirely consumed by refcountbt perag reservation. Such a large perag reservation is by design on large filesystems. The problem is that because the free space is so fragmented, this AG contributes the 300 or so allocbt blocks to the global counters as free space. If this pattern repeats across enough AGs, the filesystem lands in a state where global block reservation can outrun physical block availability. For example, a streaming buffered write on the affected filesystem continues to allow delayed allocation beyond the point where writeback starts to fail due to physical block allocation failures. The expected behavior is for the delalloc block reservation to fail gracefully with -ENOSPC before physical block allocation failure is a possibility. To address this problem, set aside in-use allocbt blocks at reservation time and thus ensure they cannot be reserved until truly available for physical allocation. This allows alloc btree metadata to continue to reside in free space, but dynamically adjusts reservation availability based on internal state. Note that the logic requires that the allocbt counter is fully populated at reservation time before it is fully effective. We currently rely on the mount time AGF scan in the perag reservation initialization code for this dependency on filesystems where it's most important (i.e. with active perag reservations). Signed-off-by: Brian Foster Reviewed-by: Chandan Babu R Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_mount.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index cb1e2c4702c3..bdfee1943796 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1188,6 +1188,7 @@ xfs_mod_fdblocks( int64_t lcounter; long long res_used; s32 batch; + uint64_t set_aside; if (delta > 0) { /* @@ -1227,8 +1228,20 @@ xfs_mod_fdblocks( else batch = XFS_FDBLOCKS_BATCH; + /* + * Set aside allocbt blocks because these blocks are tracked as free + * space but not available for allocation. Technically this means that a + * single reservation cannot consume all remaining free space, but the + * ratio of allocbt blocks to usable free blocks should be rather small. + * The tradeoff without this is that filesystems that maintain high + * perag block reservations can over reserve physical block availability + * and fail physical allocation, which leads to much more serious + * problems (i.e. transaction abort, pagecache discards, etc.) than + * slightly premature -ENOSPC. + */ + set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); - if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside, + if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ return 0; From d4f74e162d238ce00a640af5f0611c3f51dad70e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 28 Apr 2021 22:41:39 -0700 Subject: [PATCH 0748/1379] xfs: fix xfs_reflink_unshare usage of filemap_write_and_wait_range The final parameter of filemap_write_and_wait_range is the end of the range to flush, not the length of the range to flush. Fixes: 46afb0628b86 ("xfs: only flush the unshared range in xfs_reflink_unshare") Signed-off-by: Darrick J. Wong Reviewed-by: Chandan Babu R Reviewed-by: Brian Foster --- fs/xfs/xfs_reflink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 4dd4af6ac2ef..060695d6d56a 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1522,7 +1522,8 @@ xfs_reflink_unshare( if (error) goto out; - error = filemap_write_and_wait_range(inode->i_mapping, offset, len); + error = filemap_write_and_wait_range(inode->i_mapping, offset, + offset + len - 1); if (error) goto out; From 07b4523e9e2fe9763e5c62da032d3c444e83d0fd Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 28 Apr 2021 13:32:53 -0500 Subject: [PATCH 0749/1379] PCI/sysfs: Rename "vpd" attribute accessors Rename "vpd" attribute accessors so they fit with the BIN_ATTR_RW() macro usage. Currently there is no BIN_ATTR_ADMIN_RW() that uses 0600 permissions, but if there were, it would likely use "vpd_read()" and "vpd_write()". No functional change intended. Extracted from the patch mentioned below by Heiner Kallweit . Link: https://lore.kernel.org/linux-pci/7703024f-8882-9eec-a122-599871728a89@gmail.com/ Signed-off-by: Bjorn Helgaas --- drivers/pci/vpd.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 7915d10f9aa1..874bc155ffb5 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -397,9 +397,9 @@ void pci_vpd_release(struct pci_dev *dev) kfree(dev->vpd); } -static ssize_t read_vpd_attr(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, - loff_t off, size_t count) +static ssize_t vpd_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, + size_t count) { struct pci_dev *dev = to_pci_dev(kobj_to_dev(kobj)); @@ -413,9 +413,9 @@ static ssize_t read_vpd_attr(struct file *filp, struct kobject *kobj, return pci_read_vpd(dev, off, count, buf); } -static ssize_t write_vpd_attr(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, - loff_t off, size_t count) +static ssize_t vpd_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, loff_t off, + size_t count) { struct pci_dev *dev = to_pci_dev(kobj_to_dev(kobj)); @@ -445,8 +445,8 @@ void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev) attr->size = 0; attr->attr.name = "vpd"; attr->attr.mode = S_IRUSR | S_IWUSR; - attr->read = read_vpd_attr; - attr->write = write_vpd_attr; + attr->read = vpd_read; + attr->write = vpd_write; retval = sysfs_create_bin_file(&dev->dev.kobj, attr); if (retval) { kfree(attr); From d93f8399053dcf117ff56a3029ff08c0e36f4b75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Fri, 16 Apr 2021 20:58:40 +0000 Subject: [PATCH 0750/1379] PCI/sysfs: Convert "vpd" to static attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "vpd" sysfs attribute allows access to Vital Product Data (VPD). Previously it was dynamically created either by pci_bus_add_device() or the pci_sysfs_init() initcall, but since it doesn't need to be created or removed dynamically, we can use a static attribute so the device model takes care of addition and removal automatically. Convert "vpd" to a static attribute and use the .is_bin_visible() callback to check whether the device supports VPD. Remove pcie_vpd_create_sysfs_dev_files(), pcie_vpd_remove_sysfs_dev_files(), pci_create_capabilities_sysfs(), and pci_create_capabilities_sysfs(), which are no longer needed. [bhelgaas: This is substantially the same as the earlier patch from Heiner Kallweit . I included Krzysztof's change here so all the "convert to static attribute" changes are together.] [bhelgaas: rename to vpd_read()/vpd_write() and pci_dev_vpd_attr_group] Suggested-by: Oliver O'Halloran Based-on: https://lore.kernel.org/r/7703024f-8882-9eec-a122-599871728a89@gmail.com Based-on-patch-by: Heiner Kallweit Link: https://lore.kernel.org/r/20210416205856.3234481-5-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-sysfs.c | 15 +------------- drivers/pci/pci.h | 3 +-- drivers/pci/vpd.c | 46 ++++++++++++++--------------------------- 3 files changed, 18 insertions(+), 46 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index c469d9cad0a8..6dc01fe21fd2 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1378,12 +1378,6 @@ static const struct attribute_group pci_dev_reset_attr_group = { .is_visible = pci_dev_reset_attr_is_visible, }; - -static void pci_create_capabilities_sysfs(struct pci_dev *dev) -{ - pcie_vpd_create_sysfs_dev_files(dev); -} - int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev) { int retval; @@ -1395,18 +1389,11 @@ int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev) if (retval) return retval; - /* add sysfs entries for various capabilities */ - pci_create_capabilities_sysfs(pdev); pci_create_firmware_label_files(pdev); return 0; } -static void pci_remove_capabilities_sysfs(struct pci_dev *dev) -{ - pcie_vpd_remove_sysfs_dev_files(dev); -} - /** * pci_remove_sysfs_dev_files - cleanup PCI specific sysfs files * @pdev: device whose entries we should free @@ -1418,7 +1405,6 @@ void pci_remove_sysfs_dev_files(struct pci_dev *pdev) if (!sysfs_initialized) return; - pci_remove_capabilities_sysfs(pdev); pci_remove_resource_files(pdev); pci_remove_firmware_label_files(pdev); } @@ -1514,6 +1500,7 @@ const struct attribute_group *pci_dev_groups[] = { &pci_dev_config_attr_group, &pci_dev_rom_attr_group, &pci_dev_reset_attr_group, + &pci_dev_vpd_attr_group, NULL, }; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ef7c4661314f..14dcffc8a0ec 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -143,8 +143,7 @@ static inline bool pcie_downstream_port(const struct pci_dev *dev) int pci_vpd_init(struct pci_dev *dev); void pci_vpd_release(struct pci_dev *dev); -void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev); -void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev); +extern const struct attribute_group pci_dev_vpd_attr_group; /* PCI Virtual Channel */ int pci_save_vc_state(struct pci_dev *dev); diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 874bc155ffb5..6168cf208d63 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -21,7 +21,6 @@ struct pci_vpd_ops { struct pci_vpd { const struct pci_vpd_ops *ops; - struct bin_attribute *attr; /* Descriptor for sysfs VPD entry */ struct mutex lock; unsigned int len; u16 flag; @@ -428,41 +427,28 @@ static ssize_t vpd_write(struct file *filp, struct kobject *kobj, return pci_write_vpd(dev, off, count, buf); } +static BIN_ATTR(vpd, 0600, vpd_read, vpd_write, 0); -void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev) +static struct bin_attribute *vpd_attrs[] = { + &bin_attr_vpd, + NULL, +}; + +static umode_t vpd_attr_is_visible(struct kobject *kobj, + struct bin_attribute *a, int n) { - int retval; - struct bin_attribute *attr; + struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj)); - if (!dev->vpd) - return; + if (!pdev->vpd) + return 0; - attr = kzalloc(sizeof(*attr), GFP_ATOMIC); - if (!attr) - return; - - sysfs_bin_attr_init(attr); - attr->size = 0; - attr->attr.name = "vpd"; - attr->attr.mode = S_IRUSR | S_IWUSR; - attr->read = vpd_read; - attr->write = vpd_write; - retval = sysfs_create_bin_file(&dev->dev.kobj, attr); - if (retval) { - kfree(attr); - return; - } - - dev->vpd->attr = attr; + return a->attr.mode; } -void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev) -{ - if (dev->vpd && dev->vpd->attr) { - sysfs_remove_bin_file(&dev->dev.kobj, dev->vpd->attr); - kfree(dev->vpd->attr); - } -} +const struct attribute_group pci_dev_vpd_attr_group = { + .bin_attrs = vpd_attrs, + .is_bin_visible = vpd_attr_is_visible, +}; int pci_vpd_find_tag(const u8 *buf, unsigned int off, unsigned int len, u8 rdt) { From 1017275d2e43dba68527e0e69f4cc12d2b0f8966 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Tue, 27 Apr 2021 10:39:16 -0500 Subject: [PATCH 0751/1379] PCI/sysfs: Rename device_has_dsm() to device_has_acpi_name() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename device_has_dsm() to device_has_acpi_name() to better reflect its purpose and move it earlier so it's available for a future SMBIOS .is_visible() function. No functional change intended. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20210416205856.3234481-6-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-label.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c index 781e45cf60d1..5fd5824db82f 100644 --- a/drivers/pci/pci-label.c +++ b/drivers/pci/pci-label.c @@ -33,6 +33,22 @@ #include #include "pci.h" +static bool device_has_acpi_name(struct device *dev) +{ +#ifdef CONFIG_ACPI + acpi_handle handle; + + handle = ACPI_HANDLE(dev); + if (!handle) + return false; + + return acpi_check_dsm(handle, &pci_acpi_dsm_guid, 0x2, + 1 << DSM_PCI_DEVICE_NAME); +#else + return false; +#endif +} + #ifdef CONFIG_DMI enum smbios_attr_enum { SMBIOS_ATTR_NONE = 0, @@ -209,18 +225,6 @@ static int dsm_get_label(struct device *dev, char *buf, return len; } -static bool device_has_dsm(struct device *dev) -{ - acpi_handle handle; - - handle = ACPI_HANDLE(dev); - if (!handle) - return false; - - return !!acpi_check_dsm(handle, &pci_acpi_dsm_guid, 0x2, - 1 << DSM_PCI_DEVICE_NAME); -} - static umode_t acpi_index_string_exist(struct kobject *kobj, struct attribute *attr, int n) { @@ -228,7 +232,7 @@ static umode_t acpi_index_string_exist(struct kobject *kobj, dev = kobj_to_dev(kobj); - if (device_has_dsm(dev)) + if (device_has_acpi_name(dev)) return S_IRUGO; return 0; @@ -287,16 +291,11 @@ static inline int pci_remove_acpi_index_label_files(struct pci_dev *pdev) { return -1; } - -static inline bool device_has_dsm(struct device *dev) -{ - return false; -} #endif void pci_create_firmware_label_files(struct pci_dev *pdev) { - if (device_has_dsm(&pdev->dev)) + if (device_has_acpi_name(&pdev->dev)) pci_create_acpi_index_label_files(pdev); else pci_create_smbiosname_file(pdev); @@ -304,7 +303,7 @@ void pci_create_firmware_label_files(struct pci_dev *pdev) void pci_remove_firmware_label_files(struct pci_dev *pdev) { - if (device_has_dsm(&pdev->dev)) + if (device_has_acpi_name(&pdev->dev)) pci_remove_acpi_index_label_files(pdev); else pci_remove_smbiosname_file(pdev); From 2ed6494155444dd4d2005869edce1ae73b4f23ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Tue, 27 Apr 2021 13:48:51 -0500 Subject: [PATCH 0752/1379] PCI/sysfs: Define ACPI label attributes with DEVICE_ATTR*() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use DEVICE_ATTR*() to simplify definitions of the ACPI label attributes. No functional change intended. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20210416205856.3234481-6-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-label.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c index 5fd5824db82f..4fef6fc0740f 100644 --- a/drivers/pci/pci-label.c +++ b/drivers/pci/pci-label.c @@ -225,50 +225,42 @@ static int dsm_get_label(struct device *dev, char *buf, return len; } -static umode_t acpi_index_string_exist(struct kobject *kobj, - struct attribute *attr, int n) +static umode_t acpi_attr_is_visible(struct kobject *kobj, struct attribute *a, + int n) { struct device *dev; dev = kobj_to_dev(kobj); - if (device_has_acpi_name(dev)) - return S_IRUGO; + if (!device_has_acpi_name(dev)) + return 0; - return 0; + return a->mode; } -static ssize_t acpilabel_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t label_show(struct device *dev, struct device_attribute *attr, + char *buf) { return dsm_get_label(dev, buf, ACPI_ATTR_LABEL_SHOW); } +static DEVICE_ATTR_RO(label); -static ssize_t acpiindex_show(struct device *dev, +static ssize_t acpi_index_show(struct device *dev, struct device_attribute *attr, char *buf) { return dsm_get_label(dev, buf, ACPI_ATTR_INDEX_SHOW); } +static DEVICE_ATTR_RO(acpi_index); -static struct device_attribute acpi_attr_label = { - .attr = {.name = "label", .mode = 0444}, - .show = acpilabel_show, -}; - -static struct device_attribute acpi_attr_index = { - .attr = {.name = "acpi_index", .mode = 0444}, - .show = acpiindex_show, -}; - -static struct attribute *acpi_attributes[] = { - &acpi_attr_label.attr, - &acpi_attr_index.attr, +static struct attribute *acpi_attrs[] = { + &dev_attr_label.attr, + &dev_attr_acpi_index.attr, NULL, }; static const struct attribute_group acpi_attr_group = { - .attrs = acpi_attributes, - .is_visible = acpi_index_string_exist, + .attrs = acpi_attrs, + .is_visible = acpi_attr_is_visible, }; static int pci_create_acpi_index_label_files(struct pci_dev *pdev) From 4dd7dfa166d220a245ee21f499bb1084bc249393 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Tue, 27 Apr 2021 14:18:26 -0500 Subject: [PATCH 0753/1379] PCI/sysfs: Define SMBIOS label attributes with DEVICE_ATTR*() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use DEVICE_ATTR*() to simplify definition of the SMBIOS label attributes. No functional change intended. Note that dev_attr_smbios_label requires __ATTR() because the "label" attribute can be exposed via either ACPI or SMBIOS, and we already have the ACPI label_show() function in this file. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20210416205856.3234481-6-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-label.c | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c index 4fef6fc0740f..6c0c4e700bc7 100644 --- a/drivers/pci/pci-label.c +++ b/drivers/pci/pci-label.c @@ -92,8 +92,8 @@ static size_t find_smbios_instance_string(struct pci_dev *pdev, char *buf, return 0; } -static umode_t smbios_instance_string_exist(struct kobject *kobj, - struct attribute *attr, int n) +static umode_t smbios_attr_is_visible(struct kobject *kobj, struct attribute *a, + int n) { struct device *dev; struct pci_dev *pdev; @@ -101,12 +101,14 @@ static umode_t smbios_instance_string_exist(struct kobject *kobj, dev = kobj_to_dev(kobj); pdev = to_pci_dev(dev); - return find_smbios_instance_string(pdev, NULL, SMBIOS_ATTR_NONE) ? - S_IRUGO : 0; + if (!find_smbios_instance_string(pdev, NULL, SMBIOS_ATTR_NONE)) + return 0; + + return a->mode; } -static ssize_t smbioslabel_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t smbios_label_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct pci_dev *pdev; pdev = to_pci_dev(dev); @@ -114,9 +116,11 @@ static ssize_t smbioslabel_show(struct device *dev, return find_smbios_instance_string(pdev, buf, SMBIOS_ATTR_LABEL_SHOW); } +static struct device_attribute dev_attr_smbios_label = __ATTR(label, 0444, + smbios_label_show, NULL); -static ssize_t smbiosinstance_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t index_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct pci_dev *pdev; pdev = to_pci_dev(dev); @@ -124,26 +128,17 @@ static ssize_t smbiosinstance_show(struct device *dev, return find_smbios_instance_string(pdev, buf, SMBIOS_ATTR_INSTANCE_SHOW); } +static DEVICE_ATTR_RO(index); -static struct device_attribute smbios_attr_label = { - .attr = {.name = "label", .mode = 0444}, - .show = smbioslabel_show, -}; - -static struct device_attribute smbios_attr_instance = { - .attr = {.name = "index", .mode = 0444}, - .show = smbiosinstance_show, -}; - -static struct attribute *smbios_attributes[] = { - &smbios_attr_label.attr, - &smbios_attr_instance.attr, +static struct attribute *smbios_attrs[] = { + &dev_attr_smbios_label.attr, + &dev_attr_index.attr, NULL, }; static const struct attribute_group smbios_attr_group = { - .attrs = smbios_attributes, - .is_visible = smbios_instance_string_exist, + .attrs = smbios_attrs, + .is_visible = smbios_attr_is_visible, }; static int pci_create_smbiosname_file(struct pci_dev *pdev) From 506140f9c06b0d136669ae7795e0264c9f21c1a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Tue, 27 Apr 2021 10:49:16 -0500 Subject: [PATCH 0754/1379] PCI/sysfs: Convert "index", "acpi_index", "label" to static attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "label", "index", and "acpi_index" sysfs attributes show firmware label information about the device. If the ACPI Device Name _DSM is implemented for the device, we have: label Device name (optional, may be null) acpi_index Instance number (unique under \_SB scope) When there is no ACPI _DSM and SMBIOS provides an Onboard Devices structure for the device, we have: label Reference Designation, e.g., a silkscreen label index Device Type Instance Previously these attributes were dynamically created either by pci_bus_add_device() or the pci_sysfs_init() initcall, but since they don't need to be created or removed dynamically, we can use a static attribute so the device model takes care of addition and removal automatically. Convert "label", "index", and "acpi_index" to static attributes. Presence of the ACPI _DSM (device_has_acpi_name()) determines whether the ACPI information (label, acpi_index) or the SMBIOS information (label, index) is visible. [bhelgaas: commit log, split to separate patch, add "pci_dev_" prefix] Suggested-by: Oliver O'Halloran Link: https://lore.kernel.org/r/20210416205856.3234481-6-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-label.c | 63 ++++------------------------------------- drivers/pci/pci-sysfs.c | 17 +++++------ drivers/pci/pci.h | 13 +++------ 3 files changed, 16 insertions(+), 77 deletions(-) diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c index 6c0c4e700bc7..cdcbe09d363e 100644 --- a/drivers/pci/pci-label.c +++ b/drivers/pci/pci-label.c @@ -101,6 +101,9 @@ static umode_t smbios_attr_is_visible(struct kobject *kobj, struct attribute *a, dev = kobj_to_dev(kobj); pdev = to_pci_dev(dev); + if (device_has_acpi_name(dev)) + return 0; + if (!find_smbios_instance_string(pdev, NULL, SMBIOS_ATTR_NONE)) return 0; @@ -136,29 +139,10 @@ static struct attribute *smbios_attrs[] = { NULL, }; -static const struct attribute_group smbios_attr_group = { +const struct attribute_group pci_dev_smbios_attr_group = { .attrs = smbios_attrs, .is_visible = smbios_attr_is_visible, }; - -static int pci_create_smbiosname_file(struct pci_dev *pdev) -{ - return sysfs_create_group(&pdev->dev.kobj, &smbios_attr_group); -} - -static void pci_remove_smbiosname_file(struct pci_dev *pdev) -{ - sysfs_remove_group(&pdev->dev.kobj, &smbios_attr_group); -} -#else -static inline int pci_create_smbiosname_file(struct pci_dev *pdev) -{ - return -1; -} - -static inline void pci_remove_smbiosname_file(struct pci_dev *pdev) -{ -} #endif #ifdef CONFIG_ACPI @@ -253,45 +237,8 @@ static struct attribute *acpi_attrs[] = { NULL, }; -static const struct attribute_group acpi_attr_group = { +const struct attribute_group pci_dev_acpi_attr_group = { .attrs = acpi_attrs, .is_visible = acpi_attr_is_visible, }; - -static int pci_create_acpi_index_label_files(struct pci_dev *pdev) -{ - return sysfs_create_group(&pdev->dev.kobj, &acpi_attr_group); -} - -static int pci_remove_acpi_index_label_files(struct pci_dev *pdev) -{ - sysfs_remove_group(&pdev->dev.kobj, &acpi_attr_group); - return 0; -} -#else -static inline int pci_create_acpi_index_label_files(struct pci_dev *pdev) -{ - return -1; -} - -static inline int pci_remove_acpi_index_label_files(struct pci_dev *pdev) -{ - return -1; -} #endif - -void pci_create_firmware_label_files(struct pci_dev *pdev) -{ - if (device_has_acpi_name(&pdev->dev)) - pci_create_acpi_index_label_files(pdev); - else - pci_create_smbiosname_file(pdev); -} - -void pci_remove_firmware_label_files(struct pci_dev *pdev) -{ - if (device_has_acpi_name(&pdev->dev)) - pci_remove_acpi_index_label_files(pdev); - else - pci_remove_smbiosname_file(pdev); -} diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 6dc01fe21fd2..befcd04ae0c8 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -1380,18 +1380,10 @@ static const struct attribute_group pci_dev_reset_attr_group = { int __must_check pci_create_sysfs_dev_files(struct pci_dev *pdev) { - int retval; - if (!sysfs_initialized) return -EACCES; - retval = pci_create_resource_files(pdev); - if (retval) - return retval; - - pci_create_firmware_label_files(pdev); - - return 0; + return pci_create_resource_files(pdev); } /** @@ -1406,7 +1398,6 @@ void pci_remove_sysfs_dev_files(struct pci_dev *pdev) return; pci_remove_resource_files(pdev); - pci_remove_firmware_label_files(pdev); } static int __init pci_sysfs_init(void) @@ -1501,6 +1492,12 @@ const struct attribute_group *pci_dev_groups[] = { &pci_dev_rom_attr_group, &pci_dev_reset_attr_group, &pci_dev_vpd_attr_group, +#ifdef CONFIG_DMI + &pci_dev_smbios_attr_group, +#endif +#ifdef CONFIG_ACPI + &pci_dev_acpi_attr_group, +#endif NULL, }; diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 14dcffc8a0ec..34449e9a6f0a 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -21,16 +21,10 @@ bool pcie_cap_has_rtctl(const struct pci_dev *dev); int pci_create_sysfs_dev_files(struct pci_dev *pdev); void pci_remove_sysfs_dev_files(struct pci_dev *pdev); -#if !defined(CONFIG_DMI) && !defined(CONFIG_ACPI) -static inline void pci_create_firmware_label_files(struct pci_dev *pdev) -{ return; } -static inline void pci_remove_firmware_label_files(struct pci_dev *pdev) -{ return; } -#else -void pci_create_firmware_label_files(struct pci_dev *pdev); -void pci_remove_firmware_label_files(struct pci_dev *pdev); -#endif void pci_cleanup_rom(struct pci_dev *dev); +#ifdef CONFIG_DMI +extern const struct attribute_group pci_dev_smbios_attr_group; +#endif enum pci_mmap_api { PCI_MMAP_SYSFS, /* mmap on /sys/bus/pci/devices//resource */ @@ -695,6 +689,7 @@ static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL #ifdef CONFIG_ACPI int pci_acpi_program_hp_params(struct pci_dev *dev); +extern const struct attribute_group pci_dev_acpi_attr_group; #else static inline int pci_acpi_program_hp_params(struct pci_dev *dev) { From 362fb766264a1d62254ad950304fa1d97172bb44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Tue, 27 Apr 2021 15:04:44 -0500 Subject: [PATCH 0755/1379] PCI/sysfs: Tidy SMBIOS & ACPI label attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update coding style to reduce distraction. No functional change intended. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20210416205856.3234481-6-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-label.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c index cdcbe09d363e..c4636f2e8545 100644 --- a/drivers/pci/pci-label.c +++ b/drivers/pci/pci-label.c @@ -36,9 +36,8 @@ static bool device_has_acpi_name(struct device *dev) { #ifdef CONFIG_ACPI - acpi_handle handle; + acpi_handle handle = ACPI_HANDLE(dev); - handle = ACPI_HANDLE(dev); if (!handle) return false; @@ -61,13 +60,9 @@ static size_t find_smbios_instance_string(struct pci_dev *pdev, char *buf, { const struct dmi_device *dmi; struct dmi_dev_onboard *donboard; - int domain_nr; - int bus; - int devfn; - - domain_nr = pci_domain_nr(pdev->bus); - bus = pdev->bus->number; - devfn = pdev->devfn; + int domain_nr = pci_domain_nr(pdev->bus); + int bus = pdev->bus->number; + int devfn = pdev->devfn; dmi = NULL; while ((dmi = dmi_find_device(DMI_DEV_TYPE_DEV_ONBOARD, @@ -95,11 +90,8 @@ static size_t find_smbios_instance_string(struct pci_dev *pdev, char *buf, static umode_t smbios_attr_is_visible(struct kobject *kobj, struct attribute *a, int n) { - struct device *dev; - struct pci_dev *pdev; - - dev = kobj_to_dev(kobj); - pdev = to_pci_dev(dev); + struct device *dev = kobj_to_dev(kobj); + struct pci_dev *pdev = to_pci_dev(dev); if (device_has_acpi_name(dev)) return 0; @@ -113,8 +105,7 @@ static umode_t smbios_attr_is_visible(struct kobject *kobj, struct attribute *a, static ssize_t smbios_label_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct pci_dev *pdev; - pdev = to_pci_dev(dev); + struct pci_dev *pdev = to_pci_dev(dev); return find_smbios_instance_string(pdev, buf, SMBIOS_ATTR_LABEL_SHOW); @@ -125,8 +116,7 @@ static struct device_attribute dev_attr_smbios_label = __ATTR(label, 0444, static ssize_t index_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct pci_dev *pdev; - pdev = to_pci_dev(dev); + struct pci_dev *pdev = to_pci_dev(dev); return find_smbios_instance_string(pdev, buf, SMBIOS_ATTR_INSTANCE_SHOW); @@ -164,11 +154,10 @@ static void dsm_label_utf16s_to_utf8s(union acpi_object *obj, char *buf) static int dsm_get_label(struct device *dev, char *buf, enum acpi_attr_enum attr) { - acpi_handle handle; + acpi_handle handle = ACPI_HANDLE(dev); union acpi_object *obj, *tmp; int len = -1; - handle = ACPI_HANDLE(dev); if (!handle) return -1; @@ -207,9 +196,7 @@ static int dsm_get_label(struct device *dev, char *buf, static umode_t acpi_attr_is_visible(struct kobject *kobj, struct attribute *a, int n) { - struct device *dev; - - dev = kobj_to_dev(kobj); + struct device *dev = kobj_to_dev(kobj); if (!device_has_acpi_name(dev)) return 0; From df1af7cbe7bc11720b3e915771d47acc3604eb44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Tue, 27 Apr 2021 15:18:47 -0500 Subject: [PATCH 0756/1379] PCI/sysfs: Rearrange smbios_attr_group and acpi_attr_group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Collect the smbios_attr_group and acpi_attr_group together in the logical order. No functional change intended. [bhelgaas: split to separate patch] Link: https://lore.kernel.org/r/20210416205856.3234481-6-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-label.c | 52 ++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c index c4636f2e8545..f94ce0328609 100644 --- a/drivers/pci/pci-label.c +++ b/drivers/pci/pci-label.c @@ -87,21 +87,6 @@ static size_t find_smbios_instance_string(struct pci_dev *pdev, char *buf, return 0; } -static umode_t smbios_attr_is_visible(struct kobject *kobj, struct attribute *a, - int n) -{ - struct device *dev = kobj_to_dev(kobj); - struct pci_dev *pdev = to_pci_dev(dev); - - if (device_has_acpi_name(dev)) - return 0; - - if (!find_smbios_instance_string(pdev, NULL, SMBIOS_ATTR_NONE)) - return 0; - - return a->mode; -} - static ssize_t smbios_label_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -129,6 +114,21 @@ static struct attribute *smbios_attrs[] = { NULL, }; +static umode_t smbios_attr_is_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct pci_dev *pdev = to_pci_dev(dev); + + if (device_has_acpi_name(dev)) + return 0; + + if (!find_smbios_instance_string(pdev, NULL, SMBIOS_ATTR_NONE)) + return 0; + + return a->mode; +} + const struct attribute_group pci_dev_smbios_attr_group = { .attrs = smbios_attrs, .is_visible = smbios_attr_is_visible, @@ -193,17 +193,6 @@ static int dsm_get_label(struct device *dev, char *buf, return len; } -static umode_t acpi_attr_is_visible(struct kobject *kobj, struct attribute *a, - int n) -{ - struct device *dev = kobj_to_dev(kobj); - - if (!device_has_acpi_name(dev)) - return 0; - - return a->mode; -} - static ssize_t label_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -224,6 +213,17 @@ static struct attribute *acpi_attrs[] = { NULL, }; +static umode_t acpi_attr_is_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + struct device *dev = kobj_to_dev(kobj); + + if (!device_has_acpi_name(dev)) + return 0; + + return a->mode; +} + const struct attribute_group pci_dev_acpi_attr_group = { .attrs = acpi_attrs, .is_visible = acpi_attr_is_visible, From ad025f8e46f3dbf09b1bf8d7a5b4ce858df74544 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Fri, 16 Apr 2021 20:58:45 +0000 Subject: [PATCH 0757/1379] PCI/sysfs: Use sysfs_emit() and sysfs_emit_at() in "show" functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysfs_emit() and sysfs_emit_at() functions were introduced to make it less ambiguous which function is preferred when writing to the output buffer in a device attribute's "show" callback [1]. Convert the PCI sysfs object "show" functions from sprintf(), snprintf() and scnprintf() to sysfs_emit() and sysfs_emit_at() accordingly, as the latter is aware of the PAGE_SIZE buffer and correctly returns the number of bytes written into the buffer. No functional change intended. [1] Documentation/filesystems/sysfs.rst [bhelgaas: drop dsm_label_utf16s_to_utf8s(), link speed/width changes] Link: https://lore.kernel.org/r/20210416205856.3234481-10-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-label.c | 10 +++--- drivers/pci/pci-sysfs.c | 72 ++++++++++++++++++++--------------------- 2 files changed, 40 insertions(+), 42 deletions(-) diff --git a/drivers/pci/pci-label.c b/drivers/pci/pci-label.c index f94ce0328609..c32f3b7540e8 100644 --- a/drivers/pci/pci-label.c +++ b/drivers/pci/pci-label.c @@ -73,13 +73,11 @@ static size_t find_smbios_instance_string(struct pci_dev *pdev, char *buf, donboard->devfn == devfn) { if (buf) { if (attribute == SMBIOS_ATTR_INSTANCE_SHOW) - return scnprintf(buf, PAGE_SIZE, - "%d\n", - donboard->instance); + return sysfs_emit(buf, "%d\n", + donboard->instance); else if (attribute == SMBIOS_ATTR_LABEL_SHOW) - return scnprintf(buf, PAGE_SIZE, - "%s\n", - dmi->name); + return sysfs_emit(buf, "%s\n", + dmi->name); } return strlen(dmi->name); } diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index befcd04ae0c8..e216d715d26b 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -39,7 +39,7 @@ field##_show(struct device *dev, struct device_attribute *attr, char *buf) \ struct pci_dev *pdev; \ \ pdev = to_pci_dev(dev); \ - return sprintf(buf, format_string, pdev->field); \ + return sysfs_emit(buf, format_string, pdev->field); \ } \ static DEVICE_ATTR_RO(field) @@ -56,7 +56,7 @@ static ssize_t broken_parity_status_show(struct device *dev, char *buf) { struct pci_dev *pdev = to_pci_dev(dev); - return sprintf(buf, "%u\n", pdev->broken_parity_status); + return sysfs_emit(buf, "%u\n", pdev->broken_parity_status); } static ssize_t broken_parity_status_store(struct device *dev, @@ -129,7 +129,7 @@ static ssize_t power_state_show(struct device *dev, { struct pci_dev *pdev = to_pci_dev(dev); - return sprintf(buf, "%s\n", pci_power_name(pdev->current_state)); + return sysfs_emit(buf, "%s\n", pci_power_name(pdev->current_state)); } static DEVICE_ATTR_RO(power_state); @@ -138,10 +138,10 @@ static ssize_t resource_show(struct device *dev, struct device_attribute *attr, char *buf) { struct pci_dev *pci_dev = to_pci_dev(dev); - char *str = buf; int i; int max; resource_size_t start, end; + size_t len = 0; if (pci_dev->subordinate) max = DEVICE_COUNT_RESOURCE; @@ -151,12 +151,12 @@ static ssize_t resource_show(struct device *dev, struct device_attribute *attr, for (i = 0; i < max; i++) { struct resource *res = &pci_dev->resource[i]; pci_resource_to_user(pci_dev, i, res, &start, &end); - str += sprintf(str, "0x%016llx 0x%016llx 0x%016llx\n", - (unsigned long long)start, - (unsigned long long)end, - (unsigned long long)res->flags); + len += sysfs_emit_at(buf, len, "0x%016llx 0x%016llx 0x%016llx\n", + (unsigned long long)start, + (unsigned long long)end, + (unsigned long long)res->flags); } - return (str - buf); + return len; } static DEVICE_ATTR_RO(resource); @@ -165,8 +165,8 @@ static ssize_t max_link_speed_show(struct device *dev, { struct pci_dev *pdev = to_pci_dev(dev); - return sprintf(buf, "%s\n", - pci_speed_string(pcie_get_speed_cap(pdev))); + return sysfs_emit(buf, "%s\n", + pci_speed_string(pcie_get_speed_cap(pdev))); } static DEVICE_ATTR_RO(max_link_speed); @@ -175,7 +175,7 @@ static ssize_t max_link_width_show(struct device *dev, { struct pci_dev *pdev = to_pci_dev(dev); - return sprintf(buf, "%u\n", pcie_get_width_cap(pdev)); + return sysfs_emit(buf, "%u\n", pcie_get_width_cap(pdev)); } static DEVICE_ATTR_RO(max_link_width); @@ -193,7 +193,7 @@ static ssize_t current_link_speed_show(struct device *dev, speed = pcie_link_speed[linkstat & PCI_EXP_LNKSTA_CLS]; - return sprintf(buf, "%s\n", pci_speed_string(speed)); + return sysfs_emit(buf, "%s\n", pci_speed_string(speed)); } static DEVICE_ATTR_RO(current_link_speed); @@ -208,7 +208,7 @@ static ssize_t current_link_width_show(struct device *dev, if (err) return -EINVAL; - return sprintf(buf, "%u\n", + return sysfs_emit(buf, "%u\n", (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT); } static DEVICE_ATTR_RO(current_link_width); @@ -225,7 +225,7 @@ static ssize_t secondary_bus_number_show(struct device *dev, if (err) return -EINVAL; - return sprintf(buf, "%u\n", sec_bus); + return sysfs_emit(buf, "%u\n", sec_bus); } static DEVICE_ATTR_RO(secondary_bus_number); @@ -241,7 +241,7 @@ static ssize_t subordinate_bus_number_show(struct device *dev, if (err) return -EINVAL; - return sprintf(buf, "%u\n", sub_bus); + return sysfs_emit(buf, "%u\n", sub_bus); } static DEVICE_ATTR_RO(subordinate_bus_number); @@ -251,7 +251,7 @@ static ssize_t ari_enabled_show(struct device *dev, { struct pci_dev *pci_dev = to_pci_dev(dev); - return sprintf(buf, "%u\n", pci_ari_enabled(pci_dev->bus)); + return sysfs_emit(buf, "%u\n", pci_ari_enabled(pci_dev->bus)); } static DEVICE_ATTR_RO(ari_enabled); @@ -260,11 +260,11 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, { struct pci_dev *pci_dev = to_pci_dev(dev); - return sprintf(buf, "pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02X\n", - pci_dev->vendor, pci_dev->device, - pci_dev->subsystem_vendor, pci_dev->subsystem_device, - (u8)(pci_dev->class >> 16), (u8)(pci_dev->class >> 8), - (u8)(pci_dev->class)); + return sysfs_emit(buf, "pci:v%08Xd%08Xsv%08Xsd%08Xbc%02Xsc%02Xi%02X\n", + pci_dev->vendor, pci_dev->device, + pci_dev->subsystem_vendor, pci_dev->subsystem_device, + (u8)(pci_dev->class >> 16), (u8)(pci_dev->class >> 8), + (u8)(pci_dev->class)); } static DEVICE_ATTR_RO(modalias); @@ -302,7 +302,7 @@ static ssize_t enable_show(struct device *dev, struct device_attribute *attr, struct pci_dev *pdev; pdev = to_pci_dev(dev); - return sprintf(buf, "%u\n", atomic_read(&pdev->enable_cnt)); + return sysfs_emit(buf, "%u\n", atomic_read(&pdev->enable_cnt)); } static DEVICE_ATTR_RW(enable); @@ -338,7 +338,7 @@ static ssize_t numa_node_store(struct device *dev, static ssize_t numa_node_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", dev->numa_node); + return sysfs_emit(buf, "%d\n", dev->numa_node); } static DEVICE_ATTR_RW(numa_node); #endif @@ -348,7 +348,7 @@ static ssize_t dma_mask_bits_show(struct device *dev, { struct pci_dev *pdev = to_pci_dev(dev); - return sprintf(buf, "%d\n", fls64(pdev->dma_mask)); + return sysfs_emit(buf, "%d\n", fls64(pdev->dma_mask)); } static DEVICE_ATTR_RO(dma_mask_bits); @@ -356,7 +356,7 @@ static ssize_t consistent_dma_mask_bits_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", fls64(dev->coherent_dma_mask)); + return sysfs_emit(buf, "%d\n", fls64(dev->coherent_dma_mask)); } static DEVICE_ATTR_RO(consistent_dma_mask_bits); @@ -366,9 +366,9 @@ static ssize_t msi_bus_show(struct device *dev, struct device_attribute *attr, struct pci_dev *pdev = to_pci_dev(dev); struct pci_bus *subordinate = pdev->subordinate; - return sprintf(buf, "%u\n", subordinate ? - !(subordinate->bus_flags & PCI_BUS_FLAGS_NO_MSI) - : !pdev->no_msi); + return sysfs_emit(buf, "%u\n", subordinate ? + !(subordinate->bus_flags & PCI_BUS_FLAGS_NO_MSI) + : !pdev->no_msi); } static ssize_t msi_bus_store(struct device *dev, struct device_attribute *attr, @@ -523,7 +523,7 @@ static ssize_t d3cold_allowed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct pci_dev *pdev = to_pci_dev(dev); - return sprintf(buf, "%u\n", pdev->d3cold_allowed); + return sysfs_emit(buf, "%u\n", pdev->d3cold_allowed); } static DEVICE_ATTR_RW(d3cold_allowed); #endif @@ -537,7 +537,7 @@ static ssize_t devspec_show(struct device *dev, if (np == NULL) return 0; - return sprintf(buf, "%pOF", np); + return sysfs_emit(buf, "%pOF", np); } static DEVICE_ATTR_RO(devspec); #endif @@ -583,7 +583,7 @@ static ssize_t driver_override_show(struct device *dev, ssize_t len; device_lock(dev); - len = scnprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override); + len = sysfs_emit(buf, "%s\n", pdev->driver_override); device_unlock(dev); return len; } @@ -658,11 +658,11 @@ static ssize_t boot_vga_show(struct device *dev, struct device_attribute *attr, struct pci_dev *vga_dev = vga_default_device(); if (vga_dev) - return sprintf(buf, "%u\n", (pdev == vga_dev)); + return sysfs_emit(buf, "%u\n", (pdev == vga_dev)); - return sprintf(buf, "%u\n", - !!(pdev->resource[PCI_ROM_RESOURCE].flags & - IORESOURCE_ROM_SHADOW)); + return sysfs_emit(buf, "%u\n", + !!(pdev->resource[PCI_ROM_RESOURCE].flags & + IORESOURCE_ROM_SHADOW)); } static DEVICE_ATTR_RO(boot_vga); From 294353d950ab3e47d7694d382e50c887206f541a Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 25 Mar 2021 15:26:04 +0800 Subject: [PATCH 0758/1379] PCI: dwc: Move dw_pcie_msi_init() to dw_pcie_setup_rc() If the host which makes use of IP's integrated MSI Receiver losts power during suspend, we need to reinit the RC and MSI Receiver in resume. But after we move dw_pcie_msi_init() into the core, we have no API to do so. Usually the dwc users need to call dw_pcie_setup_rc() to reinit the RC, we can solve this problem by moving dw_pcie_msi_init() to dw_pcie_setup_rc(). Link: https://lore.kernel.org/r/20210325152604.6e79deba@xhacker.debian Signed-off-by: Jisheng Zhang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- drivers/pci/controller/dwc/pcie-designware-host.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index 7e55b2b66182..e6c274f4485c 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -400,7 +400,6 @@ int dw_pcie_host_init(struct pcie_port *pp) } dw_pcie_setup_rc(pp); - dw_pcie_msi_init(pp); if (!dw_pcie_link_up(pci) && pci->ops && pci->ops->start_link) { ret = pci->ops->start_link(pci); @@ -551,6 +550,8 @@ void dw_pcie_setup_rc(struct pcie_port *pp) } } + dw_pcie_msi_init(pp); + /* Setup RC BARs */ dw_pcie_writel_dbi(pci, PCI_BASE_ADDRESS_0, 0x00000004); dw_pcie_writel_dbi(pci, PCI_BASE_ADDRESS_1, 0x00000000); From 7d499169f793083c83bcc6e31170be8f36087075 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 15 Apr 2021 16:32:57 +0800 Subject: [PATCH 0759/1379] PCI: dwc/intel-gw: Remove unused function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following clang warning: drivers/pci/controller/dwc/pcie-intel-gw.c:84:19: warning: unused function 'pcie_app_rd' [-Wunused-function]. Link: https://lore.kernel.org/r/1618475577-99198-1-git-send-email-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Lorenzo Pieralisi Reviewed-by: Krzysztof Wilczyński --- drivers/pci/controller/dwc/pcie-intel-gw.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-intel-gw.c b/drivers/pci/controller/dwc/pcie-intel-gw.c index 0cedd1f95f37..f89a7d24ba28 100644 --- a/drivers/pci/controller/dwc/pcie-intel-gw.c +++ b/drivers/pci/controller/dwc/pcie-intel-gw.c @@ -81,11 +81,6 @@ static void pcie_update_bits(void __iomem *base, u32 ofs, u32 mask, u32 val) writel(val, base + ofs); } -static inline u32 pcie_app_rd(struct intel_pcie_port *lpp, u32 ofs) -{ - return readl(lpp->app_base + ofs); -} - static inline void pcie_app_wr(struct intel_pcie_port *lpp, u32 ofs, u32 val) { writel(val, lpp->app_base + ofs); From 8bcca26585585ae4b44d25d30f351ad0afa4976b Mon Sep 17 00:00:00 2001 From: Hou Zhiqiang Date: Tue, 13 Apr 2021 17:22:19 +0300 Subject: [PATCH 0760/1379] PCI: dwc: Move iATU detection earlier dw_pcie_ep_init() depends on the detected iATU region numbers to allocate the in/outbound window management bitmap. It fails after 281f1f99cf3a ("PCI: dwc: Detect number of iATU windows"). Move the iATU region detection into a new function, move the detection to the very beginning of dw_pcie_host_init() and dw_pcie_ep_init(). Also remove it from the dw_pcie_setup(), since it's more like a software initialization step than hardware setup. Link: https://lore.kernel.org/r/20210125044803.4310-1-Zhiqiang.Hou@nxp.com Link: https://lore.kernel.org/linux-pci/20210407131255.702054-1-dmitry.baryshkov@linaro.org Link: https://lore.kernel.org/r/20210413142219.2301430-1-dmitry.baryshkov@linaro.org Fixes: 281f1f99cf3a ("PCI: dwc: Detect number of iATU windows") Tested-by: Kunihiko Hayashi Tested-by: Marek Szyprowski Tested-by: Manivannan Sadhasivam Signed-off-by: Hou Zhiqiang [DB: moved dw_pcie_iatu_detect to happen after host_init callback] Signed-off-by: Dmitry Baryshkov Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Cc: stable@vger.kernel.org # v5.11+ Cc: Marek Szyprowski --- drivers/pci/controller/dwc/pcie-designware-ep.c | 2 ++ drivers/pci/controller/dwc/pcie-designware-host.c | 1 + drivers/pci/controller/dwc/pcie-designware.c | 11 ++++++++--- drivers/pci/controller/dwc/pcie-designware.h | 1 + 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 1c25d8337151..8d028a88b375 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -705,6 +705,8 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) } } + dw_pcie_iatu_detect(pci); + res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "addr_space"); if (!res) return -EINVAL; diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index e6c274f4485c..a608ae1fad57 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -398,6 +398,7 @@ int dw_pcie_host_init(struct pcie_port *pp) if (ret) goto err_free_msi; } + dw_pcie_iatu_detect(pci); dw_pcie_setup_rc(pp); diff --git a/drivers/pci/controller/dwc/pcie-designware.c b/drivers/pci/controller/dwc/pcie-designware.c index 004cb860e266..a945f0c0e73d 100644 --- a/drivers/pci/controller/dwc/pcie-designware.c +++ b/drivers/pci/controller/dwc/pcie-designware.c @@ -660,11 +660,9 @@ static void dw_pcie_iatu_detect_regions(struct dw_pcie *pci) pci->num_ob_windows = ob; } -void dw_pcie_setup(struct dw_pcie *pci) +void dw_pcie_iatu_detect(struct dw_pcie *pci) { - u32 val; struct device *dev = pci->dev; - struct device_node *np = dev->of_node; struct platform_device *pdev = to_platform_device(dev); if (pci->version >= 0x480A || (!pci->version && @@ -693,6 +691,13 @@ void dw_pcie_setup(struct dw_pcie *pci) dev_info(pci->dev, "Detected iATU regions: %u outbound, %u inbound", pci->num_ob_windows, pci->num_ib_windows); +} + +void dw_pcie_setup(struct dw_pcie *pci) +{ + u32 val; + struct device *dev = pci->dev; + struct device_node *np = dev->of_node; if (pci->link_gen > 0) dw_pcie_link_set_max_speed(pci, pci->link_gen); diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index 7247c8b01f04..7d6e9b7576be 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -306,6 +306,7 @@ int dw_pcie_prog_inbound_atu(struct dw_pcie *pci, u8 func_no, int index, void dw_pcie_disable_atu(struct dw_pcie *pci, int index, enum dw_pcie_region_type type); void dw_pcie_setup(struct dw_pcie *pci); +void dw_pcie_iatu_detect(struct dw_pcie *pci); static inline void dw_pcie_writel_dbi(struct dw_pcie *pci, u32 reg, u32 val) { From d3bf75b579b980b9d83a76d3b4d8bfb9f55b24ca Mon Sep 17 00:00:00 2001 From: Jianjun Wang Date: Tue, 20 Apr 2021 14:17:19 +0800 Subject: [PATCH 0761/1379] PCI: mediatek-gen3: Add MediaTek Gen3 driver for MT8192 MediaTek's PCIe host controller has three generation HWs, the new generation HW is an individual bridge, it supports Gen3 speed and compatible with Gen2, Gen1 speed. Add support for new Gen3 controller which can be found on MT8192. Link: https://lore.kernel.org/r/20210420061723.989-4-jianjun.wang@mediatek.com Signed-off-by: Jianjun Wang Signed-off-by: Lorenzo Pieralisi Acked-by: Ryder Lee --- drivers/pci/controller/Kconfig | 13 + drivers/pci/controller/Makefile | 1 + drivers/pci/controller/pcie-mediatek-gen3.c | 466 ++++++++++++++++++++ 3 files changed, 480 insertions(+) create mode 100644 drivers/pci/controller/pcie-mediatek-gen3.c diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index 5aa8977d7b0f..1e925ac47279 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -233,6 +233,19 @@ config PCIE_MEDIATEK Say Y here if you want to enable PCIe controller support on MediaTek SoCs. +config PCIE_MEDIATEK_GEN3 + tristate "MediaTek Gen3 PCIe controller" + depends on ARCH_MEDIATEK || COMPILE_TEST + depends on PCI_MSI_IRQ_DOMAIN + help + Adds support for PCIe Gen3 MAC controller for MediaTek SoCs. + This PCIe controller is compatible with Gen3, Gen2 and Gen1 speed, + and support up to 256 MSI interrupt numbers for + multi-function devices. + + Say Y here if you want to enable Gen3 PCIe controller support on + MediaTek SoCs. + config VMD depends on PCI_MSI && X86_64 && SRCU tristate "Intel Volume Management Device Driver" diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile index e4559f2182f2..579973327815 100644 --- a/drivers/pci/controller/Makefile +++ b/drivers/pci/controller/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_PCIE_ROCKCHIP) += pcie-rockchip.o obj-$(CONFIG_PCIE_ROCKCHIP_EP) += pcie-rockchip-ep.o obj-$(CONFIG_PCIE_ROCKCHIP_HOST) += pcie-rockchip-host.o obj-$(CONFIG_PCIE_MEDIATEK) += pcie-mediatek.o +obj-$(CONFIG_PCIE_MEDIATEK_GEN3) += pcie-mediatek-gen3.o obj-$(CONFIG_PCIE_MICROCHIP_HOST) += pcie-microchip-host.o obj-$(CONFIG_VMD) += vmd.o obj-$(CONFIG_PCIE_BRCMSTB) += pcie-brcmstb.o diff --git a/drivers/pci/controller/pcie-mediatek-gen3.c b/drivers/pci/controller/pcie-mediatek-gen3.c new file mode 100644 index 000000000000..078d8408150f --- /dev/null +++ b/drivers/pci/controller/pcie-mediatek-gen3.c @@ -0,0 +1,466 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * MediaTek PCIe host controller driver. + * + * Copyright (c) 2020 MediaTek Inc. + * Author: Jianjun Wang + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../pci.h" + +#define PCIE_SETTING_REG 0x80 +#define PCIE_PCI_IDS_1 0x9c +#define PCI_CLASS(class) (class << 8) +#define PCIE_RC_MODE BIT(0) + +#define PCIE_CFGNUM_REG 0x140 +#define PCIE_CFG_DEVFN(devfn) ((devfn) & GENMASK(7, 0)) +#define PCIE_CFG_BUS(bus) (((bus) << 8) & GENMASK(15, 8)) +#define PCIE_CFG_BYTE_EN(bytes) (((bytes) << 16) & GENMASK(19, 16)) +#define PCIE_CFG_FORCE_BYTE_EN BIT(20) +#define PCIE_CFG_OFFSET_ADDR 0x1000 +#define PCIE_CFG_HEADER(bus, devfn) \ + (PCIE_CFG_BUS(bus) | PCIE_CFG_DEVFN(devfn)) + +#define PCIE_RST_CTRL_REG 0x148 +#define PCIE_MAC_RSTB BIT(0) +#define PCIE_PHY_RSTB BIT(1) +#define PCIE_BRG_RSTB BIT(2) +#define PCIE_PE_RSTB BIT(3) + +#define PCIE_LTSSM_STATUS_REG 0x150 + +#define PCIE_LINK_STATUS_REG 0x154 +#define PCIE_PORT_LINKUP BIT(8) + +#define PCIE_TRANS_TABLE_BASE_REG 0x800 +#define PCIE_ATR_SRC_ADDR_MSB_OFFSET 0x4 +#define PCIE_ATR_TRSL_ADDR_LSB_OFFSET 0x8 +#define PCIE_ATR_TRSL_ADDR_MSB_OFFSET 0xc +#define PCIE_ATR_TRSL_PARAM_OFFSET 0x10 +#define PCIE_ATR_TLB_SET_OFFSET 0x20 + +#define PCIE_MAX_TRANS_TABLES 8 +#define PCIE_ATR_EN BIT(0) +#define PCIE_ATR_SIZE(size) \ + (((((size) - 1) << 1) & GENMASK(6, 1)) | PCIE_ATR_EN) +#define PCIE_ATR_ID(id) ((id) & GENMASK(3, 0)) +#define PCIE_ATR_TYPE_MEM PCIE_ATR_ID(0) +#define PCIE_ATR_TYPE_IO PCIE_ATR_ID(1) +#define PCIE_ATR_TLP_TYPE(type) (((type) << 16) & GENMASK(18, 16)) +#define PCIE_ATR_TLP_TYPE_MEM PCIE_ATR_TLP_TYPE(0) +#define PCIE_ATR_TLP_TYPE_IO PCIE_ATR_TLP_TYPE(2) + +/** + * struct mtk_pcie_port - PCIe port information + * @dev: pointer to PCIe device + * @base: IO mapped register base + * @reg_base: physical register base + * @mac_reset: MAC reset control + * @phy_reset: PHY reset control + * @phy: PHY controller block + * @clks: PCIe clocks + * @num_clks: PCIe clocks count for this port + */ +struct mtk_pcie_port { + struct device *dev; + void __iomem *base; + phys_addr_t reg_base; + struct reset_control *mac_reset; + struct reset_control *phy_reset; + struct phy *phy; + struct clk_bulk_data *clks; + int num_clks; +}; + +/** + * mtk_pcie_config_tlp_header() - Configure a configuration TLP header + * @bus: PCI bus to query + * @devfn: device/function number + * @where: offset in config space + * @size: data size in TLP header + * + * Set byte enable field and device information in configuration TLP header. + */ +static void mtk_pcie_config_tlp_header(struct pci_bus *bus, unsigned int devfn, + int where, int size) +{ + struct mtk_pcie_port *port = bus->sysdata; + int bytes; + u32 val; + + bytes = (GENMASK(size - 1, 0) & 0xf) << (where & 0x3); + + val = PCIE_CFG_FORCE_BYTE_EN | PCIE_CFG_BYTE_EN(bytes) | + PCIE_CFG_HEADER(bus->number, devfn); + + writel_relaxed(val, port->base + PCIE_CFGNUM_REG); +} + +static void __iomem *mtk_pcie_map_bus(struct pci_bus *bus, unsigned int devfn, + int where) +{ + struct mtk_pcie_port *port = bus->sysdata; + + return port->base + PCIE_CFG_OFFSET_ADDR + where; +} + +static int mtk_pcie_config_read(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 *val) +{ + mtk_pcie_config_tlp_header(bus, devfn, where, size); + + return pci_generic_config_read32(bus, devfn, where, size, val); +} + +static int mtk_pcie_config_write(struct pci_bus *bus, unsigned int devfn, + int where, int size, u32 val) +{ + mtk_pcie_config_tlp_header(bus, devfn, where, size); + + if (size <= 2) + val <<= (where & 0x3) * 8; + + return pci_generic_config_write32(bus, devfn, where, 4, val); +} + +static struct pci_ops mtk_pcie_ops = { + .map_bus = mtk_pcie_map_bus, + .read = mtk_pcie_config_read, + .write = mtk_pcie_config_write, +}; + +static int mtk_pcie_set_trans_table(struct mtk_pcie_port *port, + resource_size_t cpu_addr, + resource_size_t pci_addr, + resource_size_t size, + unsigned long type, int num) +{ + void __iomem *table; + u32 val; + + if (num >= PCIE_MAX_TRANS_TABLES) { + dev_err(port->dev, "not enough translate table for addr: %#llx, limited to [%d]\n", + (unsigned long long)cpu_addr, PCIE_MAX_TRANS_TABLES); + return -ENODEV; + } + + table = port->base + PCIE_TRANS_TABLE_BASE_REG + + num * PCIE_ATR_TLB_SET_OFFSET; + + writel_relaxed(lower_32_bits(cpu_addr) | PCIE_ATR_SIZE(fls(size) - 1), + table); + writel_relaxed(upper_32_bits(cpu_addr), + table + PCIE_ATR_SRC_ADDR_MSB_OFFSET); + writel_relaxed(lower_32_bits(pci_addr), + table + PCIE_ATR_TRSL_ADDR_LSB_OFFSET); + writel_relaxed(upper_32_bits(pci_addr), + table + PCIE_ATR_TRSL_ADDR_MSB_OFFSET); + + if (type == IORESOURCE_IO) + val = PCIE_ATR_TYPE_IO | PCIE_ATR_TLP_TYPE_IO; + else + val = PCIE_ATR_TYPE_MEM | PCIE_ATR_TLP_TYPE_MEM; + + writel_relaxed(val, table + PCIE_ATR_TRSL_PARAM_OFFSET); + + return 0; +} + +static int mtk_pcie_startup_port(struct mtk_pcie_port *port) +{ + struct resource_entry *entry; + struct pci_host_bridge *host = pci_host_bridge_from_priv(port); + unsigned int table_index = 0; + int err; + u32 val; + + /* Set as RC mode */ + val = readl_relaxed(port->base + PCIE_SETTING_REG); + val |= PCIE_RC_MODE; + writel_relaxed(val, port->base + PCIE_SETTING_REG); + + /* Set class code */ + val = readl_relaxed(port->base + PCIE_PCI_IDS_1); + val &= ~GENMASK(31, 8); + val |= PCI_CLASS(PCI_CLASS_BRIDGE_PCI << 8); + writel_relaxed(val, port->base + PCIE_PCI_IDS_1); + + /* Assert all reset signals */ + val = readl_relaxed(port->base + PCIE_RST_CTRL_REG); + val |= PCIE_MAC_RSTB | PCIE_PHY_RSTB | PCIE_BRG_RSTB | PCIE_PE_RSTB; + writel_relaxed(val, port->base + PCIE_RST_CTRL_REG); + + /* + * Described in PCIe CEM specification setctions 2.2 (PERST# Signal) + * and 2.2.1 (Initial Power-Up (G3 to S0)). + * The deassertion of PERST# should be delayed 100ms (TPVPERL) + * for the power and clock to become stable. + */ + msleep(100); + + /* De-assert reset signals */ + val &= ~(PCIE_MAC_RSTB | PCIE_PHY_RSTB | PCIE_BRG_RSTB | PCIE_PE_RSTB); + writel_relaxed(val, port->base + PCIE_RST_CTRL_REG); + + /* Check if the link is up or not */ + err = readl_poll_timeout(port->base + PCIE_LINK_STATUS_REG, val, + !!(val & PCIE_PORT_LINKUP), 20, + PCI_PM_D3COLD_WAIT * USEC_PER_MSEC); + if (err) { + val = readl_relaxed(port->base + PCIE_LTSSM_STATUS_REG); + dev_err(port->dev, "PCIe link down, ltssm reg val: %#x\n", val); + return err; + } + + /* Set PCIe translation windows */ + resource_list_for_each_entry(entry, &host->windows) { + struct resource *res = entry->res; + unsigned long type = resource_type(res); + resource_size_t cpu_addr; + resource_size_t pci_addr; + resource_size_t size; + const char *range_type; + + if (type == IORESOURCE_IO) { + cpu_addr = pci_pio_to_address(res->start); + range_type = "IO"; + } else if (type == IORESOURCE_MEM) { + cpu_addr = res->start; + range_type = "MEM"; + } else { + continue; + } + + pci_addr = res->start - entry->offset; + size = resource_size(res); + err = mtk_pcie_set_trans_table(port, cpu_addr, pci_addr, size, + type, table_index); + if (err) + return err; + + dev_dbg(port->dev, "set %s trans window[%d]: cpu_addr = %#llx, pci_addr = %#llx, size = %#llx\n", + range_type, table_index, (unsigned long long)cpu_addr, + (unsigned long long)pci_addr, (unsigned long long)size); + + table_index++; + } + + return 0; +} + +static int mtk_pcie_parse_port(struct mtk_pcie_port *port) +{ + struct device *dev = port->dev; + struct platform_device *pdev = to_platform_device(dev); + struct resource *regs; + int ret; + + regs = platform_get_resource_byname(pdev, IORESOURCE_MEM, "pcie-mac"); + if (!regs) + return -EINVAL; + port->base = devm_ioremap_resource(dev, regs); + if (IS_ERR(port->base)) { + dev_err(dev, "failed to map register base\n"); + return PTR_ERR(port->base); + } + + port->reg_base = regs->start; + + port->phy_reset = devm_reset_control_get_optional_exclusive(dev, "phy"); + if (IS_ERR(port->phy_reset)) { + ret = PTR_ERR(port->phy_reset); + if (ret != -EPROBE_DEFER) + dev_err(dev, "failed to get PHY reset\n"); + + return ret; + } + + port->mac_reset = devm_reset_control_get_optional_exclusive(dev, "mac"); + if (IS_ERR(port->mac_reset)) { + ret = PTR_ERR(port->mac_reset); + if (ret != -EPROBE_DEFER) + dev_err(dev, "failed to get MAC reset\n"); + + return ret; + } + + port->phy = devm_phy_optional_get(dev, "pcie-phy"); + if (IS_ERR(port->phy)) { + ret = PTR_ERR(port->phy); + if (ret != -EPROBE_DEFER) + dev_err(dev, "failed to get PHY\n"); + + return ret; + } + + port->num_clks = devm_clk_bulk_get_all(dev, &port->clks); + if (port->num_clks < 0) { + dev_err(dev, "failed to get clocks\n"); + return port->num_clks; + } + + return 0; +} + +static int mtk_pcie_power_up(struct mtk_pcie_port *port) +{ + struct device *dev = port->dev; + int err; + + /* PHY power on and enable pipe clock */ + reset_control_deassert(port->phy_reset); + + err = phy_init(port->phy); + if (err) { + dev_err(dev, "failed to initialize PHY\n"); + goto err_phy_init; + } + + err = phy_power_on(port->phy); + if (err) { + dev_err(dev, "failed to power on PHY\n"); + goto err_phy_on; + } + + /* MAC power on and enable transaction layer clocks */ + reset_control_deassert(port->mac_reset); + + pm_runtime_enable(dev); + pm_runtime_get_sync(dev); + + err = clk_bulk_prepare_enable(port->num_clks, port->clks); + if (err) { + dev_err(dev, "failed to enable clocks\n"); + goto err_clk_init; + } + + return 0; + +err_clk_init: + pm_runtime_put_sync(dev); + pm_runtime_disable(dev); + reset_control_assert(port->mac_reset); + phy_power_off(port->phy); +err_phy_on: + phy_exit(port->phy); +err_phy_init: + reset_control_assert(port->phy_reset); + + return err; +} + +static void mtk_pcie_power_down(struct mtk_pcie_port *port) +{ + clk_bulk_disable_unprepare(port->num_clks, port->clks); + + pm_runtime_put_sync(port->dev); + pm_runtime_disable(port->dev); + reset_control_assert(port->mac_reset); + + phy_power_off(port->phy); + phy_exit(port->phy); + reset_control_assert(port->phy_reset); +} + +static int mtk_pcie_setup(struct mtk_pcie_port *port) +{ + int err; + + err = mtk_pcie_parse_port(port); + if (err) + return err; + + /* Don't touch the hardware registers before power up */ + err = mtk_pcie_power_up(port); + if (err) + return err; + + /* Try link up */ + err = mtk_pcie_startup_port(port); + if (err) + goto err_setup; + + return 0; + +err_setup: + mtk_pcie_power_down(port); + + return err; +} + +static int mtk_pcie_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct mtk_pcie_port *port; + struct pci_host_bridge *host; + int err; + + host = devm_pci_alloc_host_bridge(dev, sizeof(*port)); + if (!host) + return -ENOMEM; + + port = pci_host_bridge_priv(host); + + port->dev = dev; + platform_set_drvdata(pdev, port); + + err = mtk_pcie_setup(port); + if (err) + return err; + + host->ops = &mtk_pcie_ops; + host->sysdata = port; + + err = pci_host_probe(host); + if (err) { + mtk_pcie_power_down(port); + return err; + } + + return 0; +} + +static int mtk_pcie_remove(struct platform_device *pdev) +{ + struct mtk_pcie_port *port = platform_get_drvdata(pdev); + struct pci_host_bridge *host = pci_host_bridge_from_priv(port); + + pci_lock_rescan_remove(); + pci_stop_root_bus(host->bus); + pci_remove_root_bus(host->bus); + pci_unlock_rescan_remove(); + + mtk_pcie_power_down(port); + + return 0; +} + +static const struct of_device_id mtk_pcie_of_match[] = { + { .compatible = "mediatek,mt8192-pcie" }, + {}, +}; + +static struct platform_driver mtk_pcie_driver = { + .probe = mtk_pcie_probe, + .remove = mtk_pcie_remove, + .driver = { + .name = "mtk-pcie", + .of_match_table = mtk_pcie_of_match, + }, +}; + +module_platform_driver(mtk_pcie_driver); +MODULE_LICENSE("GPL v2"); From 814cceebba9b7d1306b8d49587ffb0e81f7b73af Mon Sep 17 00:00:00 2001 From: Jianjun Wang Date: Tue, 20 Apr 2021 14:17:20 +0800 Subject: [PATCH 0762/1379] PCI: mediatek-gen3: Add INTx support Add INTx support for MediaTek Gen3 PCIe controller. Link: https://lore.kernel.org/r/20210420061723.989-5-jianjun.wang@mediatek.com Signed-off-by: Jianjun Wang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marc Zyngier Acked-by: Ryder Lee --- drivers/pci/controller/pcie-mediatek-gen3.c | 172 ++++++++++++++++++++ 1 file changed, 172 insertions(+) diff --git a/drivers/pci/controller/pcie-mediatek-gen3.c b/drivers/pci/controller/pcie-mediatek-gen3.c index 078d8408150f..9134e871bff9 100644 --- a/drivers/pci/controller/pcie-mediatek-gen3.c +++ b/drivers/pci/controller/pcie-mediatek-gen3.c @@ -9,6 +9,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -45,6 +48,13 @@ #define PCIE_LINK_STATUS_REG 0x154 #define PCIE_PORT_LINKUP BIT(8) +#define PCIE_INT_ENABLE_REG 0x180 +#define PCIE_INTX_SHIFT 24 +#define PCIE_INTX_ENABLE \ + GENMASK(PCIE_INTX_SHIFT + PCI_NUM_INTX - 1, PCIE_INTX_SHIFT) + +#define PCIE_INT_STATUS_REG 0x184 + #define PCIE_TRANS_TABLE_BASE_REG 0x800 #define PCIE_ATR_SRC_ADDR_MSB_OFFSET 0x4 #define PCIE_ATR_TRSL_ADDR_LSB_OFFSET 0x8 @@ -73,6 +83,9 @@ * @phy: PHY controller block * @clks: PCIe clocks * @num_clks: PCIe clocks count for this port + * @irq: PCIe controller interrupt number + * @irq_lock: lock protecting IRQ register access + * @intx_domain: legacy INTx IRQ domain */ struct mtk_pcie_port { struct device *dev; @@ -83,6 +96,10 @@ struct mtk_pcie_port { struct phy *phy; struct clk_bulk_data *clks; int num_clks; + + int irq; + raw_spinlock_t irq_lock; + struct irq_domain *intx_domain; }; /** @@ -198,6 +215,11 @@ static int mtk_pcie_startup_port(struct mtk_pcie_port *port) val |= PCI_CLASS(PCI_CLASS_BRIDGE_PCI << 8); writel_relaxed(val, port->base + PCIE_PCI_IDS_1); + /* Mask all INTx interrupts */ + val = readl_relaxed(port->base + PCIE_INT_ENABLE_REG); + val &= ~PCIE_INTX_ENABLE; + writel_relaxed(val, port->base + PCIE_INT_ENABLE_REG); + /* Assert all reset signals */ val = readl_relaxed(port->base + PCIE_RST_CTRL_REG); val |= PCIE_MAC_RSTB | PCIE_PHY_RSTB | PCIE_BRG_RSTB | PCIE_PE_RSTB; @@ -261,6 +283,150 @@ static int mtk_pcie_startup_port(struct mtk_pcie_port *port) return 0; } +static int mtk_pcie_set_affinity(struct irq_data *data, + const struct cpumask *mask, bool force) +{ + return -EINVAL; +} + +static void mtk_intx_mask(struct irq_data *data) +{ + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); + unsigned long flags; + u32 val; + + raw_spin_lock_irqsave(&port->irq_lock, flags); + val = readl_relaxed(port->base + PCIE_INT_ENABLE_REG); + val &= ~BIT(data->hwirq + PCIE_INTX_SHIFT); + writel_relaxed(val, port->base + PCIE_INT_ENABLE_REG); + raw_spin_unlock_irqrestore(&port->irq_lock, flags); +} + +static void mtk_intx_unmask(struct irq_data *data) +{ + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); + unsigned long flags; + u32 val; + + raw_spin_lock_irqsave(&port->irq_lock, flags); + val = readl_relaxed(port->base + PCIE_INT_ENABLE_REG); + val |= BIT(data->hwirq + PCIE_INTX_SHIFT); + writel_relaxed(val, port->base + PCIE_INT_ENABLE_REG); + raw_spin_unlock_irqrestore(&port->irq_lock, flags); +} + +/** + * mtk_intx_eoi() - Clear INTx IRQ status at the end of interrupt + * @data: pointer to chip specific data + * + * As an emulated level IRQ, its interrupt status will remain + * until the corresponding de-assert message is received; hence that + * the status can only be cleared when the interrupt has been serviced. + */ +static void mtk_intx_eoi(struct irq_data *data) +{ + struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); + unsigned long hwirq; + + hwirq = data->hwirq + PCIE_INTX_SHIFT; + writel_relaxed(BIT(hwirq), port->base + PCIE_INT_STATUS_REG); +} + +static struct irq_chip mtk_intx_irq_chip = { + .irq_mask = mtk_intx_mask, + .irq_unmask = mtk_intx_unmask, + .irq_eoi = mtk_intx_eoi, + .irq_set_affinity = mtk_pcie_set_affinity, + .name = "INTx", +}; + +static int mtk_pcie_intx_map(struct irq_domain *domain, unsigned int irq, + irq_hw_number_t hwirq) +{ + irq_set_chip_data(irq, domain->host_data); + irq_set_chip_and_handler_name(irq, &mtk_intx_irq_chip, + handle_fasteoi_irq, "INTx"); + return 0; +} + +static const struct irq_domain_ops intx_domain_ops = { + .map = mtk_pcie_intx_map, +}; + +static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port) +{ + struct device *dev = port->dev; + struct device_node *intc_node, *node = dev->of_node; + + raw_spin_lock_init(&port->irq_lock); + + /* Setup INTx */ + intc_node = of_get_child_by_name(node, "interrupt-controller"); + if (!intc_node) { + dev_err(dev, "missing interrupt-controller node\n"); + return -ENODEV; + } + + port->intx_domain = irq_domain_add_linear(intc_node, PCI_NUM_INTX, + &intx_domain_ops, port); + if (!port->intx_domain) { + dev_err(dev, "failed to create INTx IRQ domain\n"); + return -ENODEV; + } + + return 0; +} + +static void mtk_pcie_irq_teardown(struct mtk_pcie_port *port) +{ + irq_set_chained_handler_and_data(port->irq, NULL, NULL); + + if (port->intx_domain) + irq_domain_remove(port->intx_domain); + + irq_dispose_mapping(port->irq); +} + +static void mtk_pcie_irq_handler(struct irq_desc *desc) +{ + struct mtk_pcie_port *port = irq_desc_get_handler_data(desc); + struct irq_chip *irqchip = irq_desc_get_chip(desc); + unsigned long status; + unsigned int virq; + irq_hw_number_t irq_bit = PCIE_INTX_SHIFT; + + chained_irq_enter(irqchip, desc); + + status = readl_relaxed(port->base + PCIE_INT_STATUS_REG); + for_each_set_bit_from(irq_bit, &status, PCI_NUM_INTX + + PCIE_INTX_SHIFT) { + virq = irq_find_mapping(port->intx_domain, + irq_bit - PCIE_INTX_SHIFT); + generic_handle_irq(virq); + } + + chained_irq_exit(irqchip, desc); +} + +static int mtk_pcie_setup_irq(struct mtk_pcie_port *port) +{ + struct device *dev = port->dev; + struct platform_device *pdev = to_platform_device(dev); + int err; + + err = mtk_pcie_init_irq_domains(port); + if (err) + return err; + + port->irq = platform_get_irq(pdev, 0); + if (port->irq < 0) + return port->irq; + + irq_set_chained_handler_and_data(port->irq, mtk_pcie_irq_handler, port); + + return 0; +} + static int mtk_pcie_parse_port(struct mtk_pcie_port *port) { struct device *dev = port->dev; @@ -393,6 +559,10 @@ static int mtk_pcie_setup(struct mtk_pcie_port *port) if (err) goto err_setup; + err = mtk_pcie_setup_irq(port); + if (err) + goto err_setup; + return 0; err_setup: @@ -426,6 +596,7 @@ static int mtk_pcie_probe(struct platform_device *pdev) err = pci_host_probe(host); if (err) { + mtk_pcie_irq_teardown(port); mtk_pcie_power_down(port); return err; } @@ -443,6 +614,7 @@ static int mtk_pcie_remove(struct platform_device *pdev) pci_remove_root_bus(host->bus); pci_unlock_rescan_remove(); + mtk_pcie_irq_teardown(port); mtk_pcie_power_down(port); return 0; From 1bdafba538be706b185c7aded0d42327702d92b7 Mon Sep 17 00:00:00 2001 From: Jianjun Wang Date: Tue, 20 Apr 2021 14:17:21 +0800 Subject: [PATCH 0763/1379] PCI: mediatek-gen3: Add MSI support Add MSI support for MediaTek Gen3 PCIe controller. This PCIe controller supports up to 256 MSI vectors, the MSI hardware block diagram is as follows: +-----+ | GIC | +-----+ ^ | port->irq | +-+-+-+-+-+-+-+-+ |0|1|2|3|4|5|6|7| (PCIe intc) +-+-+-+-+-+-+-+-+ ^ ^ ^ | | ... | +-------+ +------+ +-----------+ | | | +-+-+---+--+--+ +-+-+---+--+--+ +-+-+---+--+--+ |0|1|...|30|31| |0|1|...|30|31| |0|1|...|30|31| (MSI sets) +-+-+---+--+--+ +-+-+---+--+--+ +-+-+---+--+--+ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ | | | | | | | | | | | | (MSI vectors) | | | | | | | | | | | | (MSI SET0) (MSI SET1) ... (MSI SET7) With 256 MSI vectors supported, the MSI vectors are composed of 8 sets, each set has its own address for MSI message, and supports 32 MSI vectors to generate interrupt. Link: https://lore.kernel.org/r/20210420061723.989-6-jianjun.wang@mediatek.com Signed-off-by: Jianjun Wang Signed-off-by: Lorenzo Pieralisi Reviewed-by: Marc Zyngier Acked-by: Ryder Lee --- drivers/pci/controller/pcie-mediatek-gen3.c | 276 ++++++++++++++++++++ 1 file changed, 276 insertions(+) diff --git a/drivers/pci/controller/pcie-mediatek-gen3.c b/drivers/pci/controller/pcie-mediatek-gen3.c index 9134e871bff9..98a2eee75a0a 100644 --- a/drivers/pci/controller/pcie-mediatek-gen3.c +++ b/drivers/pci/controller/pcie-mediatek-gen3.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -48,12 +49,29 @@ #define PCIE_LINK_STATUS_REG 0x154 #define PCIE_PORT_LINKUP BIT(8) +#define PCIE_MSI_SET_NUM 8 +#define PCIE_MSI_IRQS_PER_SET 32 +#define PCIE_MSI_IRQS_NUM \ + (PCIE_MSI_IRQS_PER_SET * PCIE_MSI_SET_NUM) + #define PCIE_INT_ENABLE_REG 0x180 +#define PCIE_MSI_ENABLE GENMASK(PCIE_MSI_SET_NUM + 8 - 1, 8) +#define PCIE_MSI_SHIFT 8 #define PCIE_INTX_SHIFT 24 #define PCIE_INTX_ENABLE \ GENMASK(PCIE_INTX_SHIFT + PCI_NUM_INTX - 1, PCIE_INTX_SHIFT) #define PCIE_INT_STATUS_REG 0x184 +#define PCIE_MSI_SET_ENABLE_REG 0x190 +#define PCIE_MSI_SET_ENABLE GENMASK(PCIE_MSI_SET_NUM - 1, 0) + +#define PCIE_MSI_SET_BASE_REG 0xc00 +#define PCIE_MSI_SET_OFFSET 0x10 +#define PCIE_MSI_SET_STATUS_OFFSET 0x04 +#define PCIE_MSI_SET_ENABLE_OFFSET 0x08 + +#define PCIE_MSI_SET_ADDR_HI_BASE 0xc80 +#define PCIE_MSI_SET_ADDR_HI_OFFSET 0x04 #define PCIE_TRANS_TABLE_BASE_REG 0x800 #define PCIE_ATR_SRC_ADDR_MSB_OFFSET 0x4 @@ -73,6 +91,16 @@ #define PCIE_ATR_TLP_TYPE_MEM PCIE_ATR_TLP_TYPE(0) #define PCIE_ATR_TLP_TYPE_IO PCIE_ATR_TLP_TYPE(2) +/** + * struct mtk_msi_set - MSI information for each set + * @base: IO mapped register base + * @msg_addr: MSI message address + */ +struct mtk_msi_set { + void __iomem *base; + phys_addr_t msg_addr; +}; + /** * struct mtk_pcie_port - PCIe port information * @dev: pointer to PCIe device @@ -86,6 +114,11 @@ * @irq: PCIe controller interrupt number * @irq_lock: lock protecting IRQ register access * @intx_domain: legacy INTx IRQ domain + * @msi_domain: MSI IRQ domain + * @msi_bottom_domain: MSI IRQ bottom domain + * @msi_sets: MSI sets information + * @lock: lock protecting IRQ bit map + * @msi_irq_in_use: bit map for assigned MSI IRQ */ struct mtk_pcie_port { struct device *dev; @@ -100,6 +133,11 @@ struct mtk_pcie_port { int irq; raw_spinlock_t irq_lock; struct irq_domain *intx_domain; + struct irq_domain *msi_domain; + struct irq_domain *msi_bottom_domain; + struct mtk_msi_set msi_sets[PCIE_MSI_SET_NUM]; + struct mutex lock; + DECLARE_BITMAP(msi_irq_in_use, PCIE_MSI_IRQS_NUM); }; /** @@ -196,6 +234,35 @@ static int mtk_pcie_set_trans_table(struct mtk_pcie_port *port, return 0; } +static void mtk_pcie_enable_msi(struct mtk_pcie_port *port) +{ + int i; + u32 val; + + for (i = 0; i < PCIE_MSI_SET_NUM; i++) { + struct mtk_msi_set *msi_set = &port->msi_sets[i]; + + msi_set->base = port->base + PCIE_MSI_SET_BASE_REG + + i * PCIE_MSI_SET_OFFSET; + msi_set->msg_addr = port->reg_base + PCIE_MSI_SET_BASE_REG + + i * PCIE_MSI_SET_OFFSET; + + /* Configure the MSI capture address */ + writel_relaxed(lower_32_bits(msi_set->msg_addr), msi_set->base); + writel_relaxed(upper_32_bits(msi_set->msg_addr), + port->base + PCIE_MSI_SET_ADDR_HI_BASE + + i * PCIE_MSI_SET_ADDR_HI_OFFSET); + } + + val = readl_relaxed(port->base + PCIE_MSI_SET_ENABLE_REG); + val |= PCIE_MSI_SET_ENABLE; + writel_relaxed(val, port->base + PCIE_MSI_SET_ENABLE_REG); + + val = readl_relaxed(port->base + PCIE_INT_ENABLE_REG); + val |= PCIE_MSI_ENABLE; + writel_relaxed(val, port->base + PCIE_INT_ENABLE_REG); +} + static int mtk_pcie_startup_port(struct mtk_pcie_port *port) { struct resource_entry *entry; @@ -247,6 +314,8 @@ static int mtk_pcie_startup_port(struct mtk_pcie_port *port) return err; } + mtk_pcie_enable_msi(port); + /* Set PCIe translation windows */ resource_list_for_each_entry(entry, &host->windows) { struct resource *res = entry->res; @@ -289,6 +358,147 @@ static int mtk_pcie_set_affinity(struct irq_data *data, return -EINVAL; } +static void mtk_pcie_msi_irq_mask(struct irq_data *data) +{ + pci_msi_mask_irq(data); + irq_chip_mask_parent(data); +} + +static void mtk_pcie_msi_irq_unmask(struct irq_data *data) +{ + pci_msi_unmask_irq(data); + irq_chip_unmask_parent(data); +} + +static struct irq_chip mtk_msi_irq_chip = { + .irq_ack = irq_chip_ack_parent, + .irq_mask = mtk_pcie_msi_irq_mask, + .irq_unmask = mtk_pcie_msi_irq_unmask, + .name = "MSI", +}; + +static struct msi_domain_info mtk_msi_domain_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX | MSI_FLAG_MULTI_PCI_MSI), + .chip = &mtk_msi_irq_chip, +}; + +static void mtk_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data); + struct mtk_pcie_port *port = data->domain->host_data; + unsigned long hwirq; + + hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET; + + msg->address_hi = upper_32_bits(msi_set->msg_addr); + msg->address_lo = lower_32_bits(msi_set->msg_addr); + msg->data = hwirq; + dev_dbg(port->dev, "msi#%#lx address_hi %#x address_lo %#x data %d\n", + hwirq, msg->address_hi, msg->address_lo, msg->data); +} + +static void mtk_msi_bottom_irq_ack(struct irq_data *data) +{ + struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data); + unsigned long hwirq; + + hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET; + + writel_relaxed(BIT(hwirq), msi_set->base + PCIE_MSI_SET_STATUS_OFFSET); +} + +static void mtk_msi_bottom_irq_mask(struct irq_data *data) +{ + struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data); + struct mtk_pcie_port *port = data->domain->host_data; + unsigned long hwirq, flags; + u32 val; + + hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET; + + raw_spin_lock_irqsave(&port->irq_lock, flags); + val = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET); + val &= ~BIT(hwirq); + writel_relaxed(val, msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET); + raw_spin_unlock_irqrestore(&port->irq_lock, flags); +} + +static void mtk_msi_bottom_irq_unmask(struct irq_data *data) +{ + struct mtk_msi_set *msi_set = irq_data_get_irq_chip_data(data); + struct mtk_pcie_port *port = data->domain->host_data; + unsigned long hwirq, flags; + u32 val; + + hwirq = data->hwirq % PCIE_MSI_IRQS_PER_SET; + + raw_spin_lock_irqsave(&port->irq_lock, flags); + val = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET); + val |= BIT(hwirq); + writel_relaxed(val, msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET); + raw_spin_unlock_irqrestore(&port->irq_lock, flags); +} + +static struct irq_chip mtk_msi_bottom_irq_chip = { + .irq_ack = mtk_msi_bottom_irq_ack, + .irq_mask = mtk_msi_bottom_irq_mask, + .irq_unmask = mtk_msi_bottom_irq_unmask, + .irq_compose_msi_msg = mtk_compose_msi_msg, + .irq_set_affinity = mtk_pcie_set_affinity, + .name = "MSI", +}; + +static int mtk_msi_bottom_domain_alloc(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs, + void *arg) +{ + struct mtk_pcie_port *port = domain->host_data; + struct mtk_msi_set *msi_set; + int i, hwirq, set_idx; + + mutex_lock(&port->lock); + + hwirq = bitmap_find_free_region(port->msi_irq_in_use, PCIE_MSI_IRQS_NUM, + order_base_2(nr_irqs)); + + mutex_unlock(&port->lock); + + if (hwirq < 0) + return -ENOSPC; + + set_idx = hwirq / PCIE_MSI_IRQS_PER_SET; + msi_set = &port->msi_sets[set_idx]; + + for (i = 0; i < nr_irqs; i++) + irq_domain_set_info(domain, virq + i, hwirq + i, + &mtk_msi_bottom_irq_chip, msi_set, + handle_edge_irq, NULL, NULL); + + return 0; +} + +static void mtk_msi_bottom_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct mtk_pcie_port *port = domain->host_data; + struct irq_data *data = irq_domain_get_irq_data(domain, virq); + + mutex_lock(&port->lock); + + bitmap_release_region(port->msi_irq_in_use, data->hwirq, + order_base_2(nr_irqs)); + + mutex_unlock(&port->lock); + + irq_domain_free_irqs_common(domain, virq, nr_irqs); +} + +static const struct irq_domain_ops mtk_msi_bottom_domain_ops = { + .alloc = mtk_msi_bottom_domain_alloc, + .free = mtk_msi_bottom_domain_free, +}; + static void mtk_intx_mask(struct irq_data *data) { struct mtk_pcie_port *port = irq_data_get_irq_chip_data(data); @@ -357,6 +567,7 @@ static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port) { struct device *dev = port->dev; struct device_node *intc_node, *node = dev->of_node; + int ret; raw_spin_lock_init(&port->irq_lock); @@ -374,7 +585,34 @@ static int mtk_pcie_init_irq_domains(struct mtk_pcie_port *port) return -ENODEV; } + /* Setup MSI */ + mutex_init(&port->lock); + + port->msi_bottom_domain = irq_domain_add_linear(node, PCIE_MSI_IRQS_NUM, + &mtk_msi_bottom_domain_ops, port); + if (!port->msi_bottom_domain) { + dev_err(dev, "failed to create MSI bottom domain\n"); + ret = -ENODEV; + goto err_msi_bottom_domain; + } + + port->msi_domain = pci_msi_create_irq_domain(dev->fwnode, + &mtk_msi_domain_info, + port->msi_bottom_domain); + if (!port->msi_domain) { + dev_err(dev, "failed to create MSI domain\n"); + ret = -ENODEV; + goto err_msi_domain; + } + return 0; + +err_msi_domain: + irq_domain_remove(port->msi_bottom_domain); +err_msi_bottom_domain: + irq_domain_remove(port->intx_domain); + + return ret; } static void mtk_pcie_irq_teardown(struct mtk_pcie_port *port) @@ -384,9 +622,39 @@ static void mtk_pcie_irq_teardown(struct mtk_pcie_port *port) if (port->intx_domain) irq_domain_remove(port->intx_domain); + if (port->msi_domain) + irq_domain_remove(port->msi_domain); + + if (port->msi_bottom_domain) + irq_domain_remove(port->msi_bottom_domain); + irq_dispose_mapping(port->irq); } +static void mtk_pcie_msi_handler(struct mtk_pcie_port *port, int set_idx) +{ + struct mtk_msi_set *msi_set = &port->msi_sets[set_idx]; + unsigned long msi_enable, msi_status; + unsigned int virq; + irq_hw_number_t bit, hwirq; + + msi_enable = readl_relaxed(msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET); + + do { + msi_status = readl_relaxed(msi_set->base + + PCIE_MSI_SET_STATUS_OFFSET); + msi_status &= msi_enable; + if (!msi_status) + break; + + for_each_set_bit(bit, &msi_status, PCIE_MSI_IRQS_PER_SET) { + hwirq = bit + set_idx * PCIE_MSI_IRQS_PER_SET; + virq = irq_find_mapping(port->msi_bottom_domain, hwirq); + generic_handle_irq(virq); + } + } while (true); +} + static void mtk_pcie_irq_handler(struct irq_desc *desc) { struct mtk_pcie_port *port = irq_desc_get_handler_data(desc); @@ -405,6 +673,14 @@ static void mtk_pcie_irq_handler(struct irq_desc *desc) generic_handle_irq(virq); } + irq_bit = PCIE_MSI_SHIFT; + for_each_set_bit_from(irq_bit, &status, PCIE_MSI_SET_NUM + + PCIE_MSI_SHIFT) { + mtk_pcie_msi_handler(port, irq_bit - PCIE_MSI_SHIFT); + + writel_relaxed(BIT(irq_bit), port->base + PCIE_INT_STATUS_REG); + } + chained_irq_exit(irqchip, desc); } From d537dc125f0756f7eb9f3a2f878fbe2e3179c452 Mon Sep 17 00:00:00 2001 From: Jianjun Wang Date: Tue, 20 Apr 2021 14:17:22 +0800 Subject: [PATCH 0764/1379] PCI: mediatek-gen3: Add system PM support Add suspend_noirq and resume_noirq callback functions to implement PM system suspend and resume hooks for the MediaTek Gen3 PCIe controller. When the system suspends, trigger the PCIe link to enter the L2 state and pull down the PERST# pin, gating the clocks of the MAC layer, and then power-off the physical layer to provide power-saving. When the system resumes, the PCIe link should be re-established and the related control register values should be restored. Link: https://lore.kernel.org/r/20210420061723.989-7-jianjun.wang@mediatek.com Signed-off-by: Jianjun Wang Signed-off-by: Lorenzo Pieralisi Acked-by: Ryder Lee --- drivers/pci/controller/pcie-mediatek-gen3.c | 113 ++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/drivers/pci/controller/pcie-mediatek-gen3.c b/drivers/pci/controller/pcie-mediatek-gen3.c index 98a2eee75a0a..3c5b97716d40 100644 --- a/drivers/pci/controller/pcie-mediatek-gen3.c +++ b/drivers/pci/controller/pcie-mediatek-gen3.c @@ -45,6 +45,9 @@ #define PCIE_PE_RSTB BIT(3) #define PCIE_LTSSM_STATUS_REG 0x150 +#define PCIE_LTSSM_STATE_MASK GENMASK(28, 24) +#define PCIE_LTSSM_STATE(val) ((val & PCIE_LTSSM_STATE_MASK) >> 24) +#define PCIE_LTSSM_STATE_L2_IDLE 0x14 #define PCIE_LINK_STATUS_REG 0x154 #define PCIE_PORT_LINKUP BIT(8) @@ -73,6 +76,9 @@ #define PCIE_MSI_SET_ADDR_HI_BASE 0xc80 #define PCIE_MSI_SET_ADDR_HI_OFFSET 0x04 +#define PCIE_ICMD_PM_REG 0x198 +#define PCIE_TURN_OFF_LINK BIT(4) + #define PCIE_TRANS_TABLE_BASE_REG 0x800 #define PCIE_ATR_SRC_ADDR_MSB_OFFSET 0x4 #define PCIE_ATR_TRSL_ADDR_LSB_OFFSET 0x8 @@ -95,10 +101,12 @@ * struct mtk_msi_set - MSI information for each set * @base: IO mapped register base * @msg_addr: MSI message address + * @saved_irq_state: IRQ enable state saved at suspend time */ struct mtk_msi_set { void __iomem *base; phys_addr_t msg_addr; + u32 saved_irq_state; }; /** @@ -112,6 +120,7 @@ struct mtk_msi_set { * @clks: PCIe clocks * @num_clks: PCIe clocks count for this port * @irq: PCIe controller interrupt number + * @saved_irq_state: IRQ enable state saved at suspend time * @irq_lock: lock protecting IRQ register access * @intx_domain: legacy INTx IRQ domain * @msi_domain: MSI IRQ domain @@ -131,6 +140,7 @@ struct mtk_pcie_port { int num_clks; int irq; + u32 saved_irq_state; raw_spinlock_t irq_lock; struct irq_domain *intx_domain; struct irq_domain *msi_domain; @@ -896,6 +906,108 @@ static int mtk_pcie_remove(struct platform_device *pdev) return 0; } +static void __maybe_unused mtk_pcie_irq_save(struct mtk_pcie_port *port) +{ + int i; + + raw_spin_lock(&port->irq_lock); + + port->saved_irq_state = readl_relaxed(port->base + PCIE_INT_ENABLE_REG); + + for (i = 0; i < PCIE_MSI_SET_NUM; i++) { + struct mtk_msi_set *msi_set = &port->msi_sets[i]; + + msi_set->saved_irq_state = readl_relaxed(msi_set->base + + PCIE_MSI_SET_ENABLE_OFFSET); + } + + raw_spin_unlock(&port->irq_lock); +} + +static void __maybe_unused mtk_pcie_irq_restore(struct mtk_pcie_port *port) +{ + int i; + + raw_spin_lock(&port->irq_lock); + + writel_relaxed(port->saved_irq_state, port->base + PCIE_INT_ENABLE_REG); + + for (i = 0; i < PCIE_MSI_SET_NUM; i++) { + struct mtk_msi_set *msi_set = &port->msi_sets[i]; + + writel_relaxed(msi_set->saved_irq_state, + msi_set->base + PCIE_MSI_SET_ENABLE_OFFSET); + } + + raw_spin_unlock(&port->irq_lock); +} + +static int __maybe_unused mtk_pcie_turn_off_link(struct mtk_pcie_port *port) +{ + u32 val; + + val = readl_relaxed(port->base + PCIE_ICMD_PM_REG); + val |= PCIE_TURN_OFF_LINK; + writel_relaxed(val, port->base + PCIE_ICMD_PM_REG); + + /* Check the link is L2 */ + return readl_poll_timeout(port->base + PCIE_LTSSM_STATUS_REG, val, + (PCIE_LTSSM_STATE(val) == + PCIE_LTSSM_STATE_L2_IDLE), 20, + 50 * USEC_PER_MSEC); +} + +static int __maybe_unused mtk_pcie_suspend_noirq(struct device *dev) +{ + struct mtk_pcie_port *port = dev_get_drvdata(dev); + int err; + u32 val; + + /* Trigger link to L2 state */ + err = mtk_pcie_turn_off_link(port); + if (err) { + dev_err(port->dev, "cannot enter L2 state\n"); + return err; + } + + /* Pull down the PERST# pin */ + val = readl_relaxed(port->base + PCIE_RST_CTRL_REG); + val |= PCIE_PE_RSTB; + writel_relaxed(val, port->base + PCIE_RST_CTRL_REG); + + dev_dbg(port->dev, "entered L2 states successfully"); + + mtk_pcie_irq_save(port); + mtk_pcie_power_down(port); + + return 0; +} + +static int __maybe_unused mtk_pcie_resume_noirq(struct device *dev) +{ + struct mtk_pcie_port *port = dev_get_drvdata(dev); + int err; + + err = mtk_pcie_power_up(port); + if (err) + return err; + + err = mtk_pcie_startup_port(port); + if (err) { + mtk_pcie_power_down(port); + return err; + } + + mtk_pcie_irq_restore(port); + + return 0; +} + +static const struct dev_pm_ops mtk_pcie_pm_ops = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(mtk_pcie_suspend_noirq, + mtk_pcie_resume_noirq) +}; + static const struct of_device_id mtk_pcie_of_match[] = { { .compatible = "mediatek,mt8192-pcie" }, {}, @@ -907,6 +1019,7 @@ static struct platform_driver mtk_pcie_driver = { .driver = { .name = "mtk-pcie", .of_match_table = mtk_pcie_of_match, + .pm = &mtk_pcie_pm_ops, }, }; From 0739191b848136f733978eae9c37e34435c906af Mon Sep 17 00:00:00 2001 From: Jianjun Wang Date: Tue, 20 Apr 2021 14:17:23 +0800 Subject: [PATCH 0765/1379] MAINTAINERS: Add Jianjun Wang as MediaTek PCI co-maintainer Update entry for MediaTek PCIe controller, add Jianjun Wang as MediaTek PCI co-maintainer. Link: https://lore.kernel.org/r/20210420061723.989-8-jianjun.wang@mediatek.com Signed-off-by: Jianjun Wang Signed-off-by: Lorenzo Pieralisi Acked-by: Ryder Lee --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index d92f85ca831d..8050c14e6a7a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13919,6 +13919,7 @@ F: drivers/pci/controller/dwc/pcie-histb.c PCIE DRIVER FOR MEDIATEK M: Ryder Lee +M: Jianjun Wang L: linux-pci@vger.kernel.org L: linux-mediatek@lists.infradead.org S: Supported From f1ce3986baa62cffc3c5be156994de87524bab99 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 29 Apr 2021 19:59:41 +0300 Subject: [PATCH 0766/1379] nitro_enclaves: Fix stale file descriptors on failed usercopy A failing usercopy of the slot uid will lead to a stale entry in the file descriptor table as put_unused_fd() won't release it. This enables userland to refer to a dangling 'file' object through that still valid file descriptor, leading to all kinds of use-after-free exploitation scenarios. Exchanging put_unused_fd() for close_fd(), ksys_close() or alike won't solve the underlying issue, as the file descriptor might have been replaced in the meantime, e.g. via userland calling close() on it (leading to a NULL pointer dereference in the error handling code as 'fget(enclave_fd)' will return a NULL pointer) or by dup2()'ing a completely different file object to that very file descriptor, leading to the same situation: a dangling file descriptor pointing to a freed object -- just in this case to a file object of user's choosing. Generally speaking, after the call to fd_install() the file descriptor is live and userland is free to do whatever with it. We cannot rely on it to still refer to our enclave object afterwards. In fact, by abusing userfaultfd() userland can hit the condition without any racing and abuse the error handling in the nitro code as it pleases. To fix the above issues, defer the call to fd_install() until all possible errors are handled. In this case it's just the usercopy, so do it directly in ne_create_vm_ioctl() itself. Signed-off-by: Mathias Krause Signed-off-by: Andra Paraschiv Cc: stable Link: https://lore.kernel.org/r/20210429165941.27020-2-andraprs@amazon.com Signed-off-by: Greg Kroah-Hartman --- drivers/virt/nitro_enclaves/ne_misc_dev.c | 43 +++++++++-------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/drivers/virt/nitro_enclaves/ne_misc_dev.c b/drivers/virt/nitro_enclaves/ne_misc_dev.c index f1964ea4b826..e21e1e86ad15 100644 --- a/drivers/virt/nitro_enclaves/ne_misc_dev.c +++ b/drivers/virt/nitro_enclaves/ne_misc_dev.c @@ -1524,7 +1524,8 @@ static const struct file_operations ne_enclave_fops = { * enclave file descriptor to be further used for enclave * resources handling e.g. memory regions and CPUs. * @ne_pci_dev : Private data associated with the PCI device. - * @slot_uid: Generated unique slot id associated with an enclave. + * @slot_uid: User pointer to store the generated unique slot id + * associated with an enclave to. * * Context: Process context. This function is called with the ne_pci_dev enclave * mutex held. @@ -1532,7 +1533,7 @@ static const struct file_operations ne_enclave_fops = { * * Enclave fd on success. * * Negative return value on failure. */ -static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 *slot_uid) +static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 __user *slot_uid) { struct ne_pci_dev_cmd_reply cmd_reply = {}; int enclave_fd = -1; @@ -1634,7 +1635,18 @@ static int ne_create_vm_ioctl(struct ne_pci_dev *ne_pci_dev, u64 *slot_uid) list_add(&ne_enclave->enclave_list_entry, &ne_pci_dev->enclaves_list); - *slot_uid = ne_enclave->slot_uid; + if (copy_to_user(slot_uid, &ne_enclave->slot_uid, sizeof(ne_enclave->slot_uid))) { + /* + * As we're holding the only reference to 'enclave_file', fput() + * will call ne_enclave_release() which will do a proper cleanup + * of all so far allocated resources, leaving only the unused fd + * for us to free. + */ + fput(enclave_file); + put_unused_fd(enclave_fd); + + return -EFAULT; + } fd_install(enclave_fd, enclave_file); @@ -1671,34 +1683,13 @@ static long ne_ioctl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { case NE_CREATE_VM: { int enclave_fd = -1; - struct file *enclave_file = NULL; struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev; - int rc = -EINVAL; - u64 slot_uid = 0; + u64 __user *slot_uid = (void __user *)arg; mutex_lock(&ne_pci_dev->enclaves_list_mutex); - - enclave_fd = ne_create_vm_ioctl(ne_pci_dev, &slot_uid); - if (enclave_fd < 0) { - rc = enclave_fd; - - mutex_unlock(&ne_pci_dev->enclaves_list_mutex); - - return rc; - } - + enclave_fd = ne_create_vm_ioctl(ne_pci_dev, slot_uid); mutex_unlock(&ne_pci_dev->enclaves_list_mutex); - if (copy_to_user((void __user *)arg, &slot_uid, sizeof(slot_uid))) { - enclave_file = fget(enclave_fd); - /* Decrement file refs to have release() called. */ - fput(enclave_file); - fput(enclave_file); - put_unused_fd(enclave_fd); - - return -EFAULT; - } - return enclave_fd; } From dfc06b389a4f54e78c03abecd5b42ab6ea8d492a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Apr 2021 08:28:59 +0200 Subject: [PATCH 0767/1379] swiotlb: don't override user specified size in swiotlb_adjust_size If the user already specified a swiotlb size on the command line, swiotlb_adjust_size should not overwrite it. Fixes: 2cbc2776efe4 ("swiotlb: remove swiotlb_nr_tbl") Reported-by: Tom Lendacky Tested-by: Tom Lendacky Signed-off-by: Christoph Hellwig Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 8635a57f88e9..8ca7d505d61c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -118,6 +118,8 @@ void __init swiotlb_adjust_size(unsigned long size) * architectures such as those supporting memory encryption to * adjust/expand SWIOTLB size for their use. */ + if (default_nslabs != IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT) + return; size = ALIGN(size, IO_TLB_SIZE); default_nslabs = ALIGN(size >> IO_TLB_SHIFT, IO_TLB_SEGSIZE); pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); From 2840f710f23a3a867426637393acbdfa1f4f1d59 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Apr 2021 16:13:51 +0100 Subject: [PATCH 0768/1379] io_uring: fix drain with rsrc CQEs Resource emitted CQEs are not bound to requests, so fix up counters used for DRAIN/defer logic. Fixes: b60c8dce33895 ("io_uring: preparation for rsrc tagging") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2b32f5f0a40d5928c3466d028f936e167f0654be.1619536280.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 360f81395d81..b0706f047c50 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7539,6 +7539,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) io_ring_submit_lock(ctx, lock_ring); spin_lock_irqsave(&ctx->completion_lock, flags); io_cqring_fill_event(ctx, prsrc->tag, 0, 0); + ctx->cq_extra++; io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); From dddca22636c9062f284e755e2a49fb8863db8a82 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Apr 2021 16:13:52 +0100 Subject: [PATCH 0769/1379] io_uring: dont overlap internal and user req flags CQE flags take one byte that we store in req->flags together with other REQ_F_* internal flags. CQE flags are copied directly into req and then verified that requires some handling on failures, e.g. to make sure that that copy doesn't set some of the internal flags. Move all internal flags to take bits after the first byte, so we don't need extra handling and make it safer overall. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b8b5b02d1ab9d786fcc7db4a3fe86db6b70b8987.1619536280.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b0706f047c50..2872f1db6458 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -702,7 +702,8 @@ enum { REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, - REQ_F_FAIL_LINK_BIT, + /* first byte is taken by user flags, shift it to not overlap */ + REQ_F_FAIL_LINK_BIT = 8, REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, REQ_F_NOWAIT_BIT, @@ -6503,14 +6504,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->work.creds = NULL; /* enforce forwards compatibility on users */ - if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { - req->flags = 0; + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) return -EINVAL; - } - if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; - if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) return -EACCES; From b0d658ec88a695861c3fd78ef783c1181f81a6e2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Apr 2021 16:13:53 +0100 Subject: [PATCH 0770/1379] io_uring: add more build check for uapi Add a couple of BUILD_BUG_ON() checking some rsrc uapi structs and SQE flags. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ff960df4d5026b9fb5bfd80994b9d3667d3926da.1619536280.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2872f1db6458..aa5854cd3942 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -10134,6 +10134,13 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); + BUILD_BUG_ON(sizeof(struct io_uring_files_update) != + sizeof(struct io_uring_rsrc_update)); + BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > + sizeof(struct io_uring_rsrc_update2)); + /* should fit into one byte */ + BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); + BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | From 6224843d56e0c29c0357e86b02b95801897c2caf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 28 Apr 2021 13:11:29 +0100 Subject: [PATCH 0771/1379] io_uring: allow empty slots for reg buffers Allow empty reg buffer slots any request using which should fail. This allows users to not register all buffers in advance, but do it lazily and/or on demand via updates. That is achieved by setting iov_base and iov_len to zero for registration and/or buffer updates. Empty buffer can't have a non-zero tag. Implementation details: to not add extra overhead to io_import_fixed(), create a dummy buffer crafted to fail any request using it, and set it to all empty buffer slots. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7e95e4d700082baaf010c648c72ac764c9cc8826.1619611868.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index aa5854cd3942..47c2f126f885 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -456,6 +456,7 @@ struct io_ring_ctx { spinlock_t rsrc_ref_lock; struct io_rsrc_node *rsrc_node; struct io_rsrc_node *rsrc_backup_node; + struct io_mapped_ubuf *dummy_ubuf; struct io_restriction restrictions; @@ -1158,6 +1159,12 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; __hash_init(ctx->cancel_hash, 1U << hash_bits); + ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); + if (!ctx->dummy_ubuf) + goto err; + /* set invalid range, so io_import_fixed() fails meeting it */ + ctx->dummy_ubuf->ubuf = -1UL; + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -1185,6 +1192,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list); return ctx; err: + kfree(ctx->dummy_ubuf); kfree(ctx->cancel_hash); kfree(ctx); return NULL; @@ -8109,11 +8117,13 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo struct io_mapped_ubuf *imu = *slot; unsigned int i; - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); - if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); - kvfree(imu); + if (imu != ctx->dummy_ubuf) { + for (i = 0; i < imu->nr_bvecs; i++) + unpin_user_page(imu->bvec[i].bv_page); + if (imu->acct_pages) + io_unaccount_mem(ctx, imu->acct_pages); + kvfree(imu); + } *slot = NULL; } @@ -8253,6 +8263,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size_t size; int ret, pret, nr_pages, i; + if (!iov->iov_base) { + *pimu = ctx->dummy_ubuf; + return 0; + } + ubuf = (unsigned long) iov->iov_base; end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = ubuf >> PAGE_SHIFT; @@ -8350,7 +8365,9 @@ static int io_buffer_validate(struct iovec *iov) * constraints here, we'll -EINVAL later when IO is * submitted if they are wrong. */ - if (!iov->iov_base || !iov->iov_len) + if (!iov->iov_base) + return iov->iov_len ? -EFAULT : 0; + if (!iov->iov_len) return -EFAULT; /* arbitrary limit, but we need something */ @@ -8400,6 +8417,8 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, ret = io_buffer_validate(&iov); if (ret) break; + if (!iov.iov_base && tag) + return -EINVAL; ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], &last_hpage); @@ -8449,12 +8468,14 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, err = io_buffer_validate(&iov); if (err) break; + if (!iov.iov_base && tag) + return -EINVAL; err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); if (err) break; i = array_index_nospec(offset, ctx->nr_user_bufs); - if (ctx->user_bufs[i]) { + if (ctx->user_bufs[i] != ctx->dummy_ubuf) { err = io_queue_rsrc_removal(ctx->buf_data, offset, ctx->rsrc_node, ctx->user_bufs[i]); if (unlikely(err)) { @@ -8602,6 +8623,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_hash); + kfree(ctx->dummy_ubuf); kfree(ctx); } From 47b228ce6f66830768eac145efa7746637969101 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 29 Apr 2021 11:46:48 +0100 Subject: [PATCH 0772/1379] io_uring: fix unchecked error in switch_start() io_rsrc_node_switch_start() can fail, don't forget to check returned error code. Reported-by: syzbot+a4715dd4b7c866136f79@syzkaller.appspotmail.com Fixes: eae071c9b4cef ("io_uring: prepare fixed rw for dynanic buffers") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c4c06e2f3f0c8e43bd8d0a266c79055bcc6b6e60.1619693112.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 47c2f126f885..82117e6bf45d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9627,7 +9627,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; /* always set a rsrc node */ - io_rsrc_node_switch_start(ctx); + ret = io_rsrc_node_switch_start(ctx); + if (ret) + goto err; io_rsrc_node_switch(ctx, NULL); memset(&p->sq_off, 0, sizeof(p->sq_off)); From cf3770e78421f268dee3c1eef5e8a5d284ec3416 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 29 Apr 2021 11:46:02 +0100 Subject: [PATCH 0773/1379] io_uring: Fix premature return from loop and memory leak Currently the -EINVAL error return path is leaking memory allocated to data. Fix this by not returning immediately but instead setting the error return variable to -EINVAL and breaking out of the loop. Kudos to Pavel Begunkov for suggesting a correct fix. Signed-off-by: Colin Ian King Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20210429104602.62676-1-colin.king@canonical.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 82117e6bf45d..a880edb90d0c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8417,8 +8417,10 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, ret = io_buffer_validate(&iov); if (ret) break; - if (!iov.iov_base && tag) - return -EINVAL; + if (!iov.iov_base && tag) { + ret = -EINVAL; + break; + } ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], &last_hpage); @@ -8468,8 +8470,10 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, err = io_buffer_validate(&iov); if (err) break; - if (!iov.iov_base && tag) - return -EINVAL; + if (!iov.iov_base && tag) { + err = -EINVAL; + break; + } err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); if (err) break; From 7942121b8ca073932529e7122a573ec2d1ed0d93 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sun, 18 Apr 2021 01:52:05 +0200 Subject: [PATCH 0774/1379] rtc: imx-sc: remove .read_alarm The RTC core properly handles RTC without .read_alarm and doesn't use it to set alarms. .read_alarm can be safely dropped. Signed-off-by: Alexandre Belloni Reviewed-by: Fabio Estevam Link: https://lore.kernel.org/r/20210417235205.994119-1-alexandre.belloni@bootlin.com --- drivers/rtc/rtc-imx-sc.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/drivers/rtc/rtc-imx-sc.c b/drivers/rtc/rtc-imx-sc.c index cc9fbab49999..814d516645e2 100644 --- a/drivers/rtc/rtc-imx-sc.c +++ b/drivers/rtc/rtc-imx-sc.c @@ -80,16 +80,6 @@ static int imx_sc_rtc_alarm_irq_enable(struct device *dev, unsigned int enable) return imx_scu_irq_group_enable(SC_IRQ_GROUP_RTC, SC_IRQ_RTC, enable); } -static int imx_sc_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) -{ - /* - * SCU firmware does NOT provide read alarm API, but .read_alarm - * callback is required by RTC framework to support alarm function, - * so just return here. - */ - return 0; -} - static int imx_sc_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) { struct imx_sc_msg_timer_rtc_set_alarm msg; @@ -127,7 +117,6 @@ static int imx_sc_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) static const struct rtc_class_ops imx_sc_rtc_ops = { .read_time = imx_sc_rtc_read_time, .set_time = imx_sc_rtc_set_time, - .read_alarm = imx_sc_rtc_read_alarm, .set_alarm = imx_sc_rtc_set_alarm, .alarm_irq_enable = imx_sc_rtc_alarm_irq_enable, }; From 64e9d8e4dbc4e9173589ed8d61ea423466172396 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sun, 18 Apr 2021 02:00:21 +0200 Subject: [PATCH 0775/1379] rtc: ds1307: replace HAS_ALARM by RTC_FEATURE_ALARM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The core now has RTC_FEATURE_ALARM for the driver to indicate whether alarms are available. Use that instead of HAS_ALARM to ensure the alarm callbacks are not even called. Tested-by: Łukasz Stelmach Reviewed-by: Łukasz Stelmach Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210418000023.995758-1-alexandre.belloni@bootlin.com --- drivers/rtc/rtc-ds1307.c | 42 +++++++--------------------------------- 1 file changed, 7 insertions(+), 35 deletions(-) diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index cd8e438bc9c4..76d67c419f7d 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -171,7 +171,6 @@ struct ds1307 { enum ds_type type; unsigned long flags; #define HAS_NVRAM 0 /* bit 0 == sysfs file active */ -#define HAS_ALARM 1 /* bit 1 == irq claimed */ struct device *dev; struct regmap *regmap; const char *name; @@ -411,9 +410,6 @@ static int ds1337_read_alarm(struct device *dev, struct rtc_wkalrm *t) int ret; u8 regs[9]; - if (!test_bit(HAS_ALARM, &ds1307->flags)) - return -EINVAL; - /* read all ALARM1, ALARM2, and status registers at once */ ret = regmap_bulk_read(ds1307->regmap, DS1339_REG_ALARM1_SECS, regs, sizeof(regs)); @@ -454,9 +450,6 @@ static int ds1337_set_alarm(struct device *dev, struct rtc_wkalrm *t) u8 control, status; int ret; - if (!test_bit(HAS_ALARM, &ds1307->flags)) - return -EINVAL; - dev_dbg(dev, "%s secs=%d, mins=%d, " "hours=%d, mday=%d, enabled=%d, pending=%d\n", "alarm set", t->time.tm_sec, t->time.tm_min, @@ -512,9 +505,6 @@ static int ds1307_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct ds1307 *ds1307 = dev_get_drvdata(dev); - if (!test_bit(HAS_ALARM, &ds1307->flags)) - return -ENOTTY; - return regmap_update_bits(ds1307->regmap, DS1337_REG_CONTROL, DS1337_BIT_A1IE, enabled ? DS1337_BIT_A1IE : 0); @@ -592,9 +582,6 @@ static int rx8130_read_alarm(struct device *dev, struct rtc_wkalrm *t) u8 ald[3], ctl[3]; int ret; - if (!test_bit(HAS_ALARM, &ds1307->flags)) - return -EINVAL; - /* Read alarm registers. */ ret = regmap_bulk_read(ds1307->regmap, RX8130_REG_ALARM_MIN, ald, sizeof(ald)); @@ -634,9 +621,6 @@ static int rx8130_set_alarm(struct device *dev, struct rtc_wkalrm *t) u8 ald[3], ctl[3]; int ret; - if (!test_bit(HAS_ALARM, &ds1307->flags)) - return -EINVAL; - dev_dbg(dev, "%s, sec=%d min=%d hour=%d wday=%d mday=%d mon=%d " "enabled=%d pending=%d\n", __func__, t->time.tm_sec, t->time.tm_min, t->time.tm_hour, @@ -681,9 +665,6 @@ static int rx8130_alarm_irq_enable(struct device *dev, unsigned int enabled) struct ds1307 *ds1307 = dev_get_drvdata(dev); int ret, reg; - if (!test_bit(HAS_ALARM, &ds1307->flags)) - return -EINVAL; - ret = regmap_read(ds1307->regmap, RX8130_REG_CONTROL0, ®); if (ret < 0) return ret; @@ -735,9 +716,6 @@ static int mcp794xx_read_alarm(struct device *dev, struct rtc_wkalrm *t) u8 regs[10]; int ret; - if (!test_bit(HAS_ALARM, &ds1307->flags)) - return -EINVAL; - /* Read control and alarm 0 registers. */ ret = regmap_bulk_read(ds1307->regmap, MCP794XX_REG_CONTROL, regs, sizeof(regs)); @@ -793,9 +771,6 @@ static int mcp794xx_set_alarm(struct device *dev, struct rtc_wkalrm *t) unsigned char regs[10]; int wday, ret; - if (!test_bit(HAS_ALARM, &ds1307->flags)) - return -EINVAL; - wday = mcp794xx_alm_weekday(dev, &t->time); if (wday < 0) return wday; @@ -842,9 +817,6 @@ static int mcp794xx_alarm_irq_enable(struct device *dev, unsigned int enabled) { struct ds1307 *ds1307 = dev_get_drvdata(dev); - if (!test_bit(HAS_ALARM, &ds1307->flags)) - return -EINVAL; - return regmap_update_bits(ds1307->regmap, MCP794XX_REG_CONTROL, MCP794XX_BIT_ALM0_EN, enabled ? MCP794XX_BIT_ALM0_EN : 0); @@ -1641,7 +1613,7 @@ static int ds3231_clks_register(struct ds1307 *ds1307) * Interrupt signal due to alarm conditions and square-wave * output share same pin, so don't initialize both. */ - if (i == DS3231_CLK_SQW && test_bit(HAS_ALARM, &ds1307->flags)) + if (i == DS3231_CLK_SQW && test_bit(RTC_FEATURE_ALARM, ds1307->rtc->features)) continue; init.name = ds3231_clks_names[i]; @@ -1964,15 +1936,15 @@ static int ds1307_probe(struct i2c_client *client, bin2bcd(tmp)); } - if (want_irq || ds1307_can_wakeup_device) { - device_set_wakeup_capable(ds1307->dev, true); - set_bit(HAS_ALARM, &ds1307->flags); - } - ds1307->rtc = devm_rtc_allocate_device(ds1307->dev); if (IS_ERR(ds1307->rtc)) return PTR_ERR(ds1307->rtc); + if (want_irq || ds1307_can_wakeup_device) + device_set_wakeup_capable(ds1307->dev, true); + else + clear_bit(RTC_FEATURE_ALARM, ds1307->rtc->features); + if (ds1307_can_wakeup_device && !want_irq) { dev_info(ds1307->dev, "'wakeup-source' is set, request for an IRQ is disabled!\n"); @@ -1988,7 +1960,7 @@ static int ds1307_probe(struct i2c_client *client, if (err) { client->irq = 0; device_set_wakeup_capable(ds1307->dev, false); - clear_bit(HAS_ALARM, &ds1307->flags); + clear_bit(RTC_FEATURE_ALARM, ds1307->rtc->features); dev_err(ds1307->dev, "unable to request IRQ!\n"); } else { dev_dbg(ds1307->dev, "got IRQ %d\n", client->irq); From 4bf84b449a0ea3885397bb5540a8fc68a78edb9d Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sun, 18 Apr 2021 02:00:22 +0200 Subject: [PATCH 0776/1379] rtc: ds1307: remove flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit flags is now unused, drop it. Tested-by: Łukasz Stelmach Reviewed-by: Łukasz Stelmach Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210418000023.995758-2-alexandre.belloni@bootlin.com --- drivers/rtc/rtc-ds1307.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index 76d67c419f7d..089509d0a3a0 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -169,8 +169,6 @@ enum ds_type { struct ds1307 { enum ds_type type; - unsigned long flags; -#define HAS_NVRAM 0 /* bit 0 == sysfs file active */ struct device *dev; struct regmap *regmap; const char *name; From c55c3a516ceff3a041d5e3253d4d9a1b75fbb1d8 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sun, 18 Apr 2021 02:00:23 +0200 Subject: [PATCH 0777/1379] rtc: rtc_update_irq_enable: rework UIE emulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the core is aware of whether alarms are available, it is possible to decide whether UIE emulation is required before actually trying to set the alarm. This greatly simplifies rtc_update_irq_enable because there is now only one error value to track and is not relying on the return value of __rtc_set_alarm anymore. Tested-by: Łukasz Stelmach Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210418000023.995758-3-alexandre.belloni@bootlin.com --- drivers/rtc/interface.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index dcb34c73319e..9a2bd4947007 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -545,7 +545,7 @@ EXPORT_SYMBOL_GPL(rtc_alarm_irq_enable); int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled) { - int rc = 0, err; + int err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) @@ -561,17 +561,21 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled) if (rtc->uie_rtctimer.enabled == enabled) goto out; - if (rtc->uie_unsupported) { - err = -EINVAL; - goto out; + if (rtc->uie_unsupported || !test_bit(RTC_FEATURE_ALARM, rtc->features)) { + mutex_unlock(&rtc->ops_lock); +#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL + return rtc_dev_update_irq_enable_emul(rtc, enabled); +#else + return -EINVAL; +#endif } if (enabled) { struct rtc_time tm; ktime_t now, onesec; - rc = __rtc_read_time(rtc, &tm); - if (rc) + err = __rtc_read_time(rtc, &tm); + if (err) goto out; onesec = ktime_set(1, 0); now = rtc_tm_to_ktime(tm); @@ -585,24 +589,6 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled) out: mutex_unlock(&rtc->ops_lock); - /* - * __rtc_read_time() failed, this probably means that the RTC time has - * never been set or less probably there is a transient error on the - * bus. In any case, avoid enabling emulation has this will fail when - * reading the time too. - */ - if (rc) - return rc; - -#ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL - /* - * Enable emulation if the driver returned -EINVAL to signal that it has - * been configured without interrupts or they are not available at the - * moment. - */ - if (err == -EINVAL) - err = rtc_dev_update_irq_enable_emul(rtc, enabled); -#endif return err; } EXPORT_SYMBOL_GPL(rtc_update_irq_enable); From 94959a3a04a574b6234df8ff165bf70135b0bb2b Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sun, 18 Apr 2021 02:20:21 +0200 Subject: [PATCH 0778/1379] rtc: pcf8523: remove useless define Drop DRIVER_NAME as it is only used once Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210418002023.1000265-1-alexandre.belloni@bootlin.com --- drivers/rtc/rtc-pcf8523.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-pcf8523.c b/drivers/rtc/rtc-pcf8523.c index 5e1e7b2a8c9a..fe3ab41d8326 100644 --- a/drivers/rtc/rtc-pcf8523.c +++ b/drivers/rtc/rtc-pcf8523.c @@ -9,8 +9,6 @@ #include #include -#define DRIVER_NAME "rtc-pcf8523" - #define REG_CONTROL1 0x00 #define REG_CONTROL1_CAP_SEL BIT(7) #define REG_CONTROL1_STOP BIT(5) @@ -373,7 +371,7 @@ MODULE_DEVICE_TABLE(of, pcf8523_of_match); static struct i2c_driver pcf8523_driver = { .driver = { - .name = DRIVER_NAME, + .name = "rtc-pcf8523", .of_match_table = of_match_ptr(pcf8523_of_match), }, .probe = pcf8523_probe, From 13e37b7fb75dfaeb4f5a72468f0bd32853628d28 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sun, 18 Apr 2021 02:20:22 +0200 Subject: [PATCH 0779/1379] rtc: pcf8523: add alarm support Alarm support requires unconditionally disabling clock out because it is using the int1 pin. Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210418002023.1000265-2-alexandre.belloni@bootlin.com --- drivers/rtc/rtc-pcf8523.c | 179 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) diff --git a/drivers/rtc/rtc-pcf8523.c b/drivers/rtc/rtc-pcf8523.c index fe3ab41d8326..4e5e5caeb8ce 100644 --- a/drivers/rtc/rtc-pcf8523.c +++ b/drivers/rtc/rtc-pcf8523.c @@ -8,10 +8,15 @@ #include #include #include +#include #define REG_CONTROL1 0x00 #define REG_CONTROL1_CAP_SEL BIT(7) #define REG_CONTROL1_STOP BIT(5) +#define REG_CONTROL1_AIE BIT(1) + +#define REG_CONTROL2 0x01 +#define REG_CONTROL2_AF BIT(3) #define REG_CONTROL3 0x02 #define REG_CONTROL3_PM_BLD BIT(7) /* battery low detection disabled */ @@ -30,9 +35,22 @@ #define REG_MONTHS 0x08 #define REG_YEARS 0x09 +#define REG_MINUTE_ALARM 0x0a +#define REG_HOUR_ALARM 0x0b +#define REG_DAY_ALARM 0x0c +#define REG_WEEKDAY_ALARM 0x0d +#define ALARM_DIS BIT(7) + #define REG_OFFSET 0x0e #define REG_OFFSET_MODE BIT(7) +#define REG_TMR_CLKOUT_CTRL 0x0f + +struct pcf8523 { + struct rtc_device *rtc; + struct i2c_client *client; +}; + static int pcf8523_read(struct i2c_client *client, u8 reg, u8 *valuep) { struct i2c_msg msgs[2]; @@ -138,6 +156,27 @@ static int pcf8523_set_pm(struct i2c_client *client, u8 pm) return 0; } +static irqreturn_t pcf8523_irq(int irq, void *dev_id) +{ + struct pcf8523 *pcf8523 = i2c_get_clientdata(dev_id); + u8 value; + int err; + + err = pcf8523_read(pcf8523->client, REG_CONTROL2, &value); + if (err < 0) + return IRQ_HANDLED; + + if (value & REG_CONTROL2_AF) { + value &= ~REG_CONTROL2_AF; + pcf8523_write(pcf8523->client, REG_CONTROL2, value); + rtc_update_irq(pcf8523->rtc, 1, RTC_IRQF | RTC_AF); + + return IRQ_HANDLED; + } + + return IRQ_NONE; +} + static int pcf8523_stop_rtc(struct i2c_client *client) { u8 value; @@ -257,6 +296,111 @@ static int pcf8523_rtc_set_time(struct device *dev, struct rtc_time *tm) return pcf8523_start_rtc(client); } +static int pcf8523_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *tm) +{ + struct i2c_client *client = to_i2c_client(dev); + u8 start = REG_MINUTE_ALARM, regs[4]; + struct i2c_msg msgs[2]; + u8 value; + int err; + + msgs[0].addr = client->addr; + msgs[0].flags = 0; + msgs[0].len = 1; + msgs[0].buf = &start; + + msgs[1].addr = client->addr; + msgs[1].flags = I2C_M_RD; + msgs[1].len = sizeof(regs); + msgs[1].buf = regs; + + err = i2c_transfer(client->adapter, msgs, ARRAY_SIZE(msgs)); + if (err < 0) + return err; + + tm->time.tm_sec = 0; + tm->time.tm_min = bcd2bin(regs[0] & 0x7F); + tm->time.tm_hour = bcd2bin(regs[1] & 0x3F); + tm->time.tm_mday = bcd2bin(regs[2] & 0x3F); + tm->time.tm_wday = bcd2bin(regs[3] & 0x7); + + err = pcf8523_read(client, REG_CONTROL1, &value); + if (err < 0) + return err; + tm->enabled = !!(value & REG_CONTROL1_AIE); + + err = pcf8523_read(client, REG_CONTROL2, &value); + if (err < 0) + return err; + tm->pending = !!(value & REG_CONTROL2_AF); + + return 0; +} + +static int pcf8523_irq_enable(struct device *dev, unsigned int enabled) +{ + struct i2c_client *client = to_i2c_client(dev); + u8 value; + int err; + + err = pcf8523_read(client, REG_CONTROL1, &value); + if (err < 0) + return err; + + value &= REG_CONTROL1_AIE; + + if (enabled) + value |= REG_CONTROL1_AIE; + + err = pcf8523_write(client, REG_CONTROL1, value); + if (err < 0) + return err; + + return 0; +} + +static int pcf8523_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *tm) +{ + struct i2c_client *client = to_i2c_client(dev); + struct i2c_msg msg; + u8 regs[5]; + int err; + + err = pcf8523_irq_enable(dev, 0); + if (err) + return err; + + err = pcf8523_write(client, REG_CONTROL2, 0); + if (err < 0) + return err; + + /* The alarm has no seconds, round up to nearest minute */ + if (tm->time.tm_sec) { + time64_t alarm_time = rtc_tm_to_time64(&tm->time); + + alarm_time += 60 - tm->time.tm_sec; + rtc_time64_to_tm(alarm_time, &tm->time); + } + + regs[0] = REG_MINUTE_ALARM; + regs[1] = bin2bcd(tm->time.tm_min); + regs[2] = bin2bcd(tm->time.tm_hour); + regs[3] = bin2bcd(tm->time.tm_mday); + regs[4] = ALARM_DIS; + msg.addr = client->addr; + msg.flags = 0; + msg.len = sizeof(regs); + msg.buf = regs; + err = i2c_transfer(client->adapter, &msg, 1); + if (err < 0) + return err; + + if (tm->enabled) + return pcf8523_irq_enable(dev, tm->enabled); + + return 0; +} + #ifdef CONFIG_RTC_INTF_DEV static int pcf8523_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) @@ -320,6 +464,9 @@ static int pcf8523_rtc_set_offset(struct device *dev, long offset) static const struct rtc_class_ops pcf8523_rtc_ops = { .read_time = pcf8523_rtc_read_time, .set_time = pcf8523_rtc_set_time, + .read_alarm = pcf8523_rtc_read_alarm, + .set_alarm = pcf8523_rtc_set_alarm, + .alarm_irq_enable = pcf8523_irq_enable, .ioctl = pcf8523_rtc_ioctl, .read_offset = pcf8523_rtc_read_offset, .set_offset = pcf8523_rtc_set_offset, @@ -328,12 +475,21 @@ static const struct rtc_class_ops pcf8523_rtc_ops = { static int pcf8523_probe(struct i2c_client *client, const struct i2c_device_id *id) { + struct pcf8523 *pcf8523; struct rtc_device *rtc; + bool wakeup_source = false; int err; if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) return -ENODEV; + pcf8523 = devm_kzalloc(&client->dev, sizeof(struct pcf8523), GFP_KERNEL); + if (!pcf8523) + return -ENOMEM; + + i2c_set_clientdata(client, pcf8523); + pcf8523->client = client; + err = pcf8523_load_capacitance(client); if (err < 0) dev_warn(&client->dev, "failed to set xtal load capacitance: %d", @@ -347,9 +503,32 @@ static int pcf8523_probe(struct i2c_client *client, if (IS_ERR(rtc)) return PTR_ERR(rtc); + pcf8523->rtc = rtc; rtc->ops = &pcf8523_rtc_ops; rtc->range_min = RTC_TIMESTAMP_BEGIN_2000; rtc->range_max = RTC_TIMESTAMP_END_2099; + rtc->uie_unsupported = 1; + + if (client->irq > 0) { + err = pcf8523_write(client, REG_TMR_CLKOUT_CTRL, 0x38); + if (err < 0) + return err; + + err = devm_request_threaded_irq(&client->dev, client->irq, + NULL, pcf8523_irq, + IRQF_SHARED | IRQF_ONESHOT | IRQF_TRIGGER_LOW, + dev_name(&rtc->dev), client); + if (err) + return err; + + dev_pm_set_wake_irq(&client->dev, client->irq); + } + +#ifdef CONFIG_OF + wakeup_source = of_property_read_bool(client->dev.of_node, "wakeup-source"); +#endif + if (client->irq > 0 || wakeup_source) + device_init_wakeup(&client->dev, true); return devm_rtc_register_device(rtc); } From a1cfe7cc3873baf83a26356cb5e10409c6fb942c Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sun, 18 Apr 2021 02:20:23 +0200 Subject: [PATCH 0780/1379] rtc: pcf8523: report oscillator failures Report oscillator failures and invalid date/time on RTC_VL_READ. Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210418002023.1000265-3-alexandre.belloni@bootlin.com --- drivers/rtc/rtc-pcf8523.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-pcf8523.c b/drivers/rtc/rtc-pcf8523.c index 4e5e5caeb8ce..740e2136ca98 100644 --- a/drivers/rtc/rtc-pcf8523.c +++ b/drivers/rtc/rtc-pcf8523.c @@ -406,6 +406,8 @@ static int pcf8523_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) { struct i2c_client *client = to_i2c_client(dev); + unsigned int flags = 0; + u8 value; int ret; switch (cmd) { @@ -414,9 +416,16 @@ static int pcf8523_rtc_ioctl(struct device *dev, unsigned int cmd, if (ret < 0) return ret; if (ret) - ret = RTC_VL_BACKUP_LOW; + flags |= RTC_VL_BACKUP_LOW; - return put_user(ret, (unsigned int __user *)arg); + ret = pcf8523_read(client, REG_SECONDS, &value); + if (ret < 0) + return ret; + + if (value & REG_SECONDS_OS) + flags |= RTC_VL_DATA_INVALID; + + return put_user(flags, (unsigned int __user *)arg); default: return -ENOIOCTLCMD; From 204756f016726a380bafe619438ed979088bd04a Mon Sep 17 00:00:00 2001 From: Nobuhiro Iwamatsu Date: Tue, 20 Apr 2021 11:39:17 +0900 Subject: [PATCH 0781/1379] rtc: ds1307: Fix wday settings for rx8130 rx8130 wday specifies the bit position, not BCD. Fixes: ee0981be7704 ("rtc: ds1307: Add support for Epson RX8130CE") Signed-off-by: Nobuhiro Iwamatsu Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210420023917.1949066-1-nobuhiro1.iwamatsu@toshiba.co.jp --- drivers/rtc/rtc-ds1307.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index 089509d0a3a0..336cb9aa5e33 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -293,7 +293,11 @@ static int ds1307_get_time(struct device *dev, struct rtc_time *t) t->tm_min = bcd2bin(regs[DS1307_REG_MIN] & 0x7f); tmp = regs[DS1307_REG_HOUR] & 0x3f; t->tm_hour = bcd2bin(tmp); - t->tm_wday = bcd2bin(regs[DS1307_REG_WDAY] & 0x07) - 1; + /* rx8130 is bit position, not BCD */ + if (ds1307->type == rx_8130) + t->tm_wday = fls(regs[DS1307_REG_WDAY] & 0x7f); + else + t->tm_wday = bcd2bin(regs[DS1307_REG_WDAY] & 0x07) - 1; t->tm_mday = bcd2bin(regs[DS1307_REG_MDAY] & 0x3f); tmp = regs[DS1307_REG_MONTH] & 0x1f; t->tm_mon = bcd2bin(tmp) - 1; @@ -340,7 +344,11 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t) regs[DS1307_REG_SECS] = bin2bcd(t->tm_sec); regs[DS1307_REG_MIN] = bin2bcd(t->tm_min); regs[DS1307_REG_HOUR] = bin2bcd(t->tm_hour); - regs[DS1307_REG_WDAY] = bin2bcd(t->tm_wday + 1); + /* rx8130 is bit position, not BCD */ + if (ds1307->type == rx_8130) + regs[DS1307_REG_WDAY] = 1 << t->tm_wday; + else + regs[DS1307_REG_WDAY] = bin2bcd(t->tm_wday + 1); regs[DS1307_REG_MDAY] = bin2bcd(t->tm_mday); regs[DS1307_REG_MONTH] = bin2bcd(t->tm_mon + 1); From fefbec3a741831bc7791a94a483ad55665160b50 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 20 Apr 2021 19:02:42 +0200 Subject: [PATCH 0782/1379] rtc: s5m: Remove reference to parent's device pdata The S5M RTC driver does not use parent's device (sec-core PMIC driver) platform data so there is no need to check for it. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210420170244.13467-3-krzysztof.kozlowski@canonical.com --- drivers/rtc/rtc-s5m.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-s5m.c b/drivers/rtc/rtc-s5m.c index 80b66f16db89..038269a6b08c 100644 --- a/drivers/rtc/rtc-s5m.c +++ b/drivers/rtc/rtc-s5m.c @@ -713,16 +713,10 @@ static int s5m8767_rtc_init_reg(struct s5m_rtc_info *info) static int s5m_rtc_probe(struct platform_device *pdev) { struct sec_pmic_dev *s5m87xx = dev_get_drvdata(pdev->dev.parent); - struct sec_platform_data *pdata = s5m87xx->pdata; struct s5m_rtc_info *info; const struct regmap_config *regmap_cfg; int ret, alarm_irq; - if (!pdata) { - dev_err(pdev->dev.parent, "Platform data not supplied\n"); - return -ENODEV; - } - info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; From e463786f380ab28f1ca6e34ea65bbc2e03b2d325 Mon Sep 17 00:00:00 2001 From: Dario Binacchi Date: Sun, 25 Apr 2021 16:59:23 +0200 Subject: [PATCH 0783/1379] rtc: omap: use rtc_write to access OMAP_RTC_OSC_REG The RTC_OSC_REG register is 32-bit, but the useful information is found in the 7 least significant bits (bits 7-31 are reserved). And in fact, as you can see from the code, all read accesses are 8-bit, as well as some writes. Let's make sure all writes are 8-bit. Moreover, in contexts where consecutive reads / writes after the busy check must take place within 15 us, it is better not to waste time on useless accesses. Signed-off-by: Dario Binacchi Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210425145924.23353-1-dariobin@libero.it --- drivers/rtc/rtc-omap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index dc7db2477f88..d46e0f0cc502 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c @@ -786,8 +786,7 @@ static int omap_rtc_probe(struct platform_device *pdev) /* enable RTC functional clock */ if (rtc->type->has_32kclk_en) { reg = rtc_read(rtc, OMAP_RTC_OSC_REG); - rtc_writel(rtc, OMAP_RTC_OSC_REG, - reg | OMAP_RTC_OSC_32KCLK_EN); + rtc_write(rtc, OMAP_RTC_OSC_REG, reg | OMAP_RTC_OSC_32KCLK_EN); } /* clear old status */ @@ -845,7 +844,7 @@ static int omap_rtc_probe(struct platform_device *pdev) reg = rtc_read(rtc, OMAP_RTC_OSC_REG); reg &= ~OMAP_RTC_OSC_OSC32K_GZ_DISABLE; reg |= OMAP_RTC_OSC_32KCLK_EN | OMAP_RTC_OSC_SEL_32KCLK_SRC; - rtc_writel(rtc, OMAP_RTC_OSC_REG, reg); + rtc_write(rtc, OMAP_RTC_OSC_REG, reg); } rtc->type->lock(rtc); From 4d0185e67806a233c423c1668e87e137fbda192c Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Thu, 29 Apr 2021 23:44:03 +0200 Subject: [PATCH 0784/1379] rtc: sysfs: check features instead of ops Test RTC_FEATURE_ALARM instead of relying on .set_alarm to know whether alarms are available. Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210429214403.2610952-1-alexandre.belloni@bootlin.com --- drivers/rtc/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/sysfs.c b/drivers/rtc/sysfs.c index 8a957d31a1a4..74026f67fdfb 100644 --- a/drivers/rtc/sysfs.c +++ b/drivers/rtc/sysfs.c @@ -273,7 +273,7 @@ static bool rtc_does_wakealarm(struct rtc_device *rtc) if (!device_can_wakeup(rtc->dev.parent)) return false; - return rtc->ops->set_alarm != NULL; + return !!test_bit(RTC_FEATURE_ALARM, rtc->features); } static umode_t rtc_attr_is_visible(struct kobject *kobj, From 94604548aa7163fa14b837149bb0cb708bc613bc Mon Sep 17 00:00:00 2001 From: Andrea Mayer Date: Tue, 27 Apr 2021 17:44:04 +0200 Subject: [PATCH 0785/1379] seg6: add counters support for SRv6 Behaviors This patch provides counters for SRv6 Behaviors as defined in [1], section 6. For each SRv6 Behavior instance, counters defined in [1] are: - the total number of packets that have been correctly processed; - the total amount of traffic in bytes of all packets that have been correctly processed; In addition, this patch introduces a new counter that counts the number of packets that have NOT been properly processed (i.e. errors) by an SRv6 Behavior instance. Counters are not only interesting for network monitoring purposes (i.e. counting the number of packets processed by a given behavior) but they also provide a simple tool for checking whether a behavior instance is working as we expect or not. Counters can be useful for troubleshooting misconfigured SRv6 networks. Indeed, an SRv6 Behavior can silently drop packets for very different reasons (i.e. wrong SID configuration, interfaces set with SID addresses, etc) without any notification/message to the user. Due to the nature of SRv6 networks, diagnostic tools such as ping and traceroute may be ineffective: paths used for reaching a given router can be totally different from the ones followed by probe packets. In addition, paths are often asymmetrical and this makes it even more difficult to keep up with the journey of the packets and to understand which behaviors are actually processing our traffic. When counters are enabled on an SRv6 Behavior instance, it is possible to verify if packets are actually processed by such behavior and what is the outcome of the processing. Therefore, the counters for SRv6 Behaviors offer an non-invasive observability point which can be leveraged for both traffic monitoring and troubleshooting purposes. [1] https://www.rfc-editor.org/rfc/rfc8986.html#name-counters Troubleshooting using SRv6 Behavior counters -------------------------------------------- Let's make a brief example to see how helpful counters can be for SRv6 networks. Let's consider a node where an SRv6 End Behavior receives an SRv6 packet whose Segment Left (SL) is equal to 0. In this case, the End Behavior (which accepts only packets with SL >= 1) discards the packet and increases the error counter. This information can be leveraged by the network operator for troubleshooting. Indeed, the error counter is telling the user that the packet: (i) arrived at the node; (ii) the packet has been taken into account by the SRv6 End behavior; (iii) but an error has occurred during the processing. The error (iii) could be caused by different reasons, such as wrong route settings on the node or due to an invalid SID List carried by the SRv6 packet. Anyway, the error counter is used to exclude that the packet did not arrive at the node or it has not been processed by the behavior at all. Turning on/off counters for SRv6 Behaviors ------------------------------------------ Each SRv6 Behavior instance can be configured, at the time of its creation, to make use of counters. This is done through iproute2 which allows the user to create an SRv6 Behavior instance specifying the optional "count" attribute as shown in the following example: $ ip -6 route add 2001:db8::1 encap seg6local action End count dev eth0 per-behavior counters can be shown by adding "-s" to the iproute2 command line, i.e.: $ ip -s -6 route show 2001:db8::1 2001:db8::1 encap seg6local action End packets 0 bytes 0 errors 0 dev eth0 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Impact of counters for SRv6 Behaviors on performance ==================================================== To determine the performance impact due to the introduction of counters in the SRv6 Behavior subsystem, we have carried out extensive tests. We chose to test the throughput achieved by the SRv6 End.DX2 Behavior because, among all the other behaviors implemented so far, it reaches the highest throughput which is around 1.5 Mpps (per core at 2.4 GHz on a Xeon(R) CPU E5-2630 v3) on kernel 5.12-rc2 using packets of size ~ 100 bytes. Three different tests were conducted in order to evaluate the overall throughput of the SRv6 End.DX2 Behavior in the following scenarios: 1) vanilla kernel (without the SRv6 Behavior counters patch) and a single instance of an SRv6 End.DX2 Behavior; 2) patched kernel with SRv6 Behavior counters and a single instance of an SRv6 End.DX2 Behavior with counters turned off; 3) patched kernel with SRv6 Behavior counters and a single instance of SRv6 End.DX2 Behavior with counters turned on. All tests were performed on a testbed deployed on the CloudLab facilities [2], a flexible infrastructure dedicated to scientific research on the future of Cloud Computing. Results of tests are shown in the following table: Scenario (1): average 1504764,81 pps (~1504,76 kpps); std. dev 3956,82 pps Scenario (2): average 1501469,78 pps (~1501,47 kpps); std. dev 2979,85 pps Scenario (3): average 1501315,13 pps (~1501,32 kpps); std. dev 2956,00 pps As can be observed, throughputs achieved in scenarios (2),(3) did not suffer any observable degradation compared to scenario (1). Thanks to Jakub Kicinski and David Ahern for their valuable suggestions and comments provided during the discussion of the proposed RFCs. [2] https://www.cloudlab.us Signed-off-by: Andrea Mayer Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/seg6_local.h | 30 +++++ net/ipv6/seg6_local.c | 198 +++++++++++++++++++++++++++++++- 2 files changed, 226 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h index 3b39ef1dbb46..5ae3ace84de0 100644 --- a/include/uapi/linux/seg6_local.h +++ b/include/uapi/linux/seg6_local.h @@ -27,6 +27,7 @@ enum { SEG6_LOCAL_OIF, SEG6_LOCAL_BPF, SEG6_LOCAL_VRFTABLE, + SEG6_LOCAL_COUNTERS, __SEG6_LOCAL_MAX, }; #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1) @@ -78,4 +79,33 @@ enum { #define SEG6_LOCAL_BPF_PROG_MAX (__SEG6_LOCAL_BPF_PROG_MAX - 1) +/* SRv6 Behavior counters are encoded as netlink attributes guaranteeing the + * correct alignment. + * Each counter is identified by a different attribute type (i.e. + * SEG6_LOCAL_CNT_PACKETS). + * + * - SEG6_LOCAL_CNT_PACKETS: identifies a counter that counts the number of + * packets that have been CORRECTLY processed by an SRv6 Behavior instance + * (i.e., packets that generate errors or are dropped are NOT counted). + * + * - SEG6_LOCAL_CNT_BYTES: identifies a counter that counts the total amount + * of traffic in bytes of all packets that have been CORRECTLY processed by + * an SRv6 Behavior instance (i.e., packets that generate errors or are + * dropped are NOT counted). + * + * - SEG6_LOCAL_CNT_ERRORS: identifies a counter that counts the number of + * packets that have NOT been properly processed by an SRv6 Behavior instance + * (i.e., packets that generate errors or are dropped). + */ +enum { + SEG6_LOCAL_CNT_UNSPEC, + SEG6_LOCAL_CNT_PAD, /* pad for 64 bits values */ + SEG6_LOCAL_CNT_PACKETS, + SEG6_LOCAL_CNT_BYTES, + SEG6_LOCAL_CNT_ERRORS, + __SEG6_LOCAL_CNT_MAX, +}; + +#define SEG6_LOCAL_CNT_MAX (__SEG6_LOCAL_CNT_MAX - 1) + #endif diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index bd7140885e60..4ff38cb08f4b 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -93,6 +93,35 @@ struct seg6_end_dt_info { int hdrlen; }; +struct pcpu_seg6_local_counters { + u64_stats_t packets; + u64_stats_t bytes; + u64_stats_t errors; + + struct u64_stats_sync syncp; +}; + +/* This struct groups all the SRv6 Behavior counters supported so far. + * + * put_nla_counters() makes use of this data structure to collect all counter + * values after the per-CPU counter evaluation has been performed. + * Finally, each counter value (in seg6_local_counters) is stored in the + * corresponding netlink attribute and sent to user space. + * + * NB: we don't want to expose this structure to user space! + */ +struct seg6_local_counters { + __u64 packets; + __u64 bytes; + __u64 errors; +}; + +#define seg6_local_alloc_pcpu_counters(__gfp) \ + __netdev_alloc_pcpu_stats(struct pcpu_seg6_local_counters, \ + ((__gfp) | __GFP_ZERO)) + +#define SEG6_F_LOCAL_COUNTERS SEG6_F_ATTR(SEG6_LOCAL_COUNTERS) + struct seg6_local_lwt { int action; struct ipv6_sr_hdr *srh; @@ -105,6 +134,7 @@ struct seg6_local_lwt { #ifdef CONFIG_NET_L3_MASTER_DEV struct seg6_end_dt_info dt_info; #endif + struct pcpu_seg6_local_counters __percpu *pcpu_counters; int headroom; struct seg6_action_desc *desc; @@ -878,36 +908,43 @@ static struct seg6_action_desc seg6_action_table[] = { { .action = SEG6_LOCAL_ACTION_END, .attrs = 0, + .optattrs = SEG6_F_LOCAL_COUNTERS, .input = input_action_end, }, { .action = SEG6_LOCAL_ACTION_END_X, .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6), + .optattrs = SEG6_F_LOCAL_COUNTERS, .input = input_action_end_x, }, { .action = SEG6_LOCAL_ACTION_END_T, .attrs = SEG6_F_ATTR(SEG6_LOCAL_TABLE), + .optattrs = SEG6_F_LOCAL_COUNTERS, .input = input_action_end_t, }, { .action = SEG6_LOCAL_ACTION_END_DX2, .attrs = SEG6_F_ATTR(SEG6_LOCAL_OIF), + .optattrs = SEG6_F_LOCAL_COUNTERS, .input = input_action_end_dx2, }, { .action = SEG6_LOCAL_ACTION_END_DX6, .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6), + .optattrs = SEG6_F_LOCAL_COUNTERS, .input = input_action_end_dx6, }, { .action = SEG6_LOCAL_ACTION_END_DX4, .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH4), + .optattrs = SEG6_F_LOCAL_COUNTERS, .input = input_action_end_dx4, }, { .action = SEG6_LOCAL_ACTION_END_DT4, .attrs = SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE), + .optattrs = SEG6_F_LOCAL_COUNTERS, #ifdef CONFIG_NET_L3_MASTER_DEV .input = input_action_end_dt4, .slwt_ops = { @@ -919,30 +956,35 @@ static struct seg6_action_desc seg6_action_table[] = { .action = SEG6_LOCAL_ACTION_END_DT6, #ifdef CONFIG_NET_L3_MASTER_DEV .attrs = 0, - .optattrs = SEG6_F_ATTR(SEG6_LOCAL_TABLE) | + .optattrs = SEG6_F_LOCAL_COUNTERS | + SEG6_F_ATTR(SEG6_LOCAL_TABLE) | SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE), .slwt_ops = { .build_state = seg6_end_dt6_build, }, #else .attrs = SEG6_F_ATTR(SEG6_LOCAL_TABLE), + .optattrs = SEG6_F_LOCAL_COUNTERS, #endif .input = input_action_end_dt6, }, { .action = SEG6_LOCAL_ACTION_END_B6, .attrs = SEG6_F_ATTR(SEG6_LOCAL_SRH), + .optattrs = SEG6_F_LOCAL_COUNTERS, .input = input_action_end_b6, }, { .action = SEG6_LOCAL_ACTION_END_B6_ENCAP, .attrs = SEG6_F_ATTR(SEG6_LOCAL_SRH), + .optattrs = SEG6_F_LOCAL_COUNTERS, .input = input_action_end_b6_encap, .static_headroom = sizeof(struct ipv6hdr), }, { .action = SEG6_LOCAL_ACTION_END_BPF, .attrs = SEG6_F_ATTR(SEG6_LOCAL_BPF), + .optattrs = SEG6_F_LOCAL_COUNTERS, .input = input_action_end_bpf, }, @@ -963,11 +1005,36 @@ static struct seg6_action_desc *__get_action_desc(int action) return NULL; } +static bool seg6_lwtunnel_counters_enabled(struct seg6_local_lwt *slwt) +{ + return slwt->parsed_optattrs & SEG6_F_LOCAL_COUNTERS; +} + +static void seg6_local_update_counters(struct seg6_local_lwt *slwt, + unsigned int len, int err) +{ + struct pcpu_seg6_local_counters *pcounters; + + pcounters = this_cpu_ptr(slwt->pcpu_counters); + u64_stats_update_begin(&pcounters->syncp); + + if (likely(!err)) { + u64_stats_inc(&pcounters->packets); + u64_stats_add(&pcounters->bytes, len); + } else { + u64_stats_inc(&pcounters->errors); + } + + u64_stats_update_end(&pcounters->syncp); +} + static int seg6_local_input(struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct seg6_action_desc *desc; struct seg6_local_lwt *slwt; + unsigned int len = skb->len; + int rc; if (skb->protocol != htons(ETH_P_IPV6)) { kfree_skb(skb); @@ -977,7 +1044,14 @@ static int seg6_local_input(struct sk_buff *skb) slwt = seg6_local_lwtunnel(orig_dst->lwtstate); desc = slwt->desc; - return desc->input(skb, slwt); + rc = desc->input(skb, slwt); + + if (!seg6_lwtunnel_counters_enabled(slwt)) + return rc; + + seg6_local_update_counters(slwt, len, rc); + + return rc; } static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { @@ -992,6 +1066,7 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, [SEG6_LOCAL_BPF] = { .type = NLA_NESTED }, + [SEG6_LOCAL_COUNTERS] = { .type = NLA_NESTED }, }; static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) @@ -1296,6 +1371,112 @@ static void destroy_attr_bpf(struct seg6_local_lwt *slwt) bpf_prog_put(slwt->bpf.prog); } +static const struct +nla_policy seg6_local_counters_policy[SEG6_LOCAL_CNT_MAX + 1] = { + [SEG6_LOCAL_CNT_PACKETS] = { .type = NLA_U64 }, + [SEG6_LOCAL_CNT_BYTES] = { .type = NLA_U64 }, + [SEG6_LOCAL_CNT_ERRORS] = { .type = NLA_U64 }, +}; + +static int parse_nla_counters(struct nlattr **attrs, + struct seg6_local_lwt *slwt) +{ + struct pcpu_seg6_local_counters __percpu *pcounters; + struct nlattr *tb[SEG6_LOCAL_CNT_MAX + 1]; + int ret; + + ret = nla_parse_nested_deprecated(tb, SEG6_LOCAL_CNT_MAX, + attrs[SEG6_LOCAL_COUNTERS], + seg6_local_counters_policy, NULL); + if (ret < 0) + return ret; + + /* basic support for SRv6 Behavior counters requires at least: + * packets, bytes and errors. + */ + if (!tb[SEG6_LOCAL_CNT_PACKETS] || !tb[SEG6_LOCAL_CNT_BYTES] || + !tb[SEG6_LOCAL_CNT_ERRORS]) + return -EINVAL; + + /* counters are always zero initialized */ + pcounters = seg6_local_alloc_pcpu_counters(GFP_KERNEL); + if (!pcounters) + return -ENOMEM; + + slwt->pcpu_counters = pcounters; + + return 0; +} + +static int seg6_local_fill_nla_counters(struct sk_buff *skb, + struct seg6_local_counters *counters) +{ + if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_PACKETS, counters->packets, + SEG6_LOCAL_CNT_PAD)) + return -EMSGSIZE; + + if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_BYTES, counters->bytes, + SEG6_LOCAL_CNT_PAD)) + return -EMSGSIZE; + + if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_ERRORS, counters->errors, + SEG6_LOCAL_CNT_PAD)) + return -EMSGSIZE; + + return 0; +} + +static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct seg6_local_counters counters = { 0, 0, 0 }; + struct nlattr *nest; + int rc, i; + + nest = nla_nest_start(skb, SEG6_LOCAL_COUNTERS); + if (!nest) + return -EMSGSIZE; + + for_each_possible_cpu(i) { + struct pcpu_seg6_local_counters *pcounters; + u64 packets, bytes, errors; + unsigned int start; + + pcounters = per_cpu_ptr(slwt->pcpu_counters, i); + do { + start = u64_stats_fetch_begin_irq(&pcounters->syncp); + + packets = u64_stats_read(&pcounters->packets); + bytes = u64_stats_read(&pcounters->bytes); + errors = u64_stats_read(&pcounters->errors); + + } while (u64_stats_fetch_retry_irq(&pcounters->syncp, start)); + + counters.packets += packets; + counters.bytes += bytes; + counters.errors += errors; + } + + rc = seg6_local_fill_nla_counters(skb, &counters); + if (rc < 0) { + nla_nest_cancel(skb, nest); + return rc; + } + + return nla_nest_end(skb, nest); +} + +static int cmp_nla_counters(struct seg6_local_lwt *a, struct seg6_local_lwt *b) +{ + /* a and b are equal if both have pcpu_counters set or not */ + return (!!((unsigned long)a->pcpu_counters)) ^ + (!!((unsigned long)b->pcpu_counters)); +} + +static void destroy_attr_counters(struct seg6_local_lwt *slwt) +{ + free_percpu(slwt->pcpu_counters); +} + struct seg6_action_param { int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt); int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt); @@ -1343,6 +1524,10 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = { .put = put_nla_vrftable, .cmp = cmp_nla_vrftable }, + [SEG6_LOCAL_COUNTERS] = { .parse = parse_nla_counters, + .put = put_nla_counters, + .cmp = cmp_nla_counters, + .destroy = destroy_attr_counters }, }; /* call the destroy() callback (if available) for each set attribute in @@ -1645,6 +1830,15 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt) if (attrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE)) nlsize += nla_total_size(4); + if (attrs & SEG6_F_LOCAL_COUNTERS) + nlsize += nla_total_size(0) + /* nest SEG6_LOCAL_COUNTERS */ + /* SEG6_LOCAL_CNT_PACKETS */ + nla_total_size_64bit(sizeof(__u64)) + + /* SEG6_LOCAL_CNT_BYTES */ + nla_total_size_64bit(sizeof(__u64)) + + /* SEG6_LOCAL_CNT_ERRORS */ + nla_total_size_64bit(sizeof(__u64)); + return nlsize; } From 7c0ea5930c1c211931819d83cfb157bff1539a4c Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 28 Apr 2021 15:23:07 +0200 Subject: [PATCH 0786/1379] openvswitch: fix stack OOB read while fragmenting IPv4 packets running openvswitch on kernels built with KASAN, it's possible to see the following splat while testing fragmentation of IPv4 packets: BUG: KASAN: stack-out-of-bounds in ip_do_fragment+0x1b03/0x1f60 Read of size 1 at addr ffff888112fc713c by task handler2/1367 CPU: 0 PID: 1367 Comm: handler2 Not tainted 5.12.0-rc6+ #418 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 Call Trace: dump_stack+0x92/0xc1 print_address_description.constprop.7+0x1a/0x150 kasan_report.cold.13+0x7f/0x111 ip_do_fragment+0x1b03/0x1f60 ovs_fragment+0x5bf/0x840 [openvswitch] do_execute_actions+0x1bd5/0x2400 [openvswitch] ovs_execute_actions+0xc8/0x3d0 [openvswitch] ovs_packet_cmd_execute+0xa39/0x1150 [openvswitch] genl_family_rcv_msg_doit.isra.15+0x227/0x2d0 genl_rcv_msg+0x287/0x490 netlink_rcv_skb+0x120/0x380 genl_rcv+0x24/0x40 netlink_unicast+0x439/0x630 netlink_sendmsg+0x719/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5ba/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f957079db07 Code: c3 66 90 41 54 41 89 d4 55 48 89 f5 53 89 fb 48 83 ec 10 e8 eb ec ff ff 44 89 e2 48 89 ee 89 df 41 89 c0 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 35 44 89 c7 48 89 44 24 08 e8 24 ed ff ff 48 RSP: 002b:00007f956ce35a50 EFLAGS: 00000293 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000019 RCX: 00007f957079db07 RDX: 0000000000000000 RSI: 00007f956ce35ae0 RDI: 0000000000000019 RBP: 00007f956ce35ae0 R08: 0000000000000000 R09: 00007f9558006730 R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000000 R13: 00007f956ce37308 R14: 00007f956ce35f80 R15: 00007f956ce35ae0 The buggy address belongs to the page: page:00000000af2a1d93 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x112fc7 flags: 0x17ffffc0000000() raw: 0017ffffc0000000 0000000000000000 dead000000000122 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected addr ffff888112fc713c is located in stack of task handler2/1367 at offset 180 in frame: ovs_fragment+0x0/0x840 [openvswitch] this frame has 2 objects: [32, 144) 'ovs_dst' [192, 424) 'ovs_rt' Memory state around the buggy address: ffff888112fc7000: f3 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff888112fc7080: 00 f1 f1 f1 f1 00 00 00 00 00 00 00 00 00 00 00 >ffff888112fc7100: 00 00 00 f2 f2 f2 f2 f2 f2 00 00 00 00 00 00 00 ^ ffff888112fc7180: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff888112fc7200: 00 00 00 00 00 00 f2 f2 f2 00 00 00 00 00 00 00 for IPv4 packets, ovs_fragment() uses a temporary struct dst_entry. Then, in the following call graph: ip_do_fragment() ip_skb_dst_mtu() ip_dst_mtu_maybe_forward() ip_mtu_locked() the pointer to struct dst_entry is used as pointer to struct rtable: this turns the access to struct members like rt_mtu_locked into an OOB read in the stack. Fix this changing the temporary variable used for IPv4 packets in ovs_fragment(), similarly to what is done for IPv6 few lines below. Fixes: d52e5a7e7ca4 ("ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmt") Cc: Acked-by: Eelco Chaudron Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 92a0b67b2728..77d924ab8cdb 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -827,17 +827,17 @@ static void ovs_fragment(struct net *net, struct vport *vport, } if (key->eth.type == htons(ETH_P_IP)) { - struct dst_entry ovs_dst; + struct rtable ovs_rt = { 0 }; unsigned long orig_dst; prepare_frag(vport, skb, orig_network_offset, ovs_key_mac_proto(key)); - dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1, + dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1, DST_OBSOLETE_NONE, DST_NOCOUNT); - ovs_dst.dev = vport->dev; + ovs_rt.dst.dev = vport->dev; orig_dst = skb->_skb_refdst; - skb_dst_set_noref(skb, &ovs_dst); + skb_dst_set_noref(skb, &ovs_rt.dst); IPCB(skb)->frag_max_size = mru; ip_do_fragment(net, skb->sk, skb, ovs_vport_output); From 31fe34a0118e0acc958c802e830ad5d37ef6b1d3 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 28 Apr 2021 15:23:14 +0200 Subject: [PATCH 0787/1379] net/sched: sch_frag: fix stack OOB read while fragmenting IPv4 packets when 'act_mirred' tries to fragment IPv4 packets that had been previously re-assembled using 'act_ct', splats like the following can be observed on kernels built with KASAN: BUG: KASAN: stack-out-of-bounds in ip_do_fragment+0x1b03/0x1f60 Read of size 1 at addr ffff888147009574 by task ping/947 CPU: 0 PID: 947 Comm: ping Not tainted 5.12.0-rc6+ #418 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 Call Trace: dump_stack+0x92/0xc1 print_address_description.constprop.7+0x1a/0x150 kasan_report.cold.13+0x7f/0x111 ip_do_fragment+0x1b03/0x1f60 sch_fragment+0x4bf/0xe40 tcf_mirred_act+0xc3d/0x11a0 [act_mirred] tcf_action_exec+0x104/0x3e0 fl_classify+0x49a/0x5e0 [cls_flower] tcf_classify_ingress+0x18a/0x820 __netif_receive_skb_core+0xae7/0x3340 __netif_receive_skb_one_core+0xb6/0x1b0 process_backlog+0x1ef/0x6c0 __napi_poll+0xaa/0x500 net_rx_action+0x702/0xac0 __do_softirq+0x1e4/0x97f do_softirq+0x71/0x90 __local_bh_enable_ip+0xdb/0xf0 ip_finish_output2+0x760/0x2120 ip_do_fragment+0x15a5/0x1f60 __ip_finish_output+0x4c2/0xea0 ip_output+0x1ca/0x4d0 ip_send_skb+0x37/0xa0 raw_sendmsg+0x1c4b/0x2d00 sock_sendmsg+0xdb/0x110 __sys_sendto+0x1d7/0x2b0 __x64_sys_sendto+0xdd/0x1b0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f82e13853eb Code: 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 f3 0f 1e fa 48 8d 05 75 42 2c 00 41 89 ca 8b 00 85 c0 75 14 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 75 c3 0f 1f 40 00 41 57 4d 89 c7 41 56 41 89 RSP: 002b:00007ffe01fad888 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00005571aac13700 RCX: 00007f82e13853eb RDX: 0000000000002330 RSI: 00005571aac13700 RDI: 0000000000000003 RBP: 0000000000002330 R08: 00005571aac10500 R09: 0000000000000010 R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe01faefb0 R13: 00007ffe01fad890 R14: 00007ffe01fad980 R15: 00005571aac0f0a0 The buggy address belongs to the page: page:000000001dff2e03 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x147009 flags: 0x17ffffc0001000(reserved) raw: 0017ffffc0001000 ffffea00051c0248 ffffea00051c0248 0000000000000000 raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888147009400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff888147009480: f1 f1 f1 f1 04 f2 f2 f2 f2 f2 f2 f2 00 00 00 00 >ffff888147009500: 00 00 00 00 00 00 00 00 00 00 f2 f2 f2 f2 f2 f2 ^ ffff888147009580: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff888147009600: 00 00 00 00 00 00 00 00 00 00 00 00 00 f2 f2 f2 for IPv4 packets, sch_fragment() uses a temporary struct dst_entry. Then, in the following call graph: ip_do_fragment() ip_skb_dst_mtu() ip_dst_mtu_maybe_forward() ip_mtu_locked() the pointer to struct dst_entry is used as pointer to struct rtable: this turns the access to struct members like rt_mtu_locked into an OOB read in the stack. Fix this changing the temporary variable used for IPv4 packets in sch_fragment(), similarly to what is done for IPv6 few lines below. Fixes: c129412f74e9 ("net/sched: sch_frag: add generic packet fragment support.") Cc: # 5.11 Reported-by: Shuang Li Acked-by: Marcelo Ricardo Leitner Acked-by: Cong Wang Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/sch_frag.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c index e1e77d3fb6c0..8c06381391d6 100644 --- a/net/sched/sch_frag.c +++ b/net/sched/sch_frag.c @@ -90,16 +90,16 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, } if (skb_protocol(skb, true) == htons(ETH_P_IP)) { - struct dst_entry sch_frag_dst; + struct rtable sch_frag_rt = { 0 }; unsigned long orig_dst; sch_frag_prepare_frag(skb, xmit); - dst_init(&sch_frag_dst, &sch_frag_dst_ops, NULL, 1, + dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, 1, DST_OBSOLETE_NONE, DST_NOCOUNT); - sch_frag_dst.dev = skb->dev; + sch_frag_rt.dst.dev = skb->dev; orig_dst = skb->_skb_refdst; - skb_dst_set_noref(skb, &sch_frag_dst); + skb_dst_set_noref(skb, &sch_frag_rt.dst); IPCB(skb)->frag_max_size = mru; ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit); From 59259ff7a81b9eb6213891c6451221e567f8f22f Mon Sep 17 00:00:00 2001 From: Zhang Zhengming Date: Wed, 28 Apr 2021 22:38:14 +0800 Subject: [PATCH 0788/1379] bridge: Fix possible races between assigning rx_handler_data and setting IFF_BRIDGE_PORT bit There is a crash in the function br_get_link_af_size_filtered, as the port_exists(dev) is true and the rx_handler_data of dev is NULL. But the rx_handler_data of dev is correct saved in vmcore. The oops looks something like: ... pc : br_get_link_af_size_filtered+0x28/0x1c8 [bridge] ... Call trace: br_get_link_af_size_filtered+0x28/0x1c8 [bridge] if_nlmsg_size+0x180/0x1b0 rtnl_calcit.isra.12+0xf8/0x148 rtnetlink_rcv_msg+0x334/0x370 netlink_rcv_skb+0x64/0x130 rtnetlink_rcv+0x28/0x38 netlink_unicast+0x1f0/0x250 netlink_sendmsg+0x310/0x378 sock_sendmsg+0x4c/0x70 __sys_sendto+0x120/0x150 __arm64_sys_sendto+0x30/0x40 el0_svc_common+0x78/0x130 el0_svc_handler+0x38/0x78 el0_svc+0x8/0xc In br_add_if(), we found there is no guarantee that assigning rx_handler_data to dev->rx_handler_data will before setting the IFF_BRIDGE_PORT bit of priv_flags. So there is a possible data competition: CPU 0: CPU 1: (RCU read lock) (RTNL lock) rtnl_calcit() br_add_slave() if_nlmsg_size() br_add_if() br_get_link_af_size_filtered() -> netdev_rx_handler_register ... // The order is not guaranteed ... -> dev->priv_flags |= IFF_BRIDGE_PORT; // The IFF_BRIDGE_PORT bit of priv_flags has been set -> if (br_port_exists(dev)) { // The dev->rx_handler_data has NOT been assigned -> p = br_port_get_rcu(dev); .... -> rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); ... Fix it in br_get_link_af_size_filtered, using br_port_get_check_rcu() and checking the return value. Signed-off-by: Zhang Zhengming Reviewed-by: Zhao Lei Reviewed-by: Wang Xiaogang Suggested-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 0456593aceec..e4e6e991313e 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -103,8 +103,9 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev, rcu_read_lock(); if (netif_is_bridge_port(dev)) { - p = br_port_get_rcu(dev); - vg = nbp_vlan_group_rcu(p); + p = br_port_get_check_rcu(dev); + if (p) + vg = nbp_vlan_group_rcu(p); } else if (dev->priv_flags & IFF_EBRIDGE) { br = netdev_priv(dev); vg = br_vlan_group_rcu(br); From 1a70f6597d5f8abf6cea8e2df213740a18746194 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 29 Apr 2021 09:32:36 +0800 Subject: [PATCH 0789/1379] net: Remove redundant assignment to err Variable 'err' is set to -ENOMEM but this value is never read as it is overwritten with a new value later on, hence the 'If statements' and assignments are redundantand and can be removed. Cleans up the following clang-analyzer warning: net/ipv6/seg6.c:126:4: warning: Value stored to 'err' is never read [clang-analyzer-deadcode.DeadStores] Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: David S. Miller --- net/ipv6/seg6.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index d2f8138e5a73..e412817fba2f 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -122,9 +122,6 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info) hinfo = seg6_hmac_info_lookup(net, hmackeyid); if (!slen) { - if (!hinfo) - err = -ENOENT; - err = seg6_hmac_info_del(net, hmackeyid); goto out_unlock; From 2867298dd49ee84214b8721521dc7a5a6382520c Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Thu, 29 Apr 2021 16:34:50 +0800 Subject: [PATCH 0790/1379] net: hns3: fix incorrect configuration for igu_egu_hw_err According to the UM, the type and enable status of igu_egu_hw_err should be configured separately. Currently, the type field is incorrect when disable this error. So fix it by configuring these two fields separately. Fixes: bf1faf9415dd ("net: hns3: Add enable and process hw errors from IGU, EGU and NCSI") Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c | 3 ++- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c index d25291916b31..8223d699cd94 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c @@ -753,8 +753,9 @@ static int hclge_config_igu_egu_hw_err_int(struct hclge_dev *hdev, bool en) /* configure IGU,EGU error interrupts */ hclge_cmd_setup_basic_desc(&desc, HCLGE_IGU_COMMON_INT_EN, false); + desc.data[0] = cpu_to_le32(HCLGE_IGU_ERR_INT_TYPE); if (en) - desc.data[0] = cpu_to_le32(HCLGE_IGU_ERR_INT_EN); + desc.data[0] |= cpu_to_le32(HCLGE_IGU_ERR_INT_EN); desc.data[1] = cpu_to_le32(HCLGE_IGU_ERR_INT_EN_MASK); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h index 608fe26fc3fe..d647f3c84134 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h @@ -32,7 +32,8 @@ #define HCLGE_TQP_ECC_ERR_INT_EN_MASK 0x0FFF #define HCLGE_MSIX_SRAM_ECC_ERR_INT_EN_MASK 0x0F000000 #define HCLGE_MSIX_SRAM_ECC_ERR_INT_EN 0x0F000000 -#define HCLGE_IGU_ERR_INT_EN 0x0000066F +#define HCLGE_IGU_ERR_INT_EN 0x0000000F +#define HCLGE_IGU_ERR_INT_TYPE 0x00000660 #define HCLGE_IGU_ERR_INT_EN_MASK 0x000F #define HCLGE_IGU_TNL_ERR_INT_EN 0x0002AABF #define HCLGE_IGU_TNL_ERR_INT_EN_MASK 0x003F From 568a54bdf70b143f3e0befa298e22ad469ffc732 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Thu, 29 Apr 2021 16:34:51 +0800 Subject: [PATCH 0791/1379] net: hns3: initialize the message content in hclge_get_link_mode() The message sent to VF should be initialized, otherwise random value of some contents may cause improper processing by the target. So add a initialization to message in hclge_get_link_mode(). Fixes: 9194d18b0577 ("net: hns3: fix the problem that the supported port is empty") Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c index 5512ffe0a149..8e5f9dc8791d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c @@ -533,7 +533,7 @@ static void hclge_get_link_mode(struct hclge_vport *vport, unsigned long advertising; unsigned long supported; unsigned long send_data; - u8 msg_data[10]; + u8 msg_data[10] = {}; u8 dest_vfid; advertising = hdev->hw.mac.advertising[0]; From b4047aac4ec1066bab6c71950623746d7bcf7154 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Thu, 29 Apr 2021 16:34:52 +0800 Subject: [PATCH 0792/1379] net: hns3: add check for HNS3_NIC_STATE_INITED in hns3_reset_notify_up_enet() In some cases, the device is not initialized because reset failed. If another task calls hns3_reset_notify_up_enet() before reset retry, it will cause an error since uninitialized pointer access. So add check for HNS3_NIC_STATE_INITED before calling hns3_nic_net_open() in hns3_reset_notify_up_enet(). Fixes: bb6b94a896d4 ("net: hns3: Add reset interface implementation in client") Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index c21dd11baed9..5dc6de5107fb 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -4616,6 +4616,11 @@ static int hns3_reset_notify_up_enet(struct hnae3_handle *handle) struct hns3_nic_priv *priv = netdev_priv(kinfo->netdev); int ret = 0; + if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state)) { + netdev_err(kinfo->netdev, "device is not initialized yet\n"); + return -EFAULT; + } + clear_bit(HNS3_NIC_STATE_RESETTING, &priv->state); if (netif_running(kinfo->netdev)) { From 1c7600b7cfc6154f2fd361a74b1d4f25b8f02e48 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 29 Apr 2021 11:05:20 +0200 Subject: [PATCH 0793/1379] MAINTAINERS: remove Wingman Kwok His email bounces with permanent error "550 Invalid recipient". His last email on the LKML was from 2015-10-22 on the LKML. Signed-off-by: Michael Walle Signed-off-by: David S. Miller --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4796ccf9f871..784e05bc0cb9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18216,7 +18216,6 @@ F: sound/soc/codecs/isabelle* F: sound/soc/codecs/lm49453* TI NETCP ETHERNET DRIVER -M: Wingman Kwok M: Murali Karicheri L: netdev@vger.kernel.org S: Maintained From 57e1d8206e48ef78e1b25823fc131ebe60c76b61 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Thu, 29 Apr 2021 11:05:21 +0200 Subject: [PATCH 0794/1379] MAINTAINERS: move Murali Karicheri to credits His email bounces with permanent error "550 Invalid recipient". His last email was from 2020-09-09 on the LKML and he seems to have left TI. Signed-off-by: Michael Walle Signed-off-by: David S. Miller --- CREDITS | 5 +++++ MAINTAINERS | 13 ------------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/CREDITS b/CREDITS index b06760f09c66..7ef7b136e71d 100644 --- a/CREDITS +++ b/CREDITS @@ -1874,6 +1874,11 @@ S: Krosenska' 543 S: 181 00 Praha 8 S: Czech Republic +N: Murali Karicheri +E: m-karicheri2@ti.com +D: Keystone NetCP driver +D: Keystone PCIe host controller driver + N: Jan "Yenya" Kasprzak E: kas@fi.muni.cz D: Author of the COSA/SRP sync serial board driver. diff --git a/MAINTAINERS b/MAINTAINERS index 784e05bc0cb9..86c11ad3c502 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14010,13 +14010,6 @@ F: Documentation/devicetree/bindings/pci/ti-pci.txt F: drivers/pci/controller/cadence/pci-j721e.c F: drivers/pci/controller/dwc/pci-dra7xx.c -PCI DRIVER FOR TI KEYSTONE -M: Murali Karicheri -L: linux-pci@vger.kernel.org -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -S: Maintained -F: drivers/pci/controller/dwc/pci-keystone.c - PCI DRIVER FOR V3 SEMICONDUCTOR V360EPC M: Linus Walleij L: linux-pci@vger.kernel.org @@ -18215,12 +18208,6 @@ S: Maintained F: sound/soc/codecs/isabelle* F: sound/soc/codecs/lm49453* -TI NETCP ETHERNET DRIVER -M: Murali Karicheri -L: netdev@vger.kernel.org -S: Maintained -F: drivers/net/ethernet/ti/netcp* - TI PCM3060 ASoC CODEC DRIVER M: Kirill Marinushkin L: alsa-devel@alsa-project.org (moderated for non-subscribers) From bbf6acea6ecf7d6a2c5ce9a399b9b16404392b89 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 29 Apr 2021 18:25:46 +0800 Subject: [PATCH 0795/1379] net: macb: Remove redundant assignment to queue Variable queue is set to bp->queues but these values is not used as it is overwritten later on, hence redundant assignment can be removed. Cleans up the following clang-analyzer warning: drivers/net/ethernet/cadence/macb_main.c:4919:21: warning: Value stored to 'queue' during its initialization is never read [clang-analyzer-deadcode.DeadStores]. drivers/net/ethernet/cadence/macb_main.c:4832:21: warning: Value stored to 'queue' during its initialization is never read [clang-analyzer-deadcode.DeadStores]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Acked-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/macb_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index 0e94db9cd45d..6bc7d41d519b 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -4852,7 +4852,7 @@ static int __maybe_unused macb_suspend(struct device *dev) { struct net_device *netdev = dev_get_drvdata(dev); struct macb *bp = netdev_priv(netdev); - struct macb_queue *queue = bp->queues; + struct macb_queue *queue; unsigned long flags; unsigned int q; int err; @@ -4939,7 +4939,7 @@ static int __maybe_unused macb_resume(struct device *dev) { struct net_device *netdev = dev_get_drvdata(dev); struct macb *bp = netdev_priv(netdev); - struct macb_queue *queue = bp->queues; + struct macb_queue *queue; unsigned long flags; unsigned int q; int err; From 8343b1f8b97ac016150c8303f95b63b20b98edf8 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 29 Apr 2021 18:38:25 +0800 Subject: [PATCH 0796/1379] bnx2x: Remove redundant assignment to err Variable 'err' is set to -EIO but this value is never read as it is overwritten with a new value later on, hence it is a redundant assignment and can be removed. Clean up the following clang-analyzer warning: drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c:1195:2: warning: Value stored to 'err' is never read [clang-analyzer-deadcode.DeadStores] Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c index 9c2f51f23035..d21f085044cd 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c @@ -1192,7 +1192,6 @@ int bnx2x_iov_init_one(struct bnx2x *bp, int int_mode_param, return 0; } - err = -EIO; /* verify ari is enabled */ if (!pci_ari_enabled(bp->pdev->bus)) { BNX2X_ERR("ARI not supported (check pci bridge ARI forwarding), SRIOV can not be enabled\n"); From d27f0201b93cb1016c232c46e0b8e1bf4c02a7ea Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 29 Apr 2021 13:08:31 +0200 Subject: [PATCH 0797/1379] net: dsa: ksz: ksz8863_smi_probe: fix possible NULL pointer dereference Fix possible NULL pointer dereference in case devm_kzalloc() failed to allocate memory. Fixes: 60a364760002 ("net: dsa: microchip: Add Microchip KSZ8863 SMI based driver support") Reported-by: Colin Ian King Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz8863_smi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/dsa/microchip/ksz8863_smi.c b/drivers/net/dsa/microchip/ksz8863_smi.c index 30d97ea7a949..9fb38e99001a 100644 --- a/drivers/net/dsa/microchip/ksz8863_smi.c +++ b/drivers/net/dsa/microchip/ksz8863_smi.c @@ -147,6 +147,9 @@ static int ksz8863_smi_probe(struct mdio_device *mdiodev) int i; ksz8 = devm_kzalloc(&mdiodev->dev, sizeof(struct ksz8), GFP_KERNEL); + if (!ksz8) + return -ENOMEM; + ksz8->priv = mdiodev; dev = ksz_switch_alloc(&mdiodev->dev, ksz8); From ba46b576a7954fa54ff4c1ef976624794b6668f6 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 29 Apr 2021 13:08:32 +0200 Subject: [PATCH 0798/1379] net: dsa: ksz: ksz8795_spi_probe: fix possible NULL pointer dereference Fix possible NULL pointer dereference in case devm_kzalloc() failed to allocate memory Fixes: cc13e52c3a89 ("net: dsa: microchip: Add Microchip KSZ8863 SPI based driver support") Reported-by: Colin Ian King Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz8795_spi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/dsa/microchip/ksz8795_spi.c b/drivers/net/dsa/microchip/ksz8795_spi.c index 85ba12aa82d8..ea7550d1b634 100644 --- a/drivers/net/dsa/microchip/ksz8795_spi.c +++ b/drivers/net/dsa/microchip/ksz8795_spi.c @@ -41,6 +41,9 @@ static int ksz8795_spi_probe(struct spi_device *spi) int i, ret = 0; ksz8 = devm_kzalloc(&spi->dev, sizeof(struct ksz8), GFP_KERNEL); + if (!ksz8) + return -ENOMEM; + ksz8->priv = spi; dev = ksz_switch_alloc(&spi->dev, ksz8); From d4eecfb28b963493a8701f271789ff04e92ae205 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 29 Apr 2021 13:08:33 +0200 Subject: [PATCH 0799/1379] net: dsa: ksz: ksz8863_smi_probe: set proper return value for ksz_switch_alloc() ksz_switch_alloc() will return NULL only if allocation is failed. So, the proper return value is -ENOMEM. Fixes: 60a364760002 ("net: dsa: microchip: Add Microchip KSZ8863 SMI based driver support") Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz8863_smi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz8863_smi.c b/drivers/net/dsa/microchip/ksz8863_smi.c index 9fb38e99001a..11293485138c 100644 --- a/drivers/net/dsa/microchip/ksz8863_smi.c +++ b/drivers/net/dsa/microchip/ksz8863_smi.c @@ -154,7 +154,7 @@ static int ksz8863_smi_probe(struct mdio_device *mdiodev) dev = ksz_switch_alloc(&mdiodev->dev, ksz8); if (!dev) - return -EINVAL; + return -ENOMEM; for (i = 0; i < ARRAY_SIZE(ksz8863_regmap_config); i++) { rc = ksz8863_regmap_config[i]; From 7b1ae248279bea33af9e797a93c35f49601cb8a0 Mon Sep 17 00:00:00 2001 From: Shuo Chen Date: Wed, 14 Apr 2021 14:24:00 -0700 Subject: [PATCH 0800/1379] dyndbg: fix parsing file query without a line-range suffix Query like 'file tcp_input.c line 1234 +p' was broken by commit aaebe329bff0 ("dyndbg: accept 'file foo.c:func1' and 'file foo.c:10-100'") because a file name without a ':' now makes the loop in ddebug_parse_query() exits early before parsing the 'line 1234' part. As a result, all pr_debug() in tcp_input.c will be enabled, instead of only the one on line 1234. Changing 'break' to 'continue' fixes this. Fixes: aaebe329bff0 ("dyndbg: accept 'file foo.c:func1' and 'file foo.c:10-100'") Cc: stable Reviewed-by: Eric Dumazet Signed-off-by: Shuo Chen Acked-by: Jason Baron Link: https://lore.kernel.org/r/20210414212400.2927281-1-giantchen@gmail.com Signed-off-by: Greg Kroah-Hartman --- lib/dynamic_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index c70d6347afa2..921d0a654243 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -396,7 +396,7 @@ static int ddebug_parse_query(char *words[], int nwords, /* tail :$info is function or line-range */ fline = strchr(query->filename, ':'); if (!fline) - break; + continue; *fline++ = '\0'; if (isalpha(*fline) || *fline == '*' || *fline == '?') { /* take as function name */ From bb6659cc0ad3c2afc3801b708b19c4c67e55ddf2 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 30 Apr 2021 16:25:15 +0800 Subject: [PATCH 0801/1379] io_uring: Fix memory leak in io_sqe_buffers_register() unreferenced object 0xffff8881123bf0a0 (size 32): comm "syz-executor557", pid 8384, jiffies 4294946143 (age 12.360s) backtrace: [] kmalloc_node include/linux/slab.h:579 [inline] [] kvmalloc_node+0x61/0xf0 mm/util.c:587 [] kvmalloc include/linux/mm.h:795 [inline] [] kvmalloc_array include/linux/mm.h:813 [inline] [] kvcalloc include/linux/mm.h:818 [inline] [] io_rsrc_data_alloc+0x4f/0xc0 fs/io_uring.c:7164 [] io_sqe_buffers_register+0x98/0x3d0 fs/io_uring.c:8383 [] __io_uring_register+0xf67/0x18c0 fs/io_uring.c:9986 [] __do_sys_io_uring_register fs/io_uring.c:10091 [inline] [] __se_sys_io_uring_register fs/io_uring.c:10071 [inline] [] __x64_sys_io_uring_register+0x112/0x230 fs/io_uring.c:10071 [] do_syscall_64+0x3a/0xb0 arch/x86/entry/common.c:47 [] entry_SYSCALL_64_after_hwframe+0x44/0xae Fix data->tags memory leak, through io_rsrc_data_free() to release data memory space. Reported-by: syzbot+0f32d05d8b6cd8d7ea3e@syzkaller.appspotmail.com Signed-off-by: Zqiang Link: https://lore.kernel.org/r/20210430082515.13886-1-qiang.zhang@windriver.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a880edb90d0c..7a2e83bc005d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8140,7 +8140,7 @@ static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) for (i = 0; i < ctx->nr_user_bufs; i++) io_buffer_unmap(ctx, &ctx->user_bufs[i]); kfree(ctx->user_bufs); - kfree(ctx->buf_data); + io_rsrc_data_free(ctx->buf_data); ctx->user_bufs = NULL; ctx->buf_data = NULL; ctx->nr_user_bufs = 0; @@ -8400,7 +8400,7 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return -ENOMEM; ret = io_buffers_map_alloc(ctx, nr_args); if (ret) { - kfree(data); + io_rsrc_data_free(data); return ret; } From 5db91e9cb5b3f645a9540d2ab67a19e464d89754 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 30 Apr 2021 15:32:22 +0200 Subject: [PATCH 0802/1379] Revert "ACPI: scan: Turn off unused power resources during initialization" Revert commit 4b9ee772eaa8 ("ACPI: scan: Turn off unused power resources during initialization") that is reported to cause initialization issues to occur. Reported-by: Shujun Wang Signed-off-by: Rafael J. Wysocki --- drivers/acpi/internal.h | 1 - drivers/acpi/power.c | 2 +- drivers/acpi/scan.c | 2 -- drivers/acpi/sleep.h | 1 + 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 9fcefcdc1dbe..e6a5d997241c 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -139,7 +139,6 @@ int acpi_device_sleep_wake(struct acpi_device *dev, int acpi_power_get_inferred_state(struct acpi_device *device, int *state); int acpi_power_on_resources(struct acpi_device *device, int state); int acpi_power_transition(struct acpi_device *device, int state); -void acpi_turn_off_unused_power_resources(void); /* -------------------------------------------------------------------------- Device Power Management diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index bacae6d178ff..7e69931be828 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -996,7 +996,6 @@ void acpi_resume_power_resources(void) mutex_unlock(&power_resource_list_lock); } -#endif void acpi_turn_off_unused_power_resources(void) { @@ -1017,3 +1016,4 @@ void acpi_turn_off_unused_power_resources(void) mutex_unlock(&power_resource_list_lock); } +#endif diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 1584c9e463bd..a184529d8fa4 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -2360,8 +2360,6 @@ int __init acpi_scan_init(void) } } - acpi_turn_off_unused_power_resources(); - acpi_scan_initialized = true; out: diff --git a/drivers/acpi/sleep.h b/drivers/acpi/sleep.h index 7fe41ee489d6..1856f76ac83f 100644 --- a/drivers/acpi/sleep.h +++ b/drivers/acpi/sleep.h @@ -8,6 +8,7 @@ extern struct list_head acpi_wakeup_device_list; extern struct mutex acpi_device_lock; extern void acpi_resume_power_resources(void); +extern void acpi_turn_off_unused_power_resources(void); static inline acpi_status acpi_set_waking_vector(u32 wakeup_address) { From a7f82c3641245055412b2b4f859ae55fd29fdffe Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Fri, 9 Apr 2021 14:08:50 +0200 Subject: [PATCH 0803/1379] s390/pci: rename zpci_configure_device() With zpci_configure_device() now always called on a device that has already been configured on the platform level its name has become misleading. Rename it to zpci_scan_configured_device() to signify that the function now only handles the correct scanning of a newly configured PCI function taking care of the special handling necessary for function 0 and functions parked waiting for a PCI bus that can't be created without first seeing function 0. Reviewed-by: Matthew Rosato Reviewed-by: Pierre Morel Signed-off-by: Niklas Schnelle Signed-off-by: Heiko Carstens --- arch/s390/include/asm/pci.h | 2 +- arch/s390/pci/pci.c | 10 ++++++---- arch/s390/pci/pci_event.c | 2 +- drivers/pci/hotplug/s390_pci_hpc.c | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 35c2af9371a9..10b67f8aab99 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -204,7 +204,7 @@ extern unsigned int s390_pci_no_rid; struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state); int zpci_enable_device(struct zpci_dev *); int zpci_disable_device(struct zpci_dev *); -int zpci_configure_device(struct zpci_dev *zdev, u32 fh); +int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh); int zpci_deconfigure_device(struct zpci_dev *zdev); int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index c01b6dbac7cf..b0993e05affe 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -738,17 +738,19 @@ error: } /** - * zpci_configure_device() - Configure a zpci_dev + * zpci_scan_configured_device() - Scan a freshly configured zpci_dev * @zdev: The zpci_dev to be configured * @fh: The general function handle supplied by the platform * * Given a device in the configuration state Configured, enables, scans and - * adds it to the common code PCI subsystem. If any failure occurs, the - * zpci_dev is left disabled. + * adds it to the common code PCI subsystem if possible. If the PCI device is + * parked because we can not yet create a PCI bus because we have not seen + * function 0, it is ignored but will be scanned once function 0 appears. + * If any failure occurs, the zpci_dev is left disabled. * * Return: 0 on success, or an error code otherwise */ -int zpci_configure_device(struct zpci_dev *zdev, u32 fh) +int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh) { int rc; diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index 1178b48a66df..8ecc256d27a5 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -113,7 +113,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) break; zdev->state = ZPCI_FN_STATE_CONFIGURED; } - zpci_configure_device(zdev, ccdf->fh); + zpci_scan_configured_device(zdev, ccdf->fh); break; case 0x0302: /* Reserved -> Standby */ if (!zdev) diff --git a/drivers/pci/hotplug/s390_pci_hpc.c b/drivers/pci/hotplug/s390_pci_hpc.c index f8f056be71b7..014868752cd4 100644 --- a/drivers/pci/hotplug/s390_pci_hpc.c +++ b/drivers/pci/hotplug/s390_pci_hpc.c @@ -35,7 +35,7 @@ static int enable_slot(struct hotplug_slot *hotplug_slot) return rc; zdev->state = ZPCI_FN_STATE_CONFIGURED; - return zpci_configure_device(zdev, zdev->fh); + return zpci_scan_configured_device(zdev, zdev->fh); } static int disable_slot(struct hotplug_slot *hotplug_slot) From 0d9cf5d8c5d0bfa144236b5f2aeff02124940c56 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Fri, 9 Apr 2021 13:51:46 +0200 Subject: [PATCH 0804/1379] s390/pci: handle stale deconfiguration events The PCIs event with PEC 0x0303 or 0x0304 are a request to deconfigure a PCI function, respectively an indication that it was already deconfigured by the platform. If such an event is queued during boot it may happen that the platform has already adjusted the configuration flag of the relevant function in the CLP List PCI Functions result. In this case we might not have configured the PCI function at all and should thus ignore the event. Note that no locking is necessary as event handling only starts after we have fully initialized the zPCI subsystem and scanned all PCI devices listed in the CLP result. Reviewed-by: Matthew Rosato Acked-by: Pierre Morel Signed-off-by: Niklas Schnelle Signed-off-by: Heiko Carstens --- arch/s390/pci/pci_event.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index 8ecc256d27a5..cd447b96b4b1 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -76,8 +76,6 @@ void zpci_event_error(void *data) static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh) { - enum zpci_state state; - zdev->fh = fh; /* Give the driver a hint that the function is * already unusable. @@ -88,15 +86,12 @@ static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh) */ zpci_disable_device(zdev); zdev->state = ZPCI_FN_STATE_STANDBY; - if (!clp_get_state(zdev->fid, &state) && - state == ZPCI_FN_STATE_RESERVED) { - zpci_zdev_put(zdev); - } } static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) { struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); + enum zpci_state state; zpci_err("avail CCDF:\n"); zpci_err_hex(ccdf, sizeof(*ccdf)); @@ -123,13 +118,28 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) break; case 0x0303: /* Deconfiguration requested */ if (zdev) { + /* The event may have been queued before we confirgured + * the device. + */ + if (zdev->state != ZPCI_FN_STATE_CONFIGURED) + break; zdev->fh = ccdf->fh; zpci_deconfigure_device(zdev); } break; case 0x0304: /* Configured -> Standby|Reserved */ - if (zdev) - zpci_event_hard_deconfigured(zdev, ccdf->fh); + if (zdev) { + /* The event may have been queued before we confirgured + * the device.: + */ + if (zdev->state == ZPCI_FN_STATE_CONFIGURED) + zpci_event_hard_deconfigured(zdev, ccdf->fh); + /* The 0x0304 event may immediately reserve the device */ + if (!clp_get_state(zdev->fid, &state) && + state == ZPCI_FN_STATE_RESERVED) { + zpci_zdev_put(zdev); + } + } break; case 0x0306: /* 0x308 or 0x302 for multiple devices */ zpci_remove_reserved_devices(); From f53a63667b0b30944462ca0ecb9f9dd5b02a56f1 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Mon, 26 Apr 2021 15:56:47 +0200 Subject: [PATCH 0805/1379] s390/configs: enable CONFIG_PCI_IOV All major distributions ship with CONFIG_PCI_IOV=y so let us enable it for our defconfigs as well. Note also that since commit e5794cf1a270 ("s390/pci: create links between PFs and VFs") we enabled proper linking between PFs and their associated VFs so with this commit and its fixes applied we can fully support handling SR-IOV enabled PFs. Signed-off-by: Niklas Schnelle Signed-off-by: Heiko Carstens --- arch/s390/configs/debug_defconfig | 1 + arch/s390/configs/defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 6422618a4f75..4eff0a3810ca 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -387,6 +387,7 @@ CONFIG_CGROUP_NET_PRIO=y CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m CONFIG_PCI=y +CONFIG_PCI_IOV=y # CONFIG_PCIEASPM is not set CONFIG_PCI_DEBUG=y CONFIG_HOTPLUG_PCI=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 371a529546aa..8e8bc93fe047 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -377,6 +377,7 @@ CONFIG_CGROUP_NET_PRIO=y CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m CONFIG_PCI=y +CONFIG_PCI_IOV=y # CONFIG_PCIEASPM is not set CONFIG_HOTPLUG_PCI=y CONFIG_HOTPLUG_PCI_S390=y From 0cceeab5a38d70fae3c2944e77e1d262c74d159b Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 26 Apr 2021 14:33:53 +0200 Subject: [PATCH 0806/1379] s390/cpumf: beautify if-then-else indentation Beautify if-then-else indentation to match coding guideline. Also use shorter pointer notation hwc instead of event->hw. Signed-off-by: Thomas Richter Acked-by : Sumanth Korikkar Signed-off-by: Heiko Carstens --- arch/s390/kernel/perf_cpum_cf.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index b3beef64d3d4..931e9d9b6524 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -230,9 +230,7 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) /* No support for kernel space counters only */ } else if (!attr->exclude_kernel && attr->exclude_user) { return -EOPNOTSUPP; - - /* Count user and kernel space */ - } else { + } else { /* Count user and kernel space */ if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) return -EOPNOTSUPP; ev = cpumf_generic_events_basic[ev]; @@ -402,12 +400,12 @@ static void cpumf_pmu_stop(struct perf_event *event, int flags) */ if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base])) ctr_set_stop(&cpuhw->state, hwc->config_base); - event->hw.state |= PERF_HES_STOPPED; + hwc->state |= PERF_HES_STOPPED; } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { hw_perf_event_update(event); - event->hw.state |= PERF_HES_UPTODATE; + hwc->state |= PERF_HES_UPTODATE; } } From 1eefa4f4399b74dc7671c4e34c1b1c6244acff22 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 26 Apr 2021 14:38:25 +0200 Subject: [PATCH 0807/1379] s390/cpumf: move counter set size calculation to common place The function to calculate the size of counter sets is renamed from cf_diag_ctrset_size() to cpum_cf_ctrset_size() and moved to the file containing common functions for the CPU Measurement Counter Facility. No functional change. Signed-off-by: Thomas Richter Acked-by : Sumanth Korikkar Signed-off-by: Heiko Carstens --- arch/s390/include/asm/cpu_mcf.h | 2 + arch/s390/kernel/perf_cpum_cf_common.c | 46 ++++++++++++++++++++++ arch/s390/kernel/perf_cpum_cf_diag.c | 54 ++------------------------ 3 files changed, 52 insertions(+), 50 deletions(-) diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h index 649b9fc60685..3e4cbcb7c4cc 100644 --- a/arch/s390/include/asm/cpu_mcf.h +++ b/arch/s390/include/asm/cpu_mcf.h @@ -123,4 +123,6 @@ static inline int stccm_avail(void) return test_facility(142); } +size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, + struct cpumf_ctr_info *info); #endif /* _ASM_S390_CPU_MCF_H */ diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c index 3bced89caffb..6d53215c8484 100644 --- a/arch/s390/kernel/perf_cpum_cf_common.c +++ b/arch/s390/kernel/perf_cpum_cf_common.c @@ -170,6 +170,52 @@ static int cpum_cf_offline_cpu(unsigned int cpu) return cpum_cf_setup(cpu, PMC_RELEASE); } +/* Return the maximum possible counter set size (in number of 8 byte counters) + * depending on type and model number. + */ +size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, + struct cpumf_ctr_info *info) +{ + size_t ctrset_size = 0; + + switch (ctrset) { + case CPUMF_CTR_SET_BASIC: + if (info->cfvn >= 1) + ctrset_size = 6; + break; + case CPUMF_CTR_SET_USER: + if (info->cfvn == 1) + ctrset_size = 6; + else if (info->cfvn >= 3) + ctrset_size = 2; + break; + case CPUMF_CTR_SET_CRYPTO: + if (info->csvn >= 1 && info->csvn <= 5) + ctrset_size = 16; + else if (info->csvn == 6) + ctrset_size = 20; + break; + case CPUMF_CTR_SET_EXT: + if (info->csvn == 1) + ctrset_size = 32; + else if (info->csvn == 2) + ctrset_size = 48; + else if (info->csvn >= 3 && info->csvn <= 5) + ctrset_size = 128; + else if (info->csvn == 6) + ctrset_size = 160; + break; + case CPUMF_CTR_SET_MT_DIAG: + if (info->csvn > 3) + ctrset_size = 48; + break; + case CPUMF_CTR_SET_MAX: + break; + } + + return ctrset_size; +} + static int __init cpum_cf_init(void) { int rc; diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c index 2e3e7edbe3a0..08c985c1097c 100644 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ b/arch/s390/kernel/perf_cpum_cf_diag.c @@ -316,52 +316,6 @@ static void cf_diag_read(struct perf_event *event) debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event); } -/* Return the maximum possible counter set size (in number of 8 byte counters) - * depending on type and model number. - */ -static size_t cf_diag_ctrset_size(enum cpumf_ctr_set ctrset, - struct cpumf_ctr_info *info) -{ - size_t ctrset_size = 0; - - switch (ctrset) { - case CPUMF_CTR_SET_BASIC: - if (info->cfvn >= 1) - ctrset_size = 6; - break; - case CPUMF_CTR_SET_USER: - if (info->cfvn == 1) - ctrset_size = 6; - else if (info->cfvn >= 3) - ctrset_size = 2; - break; - case CPUMF_CTR_SET_CRYPTO: - if (info->csvn >= 1 && info->csvn <= 5) - ctrset_size = 16; - else if (info->csvn == 6) - ctrset_size = 20; - break; - case CPUMF_CTR_SET_EXT: - if (info->csvn == 1) - ctrset_size = 32; - else if (info->csvn == 2) - ctrset_size = 48; - else if (info->csvn >= 3 && info->csvn <= 5) - ctrset_size = 128; - else if (info->csvn == 6) - ctrset_size = 160; - break; - case CPUMF_CTR_SET_MT_DIAG: - if (info->csvn > 3) - ctrset_size = 48; - break; - case CPUMF_CTR_SET_MAX: - break; - } - - return ctrset_size; -} - /* Calculate memory needed to store all counter sets together with header and * trailer data. This is independend of the counter set authorization which * can vary depending on the configuration. @@ -372,7 +326,7 @@ static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info) enum cpumf_ctr_set i; for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - size_t size = cf_diag_ctrset_size(i, info); + size_t size = cpum_cf_ctrset_size(i, info); if (size) max_size += size * sizeof(u64) + @@ -405,7 +359,7 @@ static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, ctrdata->def = CF_DIAG_CTRSET_DEF; ctrdata->set = ctrset; ctrdata->res1 = 0; - ctrset_size = cf_diag_ctrset_size(ctrset, &cpuhw->info); + ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info); if (ctrset_size) { /* Save data */ need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); @@ -845,7 +799,7 @@ static void cf_diag_cpu_read(void *parm) if (!(p->sets & cpumf_ctr_ctl[set])) continue; /* Counter set not in list */ - set_size = cf_diag_ctrset_size(set, &cpuhw->info); + set_size = cpum_cf_ctrset_size(set, &cpuhw->info); space = sizeof(csd->data) - csd->used; space = cf_diag_cpuset_read(sp, set, set_size, space); if (space) { @@ -975,7 +929,7 @@ static size_t cf_diag_needspace(unsigned int sets) for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { if (!(sets & cpumf_ctr_ctl[i])) continue; - bytes += cf_diag_ctrset_size(i, &cpuhw->info) * sizeof(u64) + + bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) + sizeof(((struct s390_ctrset_setdata *)0)->set) + sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); } From b0583ab47788617c6af484a844f13c57d7567c2a Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 26 Apr 2021 14:57:16 +0200 Subject: [PATCH 0808/1379] s390/cpumf: remove call to perf_event_update_userpage The function cpumf_pmu_add and cpumf_pmu_del call function perf_event_update_userpage(). This calls is obsolete, the calls add and delete a counter event. Counter events do not sample data and the event->rb member to access the sampling ring buffer is always NULL. The function perf_event_update_userpage() simply returns in this case. Signed-off-by: Thomas Richter Acked-by : Sumanth Korikkar Signed-off-by: Heiko Carstens --- arch/s390/kernel/perf_cpum_cf.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 931e9d9b6524..31a605bcbc6e 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -428,8 +428,6 @@ static int cpumf_pmu_add(struct perf_event *event, int flags) if (flags & PERF_EF_START) cpumf_pmu_start(event, PERF_EF_RELOAD); - perf_event_update_userpage(event); - return 0; } @@ -449,8 +447,6 @@ static void cpumf_pmu_del(struct perf_event *event, int flags) */ if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base])) ctr_set_disable(&cpuhw->state, event->hw.config_base); - - perf_event_update_userpage(event); } /* From 2f7484fd73729f89085fe08d683f5a8d9e17fe99 Mon Sep 17 00:00:00 2001 From: Vineeth Vijayan Date: Fri, 23 Apr 2021 12:08:43 +0200 Subject: [PATCH 0809/1379] s390/cio: remove invalid condition on IO_SCH_UNREG The condition to check the cdev pointer validity on css_sch_device_unregister() is a leftover from the 'commit 8cc0dcfdc1c0 ("s390/cio: remove pm support from ccw bus driver")'. This could lead to a situation, where detaching the device is not happening completely. Remove this invalid condition in the IO_SCH_UNREG case. Link: https://lore.kernel.org/r/20210423100843.2230969-1-vneethv@linux.ibm.com Fixes: 8cc0dcfdc1c0 ("s390/cio: remove pm support from ccw bus driver") Reported-by: Christian Ehrhardt Suggested-by: Christian Ehrhardt Cc: Signed-off-by: Vineeth Vijayan Tested-by: Julian Wiedmann Reviewed-by: Peter Oberparleiter Tested-by: Christian Ehrhardt Signed-off-by: Heiko Carstens --- drivers/s390/cio/device.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c index 3f026021e95e..84f659cafe76 100644 --- a/drivers/s390/cio/device.c +++ b/drivers/s390/cio/device.c @@ -1532,8 +1532,7 @@ static int io_subchannel_sch_event(struct subchannel *sch, int process) switch (action) { case IO_SCH_ORPH_UNREG: case IO_SCH_UNREG: - if (!cdev) - css_sch_device_unregister(sch); + css_sch_device_unregister(sch); break; case IO_SCH_ORPH_ATTACH: case IO_SCH_UNREG_ATTACH: From f5b474decad90719e2a4234f83d97aad19307584 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 28 Apr 2021 10:24:42 +0200 Subject: [PATCH 0810/1379] s390/configs: change CONFIG_VIRTIO_CONSOLE to "m" In former times, the virtio-console code had to be compiled into the kernel since the old guest virtio transport had some hard de- pendencies. But since the old virtio transport has been removed in commit 7fb2b2d51244 ("s390/virtio: remove the old KVM virtio transport"), we do not have this limitation anymore. Commit bb533ec8bacd ("s390/config: do not select VIRTIO_CONSOLE via Kconfig") then also lifted the hard setting in the Kconfig system, so we can finally switch the CONFIG_VIRTIO_CONSOLE knob to compile this driver as a module now, making it more flexible for the user to only load it if it is really required. Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20210428082442.321327-1-thuth@redhat.com Signed-off-by: Christian Borntraeger Signed-off-by: Heiko Carstens --- arch/s390/configs/debug_defconfig | 2 +- arch/s390/configs/defconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 4eff0a3810ca..86afcc6b56bf 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -549,7 +549,7 @@ CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 -CONFIG_VIRTIO_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=m CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m CONFIG_HANGCHECK_TIMER=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 8e8bc93fe047..71b49ea5b058 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -541,7 +541,7 @@ CONFIG_INPUT_EVDEV=y # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=0 -CONFIG_VIRTIO_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=m CONFIG_HW_RANDOM_VIRTIO=m CONFIG_RAW_DRIVER=m CONFIG_HANGCHECK_TIMER=m From bae1cd368c45d1127e054e90305d585dbc8b3b46 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 29 Apr 2021 11:14:51 +0200 Subject: [PATCH 0811/1379] s390/entry: add support for syscall stack randomization This adds support for adding a random offset to the stack while handling syscalls. The patch uses get_tod_clock_fast() as this is considered good enough and has much less performance penalty compared to using get_random_int(). The patch also adds randomization in pgm_check_handler() as the sigreturn/rt_sigreturn system calls might be called from there. Signed-off-by: Sven Schnelle Link: https://lore.kernel.org/r/20210429091451.1062594-1-svens@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 + arch/s390/include/asm/entry-common.h | 10 ++++++++++ arch/s390/kernel/syscall.c | 1 + arch/s390/kernel/traps.c | 2 ++ 4 files changed, 14 insertions(+) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c1ff874e6c2e..1900428ce557 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -137,6 +137,7 @@ config S390 select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_VMALLOC + select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY select HAVE_ARCH_TRACEHOOK diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h index 9cceb26ed63f..baa8005090c3 100644 --- a/arch/s390/include/asm/entry-common.h +++ b/arch/s390/include/asm/entry-common.h @@ -4,9 +4,11 @@ #include #include +#include #include #include #include +#include #include #define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP) @@ -48,6 +50,14 @@ static __always_inline void arch_exit_to_user_mode(void) #define arch_exit_to_user_mode arch_exit_to_user_mode +static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + unsigned long ti_work) +{ + choose_random_kstack_offset(get_tod_clock_fast() & 0xff); +} + +#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare + static inline bool on_thread_stack(void) { return !(((unsigned long)(current->stack) ^ current_stack_pointer()) & ~(THREAD_SIZE - 1)); diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index bc8e650e377d..4e5cc7d2364e 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -142,6 +142,7 @@ void do_syscall(struct pt_regs *regs) void noinstr __do_syscall(struct pt_regs *regs, int per_trap) { + add_random_kstack_offset(); enter_from_user_mode(regs); memcpy(®s->gprs[8], S390_lowcore.save_area_sync, 8 * sizeof(unsigned long)); diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 63021d484626..8dd23c703718 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -17,6 +17,7 @@ #include "asm/ptrace.h" #include #include +#include #include #include #include @@ -301,6 +302,7 @@ void noinstr __do_pgm_check(struct pt_regs *regs) unsigned int trapnr, syscall_redirect = 0; irqentry_state_t state; + add_random_kstack_offset(); regs->int_code = *(u32 *)&S390_lowcore.pgm_ilc; regs->int_parm_long = S390_lowcore.trans_exc_code; From 388708028e6937f3fc5fc19aeeb847f8970f489c Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Fri, 23 Apr 2021 13:51:59 -0700 Subject: [PATCH 0812/1379] arm64/vdso: Discard .note.gnu.property sections in vDSO The arm64 assembler in binutils 2.32 and above generates a program property note in a note section, .note.gnu.property, to encode used x86 ISAs and features. But the kernel linker script only contains a single NOTE segment: PHDRS { text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ note PT_NOTE FLAGS(4); /* PF_R */ } The NOTE segment generated by the vDSO linker script is aligned to 4 bytes. But the .note.gnu.property section must be aligned to 8 bytes on arm64. $ readelf -n vdso64.so Displaying notes found in: .note Owner Data size Description Linux 0x00000004 Unknown note type: (0x00000000) description data: 06 00 00 00 readelf: Warning: note with invalid namesz and/or descsz found at offset 0x20 readelf: Warning: type: 0x78, namesize: 0x00000100, descsize: 0x756e694c, alignment: 8 Since the note.gnu.property section in the vDSO is not checked by the dynamic linker, discard the .note.gnu.property sections in the vDSO. Similar to commit 4caffe6a28d31 ("x86/vdso: Discard .note.gnu.property sections in vDSO"), but for arm64. Signed-off-by: Bill Wendling Reviewed-by: Kees Cook Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20210423205159.830854-1-morbo@google.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vdso/vdso.lds.S | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index 61dbb4c838ef..a5e61e09ea92 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -31,6 +31,13 @@ SECTIONS .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } + /* + * Discard .note.gnu.property sections which are unused and have + * different alignment requirement from vDSO note sections. + */ + /DISCARD/ : { + *(.note.GNU-stack .note.gnu.property) + } .note : { *(.note.*) } :text :note . = ALIGN(16); @@ -48,7 +55,6 @@ SECTIONS PROVIDE(end = .); /DISCARD/ : { - *(.note.GNU-stack) *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) *(.eh_frame .eh_frame_hdr) From 8533d5bfad41e74b7dd80d292fd484913cdfb374 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 29 Apr 2021 11:20:04 +0100 Subject: [PATCH 0813/1379] arm64: stacktrace: restore terminal records We removed the terminal frame records in commit: 6106e1112cc69a36 ("arm64: remove EL0 exception frame record") ... on the assumption that as we no longer used them to find the pt_regs at exception boundaries, they were no longer necessary. However, Leo reports that as an unintended side-effect, this causes traces which cross secondary_start_kernel to terminate one entry too late, with a spurious "0" entry. There are a few ways we could sovle this, but as we're planning to use terminal records for RELIABLE_STACKTRACE, let's revert the logic change for now, keeping the update comments and accounting for the changes in commit: 3c02600144bdb0a1 ("arm64: stacktrace: Report when we reach the end of the stack") This is effectively a partial revert of commit: 6106e1112cc69a36 ("arm64: remove EL0 exception frame record") Signed-off-by: Mark Rutland Fixes: 6106e1112cc6 ("arm64: remove EL0 exception frame record") Reported-by: Leo Yan Tested-by: Leo Yan Cc: Will Deacon Cc: Mark Brown Cc: "Madhavan T. Venkataraman" Link: https://lore.kernel.org/r/20210429104813.GA33550@C02TD0UTHF1T.local Signed-off-by: Catalin Marinas --- arch/arm64/kernel/entry.S | 6 +++--- arch/arm64/kernel/stacktrace.c | 10 ++++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2df32a516ffe..806a39635482 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -283,16 +283,16 @@ alternative_else_nop_endif stp lr, x21, [sp, #S_LR] /* - * For exceptions from EL0, terminate the callchain here. + * For exceptions from EL0, create a terminal frame record. * For exceptions from EL1, create a synthetic frame record so the * interrupted code shows up in the backtrace. */ .if \el == 0 - mov x29, xzr + stp xzr, xzr, [sp, #S_STACKFRAME] .else stp x29, x22, [sp, #S_STACKFRAME] - add x29, sp, #S_STACKFRAME .endif + add x29, sp, #S_STACKFRAME #ifdef CONFIG_ARM64_SW_TTBR0_PAN alternative_if_not ARM64_HAS_PAN diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 2a1a7a281d34..c22706aa32a1 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -68,10 +68,6 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) unsigned long fp = frame->fp; struct stack_info info; - /* Terminal record; nothing to unwind */ - if (!fp) - return -ENOENT; - if (fp & 0xf) return -EINVAL; @@ -132,6 +128,12 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) frame->pc = ptrauth_strip_insn_pac(frame->pc); + /* + * This is a terminal record, so we have finished unwinding. + */ + if (!frame->fp && !frame->pc) + return -ENOENT; + return 0; } NOKPROBE_SYMBOL(unwind_frame); From f80f88f0e2f2ef9cd805fad1bbf676b0ecd4b55c Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Wed, 28 Apr 2021 17:25:01 +0200 Subject: [PATCH 0814/1379] selftests/bpf: Fix the snprintf test The BPF program for the snprintf selftest runs on all syscall entries. On busy multicore systems this can cause concurrency issues. For example it was observed that sometimes the userspace part of the test reads " 4 0000" instead of " 4 000" (extra '0' at the end) which seems to happen just before snprintf on another core sets end[-1] = '\0'. This patch adds a pid filter to the test to ensure that no bpf_snprintf() will write over the test's output buffers while the userspace reads the values. Fixes: c2e39c6bdc7e ("selftests/bpf: Add a series of tests for bpf_snprintf") Reported-by: Andrii Nakryiko Signed-off-by: Florent Revest Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210428152501.1024509-1-revest@chromium.org --- tools/testing/selftests/bpf/prog_tests/snprintf.c | 2 ++ tools/testing/selftests/bpf/progs/test_snprintf.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/snprintf.c b/tools/testing/selftests/bpf/prog_tests/snprintf.c index a958c22aec75..dffbcaa1ec98 100644 --- a/tools/testing/selftests/bpf/prog_tests/snprintf.c +++ b/tools/testing/selftests/bpf/prog_tests/snprintf.c @@ -43,6 +43,8 @@ void test_snprintf_positive(void) if (!ASSERT_OK_PTR(skel, "skel_open")) return; + skel->bss->pid = getpid(); + if (!ASSERT_OK(test_snprintf__attach(skel), "skel_attach")) goto cleanup; diff --git a/tools/testing/selftests/bpf/progs/test_snprintf.c b/tools/testing/selftests/bpf/progs/test_snprintf.c index 951a0301c553..e35129bea0a0 100644 --- a/tools/testing/selftests/bpf/progs/test_snprintf.c +++ b/tools/testing/selftests/bpf/progs/test_snprintf.c @@ -5,6 +5,8 @@ #include #include +__u32 pid = 0; + char num_out[64] = {}; long num_ret = 0; @@ -42,6 +44,9 @@ int handler(const void *ctx) static const char str1[] = "str1"; static const char longstr[] = "longstr"; + if ((int)bpf_get_current_pid_tgid() != pid) + return 0; + /* Integer types */ num_ret = BPF_SNPRINTF(num_out, sizeof(num_out), "%d %u %x %li %llu %lX", From f6334b1798c1f96ee02356c4b12bb9587bdf44f5 Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Thu, 29 Apr 2021 22:50:46 +0200 Subject: [PATCH 0815/1379] arm64: cpufeatures: use min and max Use min and max to make the effect more clear. Generated by: scripts/coccinelle/misc/minmax.cocci CC: Denis Efremov Reported-by: kernel test robot Signed-off-by: kernel test robot Signed-off-by: Julia Lawall Acked-by: Mark Rutland Link: https://lore.kernel.org/r/alpine.DEB.2.22.394.2104292246300.16899@hadrien [catalin.marinas@arm.com: include explicitly] Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 3bc30656880d..ca66a61bb396 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -695,14 +696,14 @@ static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, ret = ftrp->safe_val; break; case FTR_LOWER_SAFE: - ret = new < cur ? new : cur; + ret = min(new, cur); break; case FTR_HIGHER_OR_ZERO_SAFE: if (!cur || !new) break; fallthrough; case FTR_HIGHER_SAFE: - ret = new > cur ? new : cur; + ret = max(new, cur); break; default: BUG(); From aafe104aa9096827a429bc1358f8260ee565b7cc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 30 Apr 2021 12:17:58 -0400 Subject: [PATCH 0816/1379] tracing: Restructure trace_clock_global() to never block It was reported that a fix to the ring buffer recursion detection would cause a hung machine when performing suspend / resume testing. The following backtrace was extracted from debugging that case: Call Trace: trace_clock_global+0x91/0xa0 __rb_reserve_next+0x237/0x460 ring_buffer_lock_reserve+0x12a/0x3f0 trace_buffer_lock_reserve+0x10/0x50 __trace_graph_return+0x1f/0x80 trace_graph_return+0xb7/0xf0 ? trace_clock_global+0x91/0xa0 ftrace_return_to_handler+0x8b/0xf0 ? pv_hash+0xa0/0xa0 return_to_handler+0x15/0x30 ? ftrace_graph_caller+0xa0/0xa0 ? trace_clock_global+0x91/0xa0 ? __rb_reserve_next+0x237/0x460 ? ring_buffer_lock_reserve+0x12a/0x3f0 ? trace_event_buffer_lock_reserve+0x3c/0x120 ? trace_event_buffer_reserve+0x6b/0xc0 ? trace_event_raw_event_device_pm_callback_start+0x125/0x2d0 ? dpm_run_callback+0x3b/0xc0 ? pm_ops_is_empty+0x50/0x50 ? platform_get_irq_byname_optional+0x90/0x90 ? trace_device_pm_callback_start+0x82/0xd0 ? dpm_run_callback+0x49/0xc0 With the following RIP: RIP: 0010:native_queued_spin_lock_slowpath+0x69/0x200 Since the fix to the recursion detection would allow a single recursion to happen while tracing, this lead to the trace_clock_global() taking a spin lock and then trying to take it again: ring_buffer_lock_reserve() { trace_clock_global() { arch_spin_lock() { queued_spin_lock_slowpath() { /* lock taken */ (something else gets traced by function graph tracer) ring_buffer_lock_reserve() { trace_clock_global() { arch_spin_lock() { queued_spin_lock_slowpath() { /* DEAD LOCK! */ Tracing should *never* block, as it can lead to strange lockups like the above. Restructure the trace_clock_global() code to instead of simply taking a lock to update the recorded "prev_time" simply use it, as two events happening on two different CPUs that calls this at the same time, really doesn't matter which one goes first. Use a trylock to grab the lock for updating the prev_time, and if it fails, simply try again the next time. If it failed to be taken, that means something else is already updating it. Link: https://lkml.kernel.org/r/20210430121758.650b6e8a@gandalf.local.home Cc: stable@vger.kernel.org Tested-by: Konstantin Kharlamov Tested-by: Todd Brandt Fixes: b02414c8f045 ("ring-buffer: Fix recursion protection transitions between interrupt context") # started showing the problem Fixes: 14131f2f98ac3 ("tracing: implement trace_clock_*() APIs") # where the bug happened Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=212761 Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_clock.c | 48 +++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index aaf6793ededa..c1637f90c8a3 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -95,33 +95,49 @@ u64 notrace trace_clock_global(void) { unsigned long flags; int this_cpu; - u64 now; + u64 now, prev_time; raw_local_irq_save(flags); this_cpu = raw_smp_processor_id(); - now = sched_clock_cpu(this_cpu); + /* - * If in an NMI context then dont risk lockups and return the - * cpu_clock() time: + * The global clock "guarantees" that the events are ordered + * between CPUs. But if two events on two different CPUS call + * trace_clock_global at roughly the same time, it really does + * not matter which one gets the earlier time. Just make sure + * that the same CPU will always show a monotonic clock. + * + * Use a read memory barrier to get the latest written + * time that was recorded. + */ + smp_rmb(); + prev_time = READ_ONCE(trace_clock_struct.prev_time); + now = sched_clock_cpu(this_cpu); + + /* Make sure that now is always greater than prev_time */ + if ((s64)(now - prev_time) < 0) + now = prev_time + 1; + + /* + * If in an NMI context then dont risk lockups and simply return + * the current time. */ if (unlikely(in_nmi())) goto out; - arch_spin_lock(&trace_clock_struct.lock); + /* Tracing can cause strange recursion, always use a try lock */ + if (arch_spin_trylock(&trace_clock_struct.lock)) { + /* Reread prev_time in case it was already updated */ + prev_time = READ_ONCE(trace_clock_struct.prev_time); + if ((s64)(now - prev_time) < 0) + now = prev_time + 1; - /* - * TODO: if this happens often then maybe we should reset - * my_scd->clock to prev_time+1, to make sure - * we start ticking with the local clock from now on? - */ - if ((s64)(now - trace_clock_struct.prev_time) < 0) - now = trace_clock_struct.prev_time + 1; - - trace_clock_struct.prev_time = now; - - arch_spin_unlock(&trace_clock_struct.lock); + trace_clock_struct.prev_time = now; + /* The unlock acts as the wmb for the above rmb */ + arch_spin_unlock(&trace_clock_struct.lock); + } out: raw_local_irq_restore(flags); From ee61f36d3e46bdb1c8910d1bd5c0863130c7b951 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 12 Apr 2021 16:19:53 +0100 Subject: [PATCH 0817/1379] arm64: Relax booting requirements for configuration of traps Currently we require that a number of system registers be configured to disable traps when starting the kernel. Add an explicit note that the requirement is that the system behave as if the traps are disabled so transparent handling of the traps is fine, this should be implicit for people familiar with working with standards documents but it doesn't hurt to be explicit. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20210412151955.16078-2-broonie@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arm64/booting.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/arm64/booting.rst b/Documentation/arm64/booting.rst index 4fcc00add117..b21049ab6c69 100644 --- a/Documentation/arm64/booting.rst +++ b/Documentation/arm64/booting.rst @@ -279,7 +279,10 @@ Before jumping into the kernel, the following conditions must be met: The requirements described above for CPU mode, caches, MMUs, architected timers, coherency and system registers apply to all CPUs. All CPUs must -enter the kernel in the same exception level. +enter the kernel in the same exception level. Where the values documented +disable traps it is permissible for these traps to be enabled so long as +those traps are handled transparently by higher exception levels as though +the values documented were set. The boot loader is expected to enter the kernel on each CPU in the following manner: From b30dbf4d936224f83a98bea2328ff09e644a25b2 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 12 Apr 2021 16:19:54 +0100 Subject: [PATCH 0818/1379] arm64: Explicitly require that FPSIMD instructions do not trap We do not explicitly require that systems with FPSIMD support and EL3 have disabled EL3 traps when the kernel is started, while it is unlikely that systems will get this wrong for the sake of completeness let's spell it out. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20210412151955.16078-3-broonie@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arm64/booting.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/arm64/booting.rst b/Documentation/arm64/booting.rst index b21049ab6c69..4d0e323c0a35 100644 --- a/Documentation/arm64/booting.rst +++ b/Documentation/arm64/booting.rst @@ -277,6 +277,16 @@ Before jumping into the kernel, the following conditions must be met: - SCR_EL3.FGTEn (bit 27) must be initialised to 0b1. + For CPUs with Advanced SIMD and floating point support: + + - If EL3 is present: + + - CPTR_EL3.TFP (bit 10) must be initialised to 0b0. + + - If EL2 is present and the kernel is entered at EL1: + + - CPTR_EL2.TFP (bit 10) must be initialised to 0b0. + The requirements described above for CPU mode, caches, MMUs, architected timers, coherency and system registers apply to all CPUs. All CPUs must enter the kernel in the same exception level. Where the values documented From ff1c42cdfbcfba4cc75f3e21ed819ded2dad5f3e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 12 Apr 2021 16:19:55 +0100 Subject: [PATCH 0819/1379] arm64: Explicitly document boot requirements for SVE We do not currently document the requirements for configuration of the SVE system registers when booting the kernel, let's do so for completeness. We don't have a hard requirement that the vector lengths configured on different CPUs on initial boot be consistent since we have logic to constrain to the minimum supported value but we will reject any late CPUs which can't support the current maximum and introducing the concept of late CPUs seemed more complex than was useful so we require that all CPUs use the same value. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20210412151955.16078-4-broonie@kernel.org Signed-off-by: Catalin Marinas --- Documentation/arm64/booting.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/arm64/booting.rst b/Documentation/arm64/booting.rst index 4d0e323c0a35..18b8cc1bf32c 100644 --- a/Documentation/arm64/booting.rst +++ b/Documentation/arm64/booting.rst @@ -287,6 +287,24 @@ Before jumping into the kernel, the following conditions must be met: - CPTR_EL2.TFP (bit 10) must be initialised to 0b0. + For CPUs with the Scalable Vector Extension (FEAT_SVE) present: + + - if EL3 is present: + + - CPTR_EL3.EZ (bit 8) must be initialised to 0b1. + + - ZCR_EL3.LEN must be initialised to the same value for all CPUs the + kernel is executed on. + + - If the kernel is entered at EL1 and EL2 is present: + + - CPTR_EL2.TZ (bit 8) must be initialised to 0b0. + + - CPTR_EL2.ZEN (bits 17:16) must be initialised to 0b11. + + - ZCR_EL2.LEN must be initialised to the same value for all CPUs the + kernel will execute on. + The requirements described above for CPU mode, caches, MMUs, architected timers, coherency and system registers apply to all CPUs. All CPUs must enter the kernel in the same exception level. Where the values documented From 840d7f01d4b335ece36f656fbc1cfcf127dee310 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Wed, 28 Apr 2021 00:42:00 +0200 Subject: [PATCH 0820/1379] dt-bindings: interrupt-controller: idt,32434-pic: Add missing interrupts property Interrupts property is required, so add it. Signed-off-by: Thomas Bogendoerfer Link: https://lore.kernel.org/r/20210427224201.32285-1-tsbogend@alpha.franken.de Signed-off-by: Rob Herring --- .../bindings/interrupt-controller/idt,32434-pic.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/interrupt-controller/idt,32434-pic.yaml b/Documentation/devicetree/bindings/interrupt-controller/idt,32434-pic.yaml index df5d8d1ead70..160ff4b07cac 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/idt,32434-pic.yaml +++ b/Documentation/devicetree/bindings/interrupt-controller/idt,32434-pic.yaml @@ -22,6 +22,9 @@ properties: reg: maxItems: 1 + interrupts: + maxItems: 1 + interrupt-controller: true required: @@ -29,6 +32,7 @@ required: - compatible - reg - interrupt-controller + - interrupts additionalProperties: false From ae7ce982fc7da240d86bfe01ca165250ad053802 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Thu, 18 Feb 2021 16:28:37 +0100 Subject: [PATCH 0821/1379] dt-bindings: bcm2711-hdmi: Fix broken schema For some reason, unevaluatedProperties doesn't work and additionalProperties is required. Fix it by switching to additionalProperties. Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20210218152837.1080819-1-maxime@cerno.tech [robh: Also committed as a3cb15cda1b8, but lost due to how drm-misc-fixes and drm-misc-next got merged.] Signed-off-by: Rob Herring --- .../devicetree/bindings/display/brcm,bcm2711-hdmi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/display/brcm,bcm2711-hdmi.yaml b/Documentation/devicetree/bindings/display/brcm,bcm2711-hdmi.yaml index 57324a5f0271..a1d5a32660e0 100644 --- a/Documentation/devicetree/bindings/display/brcm,bcm2711-hdmi.yaml +++ b/Documentation/devicetree/bindings/display/brcm,bcm2711-hdmi.yaml @@ -109,7 +109,7 @@ required: - resets - ddc -unevaluatedProperties: false +additionalProperties: false examples: - | From ccd61f07d28912dcd6a61ea73f5d69af7ad88efa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Wilczy=C5=84ski?= Date: Tue, 20 Apr 2021 21:09:13 +0000 Subject: [PATCH 0822/1379] x86/PCI: Remove unused alloc_pci_root_info() return value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "info" value returned from alloc_pci_root_info() is never used, so drop it. [bhelgaas: commit log] Addresses-Coverity-ID: 1222153 ("Unused value") Link: https://lore.kernel.org/r/20210420210913.1137116-1-kw@linux.com Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas --- arch/x86/pci/amd_bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index bfa50e65ef6c..ae744b6a0785 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -126,7 +126,7 @@ static int __init early_root_info_init(void) node = (reg >> 4) & 0x07; link = (reg >> 8) & 0x03; - info = alloc_pci_root_info(min_bus, max_bus, node, link); + alloc_pci_root_info(min_bus, max_bus, node, link); } /* From 384d0c68204a4a657f4bbc096c50d729ae7d9ef0 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 12 Feb 2021 11:02:47 +0100 Subject: [PATCH 0823/1379] PCI/VPD: Remove pci_set_vpd_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 24a1720a0841 ("cxgb4: collect serial config version from register") removed the only usage of pci_set_vpd_size(). If a device needs to override the auto-detected VPD size, then this can be done with a PCI quirk, as is done for Chelsio devices. There's no need to allow drivers to change the VPD size. Remove pci_set_vpd_size(). [bhelgaas: squash in Arnd's fix for "'pci_vpd_set_size' defined but not used" from https://lore.kernel.org/r/20210421140334.3847155-1-arnd@kernel.org] Link: https://lore.kernel.org/r/47d86e52-9bcf-7da7-1edb-0d988a7a82ab@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas Reviewed-by: Krzysztof Wilczyński --- drivers/pci/vpd.c | 58 ++++++++++----------------------------------- include/linux/pci.h | 1 - 2 files changed, 13 insertions(+), 46 deletions(-) diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index ab81c7a5185f..a1d31c5d1864 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -16,7 +16,6 @@ struct pci_vpd_ops { ssize_t (*read)(struct pci_dev *dev, loff_t pos, size_t count, void *buf); ssize_t (*write)(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); - int (*set_size)(struct pci_dev *dev, size_t len); }; struct pci_vpd { @@ -60,19 +59,6 @@ ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void } EXPORT_SYMBOL(pci_write_vpd); -/** - * pci_set_vpd_size - Set size of Vital Product Data space - * @dev: pci device struct - * @len: size of vpd space - */ -int pci_set_vpd_size(struct pci_dev *dev, size_t len) -{ - if (!dev->vpd || !dev->vpd->ops) - return -ENODEV; - return dev->vpd->ops->set_size(dev, len); -} -EXPORT_SYMBOL(pci_set_vpd_size); - #define PCI_VPD_MAX_SIZE (PCI_VPD_ADDR_MASK + 1) /** @@ -297,23 +283,9 @@ out: return ret ? ret : count; } -static int pci_vpd_set_size(struct pci_dev *dev, size_t len) -{ - struct pci_vpd *vpd = dev->vpd; - - if (len == 0 || len > PCI_VPD_MAX_SIZE) - return -EIO; - - vpd->valid = 1; - vpd->len = len; - - return 0; -} - static const struct pci_vpd_ops pci_vpd_ops = { .read = pci_vpd_read, .write = pci_vpd_write, - .set_size = pci_vpd_set_size, }; static ssize_t pci_vpd_f0_read(struct pci_dev *dev, loff_t pos, size_t count, @@ -346,24 +318,9 @@ static ssize_t pci_vpd_f0_write(struct pci_dev *dev, loff_t pos, size_t count, return ret; } -static int pci_vpd_f0_set_size(struct pci_dev *dev, size_t len) -{ - struct pci_dev *tdev = pci_get_slot(dev->bus, - PCI_DEVFN(PCI_SLOT(dev->devfn), 0)); - int ret; - - if (!tdev) - return -ENODEV; - - ret = pci_set_vpd_size(tdev, len); - pci_dev_put(tdev); - return ret; -} - static const struct pci_vpd_ops pci_vpd_f0_ops = { .read = pci_vpd_f0_read, .write = pci_vpd_f0_write, - .set_size = pci_vpd_f0_set_size, }; int pci_vpd_init(struct pci_dev *dev) @@ -564,6 +521,17 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_QLOGIC, 0x2261, quirk_blacklist_vpd); DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_AMAZON_ANNAPURNA_LABS, 0x0031, PCI_CLASS_BRIDGE_PCI, 8, quirk_blacklist_vpd); +static void pci_vpd_set_size(struct pci_dev *dev, size_t len) +{ + struct pci_vpd *vpd = dev->vpd; + + if (!vpd || len == 0 || len > PCI_VPD_MAX_SIZE) + return; + + vpd->valid = 1; + vpd->len = len; +} + static void quirk_chelsio_extend_vpd(struct pci_dev *dev) { int chip = (dev->device & 0xf000) >> 12; @@ -582,9 +550,9 @@ static void quirk_chelsio_extend_vpd(struct pci_dev *dev) * limits. */ if (chip == 0x0 && prod >= 0x20) - pci_set_vpd_size(dev, 8192); + pci_vpd_set_size(dev, 8192); else if (chip >= 0x4 && func < 0x8) - pci_set_vpd_size(dev, 2048); + pci_vpd_set_size(dev, 2048); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_CHELSIO, PCI_ANY_ID, diff --git a/include/linux/pci.h b/include/linux/pci.h index 86c799c97b77..edadc62ae058 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1302,7 +1302,6 @@ void pci_unlock_rescan_remove(void); /* Vital Product Data routines */ ssize_t pci_read_vpd(struct pci_dev *dev, loff_t pos, size_t count, void *buf); ssize_t pci_write_vpd(struct pci_dev *dev, loff_t pos, size_t count, const void *buf); -int pci_set_vpd_size(struct pci_dev *dev, size_t len); /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx); From d1df5f3f4cfff88c989cbeec6ca0e02340494818 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 1 Apr 2021 14:03:49 +0200 Subject: [PATCH 0824/1379] PCI/VPD: Make missing VPD message less alarming Realtek RTL8169/8168/8125 NIC families indicate VPD capability and an optional VPD EEPROM can be connected via I2C/SPI. However I haven't seen any card or system with such a VPD EEPROM yet. The missing EEPROM causes the following warning whenever e.g. lscpi -vv is executed. invalid short VPD tag 00 at offset 01 The warning confuses users, and I think we should handle the situation more gently. Therefore, if first VPD byte is read as 0x00, assume a missing optional VPD PROM and replace the warning with a more descriptive message at info level. [bhelgaas: fix pre-existing whitespace] Link: https://lore.kernel.org/r/ccbc11f1-4dbb-e2c8-d0ea-559e06d4c340@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/pci/vpd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index a1d31c5d1864..cbf784ed5216 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -71,10 +71,14 @@ static size_t pci_vpd_size(struct pci_dev *dev, size_t old_size) size_t off = 0; unsigned char header[1+2]; /* 1 byte tag, 2 bytes length */ - while (off < old_size && - pci_read_vpd(dev, off, 1, header) == 1) { + while (off < old_size && pci_read_vpd(dev, off, 1, header) == 1) { unsigned char tag; + if (!header[0] && !off) { + pci_info(dev, "Invalid VPD tag 00, assume missing optional VPD EPROM\n"); + return 0; + } + if (header[0] & PCI_VPD_LRDT) { /* Large Resource Data Type Tag */ tag = pci_vpd_lrdt_tag(header); From e947e7b1163d5a4375dc1ca6134ebda67ee7d33a Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 1 Apr 2021 18:37:47 +0200 Subject: [PATCH 0825/1379] PCI/VPD: Change pci_vpd_init() return type to void pci_init_capabilities() is the only caller and doesn't use the return value. So let's change the return type to void. Link: https://lore.kernel.org/r/663ec440-8375-1459-ddb4-98ea76e75917@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/pci/pci.h | 2 +- drivers/pci/vpd.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ef7c4661314f..37d21aa0b0d5 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -141,7 +141,7 @@ static inline bool pcie_downstream_port(const struct pci_dev *dev) type == PCI_EXP_TYPE_PCIE_BRIDGE; } -int pci_vpd_init(struct pci_dev *dev); +void pci_vpd_init(struct pci_dev *dev); void pci_vpd_release(struct pci_dev *dev); void pcie_vpd_create_sysfs_dev_files(struct pci_dev *dev); void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev); diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index cbf784ed5216..c1c4f7d80a04 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -327,18 +327,18 @@ static const struct pci_vpd_ops pci_vpd_f0_ops = { .write = pci_vpd_f0_write, }; -int pci_vpd_init(struct pci_dev *dev) +void pci_vpd_init(struct pci_dev *dev) { struct pci_vpd *vpd; u8 cap; cap = pci_find_capability(dev, PCI_CAP_ID_VPD); if (!cap) - return -ENODEV; + return; vpd = kzalloc(sizeof(*vpd), GFP_ATOMIC); if (!vpd) - return -ENOMEM; + return; vpd->len = PCI_VPD_MAX_SIZE; if (dev->dev_flags & PCI_DEV_FLAGS_VPD_REF_F0) @@ -350,7 +350,6 @@ int pci_vpd_init(struct pci_dev *dev) vpd->busy = 0; vpd->valid = 0; dev->vpd = vpd; - return 0; } void pci_vpd_release(struct pci_dev *dev) From 4cf0abbce69bde3d07757dfa9be6420407fdbc45 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 1 Apr 2021 18:43:15 +0200 Subject: [PATCH 0826/1379] PCI/VPD: Remove pci_vpd_find_tag() 'offset' argument All callers pass 0 as offset. Therefore remove the parameter and use a fixed offset 0 in pci_vpd_find_tag(). Link: https://lore.kernel.org/r/f62e6e19-5423-2ead-b2bd-62844b23ef8f@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/net/ethernet/broadcom/bnx2.c | 2 +- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 3 +-- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/tg3.c | 4 ++-- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 2 +- drivers/net/ethernet/sfc/efx.c | 2 +- drivers/net/ethernet/sfc/falcon/efx.c | 2 +- drivers/pci/vpd.c | 4 ++-- drivers/scsi/cxlflash/main.c | 3 +-- include/linux/pci.h | 3 +-- 10 files changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index 3e8a179f39db..c0986096c701 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -8057,7 +8057,7 @@ bnx2_read_vpd_fw_ver(struct bnx2 *bp) data[i + 3] = data[i + BNX2_VPD_LEN]; } - i = pci_vpd_find_tag(data, 0, BNX2_VPD_LEN, PCI_VPD_LRDT_RO_DATA); + i = pci_vpd_find_tag(data, BNX2_VPD_LEN, PCI_VPD_LRDT_RO_DATA); if (i < 0) goto vpd_done; diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index b652ed72a621..d267e45a0518 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -12207,8 +12207,7 @@ static void bnx2x_read_fwinfo(struct bnx2x *bp) /* VPD RO tag should be first tag after identifier string, hence * we should be able to find it in first BNX2X_VPD_LEN chars */ - i = pci_vpd_find_tag(vpd_start, 0, BNX2X_VPD_LEN, - PCI_VPD_LRDT_RO_DATA); + i = pci_vpd_find_tag(vpd_start, BNX2X_VPD_LEN, PCI_VPD_LRDT_RO_DATA); if (i < 0) goto out_not_found; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index a680fd9c68ea..2bccdac28a24 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -12668,7 +12668,7 @@ static void bnxt_vpd_read_info(struct bnxt *bp) goto exit; } - i = pci_vpd_find_tag(vpd_data, 0, vpd_size, PCI_VPD_LRDT_RO_DATA); + i = pci_vpd_find_tag(vpd_data, vpd_size, PCI_VPD_LRDT_RO_DATA); if (i < 0) { netdev_err(bp->dev, "VPD READ-Only not found\n"); goto exit; diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index d2381929931b..b0e49643f483 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -13016,7 +13016,7 @@ static int tg3_test_nvram(struct tg3 *tp) if (!buf) return -ENOMEM; - i = pci_vpd_find_tag((u8 *)buf, 0, len, PCI_VPD_LRDT_RO_DATA); + i = pci_vpd_find_tag((u8 *)buf, len, PCI_VPD_LRDT_RO_DATA); if (i > 0) { j = pci_vpd_lrdt_size(&((u8 *)buf)[i]); if (j < 0) @@ -15629,7 +15629,7 @@ static void tg3_read_vpd(struct tg3 *tp) if (!vpd_data) goto out_no_vpd; - i = pci_vpd_find_tag(vpd_data, 0, vpdlen, PCI_VPD_LRDT_RO_DATA); + i = pci_vpd_find_tag(vpd_data, vpdlen, PCI_VPD_LRDT_RO_DATA); if (i < 0) goto out_not_found; diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index 98829e482bfa..ef5d10e1cce6 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -2774,7 +2774,7 @@ int t4_get_raw_vpd_params(struct adapter *adapter, struct vpd_params *p) if (id_len > ID_LEN) id_len = ID_LEN; - i = pci_vpd_find_tag(vpd, 0, VPD_LEN, PCI_VPD_LRDT_RO_DATA); + i = pci_vpd_find_tag(vpd, VPD_LEN, PCI_VPD_LRDT_RO_DATA); if (i < 0) { dev_err(adapter->pdev_dev, "missing VPD-R section\n"); ret = -EINVAL; diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 36c8625a6fd7..c746ca7235f1 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -920,7 +920,7 @@ static void efx_probe_vpd_strings(struct efx_nic *efx) } /* Get the Read only section */ - ro_start = pci_vpd_find_tag(vpd_data, 0, vpd_size, PCI_VPD_LRDT_RO_DATA); + ro_start = pci_vpd_find_tag(vpd_data, vpd_size, PCI_VPD_LRDT_RO_DATA); if (ro_start < 0) { netif_err(efx, drv, efx->net_dev, "VPD Read-only not found\n"); return; diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c index f8979991970e..5e7a57b680ca 100644 --- a/drivers/net/ethernet/sfc/falcon/efx.c +++ b/drivers/net/ethernet/sfc/falcon/efx.c @@ -2800,7 +2800,7 @@ static void ef4_probe_vpd_strings(struct ef4_nic *efx) } /* Get the Read only section */ - ro_start = pci_vpd_find_tag(vpd_data, 0, vpd_size, PCI_VPD_LRDT_RO_DATA); + ro_start = pci_vpd_find_tag(vpd_data, vpd_size, PCI_VPD_LRDT_RO_DATA); if (ro_start < 0) { netif_err(efx, drv, efx->net_dev, "VPD Read-only not found\n"); return; diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index c1c4f7d80a04..79d4313c91a3 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -410,11 +410,11 @@ void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev) } } -int pci_vpd_find_tag(const u8 *buf, unsigned int off, unsigned int len, u8 rdt) +int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt) { int i; - for (i = off; i < len; ) { + for (i = 0; i < len; ) { u8 val = buf[i]; if (val & PCI_VPD_LRDT) { diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c index e72440d919d2..ae0963cec222 100644 --- a/drivers/scsi/cxlflash/main.c +++ b/drivers/scsi/cxlflash/main.c @@ -1649,8 +1649,7 @@ static int read_vpd(struct cxlflash_cfg *cfg, u64 wwpn[]) } /* Get the read only section offset */ - ro_start = pci_vpd_find_tag(vpd_data, 0, vpd_size, - PCI_VPD_LRDT_RO_DATA); + ro_start = pci_vpd_find_tag(vpd_data, vpd_size, PCI_VPD_LRDT_RO_DATA); if (unlikely(ro_start < 0)) { dev_err(dev, "%s: VPD Read-only data not found\n", __func__); rc = -ENODEV; diff --git a/include/linux/pci.h b/include/linux/pci.h index edadc62ae058..1eb35c09674e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2310,14 +2310,13 @@ static inline u8 pci_vpd_info_field_size(const u8 *info_field) /** * pci_vpd_find_tag - Locates the Resource Data Type tag provided * @buf: Pointer to buffered vpd data - * @off: The offset into the buffer at which to begin the search * @len: The length of the vpd buffer * @rdt: The Resource Data Type to search for * * Returns the index where the Resource Data Type was found or * -ENOENT otherwise. */ -int pci_vpd_find_tag(const u8 *buf, unsigned int off, unsigned int len, u8 rdt); +int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt); /** * pci_vpd_find_info_keyword - Locates an information field keyword in the VPD From 0a08bc07610e172972985d6322fd671cff76c928 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 1 Apr 2021 18:44:15 +0200 Subject: [PATCH 0827/1379] PCI/VPD: Remove pci_vpd_find_tag() SRDT handling Only SRDT tag is the end tag, and no caller is interested in it. This allows to remove all SRDT tag handling. Link: https://lore.kernel.org/r/3f63f06f-734f-8fff-9518-27fe1faf903d@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/pci/vpd.c | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 79d4313c91a3..562d79b597b3 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -412,33 +412,14 @@ void pcie_vpd_remove_sysfs_dev_files(struct pci_dev *dev) int pci_vpd_find_tag(const u8 *buf, unsigned int len, u8 rdt) { - int i; + int i = 0; - for (i = 0; i < len; ) { - u8 val = buf[i]; + /* look for LRDT tags only, end tag is the only SRDT tag */ + while (i + PCI_VPD_LRDT_TAG_SIZE <= len && buf[i] & PCI_VPD_LRDT) { + if (buf[i] == rdt) + return i; - if (val & PCI_VPD_LRDT) { - /* Don't return success of the tag isn't complete */ - if (i + PCI_VPD_LRDT_TAG_SIZE > len) - break; - - if (val == rdt) - return i; - - i += PCI_VPD_LRDT_TAG_SIZE + - pci_vpd_lrdt_size(&buf[i]); - } else { - u8 tag = val & ~PCI_VPD_SRDT_LEN_MASK; - - if (tag == rdt) - return i; - - if (tag == PCI_VPD_SRDT_END) - break; - - i += PCI_VPD_SRDT_TAG_SIZE + - pci_vpd_srdt_size(&buf[i]); - } + i += PCI_VPD_LRDT_TAG_SIZE + pci_vpd_lrdt_size(buf + i); } return -ENOENT; From 5881b38912f3f48a4bd74a4eed58be12df012063 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 16 Apr 2021 21:52:07 +0200 Subject: [PATCH 0828/1379] PCI/VPD: Add helper pci_get_func0_dev() Factor out the "get function 0" logic into pci_get_func0_dev(). [bhelgaas: keep PCI_DEVFN(PCI_SLOT()) instead of exposing implementation details, commit log] Link: https://lore.kernel.org/r/75d1f619-8a35-690d-8fc8-e851264a4bbb@gmail.com Signed-off-by: Heiner Kallweit Signed-off-by: Bjorn Helgaas --- drivers/pci/vpd.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index 562d79b597b3..c6aad87dd0f9 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -29,6 +29,11 @@ struct pci_vpd { unsigned int valid:1; }; +static struct pci_dev *pci_get_func0_dev(struct pci_dev *dev) +{ + return pci_get_slot(dev->bus, PCI_DEVFN(PCI_SLOT(dev->devfn), 0)); +} + /** * pci_read_vpd - Read one entry from Vital Product Data * @dev: pci device struct @@ -295,8 +300,7 @@ static const struct pci_vpd_ops pci_vpd_ops = { static ssize_t pci_vpd_f0_read(struct pci_dev *dev, loff_t pos, size_t count, void *arg) { - struct pci_dev *tdev = pci_get_slot(dev->bus, - PCI_DEVFN(PCI_SLOT(dev->devfn), 0)); + struct pci_dev *tdev = pci_get_func0_dev(dev); ssize_t ret; if (!tdev) @@ -310,8 +314,7 @@ static ssize_t pci_vpd_f0_read(struct pci_dev *dev, loff_t pos, size_t count, static ssize_t pci_vpd_f0_write(struct pci_dev *dev, loff_t pos, size_t count, const void *arg) { - struct pci_dev *tdev = pci_get_slot(dev->bus, - PCI_DEVFN(PCI_SLOT(dev->devfn), 0)); + struct pci_dev *tdev = pci_get_func0_dev(dev); ssize_t ret; if (!tdev) @@ -457,7 +460,7 @@ static void quirk_f0_vpd_link(struct pci_dev *dev) if (!PCI_FUNC(dev->devfn)) return; - f0 = pci_get_slot(dev->bus, PCI_DEVFN(PCI_SLOT(dev->devfn), 0)); + f0 = pci_get_func0_dev(dev); if (!f0) return; From e00dc69b5f17c444a38cd9745a0f76bc989b3af4 Mon Sep 17 00:00:00 2001 From: Arun Easi Date: Fri, 9 Apr 2021 14:51:53 -0700 Subject: [PATCH 0829/1379] PCI: Allow VPD access for QLogic ISP2722 0d5370d1d852 ("PCI: Prevent VPD access for QLogic ISP2722") disabled access to VPD of the ISP2722-based 16/32Gb Fibre Channel to PCIe Adapter because reading past the end of the VPD caused NMIs. 104daa71b396 ("PCI: Determine actual VPD size on first access") limits reads to the actual size of VPD, which should prevent these NMIs. 104daa71b396 was merged *before* 0d5370d1d852, but we think the testing that prompted 0d5370d1d852 ("PCI: Prevent VPD access for QLogic ISP2722") was done with a kernel that lacked 104daa71b396. See [1, 2]. Remove the quirk added by 0d5370d1d852 ("PCI: Prevent VPD access for QLogic ISP2722") so customers can read the HBA VPD. [1] https://lore.kernel.org/linux-pci/alpine.LRH.2.21.9999.2012161641230.28924@irv1user01.caveonetworks.com/ [2] https://lore.kernel.org/linux-pci/alpine.LRH.2.21.9999.2104071535110.13940@irv1user01.caveonetworks.com/ [bhelgaas: commit log] Link: https://lore.kernel.org/r/20210409215153.16569-2-aeasi@marvell.com Signed-off-by: Arun Easi Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v4.6+ --- drivers/pci/vpd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pci/vpd.c b/drivers/pci/vpd.c index c6aad87dd0f9..8af31c5eec2d 100644 --- a/drivers/pci/vpd.c +++ b/drivers/pci/vpd.c @@ -500,7 +500,6 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LSI_LOGIC, 0x005d, quirk_blacklist_vpd); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LSI_LOGIC, 0x005f, quirk_blacklist_vpd); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATTANSIC, PCI_ANY_ID, quirk_blacklist_vpd); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_QLOGIC, 0x2261, quirk_blacklist_vpd); /* * The Amazon Annapurna Labs 0x0031 device id is reused for other non Root Port * device types, so the quirk is registered for the PCI_CLASS_BRIDGE_PCI class. From db7c691d7f4da6af40a6ce63331a5a9fb9511c2a Mon Sep 17 00:00:00 2001 From: Mohammad Athari Bin Ismail Date: Fri, 30 Apr 2021 07:01:04 +0800 Subject: [PATCH 0830/1379] net: stmmac: cleared __FPE_REMOVING bit in stmmac_fpe_start_wq() An issue found when network interface is down and up again, FPE handshake fails to trigger. This is due to __FPE_REMOVING bit remains being set in stmmac_fpe_stop_wq() but not cleared in stmmac_fpe_start_wq(). This cause FPE workqueue task, stmmac_fpe_lp_task() not able to be executed. To fix this, add clearing __FPE_REMOVING bit in stmmac_fpe_start_wq(). Fixes: 5a5586112b92 ("net: stmmac: support FPE link partner hand-shaking procedure") Signed-off-by: Mohammad Athari Bin Ismail Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index a9a984c57d78..e0b7eebcb512 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3180,6 +3180,7 @@ static int stmmac_fpe_start_wq(struct stmmac_priv *priv) char *name; clear_bit(__FPE_TASK_SCHED, &priv->fpe_task_state); + clear_bit(__FPE_REMOVING, &priv->fpe_task_state); name = priv->wq_name; sprintf(name, "%s-fpe", priv->dev->name); From 905416f18fe74bdd4de91bf94ef5a790a36e4b99 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Fri, 30 Apr 2021 17:06:19 +0800 Subject: [PATCH 0831/1379] net: hns3: fix for vxlan gpe tx checksum bug When skb->ip_summed is CHECKSUM_PARTIAL, for non-tunnel udp packet, which has a dest port as the IANA assigned, the hardware is expected to do the checksum offload, but the hardware whose version is below V3 will not do the checksum offload when udp dest port is 4790. So fixes it by doing the checksum in software for this case. Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC") Signed-off-by: Hao Chen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 5dc6de5107fb..a2b9a8af0413 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -824,7 +824,7 @@ static int hns3_get_l4_protocol(struct sk_buff *skb, u8 *ol4_proto, * and it is udp packet, which has a dest port as the IANA assigned. * the hardware is expected to do the checksum offload, but the * hardware will not do the checksum offload when udp dest port is - * 4789 or 6081. + * 4789, 4790 or 6081. */ static bool hns3_tunnel_csum_bug(struct sk_buff *skb) { @@ -842,7 +842,8 @@ static bool hns3_tunnel_csum_bug(struct sk_buff *skb) if (!(!skb->encapsulation && (l4.udp->dest == htons(IANA_VXLAN_UDP_PORT) || - l4.udp->dest == htons(GENEVE_UDP_PORT)))) + l4.udp->dest == htons(GENEVE_UDP_PORT) || + l4.udp->dest == htons(4790)))) return false; skb_checksum_help(skb); From b416e872be06fdace3c36cf5210130509d0f0e72 Mon Sep 17 00:00:00 2001 From: Peng Li Date: Fri, 30 Apr 2021 17:06:20 +0800 Subject: [PATCH 0832/1379] net: hns3: use netif_tx_disable to stop the transmit queue Currently, netif_tx_stop_all_queues() is used to ensure that the xmit is not running, but for the concurrent case it will not take effect, since netif_tx_stop_all_queues() just sets a flag without locking to indicate that the xmit queue(s) should not be run. So use netif_tx_disable() to replace netif_tx_stop_all_queues(), it takes the xmit queue lock while marking the queue stopped. Fixes: 76ad4f0ee747 ("net: hns3: Add support of HNS3 Ethernet Driver for hip08 SoC") Signed-off-by: Peng Li Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index a2b9a8af0413..783fdaf8f8d6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -575,8 +575,8 @@ static int hns3_nic_net_stop(struct net_device *netdev) if (h->ae_algo->ops->set_timer_task) h->ae_algo->ops->set_timer_task(priv->ae_handle, false); - netif_tx_stop_all_queues(netdev); netif_carrier_off(netdev); + netif_tx_disable(netdev); hns3_nic_net_down(netdev); From 8c9200e387721c597baabb319b4bd1cdf1155e35 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Fri, 30 Apr 2021 17:06:21 +0800 Subject: [PATCH 0833/1379] net: hns3: clear unnecessary reset request in hclge_reset_rebuild HW error and global reset are reported through MSIX interrupts. The same error may be reported to different functions at the same time. When global reset begins, the pending reset request set by this error is unnecessary. So clear the pending reset request after the reset is complete to avoid the repeated reset. Fixes: f6162d44126c ("net: hns3: add handling of hw errors reported through MSIX") Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index c296ab64fb0a..6304aed49f22 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3978,6 +3978,12 @@ static void hclge_update_reset_level(struct hclge_dev *hdev) struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); enum hnae3_reset_type reset_level; + /* reset request will not be set during reset, so clear + * pending reset request to avoid unnecessary reset + * caused by the same reason. + */ + hclge_get_reset_level(ae_dev, &hdev->reset_request); + /* if default_reset_request has a higher level reset request, * it should be handled as soon as possible. since some errors * need this kind of reset to fix. From 472497d0bdae890a896013332a0b673f9acdf2bf Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Fri, 30 Apr 2021 17:06:22 +0800 Subject: [PATCH 0834/1379] net: hns3: disable phy loopback setting in hclge_mac_start_phy If selftest and reset are performed at the same time, the phy loopback setting may be still in enable state after the reset, and device cannot link up. So fix this issue by disabling phy loopback before phy_start(). Fixes: 256727da7395 ("net: hns3: Add MDIO support to HNS3 Ethernet driver for hip08 SoC") Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c index 08e88d9422cd..1231c34f0949 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c @@ -255,6 +255,8 @@ void hclge_mac_start_phy(struct hclge_dev *hdev) if (!phydev) return; + phy_loopback(phydev, false); + phy_start(phydev); } From f0a5818b472c574a985cfeb6518a5ba395f26b3c Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 30 Apr 2021 17:27:34 +0800 Subject: [PATCH 0835/1379] vsock/vmci: Remove redundant assignment to err Variable 'err' is set to zero but this value is never read as it is overwritten with a new value later on, hence it is a redundant assignment and can be removed. Clean up the following clang-analyzer warning: net/vmw_vsock/vmci_transport.c:948:2: warning: Value stored to 'err' is never read [clang-analyzer-deadcode.DeadStores] Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: David S. Miller --- net/vmw_vsock/vmci_transport.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 1c9ecb18b8e6..c99bc4ce78e2 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -944,8 +944,6 @@ static int vmci_transport_recv_listen(struct sock *sk, bool old_request = false; bool old_pkt_proto = false; - err = 0; - /* Because we are in the listen state, we could be receiving a packet * for ourself or any previous connection requests that we received. * If it's the latter, we try to find a socket in our list of pending From a57d3d48366b9068195d01f9ef97844d5ee14f73 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 30 Apr 2021 14:11:42 +0200 Subject: [PATCH 0836/1379] net: atheros: nic-devel@qualcomm.com is dead Remove it from the MODULE_AUTHOR statements referencing it. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- drivers/net/ethernet/atheros/alx/main.c | 2 +- drivers/net/ethernet/atheros/atl1c/atl1c_main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c index 9e02f8864593..b3d74332ed33 100644 --- a/drivers/net/ethernet/atheros/alx/main.c +++ b/drivers/net/ethernet/atheros/alx/main.c @@ -2016,7 +2016,7 @@ static struct pci_driver alx_driver = { module_pci_driver(alx_driver); MODULE_DEVICE_TABLE(pci, alx_pci_tbl); MODULE_AUTHOR("Johannes Berg "); -MODULE_AUTHOR("Qualcomm Corporation, "); +MODULE_AUTHOR("Qualcomm Corporation"); MODULE_DESCRIPTION( "Qualcomm Atheros(R) AR816x/AR817x PCI-E Ethernet Network Driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c index 1d17c24e6d75..c6263cf8d3c0 100644 --- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c +++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c @@ -32,7 +32,7 @@ static const struct pci_device_id atl1c_pci_tbl[] = { MODULE_DEVICE_TABLE(pci, atl1c_pci_tbl); MODULE_AUTHOR("Jie Yang"); -MODULE_AUTHOR("Qualcomm Atheros Inc., "); +MODULE_AUTHOR("Qualcomm Atheros Inc."); MODULE_DESCRIPTION("Qualcomm Atheros 100/1000M Ethernet Network Driver"); MODULE_LICENSE("GPL"); From c5197b4ec932f34934944859ca78086bd910edc9 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Fri, 30 Apr 2021 14:50:09 -0300 Subject: [PATCH 0837/1379] afs, rxrpc: Add Marc Dionne as co-maintainer Add Marc Dionne as a co-maintainer for kafs and rxrpc. Signed-off-by: Marc Dionne Signed-off-by: David S. Miller --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 86c11ad3c502..4a8ddad3c652 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -624,6 +624,7 @@ F: fs/affs/ AFS FILESYSTEM M: David Howells +M: Marc Dionne L: linux-afs@lists.infradead.org S: Supported W: https://www.infradead.org/~dhowells/kafs/ @@ -15792,6 +15793,7 @@ F: drivers/infiniband/ulp/rtrs/ RXRPC SOCKETS (AF_RXRPC) M: David Howells +M: Marc Dionne L: linux-afs@lists.infradead.org S: Supported W: https://www.infradead.org/~dhowells/kafs/ From 35b4f24415c854cd718ccdf38dbea6297f010aae Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 1 May 2021 04:02:58 +0800 Subject: [PATCH 0838/1379] sctp: do asoc update earlier in sctp_sf_do_dupcook_a There's a panic that occurs in a few of envs, the call trace is as below: [] general protection fault, ... 0x29acd70f1000a: 0000 [#1] SMP PTI [] RIP: 0010:sctp_ulpevent_notify_peer_addr_change+0x4b/0x1fa [sctp] [] sctp_assoc_control_transport+0x1b9/0x210 [sctp] [] sctp_do_8_2_transport_strike.isra.16+0x15c/0x220 [sctp] [] sctp_cmd_interpreter.isra.21+0x1231/0x1a10 [sctp] [] sctp_do_sm+0xc3/0x2a0 [sctp] [] sctp_generate_timeout_event+0x81/0xf0 [sctp] This is caused by a transport use-after-free issue. When processing a duplicate COOKIE-ECHO chunk in sctp_sf_do_dupcook_a(), both COOKIE-ACK and SHUTDOWN chunks are allocated with the transort from the new asoc. However, later in the sideeffect machine, the old asoc is used to send them out and old asoc's shutdown_last_sent_to is set to the transport that SHUTDOWN chunk attached to in sctp_cmd_setup_t2(), which actually belongs to the new asoc. After the new_asoc is freed and the old asoc T2 timeout, the old asoc's shutdown_last_sent_to that is already freed would be accessed in sctp_sf_t2_timer_expire(). Thanks Alexander and Jere for helping dig into this issue. To fix it, this patch is to do the asoc update first, then allocate the COOKIE-ACK and SHUTDOWN chunks with the 'updated' old asoc. This would make more sense, as a chunk from an asoc shouldn't be sent out with another asoc. We had fixed quite a few issues caused by this. Fixes: 145cb2f7177d ("sctp: Fix bundling of SHUTDOWN with COOKIE-ACK") Reported-by: Alexander Sverdlin Reported-by: syzbot+bbe538efd1046586f587@syzkaller.appspotmail.com Reported-by: Michal Tesar Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 7632714c1e5b..30cb94696843 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1852,20 +1852,35 @@ static enum sctp_disposition sctp_sf_do_dupcook_a( SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_ASCONF_QUEUE, SCTP_NULL()); - repl = sctp_make_cookie_ack(new_asoc, chunk); + /* Update the content of current association. */ + if (sctp_assoc_update((struct sctp_association *)asoc, new_asoc)) { + struct sctp_chunk *abort; + + abort = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr)); + if (abort) { + sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + } + sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNABORTED)); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_RSRC_LOW)); + SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB); + goto nomem; + } + + repl = sctp_make_cookie_ack(asoc, chunk); if (!repl) goto nomem; /* Report association restart to upper layer. */ ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_RESTART, 0, - new_asoc->c.sinit_num_ostreams, - new_asoc->c.sinit_max_instreams, + asoc->c.sinit_num_ostreams, + asoc->c.sinit_max_instreams, NULL, GFP_ATOMIC); if (!ev) goto nomem_ev; - /* Update the content of current association. */ - sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); if ((sctp_state(asoc, SHUTDOWN_PENDING) || sctp_state(asoc, SHUTDOWN_SENT)) && From 7e9269a5acec6d841d22e12770a0b02db4f5d8f2 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 1 May 2021 04:02:59 +0800 Subject: [PATCH 0839/1379] Revert "sctp: Fix bundling of SHUTDOWN with COOKIE-ACK" This can be reverted as shutdown and cookie_ack chunk are using the same asoc since the last patch. This reverts commit 145cb2f7177d94bc54563ed26027e952ee0ae03c. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 30cb94696843..e8ccc4e8b086 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1892,7 +1892,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_a( */ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); return sctp_sf_do_9_2_start_shutdown(net, ep, asoc, - SCTP_ST_CHUNK(0), repl, + SCTP_ST_CHUNK(0), NULL, commands); } else { sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, @@ -5536,7 +5536,7 @@ enum sctp_disposition sctp_sf_do_9_2_start_shutdown( * in the Cumulative TSN Ack field the last sequential TSN it * has received from the peer. */ - reply = sctp_make_shutdown(asoc, arg); + reply = sctp_make_shutdown(asoc, NULL); if (!reply) goto nomem; @@ -6134,7 +6134,7 @@ enum sctp_disposition sctp_sf_autoclose_timer_expire( disposition = SCTP_DISPOSITION_CONSUME; if (sctp_outq_is_empty(&asoc->outqueue)) { disposition = sctp_sf_do_9_2_start_shutdown(net, ep, asoc, type, - NULL, commands); + arg, commands); } return disposition; From 51eac7f2f06b5f60d22dfb06c48d98a227507b8e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 1 May 2021 04:03:00 +0800 Subject: [PATCH 0840/1379] sctp: do asoc update earlier in sctp_sf_do_dupcook_b The same thing should be done for sctp_sf_do_dupcook_b(). Meanwhile, SCTP_CMD_UPDATE_ASSOC cmd can be removed. v1->v2: - Fix the return value in sctp_sf_do_assoc_update(). Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/command.h | 1 - net/sctp/sm_sideeffect.c | 26 --------------------- net/sctp/sm_statefuns.c | 47 ++++++++++++++++++++++++-------------- 3 files changed, 30 insertions(+), 44 deletions(-) diff --git a/include/net/sctp/command.h b/include/net/sctp/command.h index e8df72e1627a..5e848884ff61 100644 --- a/include/net/sctp/command.h +++ b/include/net/sctp/command.h @@ -68,7 +68,6 @@ enum sctp_verb { SCTP_CMD_ASSOC_FAILED, /* Handle association failure. */ SCTP_CMD_DISCARD_PACKET, /* Discard the whole packet. */ SCTP_CMD_GEN_SHUTDOWN, /* Generate a SHUTDOWN chunk. */ - SCTP_CMD_UPDATE_ASSOC, /* Update association information. */ SCTP_CMD_PURGE_OUTQUEUE, /* Purge all data waiting to be sent. */ SCTP_CMD_SETUP_T2, /* Hi-level, setup T2-shutdown parms. */ SCTP_CMD_RTO_PENDING, /* Set transport's rto_pending. */ diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 0948f14ce221..ce15d590a615 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -826,28 +826,6 @@ static void sctp_cmd_setup_t2(struct sctp_cmd_seq *cmds, asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = t->rto; } -static void sctp_cmd_assoc_update(struct sctp_cmd_seq *cmds, - struct sctp_association *asoc, - struct sctp_association *new) -{ - struct net *net = asoc->base.net; - struct sctp_chunk *abort; - - if (!sctp_assoc_update(asoc, new)) - return; - - abort = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr)); - if (abort) { - sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0); - sctp_add_cmd_sf(cmds, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); - } - sctp_add_cmd_sf(cmds, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNABORTED)); - sctp_add_cmd_sf(cmds, SCTP_CMD_ASSOC_FAILED, - SCTP_PERR(SCTP_ERROR_RSRC_LOW)); - SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS); - SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB); -} - /* Helper function to change the state of an association. */ static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, @@ -1301,10 +1279,6 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, sctp_endpoint_add_asoc(ep, asoc); break; - case SCTP_CMD_UPDATE_ASSOC: - sctp_cmd_assoc_update(commands, asoc, cmd->obj.asoc); - break; - case SCTP_CMD_PURGE_OUTQUEUE: sctp_outq_teardown(&asoc->outqueue); break; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index e8ccc4e8b086..a42844904b81 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1773,6 +1773,30 @@ enum sctp_disposition sctp_sf_do_5_2_3_initack( return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); } +static int sctp_sf_do_assoc_update(struct sctp_association *asoc, + struct sctp_association *new, + struct sctp_cmd_seq *cmds) +{ + struct net *net = asoc->base.net; + struct sctp_chunk *abort; + + if (!sctp_assoc_update(asoc, new)) + return 0; + + abort = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr)); + if (abort) { + sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0); + sctp_add_cmd_sf(cmds, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + } + sctp_add_cmd_sf(cmds, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNABORTED)); + sctp_add_cmd_sf(cmds, SCTP_CMD_ASSOC_FAILED, + SCTP_PERR(SCTP_ERROR_RSRC_LOW)); + SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB); + + return -ENOMEM; +} + /* Unexpected COOKIE-ECHO handler for peer restart (Table 2, action 'A') * * Section 5.2.4 @@ -1853,21 +1877,8 @@ static enum sctp_disposition sctp_sf_do_dupcook_a( sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_ASCONF_QUEUE, SCTP_NULL()); /* Update the content of current association. */ - if (sctp_assoc_update((struct sctp_association *)asoc, new_asoc)) { - struct sctp_chunk *abort; - - abort = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr)); - if (abort) { - sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, 0); - sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); - } - sctp_add_cmd_sf(commands, SCTP_CMD_SET_SK_ERR, SCTP_ERROR(ECONNABORTED)); - sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, - SCTP_PERR(SCTP_ERROR_RSRC_LOW)); - SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS); - SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB); + if (sctp_sf_do_assoc_update((struct sctp_association *)asoc, new_asoc, commands)) goto nomem; - } repl = sctp_make_cookie_ack(asoc, chunk); if (!repl) @@ -1940,14 +1951,16 @@ static enum sctp_disposition sctp_sf_do_dupcook_b( if (!sctp_auth_chunk_verify(net, chunk, new_asoc)) return SCTP_DISPOSITION_DISCARD; - /* Update the content of current association. */ - sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_ESTABLISHED)); SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB); sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL()); - repl = sctp_make_cookie_ack(new_asoc, chunk); + /* Update the content of current association. */ + if (sctp_sf_do_assoc_update((struct sctp_association *)asoc, new_asoc, commands)) + goto nomem; + + repl = sctp_make_cookie_ack(asoc, chunk); if (!repl) goto nomem; From 8385b1f0ad0d86b99476de654623effdcb6ac2a2 Mon Sep 17 00:00:00 2001 From: Maxim Kochetkov Date: Fri, 30 Apr 2021 07:57:33 +0300 Subject: [PATCH 0841/1379] net: phy: marvell: enable downshift by default A number of PHYs support the PHY tunable to set and get downshift. However, only 88E1116R enables downshift by default. Extend this default enabled to all the PHYs that support the downshift tunable. Signed-off-by: Maxim Kochetkov Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 62 +++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 12 deletions(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 0b2cccb0d865..e6721c1c26c2 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -1088,6 +1088,38 @@ static int m88e1011_set_tunable(struct phy_device *phydev, } } +static int m88e1112_config_init(struct phy_device *phydev) +{ + int err; + + err = m88e1011_set_downshift(phydev, 3); + if (err < 0) + return err; + + return m88e1111_config_init(phydev); +} + +static int m88e1111gbe_config_init(struct phy_device *phydev) +{ + int err; + + err = m88e1111_set_downshift(phydev, 3); + if (err < 0) + return err; + + return m88e1111_config_init(phydev); +} + +static int marvell_1011gbe_config_init(struct phy_device *phydev) +{ + int err; + + err = m88e1011_set_downshift(phydev, 3); + if (err < 0) + return err; + + return marvell_config_init(phydev); +} static int m88e1116r_config_init(struct phy_device *phydev) { int err; @@ -1168,6 +1200,9 @@ static int m88e1510_config_init(struct phy_device *phydev) if (err < 0) return err; } + err = m88e1011_set_downshift(phydev, 3); + if (err < 0) + return err; return m88e1318_config_init(phydev); } @@ -1320,6 +1355,9 @@ static int m88e1145_config_init(struct phy_device *phydev) if (err < 0) return err; } + err = m88e1111_set_downshift(phydev, 3); + if (err < 0) + return err; err = marvell_of_reg_init(phydev); if (err < 0) @@ -2698,7 +2736,7 @@ static struct phy_driver marvell_drivers[] = { .name = "Marvell 88E1112", /* PHY_GBIT_FEATURES */ .probe = marvell_probe, - .config_init = m88e1111_config_init, + .config_init = m88e1112_config_init, .config_aneg = marvell_config_aneg, .config_intr = marvell_config_intr, .handle_interrupt = marvell_handle_interrupt, @@ -2718,7 +2756,7 @@ static struct phy_driver marvell_drivers[] = { .name = "Marvell 88E1111", /* PHY_GBIT_FEATURES */ .probe = marvell_probe, - .config_init = m88e1111_config_init, + .config_init = m88e1111gbe_config_init, .config_aneg = m88e1111_config_aneg, .read_status = marvell_read_status, .config_intr = marvell_config_intr, @@ -2739,7 +2777,7 @@ static struct phy_driver marvell_drivers[] = { .name = "Marvell 88E1111 (Finisar)", /* PHY_GBIT_FEATURES */ .probe = marvell_probe, - .config_init = m88e1111_config_init, + .config_init = m88e1111gbe_config_init, .config_aneg = m88e1111_config_aneg, .read_status = marvell_read_status, .config_intr = marvell_config_intr, @@ -2779,7 +2817,7 @@ static struct phy_driver marvell_drivers[] = { .driver_data = DEF_MARVELL_HWMON_OPS(m88e1121_hwmon_ops), /* PHY_GBIT_FEATURES */ .probe = marvell_probe, - .config_init = marvell_config_init, + .config_init = marvell_1011gbe_config_init, .config_aneg = m88e1121_config_aneg, .read_status = marvell_read_status, .config_intr = marvell_config_intr, @@ -2859,7 +2897,7 @@ static struct phy_driver marvell_drivers[] = { .name = "Marvell 88E1240", /* PHY_GBIT_FEATURES */ .probe = marvell_probe, - .config_init = m88e1111_config_init, + .config_init = m88e1112_config_init, .config_aneg = marvell_config_aneg, .config_intr = marvell_config_intr, .handle_interrupt = marvell_handle_interrupt, @@ -2929,7 +2967,7 @@ static struct phy_driver marvell_drivers[] = { /* PHY_GBIT_FEATURES */ .flags = PHY_POLL_CABLE_TEST, .probe = marvell_probe, - .config_init = marvell_config_init, + .config_init = marvell_1011gbe_config_init, .config_aneg = m88e1510_config_aneg, .read_status = marvell_read_status, .config_intr = marvell_config_intr, @@ -2955,7 +2993,7 @@ static struct phy_driver marvell_drivers[] = { .probe = marvell_probe, /* PHY_GBIT_FEATURES */ .flags = PHY_POLL_CABLE_TEST, - .config_init = marvell_config_init, + .config_init = marvell_1011gbe_config_init, .config_aneg = m88e1510_config_aneg, .read_status = marvell_read_status, .config_intr = marvell_config_intr, @@ -3000,7 +3038,7 @@ static struct phy_driver marvell_drivers[] = { /* PHY_GBIT_FEATURES */ .flags = PHY_POLL_CABLE_TEST, .probe = marvell_probe, - .config_init = marvell_config_init, + .config_init = marvell_1011gbe_config_init, .config_aneg = m88e6390_config_aneg, .read_status = marvell_read_status, .config_intr = marvell_config_intr, @@ -3026,7 +3064,7 @@ static struct phy_driver marvell_drivers[] = { /* PHY_GBIT_FEATURES */ .flags = PHY_POLL_CABLE_TEST, .probe = marvell_probe, - .config_init = marvell_config_init, + .config_init = marvell_1011gbe_config_init, .config_aneg = m88e6390_config_aneg, .read_status = marvell_read_status, .config_intr = marvell_config_intr, @@ -3052,7 +3090,7 @@ static struct phy_driver marvell_drivers[] = { /* PHY_GBIT_FEATURES */ .flags = PHY_POLL_CABLE_TEST, .probe = marvell_probe, - .config_init = marvell_config_init, + .config_init = marvell_1011gbe_config_init, .config_aneg = m88e1510_config_aneg, .read_status = marvell_read_status, .config_intr = marvell_config_intr, @@ -3077,7 +3115,7 @@ static struct phy_driver marvell_drivers[] = { .driver_data = DEF_MARVELL_HWMON_OPS(m88e1510_hwmon_ops), .probe = marvell_probe, /* PHY_GBIT_FEATURES */ - .config_init = marvell_config_init, + .config_init = marvell_1011gbe_config_init, .config_aneg = m88e1510_config_aneg, .read_status = marvell_read_status, .config_intr = marvell_config_intr, @@ -3099,7 +3137,7 @@ static struct phy_driver marvell_drivers[] = { .driver_data = DEF_MARVELL_HWMON_OPS(m88e1510_hwmon_ops), .probe = marvell_probe, .features = PHY_GBIT_FIBRE_FEATURES, - .config_init = marvell_config_init, + .config_init = marvell_1011gbe_config_init, .config_aneg = m88e1510_config_aneg, .read_status = marvell_read_status, .config_intr = marvell_config_intr, From f18c51b6513c6bd39c834855e3ccaec52c150c84 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Fri, 30 Apr 2021 11:10:47 +0800 Subject: [PATCH 0842/1379] net: stmmac: Remove duplicate declaration of stmmac_priv In commit f4da56529da60 ("net: stmmac: Add support for external trigger timestamping"), struct stmmac_priv was declared at line 507 which caused duplicate struct declarations. Remove later duplicate declaration here. Signed-off-by: Wan Jiabing Reviewed-by: Wong Vee Khee Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/hwif.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 2cc91759b91f..6d5e0f2b03ce 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -564,7 +564,6 @@ struct stmmac_mode_ops { #define stmmac_clean_desc3(__priv, __args...) \ stmmac_do_void_callback(__priv, mode, clean_desc3, __args) -struct stmmac_priv; struct tc_cls_u32_offload; struct tc_cbs_qopt_offload; struct flow_cls_offload; From 9c19722c5e1c623f2d7939bdeb74427e9a73c5d5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Mar 2021 23:49:44 +0900 Subject: [PATCH 0843/1379] sh: syscalls: switch to generic syscalltbl.sh Many architectures duplicate similar shell scripts. This commit converts sh to use scripts/syscalltbl.sh. Signed-off-by: Masahiro Yamada --- arch/sh/kernel/syscalls/Makefile | 7 ++---- arch/sh/kernel/syscalls/syscalltbl.sh | 32 --------------------------- 2 files changed, 2 insertions(+), 37 deletions(-) delete mode 100644 arch/sh/kernel/syscalls/syscalltbl.sh diff --git a/arch/sh/kernel/syscalls/Makefile b/arch/sh/kernel/syscalls/Makefile index 285aaba832d9..ad2492cb5568 100644 --- a/arch/sh/kernel/syscalls/Makefile +++ b/arch/sh/kernel/syscalls/Makefile @@ -7,7 +7,7 @@ _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ syscall := $(src)/syscall.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh -systbl := $(srctree)/$(src)/syscalltbl.sh +systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ @@ -16,10 +16,7 @@ quiet_cmd_syshdr = SYSHDR $@ '$(syshdr_offset_$(basetarget))' quiet_cmd_systbl = SYSTBL $@ - cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@' \ - '$(systbl_abis_$(basetarget))' \ - '$(systbl_abi_$(basetarget))' \ - '$(systbl_offset_$(basetarget))' + cmd_systbl = $(CONFIG_SHELL) $(systbl) $< $@ $(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) diff --git a/arch/sh/kernel/syscalls/syscalltbl.sh b/arch/sh/kernel/syscalls/syscalltbl.sh deleted file mode 100644 index 904b8e6e625d..000000000000 --- a/arch/sh/kernel/syscalls/syscalltbl.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -my_abi="$4" -offset="$5" - -emit() { - t_nxt="$1" - t_nr="$2" - t_entry="$3" - - while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" - t_nxt=$((t_nxt+1)) - done - printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" -} - -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - nxt=0 - if [ -z "$offset" ]; then - offset=0 - fi - - while read nr abi name entry ; do - emit $((nxt+offset)) $((nr+offset)) $entry - nxt=$((nr+1)) - done -) > "$out" From eb6111495ca94a8c9fa7ca043bd5d5cff9a661f4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Mar 2021 23:49:45 +0900 Subject: [PATCH 0844/1379] sh: syscalls: switch to generic syscallhdr.sh Many architectures duplicate similar shell scripts. This commit converts sh to use scripts/syscallhdr.sh. Signed-off-by: Masahiro Yamada --- arch/sh/kernel/syscalls/Makefile | 7 ++---- arch/sh/kernel/syscalls/syscallhdr.sh | 36 --------------------------- 2 files changed, 2 insertions(+), 41 deletions(-) delete mode 100644 arch/sh/kernel/syscalls/syscallhdr.sh diff --git a/arch/sh/kernel/syscalls/Makefile b/arch/sh/kernel/syscalls/Makefile index ad2492cb5568..6713c65a25e1 100644 --- a/arch/sh/kernel/syscalls/Makefile +++ b/arch/sh/kernel/syscalls/Makefile @@ -6,14 +6,11 @@ _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') syscall := $(src)/syscall.tbl -syshdr := $(srctree)/$(src)/syscallhdr.sh +syshdr := $(srctree)/scripts/syscallhdr.sh systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ - '$(syshdr_abis_$(basetarget))' \ - '$(syshdr_pfx_$(basetarget))' \ - '$(syshdr_offset_$(basetarget))' + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr $< $@ quiet_cmd_systbl = SYSTBL $@ cmd_systbl = $(CONFIG_SHELL) $(systbl) $< $@ diff --git a/arch/sh/kernel/syscalls/syscallhdr.sh b/arch/sh/kernel/syscalls/syscallhdr.sh deleted file mode 100644 index 4c0519861e97..000000000000 --- a/arch/sh/kernel/syscalls/syscallhdr.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -prefix="$4" -offset="$5" - -fileguard=_UAPI_ASM_SH_`basename "$out" | sed \ - -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ - -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - printf "#ifndef %s\n" "${fileguard}" - printf "#define %s\n" "${fileguard}" - printf "\n" - - nxt=0 - while read nr abi name entry ; do - if [ -z "$offset" ]; then - printf "#define __NR_%s%s\t%s\n" \ - "${prefix}" "${name}" "${nr}" - else - printf "#define __NR_%s%s\t(%s + %s)\n" \ - "${prefix}" "${name}" "${offset}" "${nr}" - fi - nxt=$((nr+1)) - done - - printf "\n" - printf "#ifdef __KERNEL__\n" - printf "#define __NR_syscalls\t%s\n" "${nxt}" - printf "#endif\n" - printf "\n" - printf "#endif /* %s */\n" "${fileguard}" -) > "$out" From 5ad4e94b46a618f333a6b1a34ee391c8a6bb40b2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Mar 2021 23:51:01 +0900 Subject: [PATCH 0845/1379] sparc: syscalls: switch to generic syscalltbl.sh Many architectures duplicate similar shell scripts. This commit converts sparc to use scripts/syscalltbl.sh. This also unifies syscall_table_64.h and syscall_table_c32.h. Signed-off-by: Masahiro Yamada --- arch/sparc/include/asm/Kbuild | 1 - arch/sparc/kernel/syscalls/Makefile | 19 ++++--------- arch/sparc/kernel/syscalls/syscalltbl.sh | 36 ------------------------ arch/sparc/kernel/systbls_32.S | 4 +-- arch/sparc/kernel/systbls_64.S | 8 ++++-- 5 files changed, 12 insertions(+), 56 deletions(-) delete mode 100644 arch/sparc/kernel/syscalls/syscalltbl.sh diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index aec20406145e..0b9d98ced34a 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 generated-y += syscall_table_32.h generated-y += syscall_table_64.h -generated-y += syscall_table_c32.h generic-y += export.h generic-y += kvm_para.h generic-y += mcs_spinlock.h diff --git a/arch/sparc/kernel/syscalls/Makefile b/arch/sparc/kernel/syscalls/Makefile index 283f64407b07..11424f1c8d9e 100644 --- a/arch/sparc/kernel/syscalls/Makefile +++ b/arch/sparc/kernel/syscalls/Makefile @@ -7,7 +7,7 @@ _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ syscall := $(src)/syscall.tbl syshdr := $(srctree)/$(src)/syscallhdr.sh -systbl := $(srctree)/$(src)/syscalltbl.sh +systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ @@ -16,10 +16,7 @@ quiet_cmd_syshdr = SYSHDR $@ '$(syshdr_offset_$(basetarget))' quiet_cmd_systbl = SYSTBL $@ - cmd_systbl = $(CONFIG_SHELL) '$(systbl)' '$<' '$@' \ - '$(systbl_abis_$(basetarget))' \ - '$(systbl_abi_$(basetarget))' \ - '$(systbl_offset_$(basetarget))' + cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ syshdr_abis_unistd_32 := common,32 $(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE @@ -29,23 +26,17 @@ syshdr_abis_unistd_64 := common,64 $(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -systbl_abis_syscall_table_32 := common,32 +$(kapi)/syscall_table_32.h: abis := common,32 $(kapi)/syscall_table_32.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) -systbl_abis_syscall_table_64 := common,64 +$(kapi)/syscall_table_64.h: abis := common,64 $(kapi)/syscall_table_64.h: $(syscall) $(systbl) FORCE $(call if_changed,systbl) -systbl_abis_syscall_table_c32 := common,32 -systbl_abi_syscall_table_c32 := c32 -$(kapi)/syscall_table_c32.h: $(syscall) $(systbl) FORCE - $(call if_changed,systbl) - uapisyshdr-y += unistd_32.h unistd_64.h kapisyshdr-y += syscall_table_32.h \ - syscall_table_64.h \ - syscall_table_c32.h + syscall_table_64.h uapisyshdr-y := $(addprefix $(uapi)/, $(uapisyshdr-y)) kapisyshdr-y := $(addprefix $(kapi)/, $(kapisyshdr-y)) diff --git a/arch/sparc/kernel/syscalls/syscalltbl.sh b/arch/sparc/kernel/syscalls/syscalltbl.sh deleted file mode 100644 index 77cf0143ba19..000000000000 --- a/arch/sparc/kernel/syscalls/syscalltbl.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -my_abi="$4" -offset="$5" - -emit() { - t_nxt="$1" - t_nr="$2" - t_entry="$3" - - while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s, sys_nis_syscall, )\n" "${t_nxt}" - t_nxt=$((t_nxt+1)) - done - printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}" -} - -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - nxt=0 - if [ -z "$offset" ]; then - offset=0 - fi - - while read nr abi name entry compat ; do - if [ "$my_abi" = "c32" ] && [ ! -z "$compat" ]; then - emit $((nxt+offset)) $((nr+offset)) $compat - else - emit $((nxt+offset)) $((nr+offset)) $entry - fi - nxt=$((nr+1)) - done -) > "$out" diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index ab9e4d57685a..3aaffa017706 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S @@ -9,10 +9,10 @@ * Copyright (C) 1995 Adrian M. Rodriguez (adrian@remus.rutgers.edu) */ -#define __SYSCALL(nr, entry, nargs) .long entry +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) +#define __SYSCALL(nr, entry) .long entry .data .align 4 .globl sys_call_table sys_call_table: #include /* 32-bit native syscalls */ -#undef __SYSCALL diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index a27394bf7d7f..398fe449dd34 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S @@ -10,18 +10,20 @@ * Copyright (C) 1995 Adrian M. Rodriguez (adrian@remus.rutgers.edu) */ -#define __SYSCALL(nr, entry, nargs) .word entry +#define __SYSCALL(nr, entry) .word entry .text .align 4 #ifdef CONFIG_COMPAT .globl sys_call_table32 sys_call_table32: -#include /* Compat syscalls */ +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) +#include /* Compat syscalls */ +#undef __SYSCALL_WITH_COMPAT #endif /* CONFIG_COMPAT */ .align 4 .globl sys_call_table64, sys_call_table sys_call_table64: sys_call_table: +#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) #include /* 64-bit native syscalls */ -#undef __SYSCALL From c5849b7c206bf36b8ce7079d4777e0a59305ccce Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 1 Mar 2021 23:51:02 +0900 Subject: [PATCH 0846/1379] sparc: syscalls: switch to generic syscallshdr.sh Many architectures duplicate similar shell scripts. This commit converts sparc to use scripts/syscallhdr.sh. Signed-off-by: Masahiro Yamada --- arch/sparc/kernel/syscalls/Makefile | 11 +++----- arch/sparc/kernel/syscalls/syscallhdr.sh | 36 ------------------------ 2 files changed, 4 insertions(+), 43 deletions(-) delete mode 100644 arch/sparc/kernel/syscalls/syscallhdr.sh diff --git a/arch/sparc/kernel/syscalls/Makefile b/arch/sparc/kernel/syscalls/Makefile index 11424f1c8d9e..0f2ea5bcb0d7 100644 --- a/arch/sparc/kernel/syscalls/Makefile +++ b/arch/sparc/kernel/syscalls/Makefile @@ -6,23 +6,20 @@ _dummy := $(shell [ -d '$(uapi)' ] || mkdir -p '$(uapi)') \ $(shell [ -d '$(kapi)' ] || mkdir -p '$(kapi)') syscall := $(src)/syscall.tbl -syshdr := $(srctree)/$(src)/syscallhdr.sh +syshdr := $(srctree)/scripts/syscallhdr.sh systbl := $(srctree)/scripts/syscalltbl.sh quiet_cmd_syshdr = SYSHDR $@ - cmd_syshdr = $(CONFIG_SHELL) '$(syshdr)' '$<' '$@' \ - '$(syshdr_abis_$(basetarget))' \ - '$(syshdr_pfx_$(basetarget))' \ - '$(syshdr_offset_$(basetarget))' + cmd_syshdr = $(CONFIG_SHELL) $(syshdr) --emit-nr --abis $(abis) $< $@ quiet_cmd_systbl = SYSTBL $@ cmd_systbl = $(CONFIG_SHELL) $(systbl) --abis $(abis) $< $@ -syshdr_abis_unistd_32 := common,32 +$(uapi)/unistd_32.h: abis := common,32 $(uapi)/unistd_32.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) -syshdr_abis_unistd_64 := common,64 +$(uapi)/unistd_64.h: abis := common,64 $(uapi)/unistd_64.h: $(syscall) $(syshdr) FORCE $(call if_changed,syshdr) diff --git a/arch/sparc/kernel/syscalls/syscallhdr.sh b/arch/sparc/kernel/syscalls/syscallhdr.sh deleted file mode 100644 index cf50a75cc0bb..000000000000 --- a/arch/sparc/kernel/syscalls/syscallhdr.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -in="$1" -out="$2" -my_abis=`echo "($3)" | tr ',' '|'` -prefix="$4" -offset="$5" - -fileguard=_UAPI_ASM_SPARC_`basename "$out" | sed \ - -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/' \ - -e 's/[^A-Z0-9_]/_/g' -e 's/__/_/g'` -grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( - printf "#ifndef %s\n" "${fileguard}" - printf "#define %s\n" "${fileguard}" - printf "\n" - - nxt=0 - while read nr abi name entry compat ; do - if [ -z "$offset" ]; then - printf "#define __NR_%s%s\t%s\n" \ - "${prefix}" "${name}" "${nr}" - else - printf "#define __NR_%s%s\t(%s + %s)\n" \ - "${prefix}" "${name}" "${offset}" "${nr}" - fi - nxt=$((nr+1)) - done - - printf "\n" - printf "#ifdef __KERNEL__\n" - printf "#define __NR_syscalls\t%s\n" "${nxt}" - printf "#endif\n" - printf "\n" - printf "#endif /* %s */\n" "${fileguard}" -) > "$out" From 3787b7da5d3e2c849fe8ffed987922a4e6dd6cfd Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 24 Apr 2021 20:55:53 +0900 Subject: [PATCH 0847/1379] kbuild: add comment about why cmd_shipped uses 'cat' cmd_shipped uses 'cat' instead of 'cp' for copying a file. The reason is explained in the commit [1], but it was in the pre-git era. $ touch a $ chmod -w a $ cp a b $ cp a b cp: cannot create regular file 'b': Permission denied Add comments so that you can see the reason without looking into the history. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/commit/?id=a70dba8086160449cc94c5bdaff78419b6b8e3c8 Signed-off-by: Masahiro Yamada --- scripts/Makefile.lib | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 64daf37e874b..c0982b8b2b6d 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -252,6 +252,9 @@ quiet_cmd_copy = COPY $@ # Shipped files # =========================================================================== +# 'cp' preserves permissions. If you use it to copy a file in read-only srctree, +# the copy would be read-only as well, leading to an error when executing the +# rule next time. Use 'cat' instead in order to generate a writable file. quiet_cmd_shipped = SHIPPED $@ cmd_shipped = cat $< > $@ From 382243f346416f5ed14cc2517d8a3947bf25d628 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 24 Apr 2021 21:08:29 +0900 Subject: [PATCH 0848/1379] genksyms: fix stale comment (shipped source) is a stale comment. Since commit 833e62245943 ("genksyms: generate lexer and parser during build instead of shipping"), there is no source file to be shipped in this directory. Signed-off-by: Masahiro Yamada --- scripts/genksyms/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/genksyms/Makefile b/scripts/genksyms/Makefile index ce4f99935de5..d6a422a63b6a 100644 --- a/scripts/genksyms/Makefile +++ b/scripts/genksyms/Makefile @@ -22,7 +22,7 @@ $(obj)/pars%.tab.c $(obj)/pars%.tab.h: $(src)/pars%.y FORCE endif -# -I needed for generated C source (shipped source) +# -I needed for generated C source to include headers in source tree HOSTCFLAGS_parse.tab.o := -I $(srctree)/$(src) HOSTCFLAGS_lex.lex.o := -I $(srctree)/$(src) From 5134e94ac4f5e58d73f39fde8ee6735b47f5c63d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 25 Apr 2021 04:47:19 +0900 Subject: [PATCH 0849/1379] usr/include: refactor .gitignore The current .gitignore intends to ignore everything under usr/include/ except .gitignore and Makefile. A cleaner solution is to use a pattern suffixed with '/', which matches only directories. It works well here because all the exported headers are located in sub-directories, like , . Signed-off-by: Masahiro Yamada --- usr/include/.gitignore | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/usr/include/.gitignore b/usr/include/.gitignore index d2fab782cb7d..17b0ba1bd325 100644 --- a/usr/include/.gitignore +++ b/usr/include/.gitignore @@ -1,4 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -* -!.gitignore -!Makefile +/*/ From 1fca37660326b3c7a310e35768cf554425dd7f64 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 25 Apr 2021 15:24:03 +0900 Subject: [PATCH 0850/1379] kernel/.gitgnore: remove stale timeconst.h and hz.bc timeconst.h and hz.bc used to exist in kernel/. Commit 5cee96459726 ("time/timers: Move all time(r) related files into kernel/time") moved them to kernel/time/. Commit 0a227985d4a9 ("time: Move timeconst.h into include/generated") moved timeconst.h to include/generated/ and removed hz.bc . Signed-off-by: Masahiro Yamada --- kernel/.gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/.gitignore b/kernel/.gitignore index 78701ea37c97..4abc4e033ed8 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,4 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only kheaders.md5 -timeconst.h -hz.bc From 819cb9fc80733e346f3f913293c0a70e00a61d33 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 25 Apr 2021 15:24:04 +0900 Subject: [PATCH 0851/1379] .gitignore: move tags and TAGS close to other tag files For consistency, move tags and TAGS close to the cscope and GNU Global patterns. I removed the '/' prefix in case somebody wants to manually create tag files in sub-directories. Signed-off-by: Masahiro Yamada --- .gitignore | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index df8d3146a43f..deb68b7555ad 100644 --- a/.gitignore +++ b/.gitignore @@ -54,8 +54,6 @@ modules.order # # Top-level generic files # -/tags -/TAGS /linux /modules-only.symvers /vmlinux @@ -114,6 +112,10 @@ patches-* patches series +# ctags files +tags +TAGS + # cscope files cscope.* ncscope.* From 40cb020305f40bafc2a13c7e879a33dbbd607507 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 25 Apr 2021 15:24:05 +0900 Subject: [PATCH 0852/1379] .gitignore: ignore only top-level modules.builtin modules.builtin used to be created in every directory. Since commit 8b41fc4454e3 ("kbuild: create modules.builtin without Makefile.modbuiltin or tristate.conf"), modules.builtin is created only in the top directory. Add the '/' prefix so that it matches to only the modules.builtin located in the top directory. It has been more than one year since that change. I hope this will not flood 'Untracked files' of 'git status'. Signed-off-by: Masahiro Yamada --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index deb68b7555ad..7afd412dadd2 100644 --- a/.gitignore +++ b/.gitignore @@ -48,7 +48,6 @@ *.xz *.zst Module.symvers -modules.builtin modules.order # @@ -64,6 +63,7 @@ modules.order /vmlinuz /System.map /Module.markers +/modules.builtin /modules.builtin.modinfo /modules.nsdeps From 46b41d5dd8019b264717978c39c43313a524d033 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 25 Apr 2021 15:24:07 +0900 Subject: [PATCH 0853/1379] kbuild: update config_data.gz only when the content of .config is changed If the timestamp of the .config file is updated, config_data.gz is regenerated, then vmlinux is re-linked. This occurs even if the content of the .config has not changed at all. This issue was mitigated by commit 67424f61f813 ("kconfig: do not write .config if the content is the same"); Kconfig does not update the .config when it ends up with the identical configuration. The issue is remaining when the .config is created by *_defconfig with some config fragment(s) applied on top. This is typical for powerpc and mips, where several *_defconfig targets are constructed by using merge_config.sh. One workaround is to have the copy of the .config. The filechk rule updates the copy, kernel/config_data, by checking the content instead of the timestamp. With this commit, the second run with the same configuration avoids the needless rebuilds. $ make ARCH=mips defconfig all [ snip ] $ make ARCH=mips defconfig all *** Default configuration is based on target '32r2el_defconfig' Using ./arch/mips/configs/generic_defconfig as base Merging arch/mips/configs/generic/32r2.config Merging arch/mips/configs/generic/el.config Merging ./arch/mips/configs/generic/board-boston.config Merging ./arch/mips/configs/generic/board-ni169445.config Merging ./arch/mips/configs/generic/board-ocelot.config Merging ./arch/mips/configs/generic/board-ranchu.config Merging ./arch/mips/configs/generic/board-sead-3.config Merging ./arch/mips/configs/generic/board-xilfpga.config # # configuration written to .config # SYNC include/config/auto.conf CALL scripts/checksyscalls.sh CALL scripts/atomic/check-atomics.sh CHK include/generated/compile.h CHK include/generated/autoksyms.h Reported-by: Elliot Berman Signed-off-by: Masahiro Yamada --- kernel/.gitignore | 1 + kernel/Makefile | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/.gitignore b/kernel/.gitignore index 4abc4e033ed8..dafdce3d18c5 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only +/config_data kheaders.md5 diff --git a/kernel/Makefile b/kernel/Makefile index e8a6715f38dc..4df609be42d0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -142,10 +142,15 @@ obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o $(obj)/configs.o: $(obj)/config_data.gz -targets += config_data.gz -$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE +targets += config_data config_data.gz +$(obj)/config_data.gz: $(obj)/config_data FORCE $(call if_changed,gzip) +filechk_cat = cat $< + +$(obj)/config_data: $(KCONFIG_CONFIG) FORCE + $(call filechk,cat) + $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz From 1476fee5c53e24e06cfc436110cdefbc1868e8c1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 25 Apr 2021 16:07:12 +0900 Subject: [PATCH 0854/1379] kbuild: add a script to remove stale generated files We maintain .gitignore and Makefiles so build artifacts are properly ignored by Git, and cleaned up by 'make clean'. However, the code is always changing; generated files are often moved to another directory, or removed when they become unnecessary. Such garbage files tend to be left over in the source tree because people usually git-pull without cleaning the tree. This is not only the noise for 'git status', but also a build issue in some cases. One solution is to remove a stale file like commit 223c24a7dba9 ("kbuild: Automatically remove stale file") did. Such workaround should be removed after a while, but we forget about that if we scatter the workaround code in random places. So, this commit adds a new script to collect cleanings of stale files. As a start point, move the code in arch/arm/boot/compressed/Makefile into this script. Signed-off-by: Masahiro Yamada --- Makefile | 6 +++++- arch/arm/boot/compressed/Makefile | 7 ------- scripts/remove-stale-files | 31 +++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 8 deletions(-) create mode 100755 scripts/remove-stale-files diff --git a/Makefile b/Makefile index 6eafedd7efc6..9f5a3cac63e7 100644 --- a/Makefile +++ b/Makefile @@ -1225,7 +1225,7 @@ PHONY += prepare archprepare archprepare: outputmakefile archheaders archscripts scripts include/config/kernel.release \ asm-generic $(version_h) $(autoksyms_h) include/generated/utsrelease.h \ - include/generated/autoconf.h + include/generated/autoconf.h remove-stale-files prepare0: archprepare $(Q)$(MAKE) $(build)=scripts/mod @@ -1234,6 +1234,10 @@ prepare0: archprepare # All the preparing.. prepare: prepare0 prepare-objtool prepare-resolve_btfids +PHONY += remove-stale-files +remove-stale-files: + $(Q)$(srctree)/scripts/remove-stale-files + # Support for using generic headers in asm-generic asm-generic := -f $(srctree)/scripts/Makefile.asm-generic obj diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index fd94e27ba4fa..182b300e3f8a 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -96,13 +96,6 @@ endif $(foreach o, $(libfdt_objs) atags_to_fdt.o fdt_check_mem_start.o, \ $(eval CFLAGS_$(o) := -I $(srctree)/scripts/dtc/libfdt -fno-stack-protector)) -# These were previously generated C files. When you are building the kernel -# with O=, make sure to remove the stale files in the output tree. Otherwise, -# the build system wrongly compiles the stale ones. -ifdef building_out_of_srctree -$(shell rm -f $(addprefix $(obj)/, fdt_rw.c fdt_ro.c fdt_wip.c fdt.c)) -endif - targets := vmlinux vmlinux.lds piggy_data piggy.o \ lib1funcs.o ashldi3.o bswapsdi2.o \ head.o $(OBJS) diff --git a/scripts/remove-stale-files b/scripts/remove-stale-files new file mode 100755 index 000000000000..c3eb81c3f7de --- /dev/null +++ b/scripts/remove-stale-files @@ -0,0 +1,31 @@ +#!/bin/sh + +set -e + +# When you move, remove or rename generated files, you probably also update +# .gitignore and cleaning rules in the Makefile. This is the right thing +# to do. However, people usually do 'git pull', 'git bisect', etc. without +# running 'make clean'. Then, the stale generated files are left over, often +# causing build issues. +# +# Also, 'git status' shows such stale build artifacts as untracked files. +# What is worse, some people send a wrong patch to get them back to .gitignore +# without checking the commit history. +# +# So, when you (re)move generated files, please move the cleaning rules from +# the Makefile to this script. This is run before Kbuild starts building +# anything, so people will not be annoyed by such garbage files. +# +# This script is not intended to grow endlessly. Rather, it is a temporary scrap +# yard. Stale files stay in this file for a while (for some release cycles?), +# then will be really dead and removed from the code base entirely. + +# These were previously generated source files. When you are building the kernel +# with O=, make sure to remove the stale files in the output tree. Otherwise, +# the build system wrongly compiles the stale ones. +if [ -n "${building_out_of_srctree}" ]; then + for f in fdt_rw.c fdt_ro.c fdt_wip.c fdt.c + do + rm -f arch/arm/boot/compressed/${f} + done +fi From 885480b084696331bea61a4f7eba10652999a9c1 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 28 Apr 2021 18:23:50 -0700 Subject: [PATCH 0855/1379] Makefile: Move -Wno-unused-but-set-variable out of GCC only block Currently, -Wunused-but-set-variable is only supported by GCC so it is disabled unconditionally in a GCC only block (it is enabled with W=1). clang currently has its implementation for this warning in review so preemptively move this statement out of the GCC only block and wrap it with cc-disable-warning so that both compilers function the same. Cc: stable@vger.kernel.org Link: https://reviews.llvm.org/D100581 Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Signed-off-by: Masahiro Yamada --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 9f5a3cac63e7..3597fd75d5b5 100644 --- a/Makefile +++ b/Makefile @@ -792,16 +792,16 @@ KBUILD_CFLAGS += -Wno-gnu KBUILD_CFLAGS += -mno-global-merge else -# These warnings generated too much noise in a regular build. -# Use make W=1 to enable them (see scripts/Makefile.extrawarn) -KBUILD_CFLAGS += -Wno-unused-but-set-variable - # Warn about unmarked fall-throughs in switch statement. # Disabled for clang while comment to attribute conversion happens and # https://github.com/ClangBuiltLinux/linux/issues/636 is discussed. KBUILD_CFLAGS += $(call cc-option,-Wimplicit-fallthrough,) endif +# These warnings generated too much noise in a regular build. +# Use make W=1 to enable them (see scripts/Makefile.extrawarn) +KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable) + KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable) ifdef CONFIG_FRAME_POINTER KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls From 77a88274dc1a2cf3a775161d9a3242bc798ee680 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 30 Apr 2021 10:56:27 +0900 Subject: [PATCH 0856/1379] kbuild: replace LANG=C with LC_ALL=C LANG gives a weak default to each LC_* in case it is not explicitly defined. LC_ALL, if set, overrides all other LC_* variables. LANG < LC_CTYPE, LC_COLLATE, LC_MONETARY, LC_NUMERIC, ... < LC_ALL This is why documentation such as [1] suggests to set LC_ALL in build scripts to get the deterministic result. LANG=C is not strong enough to override LC_* that may be set by end users. [1]: https://reproducible-builds.org/docs/locales/ Signed-off-by: Masahiro Yamada Acked-by: Michael Ellerman (powerpc) Reviewed-by: Matthias Maennich Acked-by: Matthieu Baerts (mptcp) Reviewed-by: Greg Kroah-Hartman --- arch/powerpc/boot/wrapper | 2 +- scripts/nsdeps | 2 +- scripts/recordmcount.pl | 2 +- scripts/setlocalversion | 2 +- scripts/tags.sh | 2 +- tools/testing/selftests/net/mptcp/mptcp_connect.sh | 2 +- usr/gen_initramfs.sh | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 41fa0a8715e3..cdb796b76e2e 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -191,7 +191,7 @@ if [ -z "$kernel" ]; then kernel=vmlinux fi -LANG=C elfformat="`${CROSS}objdump -p "$kernel" | grep 'file format' | awk '{print $4}'`" +LC_ALL=C elfformat="`${CROSS}objdump -p "$kernel" | grep 'file format' | awk '{print $4}'`" case "$elfformat" in elf64-powerpcle) format=elf64lppc ;; elf64-powerpc) format=elf32ppc ;; diff --git a/scripts/nsdeps b/scripts/nsdeps index e8ce2a4d704a..04c4b96e95ec 100644 --- a/scripts/nsdeps +++ b/scripts/nsdeps @@ -44,7 +44,7 @@ generate_deps() { for source_file in $mod_source_files; do sed '/MODULE_IMPORT_NS/Q' $source_file > ${source_file}.tmp offset=$(wc -l ${source_file}.tmp | awk '{print $1;}') - cat $source_file | grep MODULE_IMPORT_NS | LANG=C sort -u >> ${source_file}.tmp + cat $source_file | grep MODULE_IMPORT_NS | LC_ALL=C sort -u >> ${source_file}.tmp tail -n +$((offset +1)) ${source_file} | grep -v MODULE_IMPORT_NS >> ${source_file}.tmp if ! diff -q ${source_file} ${source_file}.tmp; then mv ${source_file}.tmp ${source_file} diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 867860ea57da..0a7fc9507d6f 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -497,7 +497,7 @@ sub update_funcs # # Step 2: find the sections and mcount call sites # -open(IN, "LANG=C $objdump -hdr $inputfile|") || die "error running $objdump"; +open(IN, "LC_ALL=C $objdump -hdr $inputfile|") || die "error running $objdump"; my $text; diff --git a/scripts/setlocalversion b/scripts/setlocalversion index bb709eda96cd..db941f6d9591 100755 --- a/scripts/setlocalversion +++ b/scripts/setlocalversion @@ -126,7 +126,7 @@ scm_version() fi # Check for svn and a svn repo. - if rev=$(LANG= LC_ALL= LC_MESSAGES=C svn info 2>/dev/null | grep '^Last Changed Rev'); then + if rev=$(LC_ALL=C svn info 2>/dev/null | grep '^Last Changed Rev'); then rev=$(echo $rev | awk '{print $NF}') printf -- '-svn%s' "$rev" diff --git a/scripts/tags.sh b/scripts/tags.sh index fd96734deff1..db8ba411860a 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -326,5 +326,5 @@ esac # Remove structure forward declarations. if [ -n "$remove_structs" ]; then - LANG=C sed -i -e '/^\([a-zA-Z_][a-zA-Z0-9_]*\)\t.*\t\/\^struct \1;.*\$\/;"\tx$/d' $1 + LC_ALL=C sed -i -e '/^\([a-zA-Z_][a-zA-Z0-9_]*\)\t.*\t\/\^struct \1;.*\$\/;"\tx$/d' $1 fi diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 9236609731b1..3c4cb72ed8a4 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -274,7 +274,7 @@ check_mptcp_disabled() ip netns exec ${disabled_ns} sysctl -q net.mptcp.enabled=0 local err=0 - LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \ + LC_ALL=C ip netns exec ${disabled_ns} ./mptcp_connect -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \ grep -q "^socket: Protocol not available$" && err=1 ip netns delete ${disabled_ns} diff --git a/usr/gen_initramfs.sh b/usr/gen_initramfs.sh index 8ae831657e5d..63476bb70b41 100755 --- a/usr/gen_initramfs.sh +++ b/usr/gen_initramfs.sh @@ -147,7 +147,7 @@ dir_filelist() { header "$1" srcdir=$(echo "$1" | sed -e 's://*:/:g') - dirlist=$(find "${srcdir}" -printf "%p %m %U %G\n" | LANG=C sort) + dirlist=$(find "${srcdir}" -printf "%p %m %U %G\n" | LC_ALL=C sort) # If $dirlist is only one line, then the directory is empty if [ "$(echo "${dirlist}" | wc -l)" -gt 1 ]; then From 9009b455811b0fa1f6b0adfa94db136984db5a38 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 30 Apr 2021 11:03:08 +0900 Subject: [PATCH 0857/1379] .gitignore: prefix local generated files with a slash The pattern prefixed with '/' matches files in the same directory, but not ones in sub-directories. Signed-off-by: Masahiro Yamada Acked-by: Miguel Ojeda Acked-by: Rob Herring Acked-by: Andra Paraschiv Acked-by: Greg Kroah-Hartman Acked-by: Gabriel Krisman Bertazi --- Documentation/devicetree/bindings/.gitignore | 4 ++-- arch/.gitignore | 4 ++-- certs/.gitignore | 4 ++-- drivers/memory/.gitignore | 2 +- drivers/tty/vt/.gitignore | 6 +++--- fs/unicode/.gitignore | 4 ++-- kernel/.gitignore | 2 +- lib/.gitignore | 10 +++++----- samples/auxdisplay/.gitignore | 2 +- samples/binderfs/.gitignore | 3 ++- samples/connector/.gitignore | 2 +- samples/hidraw/.gitignore | 2 +- samples/mei/.gitignore | 2 +- samples/nitro_enclaves/.gitignore | 2 +- samples/pidfd/.gitignore | 2 +- samples/seccomp/.gitignore | 8 ++++---- samples/timers/.gitignore | 2 +- samples/vfs/.gitignore | 4 ++-- samples/watch_queue/.gitignore | 3 ++- samples/watchdog/.gitignore | 2 +- scripts/.gitignore | 18 +++++++++--------- scripts/basic/.gitignore | 2 +- scripts/dtc/.gitignore | 4 ++-- scripts/gcc-plugins/.gitignore | 2 +- scripts/genksyms/.gitignore | 2 +- scripts/mod/.gitignore | 8 ++++---- usr/.gitignore | 4 ++-- 27 files changed, 56 insertions(+), 54 deletions(-) diff --git a/Documentation/devicetree/bindings/.gitignore b/Documentation/devicetree/bindings/.gitignore index 3a05b99bfa26..a77719968a7e 100644 --- a/Documentation/devicetree/bindings/.gitignore +++ b/Documentation/devicetree/bindings/.gitignore @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only *.example.dts -processed-schema*.yaml -processed-schema*.json +/processed-schema*.yaml +/processed-schema*.json diff --git a/arch/.gitignore b/arch/.gitignore index 4191da401dbb..756c19c34f99 100644 --- a/arch/.gitignore +++ b/arch/.gitignore @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -i386 -x86_64 +/i386/ +/x86_64/ diff --git a/certs/.gitignore b/certs/.gitignore index 6cbd1f1a5837..8c3763f80be3 100644 --- a/certs/.gitignore +++ b/certs/.gitignore @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -x509_certificate_list -x509_revocation_list +/x509_certificate_list +/x509_revocation_list diff --git a/drivers/memory/.gitignore b/drivers/memory/.gitignore index caedc4c7d2db..5e84bee05ef8 100644 --- a/drivers/memory/.gitignore +++ b/drivers/memory/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -ti-emif-asm-offsets.h +/ti-emif-asm-offsets.h diff --git a/drivers/tty/vt/.gitignore b/drivers/tty/vt/.gitignore index 3ecf42234d89..0221709b177d 100644 --- a/drivers/tty/vt/.gitignore +++ b/drivers/tty/vt/.gitignore @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -conmakehash -consolemap_deftbl.c -defkeymap.c +/conmakehash +/consolemap_deftbl.c +/defkeymap.c diff --git a/fs/unicode/.gitignore b/fs/unicode/.gitignore index 9b2467e77b2d..361294571ab0 100644 --- a/fs/unicode/.gitignore +++ b/fs/unicode/.gitignore @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -mkutf8data -utf8data.h +/mkutf8data +/utf8data.h diff --git a/kernel/.gitignore b/kernel/.gitignore index dafdce3d18c5..c6b299a6b786 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only /config_data -kheaders.md5 +/kheaders.md5 diff --git a/lib/.gitignore b/lib/.gitignore index 327cb2c7f2c9..5e7fa54c4536 100644 --- a/lib/.gitignore +++ b/lib/.gitignore @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -gen_crc32table -gen_crc64table -crc32table.h -crc64table.h -oid_registry_data.c +/crc32table.h +/crc64table.h +/gen_crc32table +/gen_crc64table +/oid_registry_data.c diff --git a/samples/auxdisplay/.gitignore b/samples/auxdisplay/.gitignore index 2ed744c0e741..d023816849bd 100644 --- a/samples/auxdisplay/.gitignore +++ b/samples/auxdisplay/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -cfag12864b-example +/cfag12864b-example diff --git a/samples/binderfs/.gitignore b/samples/binderfs/.gitignore index eb60241e8087..8fa415a3640b 100644 --- a/samples/binderfs/.gitignore +++ b/samples/binderfs/.gitignore @@ -1 +1,2 @@ -binderfs_example +# SPDX-License-Identifier: GPL-2.0 +/binderfs_example diff --git a/samples/connector/.gitignore b/samples/connector/.gitignore index d86f2ff9c947..0e26039f39b5 100644 --- a/samples/connector/.gitignore +++ b/samples/connector/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -ucon +/ucon diff --git a/samples/hidraw/.gitignore b/samples/hidraw/.gitignore index d7a6074ebcf9..5233ab63262e 100644 --- a/samples/hidraw/.gitignore +++ b/samples/hidraw/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -hid-example +/hid-example diff --git a/samples/mei/.gitignore b/samples/mei/.gitignore index db5e802f041e..fe894bcb6a62 100644 --- a/samples/mei/.gitignore +++ b/samples/mei/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -mei-amt-version +/mei-amt-version diff --git a/samples/nitro_enclaves/.gitignore b/samples/nitro_enclaves/.gitignore index 827934129c90..6a718eec71f4 100644 --- a/samples/nitro_enclaves/.gitignore +++ b/samples/nitro_enclaves/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0 -ne_ioctl_sample +/ne_ioctl_sample diff --git a/samples/pidfd/.gitignore b/samples/pidfd/.gitignore index eea857fca736..d4cfa3176b1b 100644 --- a/samples/pidfd/.gitignore +++ b/samples/pidfd/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -pidfd-metadata +/pidfd-metadata diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore index 4a5a5b7db30b..a6df0da77c5d 100644 --- a/samples/seccomp/.gitignore +++ b/samples/seccomp/.gitignore @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -bpf-direct -bpf-fancy -dropper -user-trap +/bpf-direct +/bpf-fancy +/dropper +/user-trap diff --git a/samples/timers/.gitignore b/samples/timers/.gitignore index 40510c33cf08..cd9ff7b95383 100644 --- a/samples/timers/.gitignore +++ b/samples/timers/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -hpet_example +/hpet_example diff --git a/samples/vfs/.gitignore b/samples/vfs/.gitignore index 8fdabf7e5373..79212d91285b 100644 --- a/samples/vfs/.gitignore +++ b/samples/vfs/.gitignore @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -test-fsmount -test-statx +/test-fsmount +/test-statx diff --git a/samples/watch_queue/.gitignore b/samples/watch_queue/.gitignore index 2aa3c7e56a1a..823b351d3db9 100644 --- a/samples/watch_queue/.gitignore +++ b/samples/watch_queue/.gitignore @@ -1 +1,2 @@ -watch_test +# SPDX-License-Identifier: GPL-2.0-only +/watch_test diff --git a/samples/watchdog/.gitignore b/samples/watchdog/.gitignore index 74153b831244..a70a0150ed9f 100644 --- a/samples/watchdog/.gitignore +++ b/samples/watchdog/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -watchdog-simple +/watchdog-simple diff --git a/scripts/.gitignore b/scripts/.gitignore index a6c11316c969..e83c620ef52c 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -1,11 +1,11 @@ # SPDX-License-Identifier: GPL-2.0-only -bin2c -kallsyms -unifdef -recordmcount -sorttable -asn1_compiler -extract-cert -sign-file -insert-sys-cert +/asn1_compiler +/bin2c +/extract-cert +/insert-sys-cert +/kallsyms /module.lds +/recordmcount +/sign-file +/sorttable +/unifdef diff --git a/scripts/basic/.gitignore b/scripts/basic/.gitignore index 98ae1f509592..961c91c8a884 100644 --- a/scripts/basic/.gitignore +++ b/scripts/basic/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -fixdep +/fixdep diff --git a/scripts/dtc/.gitignore b/scripts/dtc/.gitignore index 8a8b62bf3d3c..e0b5c1d2464a 100644 --- a/scripts/dtc/.gitignore +++ b/scripts/dtc/.gitignore @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only -dtc -fdtoverlay +/dtc +/fdtoverlay diff --git a/scripts/gcc-plugins/.gitignore b/scripts/gcc-plugins/.gitignore index b04e0f0f033e..5cc385b9eb97 100644 --- a/scripts/gcc-plugins/.gitignore +++ b/scripts/gcc-plugins/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -randomize_layout_seed.h +/randomize_layout_seed.h diff --git a/scripts/genksyms/.gitignore b/scripts/genksyms/.gitignore index 999af710f83d..0b275abf9405 100644 --- a/scripts/genksyms/.gitignore +++ b/scripts/genksyms/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -genksyms +/genksyms diff --git a/scripts/mod/.gitignore b/scripts/mod/.gitignore index 07e4a39f90a6..0465ec33c9bf 100644 --- a/scripts/mod/.gitignore +++ b/scripts/mod/.gitignore @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -elfconfig.h -mk_elfconfig -modpost -devicetable-offsets.h +/devicetable-offsets.h +/elfconfig.h +/mk_elfconfig +/modpost diff --git a/usr/.gitignore b/usr/.gitignore index 935442ed1eb2..8996e7a88902 100644 --- a/usr/.gitignore +++ b/usr/.gitignore @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -gen_init_cpio -initramfs_data.cpio +/gen_init_cpio +/initramfs_data.cpio /initramfs_inc_data From 533b4f3a789d49574e7ae0f6ececed153f651f97 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 15 Apr 2021 14:25:22 +0530 Subject: [PATCH 0858/1379] RISC-V: Fix error code returned by riscv_hartid_to_cpuid() We should return a negative error code upon failure in riscv_hartid_to_cpuid() instead of NR_CPUS. This is also aligned with all uses of riscv_hartid_to_cpuid() which expect negative error code upon failure. Fixes: 6825c7a80f18 ("RISC-V: Add logical CPU indexing for RISC-V") Fixes: f99fb607fb2b ("RISC-V: Use Linux logical CPU number instead of hartid") Signed-off-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/smp.c b/arch/riscv/kernel/smp.c index 366cb87c0e2e..921d9d7df400 100644 --- a/arch/riscv/kernel/smp.c +++ b/arch/riscv/kernel/smp.c @@ -56,7 +56,7 @@ int riscv_hartid_to_cpuid(int hartid) return i; pr_err("Couldn't find cpu id for hartid [%d]\n", hartid); - return i; + return -ENOENT; } void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out) From 883fcb8ecaaffbc46d5ed20f336da61e422021aa Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 28 Apr 2021 06:02:17 -0400 Subject: [PATCH 0859/1379] riscv: Fix 32b kernel build with CONFIG_DEBUG_VIRTUAL=y Declare kernel_virt_addr for 32b kernel since it is used in __phys_addr_symbol defined when CONFIG_DEBUG_VIRTUAL is set. Fixes: 2bfc6cd81bd17 ("riscv: Move kernel mapping outside of linear mapping") Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/page.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index e280ba60cb34..6a7761c86ec2 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -106,9 +106,9 @@ extern unsigned long pfn_base; #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ -#ifdef CONFIG_64BIT extern unsigned long kernel_virt_addr; +#ifdef CONFIG_64BIT #define linear_mapping_pa_to_va(x) ((void *)((unsigned long)(x) + va_pa_offset)) #ifdef CONFIG_XIP_KERNEL #define kernel_mapping_pa_to_va(y) ({ \ From 28252e08649f3aa06cb6b5420e29df7a9d5fe67d Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Sun, 18 Apr 2021 07:28:56 -0400 Subject: [PATCH 0860/1379] riscv: Remove 32b kernel mapping from page table dump The 32b kernel mapping lies in the linear mapping, there is no point in printing its address in page table dump, so remove this leftover that comes from moving the kernel mapping outside the linear mapping for 64b kernel. Fixes: e9efb21fe352 ("riscv: Prepare ptdump for vm layout dynamic addresses") Signed-off-by: Alexandre Ghiti Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/ptdump.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 92179598d8f8..0536ac84b730 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -76,8 +76,8 @@ enum address_markers_idx { PAGE_OFFSET_NR, #ifdef CONFIG_64BIT MODULES_MAPPING_NR, -#endif KERNEL_MAPPING_NR, +#endif END_OF_SPACE_NR }; @@ -99,8 +99,8 @@ static struct addr_marker address_markers[] = { {0, "Linear mapping"}, #ifdef CONFIG_64BIT {0, "Modules mapping"}, -#endif {0, "Kernel mapping (kernel, BPF)"}, +#endif {-1, NULL}, }; @@ -379,8 +379,8 @@ static int __init ptdump_init(void) address_markers[PAGE_OFFSET_NR].start_address = PAGE_OFFSET; #ifdef CONFIG_64BIT address_markers[MODULES_MAPPING_NR].start_address = MODULES_VADDR; -#endif address_markers[KERNEL_MAPPING_NR].start_address = kernel_virt_addr; +#endif kernel_ptd_info.base_addr = KERN_VIRT_START; From f54c7b5898d31eda3d6608da13b55c0466ba49fe Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Wed, 28 Apr 2021 14:45:12 -0700 Subject: [PATCH 0861/1379] RISC-V: Always define XIP_FIXUP XIP depends on MMU, but XIP_FIXUP is used throughout the kernel in order to avoid excessive ifdefs. This just makes sure to always define XIP_FIXUP, which will fix MMU=n builds. XIP_OFFSET is used by assembly but XIP_FIXUP is C-only, so they're split. Fixes: 44c922572952 ("RISC-V: enable XIP") Reported-by: Guenter Roeck Signed-off-by: Palmer Dabbelt Tested-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 2f1384e14e31..9469f464e71a 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -73,18 +73,10 @@ #endif #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) +#endif + #ifdef CONFIG_XIP_KERNEL #define XIP_OFFSET SZ_8M -#define XIP_FIXUP(addr) ({ \ - uintptr_t __a = (uintptr_t)(addr); \ - (__a >= CONFIG_XIP_PHYS_ADDR && __a < CONFIG_XIP_PHYS_ADDR + SZ_16M) ? \ - __a - CONFIG_XIP_PHYS_ADDR + CONFIG_PHYS_RAM_BASE - XIP_OFFSET :\ - __a; \ - }) -#else -#define XIP_FIXUP(addr) (addr) -#endif /* CONFIG_XIP_KERNEL */ - #endif #ifndef __ASSEMBLY__ @@ -101,6 +93,17 @@ #include #endif /* CONFIG_64BIT */ +#ifdef CONFIG_XIP_KERNEL +#define XIP_FIXUP(addr) ({ \ + uintptr_t __a = (uintptr_t)(addr); \ + (__a >= CONFIG_XIP_PHYS_ADDR && __a < CONFIG_XIP_PHYS_ADDR + SZ_16M) ? \ + __a - CONFIG_XIP_PHYS_ADDR + CONFIG_PHYS_RAM_BASE - XIP_OFFSET :\ + __a; \ + }) +#else +#define XIP_FIXUP(addr) (addr) +#endif /* CONFIG_XIP_KERNEL */ + #ifdef CONFIG_MMU /* Number of entries in the page global directory */ #define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) From f8f7e0fb22b2e75be55f2f0c13e229e75b0eac07 Mon Sep 17 00:00:00 2001 From: Baptiste Lepers Date: Sat, 1 May 2021 14:10:51 +1000 Subject: [PATCH 0862/1379] sunrpc: Fix misplaced barrier in call_decode Fix a misplaced barrier in call_decode. The struct rpc_rqst is modified as follows by xprt_complete_rqst: req->rq_private_buf.len = copied; /* Ensure all writes are done before we update */ /* req->rq_reply_bytes_recvd */ smp_wmb(); req->rq_reply_bytes_recvd = copied; And currently read as follows by call_decode: smp_rmb(); // misplaced if (!req->rq_reply_bytes_recvd) goto out; req->rq_rcv_buf.len = req->rq_private_buf.len; This patch places the smp_rmb after the if to ensure that rq_reply_bytes_recvd and rq_private_buf.len are read in order. Fixes: 9ba828861c56a ("SUNRPC: Don't try to parse incomplete RPC messages") Signed-off-by: Baptiste Lepers Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index c2a01125be1a..f555d335e910 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2456,12 +2456,6 @@ call_decode(struct rpc_task *task) task->tk_flags &= ~RPC_CALL_MAJORSEEN; } - /* - * Ensure that we see all writes made by xprt_complete_rqst() - * before it changed req->rq_reply_bytes_recvd. - */ - smp_rmb(); - /* * Did we ever call xprt_complete_rqst()? If not, we should assume * the message is incomplete. @@ -2470,6 +2464,11 @@ call_decode(struct rpc_task *task) if (!req->rq_reply_bytes_recvd) goto out; + /* Ensure that we see all writes made by xprt_complete_rqst() + * before it changed req->rq_reply_bytes_recvd. + */ + smp_rmb(); + req->rq_rcv_buf.len = req->rq_private_buf.len; trace_rpc_xdr_recvfrom(task, &req->rq_rcv_buf); From 9e895cd9649abe4392c59d14e31b0f5667d082d2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 1 May 2021 15:38:02 -0400 Subject: [PATCH 0863/1379] xprtrdma: Fix a NULL dereference in frwr_unmap_sync() The normal mechanism that invalidates and unmaps MRs is frwr_unmap_async(). frwr_unmap_sync() is used only when an RPC Reply bearing Write or Reply chunks has been lost (ie, almost never). Coverity found that after commit 9a301cafc861 ("xprtrdma: Move fr_linv_done field to struct rpcrdma_mr"), the while() loop in frwr_unmap_sync() exits only once @mr is NULL, unconditionally causing subsequent dereferences of @mr to Oops. I've tested this fix by creating a client that skips invoking frwr_unmap_async() when RPC Replies complete. That forces all invalidation tasks to fall upon frwr_unmap_sync(). Simple workloads with this fix applied to the adulterated client work as designed. Reported-by: coverity-bot Addresses-Coverity-ID: 1504556 ("Null pointer dereferences") Fixes: 9a301cafc861 ("xprtrdma: Move fr_linv_done field to struct rpcrdma_mr") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/frwr_ops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 285d73246fc2..229fcc9a9064 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -530,6 +530,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) *prev = last; prev = &last->next; } + mr = container_of(last, struct rpcrdma_mr, mr_invwr); /* Strong send queue ordering guarantees that when the * last WR in the chain completes, all WRs in the chain From 562d1e207d322e6346e8db91bbd11d94f16427d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Mar 2021 07:13:11 +0100 Subject: [PATCH 0864/1379] powerpc/powernv: remove the nvlink support This code was only used by the vfio-nvlink2 code, which itself had no proper use. Drop this huge chunk of code build into every powernv or generic build. Signed-off-by: Christoph Hellwig Acked-by: Greg Kroah-Hartman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210326061311.1497642-3-hch@lst.de --- arch/powerpc/include/asm/opal.h | 3 - arch/powerpc/include/asm/pci-bridge.h | 1 - arch/powerpc/include/asm/pci.h | 7 - arch/powerpc/platforms/powernv/Makefile | 2 +- arch/powerpc/platforms/powernv/npu-dma.c | 705 --------------------- arch/powerpc/platforms/powernv/opal-call.c | 2 - arch/powerpc/platforms/powernv/pci-ioda.c | 185 +----- arch/powerpc/platforms/powernv/pci.c | 11 - arch/powerpc/platforms/powernv/pci.h | 17 +- arch/powerpc/platforms/pseries/pci.c | 23 - 10 files changed, 8 insertions(+), 948 deletions(-) delete mode 100644 arch/powerpc/platforms/powernv/npu-dma.c diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index c76157237e22..6ea9001de9a9 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -28,9 +28,6 @@ extern struct device_node *opal_node; /* API functions */ int64_t opal_invalid_call(void); -int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid, uint64_t bdf); -int64_t opal_npu_init_context(uint64_t phb_id, int pasid, uint64_t msr, - uint64_t bdf); int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid, uint64_t lpcr); int64_t opal_npu_spa_setup(uint64_t phb_id, uint32_t bdfn, diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index d2a2a14e56f9..74424c14515c 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -126,7 +126,6 @@ struct pci_controller { #endif /* CONFIG_PPC64 */ void *private_data; - struct npu *npu; }; /* These are used for config access before all the PCI probing diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 6436f0b41539..d1f53260725c 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -119,11 +119,4 @@ extern void pcibios_scan_phb(struct pci_controller *hose); #endif /* __KERNEL__ */ -extern struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev); -extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index); -extern int pnv_npu2_init(struct pci_controller *hose); -extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid, - unsigned long msr); -extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev); - #endif /* __ASM_POWERPC_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 2eb6ae150d1f..be2546b96816 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_FA_DUMP) += opal-fadump.o obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o obj-$(CONFIG_OPAL_CORE) += opal-core.o -obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o +obj-$(CONFIG_PCI) += pci.o pci-ioda.o pci-ioda-tce.o obj-$(CONFIG_PCI_IOV) += pci-sriov.o obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c deleted file mode 100644 index b711dc3262a3..000000000000 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ /dev/null @@ -1,705 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * This file implements the DMA operations for NVLink devices. The NPU - * devices all point to the same iommu table as the parent PCI device. - * - * Copyright Alistair Popple, IBM Corporation 2015. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "pci.h" - -static struct pci_dev *get_pci_dev(struct device_node *dn) -{ - struct pci_dn *pdn = PCI_DN(dn); - struct pci_dev *pdev; - - pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus), - pdn->busno, pdn->devfn); - - /* - * pci_get_domain_bus_and_slot() increased the reference count of - * the PCI device, but callers don't need that actually as the PE - * already holds a reference to the device. Since callers aren't - * aware of the reference count change, call pci_dev_put() now to - * avoid leaks. - */ - if (pdev) - pci_dev_put(pdev); - - return pdev; -} - -/* Given a NPU device get the associated PCI device. */ -struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev) -{ - struct device_node *dn; - struct pci_dev *gpdev; - - if (WARN_ON(!npdev)) - return NULL; - - if (WARN_ON(!npdev->dev.of_node)) - return NULL; - - /* Get assoicated PCI device */ - dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0); - if (!dn) - return NULL; - - gpdev = get_pci_dev(dn); - of_node_put(dn); - - return gpdev; -} -EXPORT_SYMBOL(pnv_pci_get_gpu_dev); - -/* Given the real PCI device get a linked NPU device. */ -struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index) -{ - struct device_node *dn; - struct pci_dev *npdev; - - if (WARN_ON(!gpdev)) - return NULL; - - /* Not all PCI devices have device-tree nodes */ - if (!gpdev->dev.of_node) - return NULL; - - /* Get assoicated PCI device */ - dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index); - if (!dn) - return NULL; - - npdev = get_pci_dev(dn); - of_node_put(dn); - - return npdev; -} -EXPORT_SYMBOL(pnv_pci_get_npu_dev); - -#ifdef CONFIG_IOMMU_API -/* - * Returns the PE assoicated with the PCI device of the given - * NPU. Returns the linked pci device if pci_dev != NULL. - */ -static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, - struct pci_dev **gpdev) -{ - struct pnv_phb *phb; - struct pci_controller *hose; - struct pci_dev *pdev; - struct pnv_ioda_pe *pe; - struct pci_dn *pdn; - - pdev = pnv_pci_get_gpu_dev(npe->pdev); - if (!pdev) - return NULL; - - pdn = pci_get_pdn(pdev); - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return NULL; - - hose = pci_bus_to_host(pdev->bus); - phb = hose->private_data; - pe = &phb->ioda.pe_array[pdn->pe_number]; - - if (gpdev) - *gpdev = pdev; - - return pe; -} - -static long pnv_npu_unset_window(struct iommu_table_group *table_group, - int num); - -static long pnv_npu_set_window(struct iommu_table_group *table_group, int num, - struct iommu_table *tbl) -{ - struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, - table_group); - struct pnv_phb *phb = npe->phb; - int64_t rc; - const unsigned long size = tbl->it_indirect_levels ? - tbl->it_level_size : tbl->it_size; - const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; - const __u64 win_size = tbl->it_size << tbl->it_page_shift; - int num2 = (num == 0) ? 1 : 0; - - /* NPU has just one TVE so if there is another table, remove it first */ - if (npe->table_group.tables[num2]) - pnv_npu_unset_window(&npe->table_group, num2); - - pe_info(npe, "Setting up window %llx..%llx pg=%lx\n", - start_addr, start_addr + win_size - 1, - IOMMU_PAGE_SIZE(tbl)); - - rc = opal_pci_map_pe_dma_window(phb->opal_id, - npe->pe_number, - npe->pe_number, - tbl->it_indirect_levels + 1, - __pa(tbl->it_base), - size << 3, - IOMMU_PAGE_SIZE(tbl)); - if (rc) { - pe_err(npe, "Failed to configure TCE table, err %lld\n", rc); - return rc; - } - pnv_pci_ioda2_tce_invalidate_entire(phb, false); - - /* Add the table to the list so its TCE cache will get invalidated */ - pnv_pci_link_table_and_group(phb->hose->node, num, - tbl, &npe->table_group); - - return 0; -} - -static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num) -{ - struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, - table_group); - struct pnv_phb *phb = npe->phb; - int64_t rc; - - if (!npe->table_group.tables[num]) - return 0; - - pe_info(npe, "Removing DMA window\n"); - - rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, - npe->pe_number, - 0/* levels */, 0/* table address */, - 0/* table size */, 0/* page size */); - if (rc) { - pe_err(npe, "Unmapping failed, ret = %lld\n", rc); - return rc; - } - pnv_pci_ioda2_tce_invalidate_entire(phb, false); - - pnv_pci_unlink_table_and_group(npe->table_group.tables[num], - &npe->table_group); - - return 0; -} - -/* Switch ownership from platform code to external user (e.g. VFIO) */ -static void pnv_npu_take_ownership(struct iommu_table_group *table_group) -{ - struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, - table_group); - struct pnv_phb *phb = npe->phb; - int64_t rc; - struct pci_dev *gpdev = NULL; - - /* - * Note: NPU has just a single TVE in the hardware which means that - * while used by the kernel, it can have either 32bit window or - * DMA bypass but never both. So we deconfigure 32bit window only - * if it was enabled at the moment of ownership change. - */ - if (npe->table_group.tables[0]) { - pnv_npu_unset_window(&npe->table_group, 0); - return; - } - - /* Disable bypass */ - rc = opal_pci_map_pe_dma_window_real(phb->opal_id, - npe->pe_number, npe->pe_number, - 0 /* bypass base */, 0); - if (rc) { - pe_err(npe, "Failed to disable bypass, err %lld\n", rc); - return; - } - pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); - - get_gpu_pci_dev_and_pe(npe, &gpdev); - if (gpdev) - pnv_npu2_unmap_lpar_dev(gpdev); -} - -static void pnv_npu_release_ownership(struct iommu_table_group *table_group) -{ - struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, - table_group); - struct pci_dev *gpdev = NULL; - - get_gpu_pci_dev_and_pe(npe, &gpdev); - if (gpdev) - pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV); -} - -static struct iommu_table_group_ops pnv_pci_npu_ops = { - .set_window = pnv_npu_set_window, - .unset_window = pnv_npu_unset_window, - .take_ownership = pnv_npu_take_ownership, - .release_ownership = pnv_npu_release_ownership, -}; -#endif /* !CONFIG_IOMMU_API */ - -/* - * NPU2 ATS - */ -/* Maximum possible number of ATSD MMIO registers per NPU */ -#define NV_NMMU_ATSD_REGS 8 -#define NV_NPU_MAX_PE_NUM 16 - -/* - * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or - * up to 3 x (GPU + 2xNPUs) (POWER9). - */ -struct npu_comp { - struct iommu_table_group table_group; - int pe_num; - struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM]; -}; - -/* An NPU descriptor, valid for POWER9 only */ -struct npu { - int index; - struct npu_comp npucomp; -}; - -#ifdef CONFIG_IOMMU_API -static long pnv_npu_peers_create_table_userspace( - struct iommu_table_group *table_group, - int num, __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table **ptbl) -{ - struct npu_comp *npucomp = container_of(table_group, struct npu_comp, - table_group); - - if (!npucomp->pe_num || !npucomp->pe[0] || - !npucomp->pe[0]->table_group.ops || - !npucomp->pe[0]->table_group.ops->create_table) - return -EFAULT; - - return npucomp->pe[0]->table_group.ops->create_table( - &npucomp->pe[0]->table_group, num, page_shift, - window_size, levels, ptbl); -} - -static long pnv_npu_peers_set_window(struct iommu_table_group *table_group, - int num, struct iommu_table *tbl) -{ - int i, j; - long ret = 0; - struct npu_comp *npucomp = container_of(table_group, struct npu_comp, - table_group); - - for (i = 0; i < npucomp->pe_num; ++i) { - struct pnv_ioda_pe *pe = npucomp->pe[i]; - - if (!pe->table_group.ops->set_window) - continue; - - ret = pe->table_group.ops->set_window(&pe->table_group, - num, tbl); - if (ret) - break; - } - - if (ret) { - for (j = 0; j < i; ++j) { - struct pnv_ioda_pe *pe = npucomp->pe[j]; - - if (!pe->table_group.ops->unset_window) - continue; - - ret = pe->table_group.ops->unset_window( - &pe->table_group, num); - if (ret) - break; - } - } else { - table_group->tables[num] = iommu_tce_table_get(tbl); - } - - return ret; -} - -static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group, - int num) -{ - int i, j; - long ret = 0; - struct npu_comp *npucomp = container_of(table_group, struct npu_comp, - table_group); - - for (i = 0; i < npucomp->pe_num; ++i) { - struct pnv_ioda_pe *pe = npucomp->pe[i]; - - WARN_ON(npucomp->table_group.tables[num] != - table_group->tables[num]); - if (!npucomp->table_group.tables[num]) - continue; - - if (!pe->table_group.ops->unset_window) - continue; - - ret = pe->table_group.ops->unset_window(&pe->table_group, num); - if (ret) - break; - } - - if (ret) { - for (j = 0; j < i; ++j) { - struct pnv_ioda_pe *pe = npucomp->pe[j]; - - if (!npucomp->table_group.tables[num]) - continue; - - if (!pe->table_group.ops->set_window) - continue; - - ret = pe->table_group.ops->set_window(&pe->table_group, - num, table_group->tables[num]); - if (ret) - break; - } - } else if (table_group->tables[num]) { - iommu_tce_table_put(table_group->tables[num]); - table_group->tables[num] = NULL; - } - - return ret; -} - -static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group) -{ - int i; - struct npu_comp *npucomp = container_of(table_group, struct npu_comp, - table_group); - - for (i = 0; i < npucomp->pe_num; ++i) { - struct pnv_ioda_pe *pe = npucomp->pe[i]; - - if (!pe->table_group.ops || - !pe->table_group.ops->take_ownership) - continue; - pe->table_group.ops->take_ownership(&pe->table_group); - } -} - -static void pnv_npu_peers_release_ownership( - struct iommu_table_group *table_group) -{ - int i; - struct npu_comp *npucomp = container_of(table_group, struct npu_comp, - table_group); - - for (i = 0; i < npucomp->pe_num; ++i) { - struct pnv_ioda_pe *pe = npucomp->pe[i]; - - if (!pe->table_group.ops || - !pe->table_group.ops->release_ownership) - continue; - pe->table_group.ops->release_ownership(&pe->table_group); - } -} - -static struct iommu_table_group_ops pnv_npu_peers_ops = { - .get_table_size = pnv_pci_ioda2_get_table_size, - .create_table = pnv_npu_peers_create_table_userspace, - .set_window = pnv_npu_peers_set_window, - .unset_window = pnv_npu_peers_unset_window, - .take_ownership = pnv_npu_peers_take_ownership, - .release_ownership = pnv_npu_peers_release_ownership, -}; - -static void pnv_comp_attach_table_group(struct npu_comp *npucomp, - struct pnv_ioda_pe *pe) -{ - if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM)) - return; - - npucomp->pe[npucomp->pe_num] = pe; - ++npucomp->pe_num; -} - -static struct iommu_table_group * - pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) -{ - struct iommu_table_group *compound_group; - struct npu_comp *npucomp; - struct pci_dev *gpdev = NULL; - struct pci_controller *hose; - struct pci_dev *npdev = NULL; - - list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) { - npdev = pnv_pci_get_npu_dev(gpdev, 0); - if (npdev) - break; - } - - if (!npdev) - /* It is not an NPU attached device, skip */ - return NULL; - - hose = pci_bus_to_host(npdev->bus); - - if (hose->npu) { - /* P9 case: compound group is per-NPU (all gpus, all links) */ - npucomp = &hose->npu->npucomp; - } else { - /* P8 case: Compound group is per-GPU (1 gpu, 2 links) */ - npucomp = pe->npucomp = kzalloc(sizeof(*npucomp), GFP_KERNEL); - } - - compound_group = &npucomp->table_group; - if (!compound_group->group) { - compound_group->ops = &pnv_npu_peers_ops; - iommu_register_group(compound_group, hose->global_number, - pe->pe_number); - - /* Steal capabilities from a GPU PE */ - compound_group->max_dynamic_windows_supported = - pe->table_group.max_dynamic_windows_supported; - compound_group->tce32_start = pe->table_group.tce32_start; - compound_group->tce32_size = pe->table_group.tce32_size; - compound_group->max_levels = pe->table_group.max_levels; - if (!compound_group->pgsizes) - compound_group->pgsizes = pe->table_group.pgsizes; - } - - /* - * The gpu would have been added to the iommu group that's created - * for the PE. Pull it out now. - */ - iommu_del_device(&gpdev->dev); - - /* - * I'm not sure this is strictly required, but it's probably a good idea - * since the table_group for the PE is going to be attached to the - * compound table group. If we leave the PE's iommu group active then - * we might have the same table_group being modifiable via two sepeate - * iommu groups. - */ - iommu_group_put(pe->table_group.group); - - /* now put the GPU into the compound group */ - pnv_comp_attach_table_group(npucomp, pe); - iommu_add_device(compound_group, &gpdev->dev); - - return compound_group; -} - -static struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe) -{ - struct iommu_table_group *table_group; - struct npu_comp *npucomp; - struct pci_dev *gpdev = NULL; - struct pci_dev *npdev; - struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev); - - WARN_ON(!(pe->flags & PNV_IODA_PE_DEV)); - if (!gpe) - return NULL; - - /* - * IODA2 bridges get this set up from pci_controller_ops::setup_bridge - * but NPU bridges do not have this hook defined so we do it here. - * We do not setup other table group parameters as they won't be used - * anyway - NVLink bridges are subordinate PEs. - */ - pe->table_group.ops = &pnv_pci_npu_ops; - - table_group = iommu_group_get_iommudata( - iommu_group_get(&gpdev->dev)); - - /* - * On P9 NPU PHB and PCI PHB support different page sizes, - * keep only matching. We expect here that NVLink bridge PE pgsizes is - * initialized by the caller. - */ - table_group->pgsizes &= pe->table_group.pgsizes; - npucomp = container_of(table_group, struct npu_comp, table_group); - pnv_comp_attach_table_group(npucomp, pe); - - list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) { - struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev); - - if (gpdevtmp != gpdev) - continue; - - iommu_add_device(table_group, &npdev->dev); - } - - return table_group; -} - -void pnv_pci_npu_setup_iommu_groups(void) -{ - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - - /* - * For non-nvlink devices the IOMMU group is registered when the PE is - * configured and devices are added to the group when the per-device - * DMA setup is run. That's done in hose->ops.dma_dev_setup() which is - * only initialise for "normal" IODA PHBs. - * - * For NVLink devices we need to ensure the NVLinks and the GPU end up - * in the same IOMMU group, so that's handled here. - */ - list_for_each_entry(hose, &hose_list, list_node) { - phb = hose->private_data; - - if (phb->type == PNV_PHB_IODA2) - list_for_each_entry(pe, &phb->ioda.pe_list, list) - pnv_try_setup_npu_table_group(pe); - } - - /* - * Now we have all PHBs discovered, time to add NPU devices to - * the corresponding IOMMU groups. - */ - list_for_each_entry(hose, &hose_list, list_node) { - unsigned long pgsizes; - - phb = hose->private_data; - - if (phb->type != PNV_PHB_NPU_NVLINK) - continue; - - pgsizes = pnv_ioda_parse_tce_sizes(phb); - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - /* - * IODA2 bridges get this set up from - * pci_controller_ops::setup_bridge but NPU bridges - * do not have this hook defined so we do it here. - */ - pe->table_group.pgsizes = pgsizes; - pnv_npu_compound_attach(pe); - } - } -} -#endif /* CONFIG_IOMMU_API */ - -int pnv_npu2_init(struct pci_controller *hose) -{ - static int npu_index; - struct npu *npu; - int ret; - - npu = kzalloc(sizeof(*npu), GFP_KERNEL); - if (!npu) - return -ENOMEM; - - npu_index++; - if (WARN_ON(npu_index >= NV_MAX_NPUS)) { - ret = -ENOSPC; - goto fail_exit; - } - npu->index = npu_index; - hose->npu = npu; - - return 0; - -fail_exit: - kfree(npu); - return ret; -} - -int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid, - unsigned long msr) -{ - int ret; - struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); - struct pci_controller *hose; - struct pnv_phb *nphb; - - if (!npdev) - return -ENODEV; - - hose = pci_bus_to_host(npdev->bus); - if (hose->npu == NULL) { - dev_info_once(&npdev->dev, "Nvlink1 does not support contexts"); - return 0; - } - - nphb = hose->private_data; - - dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=%u\n", - nphb->opal_id, lparid); - /* - * Currently we only support radix and non-zero LPCR only makes sense - * for hash tables so skiboot expects the LPCR parameter to be a zero. - */ - ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), lparid, - 0 /* LPCR bits */); - if (ret) { - dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret); - return ret; - } - - dev_dbg(&gpdev->dev, "init context opalid=%llu msr=%lx\n", - nphb->opal_id, msr); - ret = opal_npu_init_context(nphb->opal_id, 0/*__unused*/, msr, - pci_dev_id(gpdev)); - if (ret < 0) - dev_err(&gpdev->dev, "Failed to init context: %d\n", ret); - else - ret = 0; - - return 0; -} -EXPORT_SYMBOL_GPL(pnv_npu2_map_lpar_dev); - -void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr) -{ - struct pci_dev *gpdev; - - list_for_each_entry(gpdev, &gpe->pbus->devices, bus_list) - pnv_npu2_map_lpar_dev(gpdev, 0, msr); -} - -int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev) -{ - int ret; - struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); - struct pci_controller *hose; - struct pnv_phb *nphb; - - if (!npdev) - return -ENODEV; - - hose = pci_bus_to_host(npdev->bus); - if (hose->npu == NULL) { - dev_info_once(&npdev->dev, "Nvlink1 does not support contexts"); - return 0; - } - - nphb = hose->private_data; - - dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n", - nphb->opal_id); - ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/, - pci_dev_id(gpdev)); - if (ret < 0) { - dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret); - return ret; - } - - /* Set LPID to 0 anyway, just to be safe */ - dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id); - ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), 0 /*LPID*/, - 0 /* LPCR bits */); - if (ret) - dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret); - - return ret; -} -EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev); diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index 5cd0f52d258f..01401e3da7ca 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -267,8 +267,6 @@ OPAL_CALL(opal_xive_get_queue_state, OPAL_XIVE_GET_QUEUE_STATE); OPAL_CALL(opal_xive_set_queue_state, OPAL_XIVE_SET_QUEUE_STATE); OPAL_CALL(opal_xive_get_vp_state, OPAL_XIVE_GET_VP_STATE); OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); -OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); -OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT); OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 66c3c3337334..7de464679292 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -47,8 +47,7 @@ #define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */ #define PNV_IODA1_DMA32_SEGSIZE 0x10000000 -static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK", - "NPU_OCAPI" }; +static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_OCAPI" }; static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); static void pnv_pci_configure_bus(struct pci_bus *bus); @@ -192,8 +191,6 @@ void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) unsigned int pe_num = pe->pe_number; WARN_ON(pe->pdev); - WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */ - kfree(pe->npucomp); memset(pe, 0, sizeof(struct pnv_ioda_pe)); mutex_lock(&phb->ioda.pe_alloc_mutex); @@ -875,7 +872,7 @@ int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) * Release from all parents PELT-V. NPUs don't have a PELTV * table */ - if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI) + if (phb->type != PNV_PHB_NPU_OCAPI) pnv_ioda_unset_peltv(phb, pe, parent); rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, @@ -946,7 +943,7 @@ int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) * Configure PELTV. NPUs don't have a PELTV table so skip * configuration on them. */ - if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI) + if (phb->type != PNV_PHB_NPU_OCAPI) pnv_ioda_set_peltv(phb, pe, true); /* Setup reverse map */ @@ -1002,8 +999,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) /* NOTE: We don't get a reference for the pointer in the PE * data structure, both the device and PE structures should be - * destroyed at the same time. However, removing nvlink - * devices will need some work. + * destroyed at the same time. * * At some point we want to remove the PDN completely anyways */ @@ -1099,113 +1095,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) return pe; } -static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) -{ - int pe_num, found_pe = false, rc; - long rid; - struct pnv_ioda_pe *pe; - struct pci_dev *gpu_pdev; - struct pci_dn *npu_pdn; - struct pnv_phb *phb = pci_bus_to_pnvhb(npu_pdev->bus); - - /* - * Intentionally leak a reference on the npu device (for - * nvlink only; this is not an opencapi path) to make sure it - * never goes away, as it's been the case all along and some - * work is needed otherwise. - */ - pci_dev_get(npu_pdev); - - /* - * Due to a hardware errata PE#0 on the NPU is reserved for - * error handling. This means we only have three PEs remaining - * which need to be assigned to four links, implying some - * links must share PEs. - * - * To achieve this we assign PEs such that NPUs linking the - * same GPU get assigned the same PE. - */ - gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev); - for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) { - pe = &phb->ioda.pe_array[pe_num]; - if (!pe->pdev) - continue; - - if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) { - /* - * This device has the same peer GPU so should - * be assigned the same PE as the existing - * peer NPU. - */ - dev_info(&npu_pdev->dev, - "Associating to existing PE %x\n", pe_num); - npu_pdn = pci_get_pdn(npu_pdev); - rid = npu_pdev->bus->number << 8 | npu_pdn->devfn; - npu_pdn->pe_number = pe_num; - phb->ioda.pe_rmap[rid] = pe->pe_number; - pe->device_count++; - - /* Map the PE to this link */ - rc = opal_pci_set_pe(phb->opal_id, pe_num, rid, - OpalPciBusAll, - OPAL_COMPARE_RID_DEVICE_NUMBER, - OPAL_COMPARE_RID_FUNCTION_NUMBER, - OPAL_MAP_PE); - WARN_ON(rc != OPAL_SUCCESS); - found_pe = true; - break; - } - } - - if (!found_pe) - /* - * Could not find an existing PE so allocate a new - * one. - */ - return pnv_ioda_setup_dev_PE(npu_pdev); - else - return pe; -} - -static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus) -{ - struct pci_dev *pdev; - - list_for_each_entry(pdev, &bus->devices, bus_list) - pnv_ioda_setup_npu_PE(pdev); -} - -static void pnv_pci_ioda_setup_nvlink(void) -{ - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - - list_for_each_entry(hose, &hose_list, list_node) { - phb = hose->private_data; - if (phb->type == PNV_PHB_NPU_NVLINK) { - /* PE#0 is needed for error reporting */ - pnv_ioda_reserve_pe(phb, 0); - pnv_ioda_setup_npu_PEs(hose->bus); - if (phb->model == PNV_PHB_MODEL_NPU2) - WARN_ON_ONCE(pnv_npu2_init(hose)); - } - } - list_for_each_entry(hose, &hose_list, list_node) { - phb = hose->private_data; - if (phb->type != PNV_PHB_IODA2) - continue; - - list_for_each_entry(pe, &phb->ioda.pe_list, list) - pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV); - } - -#ifdef CONFIG_IOMMU_API - /* setup iommu groups so we can do nvlink pass-thru */ - pnv_pci_npu_setup_iommu_groups(); -#endif -} - static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); @@ -1468,18 +1357,6 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { #define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1) #define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2) -static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm) -{ - __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm); - const unsigned long val = PHB3_TCE_KILL_INVAL_ALL; - - mb(); /* Ensure previous TCE table stores are visible */ - if (rm) - __raw_rm_writeq_be(val, invalidate); - else - __raw_writeq_be(val, invalidate); -} - static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe) { /* 01xb - invalidate TCEs that match the specified PE# */ @@ -1539,20 +1416,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, struct pnv_phb *phb = pe->phb; unsigned int shift = tbl->it_page_shift; - /* - * NVLink1 can use the TCE kill register directly as - * it's the same as PHB3. NVLink2 is different and - * should go via the OPAL call. - */ - if (phb->model == PNV_PHB_MODEL_NPU) { - /* - * The NVLink hardware does not support TCE kill - * per TCE entry so we have to invalidate - * the entire cache for it. - */ - pnv_pci_phb3_tce_invalidate_entire(phb, rm); - continue; - } if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs) pnv_pci_phb3_tce_invalidate(pe, rm, shift, index, npages); @@ -1564,14 +1427,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, } } -void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm) -{ - if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3) - pnv_pci_phb3_tce_invalidate_entire(phb, rm); - else - opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0); -} - static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, @@ -2451,7 +2306,6 @@ static void pnv_pci_enable_bridges(void) static void pnv_pci_ioda_fixup(void) { - pnv_pci_ioda_setup_nvlink(); pnv_pci_ioda_create_dbgfs(); pnv_pci_enable_bridges(); @@ -2824,15 +2678,6 @@ static void pnv_pci_release_device(struct pci_dev *pdev) pnv_ioda_release_pe(pe); } -static void pnv_npu_disable_device(struct pci_dev *pdev) -{ - struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); - struct eeh_pe *eehpe = edev ? edev->pe : NULL; - - if (eehpe && eeh_ops && eeh_ops->reset) - eeh_ops->reset(eehpe, EEH_RESET_HOT); -} - static void pnv_pci_ioda_shutdown(struct pci_controller *hose) { struct pnv_phb *phb = hose->private_data; @@ -2874,16 +2719,6 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .shutdown = pnv_pci_ioda_shutdown, }; -static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, - .enable_device_hook = pnv_pci_enable_device_hook, - .window_alignment = pnv_pci_window_alignment, - .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .shutdown = pnv_pci_ioda_shutdown, - .disable_device = pnv_npu_disable_device, -}; - static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = { .enable_device_hook = pnv_ocapi_enable_device_hook, .release_device = pnv_pci_release_device, @@ -2957,10 +2792,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->model = PNV_PHB_MODEL_P7IOC; else if (of_device_is_compatible(np, "ibm,power8-pciex")) phb->model = PNV_PHB_MODEL_PHB3; - else if (of_device_is_compatible(np, "ibm,power8-npu-pciex")) - phb->model = PNV_PHB_MODEL_NPU; - else if (of_device_is_compatible(np, "ibm,power9-npu-pciex")) - phb->model = PNV_PHB_MODEL_NPU2; else phb->model = PNV_PHB_MODEL_UNKNOWN; @@ -3118,9 +2949,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; switch (phb->type) { - case PNV_PHB_NPU_NVLINK: - hose->controller_ops = pnv_npu_ioda_controller_ops; - break; case PNV_PHB_NPU_OCAPI: hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops; break; @@ -3173,11 +3001,6 @@ void __init pnv_pci_init_ioda2_phb(struct device_node *np) pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2); } -void __init pnv_pci_init_npu_phb(struct device_node *np) -{ - pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK); -} - void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np) { pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 9b9bca169275..b18468dc31ff 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -926,17 +926,6 @@ void __init pnv_pci_init(void) for_each_compatible_node(np, NULL, "ibm,ioda3-phb") pnv_pci_init_ioda2_phb(np); - /* Look for NPU PHBs */ - for_each_compatible_node(np, NULL, "ibm,ioda2-npu-phb") - pnv_pci_init_npu_phb(np); - - /* - * Look for NPU2 PHBs which we treat mostly as NPU PHBs with - * the exception of TCE kill which requires an OPAL call. - */ - for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-phb") - pnv_pci_init_npu_phb(np); - /* Look for NPU2 OpenCAPI PHBs */ for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-opencapi-phb") pnv_pci_init_npu2_opencapi_phb(np); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 36d22920f5a3..c8d4f222a86f 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -10,10 +10,9 @@ struct pci_dn; enum pnv_phb_type { - PNV_PHB_IODA1 = 0, - PNV_PHB_IODA2 = 1, - PNV_PHB_NPU_NVLINK = 2, - PNV_PHB_NPU_OCAPI = 3, + PNV_PHB_IODA1, + PNV_PHB_IODA2, + PNV_PHB_NPU_OCAPI, }; /* Precise PHB model for error management */ @@ -21,8 +20,6 @@ enum pnv_phb_model { PNV_PHB_MODEL_UNKNOWN, PNV_PHB_MODEL_P7IOC, PNV_PHB_MODEL_PHB3, - PNV_PHB_MODEL_NPU, - PNV_PHB_MODEL_NPU2, }; #define PNV_PCI_DIAG_BUF_SIZE 8192 @@ -81,7 +78,6 @@ struct pnv_ioda_pe { /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ struct iommu_table_group table_group; - struct npu_comp *npucomp; /* 64-bit TCE bypass region */ bool tce_bypass_enabled; @@ -289,9 +285,7 @@ extern struct iommu_table *pnv_pci_table_alloc(int nid); extern void pnv_pci_init_ioda_hub(struct device_node *np); extern void pnv_pci_init_ioda2_phb(struct device_node *np); -extern void pnv_pci_init_npu_phb(struct device_node *np); extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np); -extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr); extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); @@ -314,11 +308,6 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, #define pe_info(pe, fmt, ...) \ pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) -/* Nvlink functions */ -extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); -extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); -extern void pnv_pci_npu_setup_iommu_groups(void); - /* pci-ioda-tce.c */ #define POWERNV_IOMMU_DEFAULT_LEVELS 2 #define POWERNV_IOMMU_MAX_LEVELS 5 diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 1bffbd1c9a94..3b6800f774c2 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -224,8 +224,6 @@ static void __init pSeries_request_regions(void) void __init pSeries_final_fixup(void) { - struct pci_controller *hose; - pSeries_request_regions(); eeh_show_enabled(); @@ -234,27 +232,6 @@ void __init pSeries_final_fixup(void) ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable; ppc_md.pcibios_sriov_disable = pseries_pcibios_sriov_disable; #endif - list_for_each_entry(hose, &hose_list, list_node) { - struct device_node *dn = hose->dn, *nvdn; - - while (1) { - dn = of_find_all_nodes(dn); - if (!dn) - break; - nvdn = of_parse_phandle(dn, "ibm,nvlink", 0); - if (!nvdn) - continue; - if (!of_device_is_compatible(nvdn, "ibm,npu-link")) - continue; - if (!of_device_is_compatible(nvdn->parent, - "ibm,power9-npu")) - continue; -#ifdef CONFIG_PPC_POWERNV - WARN_ON_ONCE(pnv_npu2_init(hose)); -#endif - break; - } - } } /* From fc09acb7de31badb2ea9e85d21e071be1a5736e4 Mon Sep 17 00:00:00 2001 From: Douglas Gilbert Date: Wed, 14 Apr 2021 21:50:31 -0400 Subject: [PATCH 0865/1379] scsi: scsi_debug: Fix cmd_per_lun, set to max_queue Make sure that the cmd_per_lun value placed in the host template never exceeds the can_queue value. If the max_queue driver parameter is not specified then both cmd_per_lun and can_queue are set to CAN_QUEUE. CAN_QUEUE is a compile time constant and is used to dimension an array to hold queued requests. If the max_queue driver parameter is given it is must be less than or equal to CAN_QUEUE and if so, the host template values are adjusted. Remove undocumented code that allowed queue_depth to exceed CAN_QUEUE and cause stack full type errors. There is a documented way to do that with every_nth and echo 0x8000 > /sys/bus/pseudo/drivers/scsi_debug/opts See: https://sg.danny.cz/sg/scsi_debug.html Tweak some formatting, and add a suggestion to the "trim poll_queues" warning. Link: https://lore.kernel.org/r/20210415015031.607153-1-dgilbert@interlog.com Reported-by: Kashyap Desai Reviewed-by: John Garry Signed-off-by: Douglas Gilbert Signed-off-by: John Garry Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 70165be10f00..a5d1633b5bd8 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -218,7 +218,7 @@ static const char *sdebug_version_date = "20200710"; */ #define SDEBUG_CANQUEUE_WORDS 3 /* a WORD is bits in a long */ #define SDEBUG_CANQUEUE (SDEBUG_CANQUEUE_WORDS * BITS_PER_LONG) -#define DEF_CMD_PER_LUN 255 +#define DEF_CMD_PER_LUN SDEBUG_CANQUEUE /* UA - Unit Attention; SA - Service Action; SSU - Start Stop Unit */ #define F_D_IN 1 /* Data-in command (e.g. READ) */ @@ -5695,8 +5695,8 @@ MODULE_PARM_DESC(lbpu, "enable LBP, support UNMAP command (def=0)"); MODULE_PARM_DESC(lbpws, "enable LBP, support WRITE SAME(16) with UNMAP bit (def=0)"); MODULE_PARM_DESC(lbpws10, "enable LBP, support WRITE SAME(10) with UNMAP bit (def=0)"); MODULE_PARM_DESC(lowest_aligned, "lowest aligned lba (def=0)"); -MODULE_PARM_DESC(max_luns, "number of LUNs per target to simulate(def=1)"); MODULE_PARM_DESC(lun_format, "LUN format: 0->peripheral (def); 1 --> flat address method"); +MODULE_PARM_DESC(max_luns, "number of LUNs per target to simulate(def=1)"); MODULE_PARM_DESC(max_queue, "max number of queued commands (1 to max(def))"); MODULE_PARM_DESC(medium_error_count, "count of sectors to return follow on MEDIUM error"); MODULE_PARM_DESC(medium_error_start, "starting sector number to return MEDIUM error"); @@ -5710,7 +5710,7 @@ MODULE_PARM_DESC(opt_xferlen_exp, "optimal transfer length granularity exponent MODULE_PARM_DESC(opts, "1->noise, 2->medium_err, 4->timeout, 8->recovered_err... (def=0)"); MODULE_PARM_DESC(per_host_store, "If set, next positive add_host will get new store (def=0)"); MODULE_PARM_DESC(physblk_exp, "physical block exponent (def=0)"); -MODULE_PARM_DESC(poll_queues, "support for iouring iopoll queues (1 to max(submit_queues - 1)"); +MODULE_PARM_DESC(poll_queues, "support for iouring iopoll queues (1 to max(submit_queues - 1))"); MODULE_PARM_DESC(ptype, "SCSI peripheral type(def=0[disk])"); MODULE_PARM_DESC(random, "If set, uniformly randomize command duration between 0 and delay_in_ns"); MODULE_PARM_DESC(removable, "claim to have removable media (def=0)"); @@ -7165,12 +7165,15 @@ static int sdebug_change_qdepth(struct scsi_device *sdev, int qdepth) } num_in_q = atomic_read(&devip->num_in_q); + if (qdepth > SDEBUG_CANQUEUE) { + qdepth = SDEBUG_CANQUEUE; + pr_warn("%s: requested qdepth [%d] exceeds canqueue [%d], trim\n", __func__, + qdepth, SDEBUG_CANQUEUE); + } if (qdepth < 1) qdepth = 1; - /* allow to exceed max host qc_arr elements for testing */ - if (qdepth > SDEBUG_CANQUEUE + 10) - qdepth = SDEBUG_CANQUEUE + 10; - scsi_change_queue_depth(sdev, qdepth); + if (qdepth != sdev->queue_depth) + scsi_change_queue_depth(sdev, qdepth); if (SDEBUG_OPT_Q_NOISE & sdebug_opts) { sdev_printk(KERN_INFO, sdev, "%s: qdepth=%d, num_in_q=%d\n", @@ -7558,6 +7561,7 @@ static int sdebug_driver_probe(struct device *dev) sdbg_host = to_sdebug_host(dev); sdebug_driver_template.can_queue = sdebug_max_queue; + sdebug_driver_template.cmd_per_lun = sdebug_max_queue; if (!sdebug_clustering) sdebug_driver_template.dma_boundary = PAGE_SIZE - 1; @@ -7593,7 +7597,11 @@ static int sdebug_driver_probe(struct device *dev) * If condition not met, trim poll_queues to 1 (just for simplicity). */ if (poll_queues >= submit_queues) { - pr_warn("%s: trim poll_queues to 1\n", my_name); + if (submit_queues < 3) + pr_warn("%s: trim poll_queues to 1\n", my_name); + else + pr_warn("%s: trim poll_queues to 1. Perhaps try poll_queues=%d\n", + my_name, submit_queues - 1); poll_queues = 1; } if (poll_queues) From 3ba1eeff00c42ccb31c0089c8c95c3ade546e9b0 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 29 Apr 2021 14:25:15 +0200 Subject: [PATCH 0866/1379] scsi: fnic: Kill 'exclude_id' argument to fnic_cleanup_io() 'exclude_id' is always SCSI_NO_TAG which will never be reached when traversing the list of tags. Link: https://lore.kernel.org/r/20210429122517.39659-2-hare@suse.de Reviewed-by: Ming Lei Signed-off-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fnic_scsi.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c index e619a82f921b..f41d1b1c2e39 100644 --- a/drivers/scsi/fnic/fnic_scsi.c +++ b/drivers/scsi/fnic/fnic_scsi.c @@ -102,7 +102,7 @@ static const char *fnic_fcpio_status_to_str(unsigned int status) return fcpio_status_str[status]; } -static void fnic_cleanup_io(struct fnic *fnic, int exclude_id); +static void fnic_cleanup_io(struct fnic *fnic); static inline spinlock_t *fnic_io_lock_hash(struct fnic *fnic, struct scsi_cmnd *sc) @@ -638,7 +638,7 @@ static int fnic_fcpio_fw_reset_cmpl_handler(struct fnic *fnic, atomic64_inc(&reset_stats->fw_reset_completions); /* Clean up all outstanding io requests */ - fnic_cleanup_io(fnic, SCSI_NO_TAG); + fnic_cleanup_io(fnic); atomic64_set(&fnic->fnic_stats.fw_stats.active_fw_reqs, 0); atomic64_set(&fnic->fnic_stats.io_stats.active_ios, 0); @@ -1361,7 +1361,7 @@ int fnic_wq_copy_cmpl_handler(struct fnic *fnic, int copy_work_to_do) return wq_work_done; } -static void fnic_cleanup_io(struct fnic *fnic, int exclude_id) +static void fnic_cleanup_io(struct fnic *fnic) { int i; struct fnic_io_req *io_req; @@ -1372,9 +1372,6 @@ static void fnic_cleanup_io(struct fnic *fnic, int exclude_id) struct fnic_stats *fnic_stats = &fnic->fnic_stats; for (i = 0; i < fnic->fnic_max_tag_id; i++) { - if (i == exclude_id) - continue; - io_lock = fnic_io_lock_tag(fnic, i); spin_lock_irqsave(io_lock, flags); sc = scsi_host_find_tag(fnic->lport->host, i); From 35ffbb60bdad652d461aa8e97fa094faa9eb46ec Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 29 Apr 2021 14:25:16 +0200 Subject: [PATCH 0867/1379] scsi: fnic: Use scsi_host_busy_iter() to traverse commands Use scsi_host_busy_iter() to traverse commands instead of hand-crafted routines walking the command list. Link: https://lore.kernel.org/r/20210429122517.39659-3-hare@suse.de Reviewed-by: Ming Lei Signed-off-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fnic_scsi.c | 879 ++++++++++++++++------------------ 1 file changed, 404 insertions(+), 475 deletions(-) diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c index f41d1b1c2e39..762cc8bd2653 100644 --- a/drivers/scsi/fnic/fnic_scsi.c +++ b/drivers/scsi/fnic/fnic_scsi.c @@ -1361,90 +1361,90 @@ int fnic_wq_copy_cmpl_handler(struct fnic *fnic, int copy_work_to_do) return wq_work_done; } -static void fnic_cleanup_io(struct fnic *fnic) +static bool fnic_cleanup_io_iter(struct scsi_cmnd *sc, void *data, + bool reserved) { - int i; + struct fnic *fnic = data; struct fnic_io_req *io_req; unsigned long flags = 0; - struct scsi_cmnd *sc; spinlock_t *io_lock; unsigned long start_time = 0; struct fnic_stats *fnic_stats = &fnic->fnic_stats; - for (i = 0; i < fnic->fnic_max_tag_id; i++) { - io_lock = fnic_io_lock_tag(fnic, i); - spin_lock_irqsave(io_lock, flags); - sc = scsi_host_find_tag(fnic->lport->host, i); - if (!sc) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } - - io_req = (struct fnic_io_req *)CMD_SP(sc); - if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) && - !(CMD_FLAGS(sc) & FNIC_DEV_RST_DONE)) { - /* - * We will be here only when FW completes reset - * without sending completions for outstanding ios. - */ - CMD_FLAGS(sc) |= FNIC_DEV_RST_DONE; - if (io_req && io_req->dr_done) - complete(io_req->dr_done); - else if (io_req && io_req->abts_done) - complete(io_req->abts_done); - spin_unlock_irqrestore(io_lock, flags); - continue; - } else if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } - if (!io_req) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } - - CMD_SP(sc) = NULL; - - spin_unlock_irqrestore(io_lock, flags); + io_lock = fnic_io_lock_tag(fnic, sc->request->tag); + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) && + !(CMD_FLAGS(sc) & FNIC_DEV_RST_DONE)) { /* - * If there is a scsi_cmnd associated with this io_req, then - * free the corresponding state + * We will be here only when FW completes reset + * without sending completions for outstanding ios. */ - start_time = io_req->start_time; - fnic_release_ioreq_buf(fnic, io_req, sc); - mempool_free(io_req, fnic->io_req_pool); - - sc->result = DID_TRANSPORT_DISRUPTED << 16; - FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, - "%s: tag:0x%x : sc:0x%p duration = %lu DID_TRANSPORT_DISRUPTED\n", - __func__, sc->request->tag, sc, - (jiffies - start_time)); - - if (atomic64_read(&fnic->io_cmpl_skip)) - atomic64_dec(&fnic->io_cmpl_skip); - else - atomic64_inc(&fnic_stats->io_stats.io_completions); - - /* Complete the command to SCSI */ - if (sc->scsi_done) { - if (!(CMD_FLAGS(sc) & FNIC_IO_ISSUED)) - shost_printk(KERN_ERR, fnic->lport->host, - "Calling done for IO not issued to fw: tag:0x%x sc:0x%p\n", - sc->request->tag, sc); - - FNIC_TRACE(fnic_cleanup_io, - sc->device->host->host_no, i, sc, - jiffies_to_msecs(jiffies - start_time), - 0, ((u64)sc->cmnd[0] << 32 | - (u64)sc->cmnd[2] << 24 | - (u64)sc->cmnd[3] << 16 | - (u64)sc->cmnd[4] << 8 | sc->cmnd[5]), - (((u64)CMD_FLAGS(sc) << 32) | CMD_STATE(sc))); - - sc->scsi_done(sc); - } + CMD_FLAGS(sc) |= FNIC_DEV_RST_DONE; + if (io_req && io_req->dr_done) + complete(io_req->dr_done); + else if (io_req && io_req->abts_done) + complete(io_req->abts_done); + spin_unlock_irqrestore(io_lock, flags); + return true; + } else if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) { + spin_unlock_irqrestore(io_lock, flags); + return true; } + if (!io_req) { + spin_unlock_irqrestore(io_lock, flags); + goto cleanup_scsi_cmd; + } + + CMD_SP(sc) = NULL; + + spin_unlock_irqrestore(io_lock, flags); + + /* + * If there is a scsi_cmnd associated with this io_req, then + * free the corresponding state + */ + start_time = io_req->start_time; + fnic_release_ioreq_buf(fnic, io_req, sc); + mempool_free(io_req, fnic->io_req_pool); + +cleanup_scsi_cmd: + sc->result = DID_TRANSPORT_DISRUPTED << 16; + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "fnic_cleanup_io: tag:0x%x : sc:0x%p duration = %lu DID_TRANSPORT_DISRUPTED\n", + sc->request->tag, sc, (jiffies - start_time)); + + if (atomic64_read(&fnic->io_cmpl_skip)) + atomic64_dec(&fnic->io_cmpl_skip); + else + atomic64_inc(&fnic_stats->io_stats.io_completions); + + /* Complete the command to SCSI */ + if (sc->scsi_done) { + if (!(CMD_FLAGS(sc) & FNIC_IO_ISSUED)) + shost_printk(KERN_ERR, fnic->lport->host, + "Calling done for IO not issued to fw: tag:0x%x sc:0x%p\n", + sc->request->tag, sc); + + FNIC_TRACE(fnic_cleanup_io, + sc->device->host->host_no, sc->request->tag, sc, + jiffies_to_msecs(jiffies - start_time), + 0, ((u64)sc->cmnd[0] << 32 | + (u64)sc->cmnd[2] << 24 | + (u64)sc->cmnd[3] << 16 | + (u64)sc->cmnd[4] << 8 | sc->cmnd[5]), + (((u64)CMD_FLAGS(sc) << 32) | CMD_STATE(sc))); + + sc->scsi_done(sc); + } + return true; +} + +static void fnic_cleanup_io(struct fnic *fnic) +{ + scsi_host_busy_iter(fnic->lport->host, + fnic_cleanup_io_iter, fnic); } void fnic_wq_copy_cleanup_handler(struct vnic_wq_copy *wq, @@ -1555,20 +1555,121 @@ static inline int fnic_queue_abort_io_req(struct fnic *fnic, int tag, return 0; } -static void fnic_rport_exch_reset(struct fnic *fnic, u32 port_id) +struct fnic_rport_abort_io_iter_data { + struct fnic *fnic; + u32 port_id; + int term_cnt; +}; + +static bool fnic_rport_abort_io_iter(struct scsi_cmnd *sc, void *data, + bool reserved) { - int tag; - int abt_tag; - int term_cnt = 0; + struct fnic_rport_abort_io_iter_data *iter_data = data; + struct fnic *fnic = iter_data->fnic; + int abt_tag = sc->request->tag; struct fnic_io_req *io_req; spinlock_t *io_lock; unsigned long flags; - struct scsi_cmnd *sc; struct reset_stats *reset_stats = &fnic->fnic_stats.reset_stats; struct terminate_stats *term_stats = &fnic->fnic_stats.term_stats; struct scsi_lun fc_lun; enum fnic_ioreq_state old_ioreq_state; + io_lock = fnic_io_lock_tag(fnic, abt_tag); + spin_lock_irqsave(io_lock, flags); + + io_req = (struct fnic_io_req *)CMD_SP(sc); + + if (!io_req || io_req->port_id != iter_data->port_id) { + spin_unlock_irqrestore(io_lock, flags); + return true; + } + + if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) && + (!(CMD_FLAGS(sc) & FNIC_DEV_RST_ISSUED))) { + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "fnic_rport_exch_reset dev rst not pending sc 0x%p\n", + sc); + spin_unlock_irqrestore(io_lock, flags); + return true; + } + + /* + * Found IO that is still pending with firmware and + * belongs to rport that went away + */ + if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) { + spin_unlock_irqrestore(io_lock, flags); + return true; + } + if (io_req->abts_done) { + shost_printk(KERN_ERR, fnic->lport->host, + "fnic_rport_exch_reset: io_req->abts_done is set " + "state is %s\n", + fnic_ioreq_state_to_str(CMD_STATE(sc))); + } + + if (!(CMD_FLAGS(sc) & FNIC_IO_ISSUED)) { + shost_printk(KERN_ERR, fnic->lport->host, + "rport_exch_reset " + "IO not yet issued %p tag 0x%x flags " + "%x state %d\n", + sc, abt_tag, CMD_FLAGS(sc), CMD_STATE(sc)); + } + old_ioreq_state = CMD_STATE(sc); + CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING; + CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE; + if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) { + atomic64_inc(&reset_stats->device_reset_terminates); + abt_tag |= FNIC_TAG_DEV_RST; + } + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "fnic_rport_exch_reset dev rst sc 0x%p\n", sc); + BUG_ON(io_req->abts_done); + + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "fnic_rport_reset_exch: Issuing abts\n"); + + spin_unlock_irqrestore(io_lock, flags); + + /* Now queue the abort command to firmware */ + int_to_scsilun(sc->device->lun, &fc_lun); + + if (fnic_queue_abort_io_req(fnic, abt_tag, + FCPIO_ITMF_ABT_TASK_TERM, + fc_lun.scsi_lun, io_req)) { + /* + * Revert the cmd state back to old state, if + * it hasn't changed in between. This cmd will get + * aborted later by scsi_eh, or cleaned up during + * lun reset + */ + spin_lock_irqsave(io_lock, flags); + if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) + CMD_STATE(sc) = old_ioreq_state; + spin_unlock_irqrestore(io_lock, flags); + } else { + spin_lock_irqsave(io_lock, flags); + if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) + CMD_FLAGS(sc) |= FNIC_DEV_RST_TERM_ISSUED; + else + CMD_FLAGS(sc) |= FNIC_IO_INTERNAL_TERM_ISSUED; + spin_unlock_irqrestore(io_lock, flags); + atomic64_inc(&term_stats->terminates); + iter_data->term_cnt++; + } + return true; +} + +static void fnic_rport_exch_reset(struct fnic *fnic, u32 port_id) +{ + struct terminate_stats *term_stats = &fnic->fnic_stats.term_stats; + struct fnic_rport_abort_io_iter_data iter_data = { + .fnic = fnic, + .port_id = port_id, + .term_cnt = 0, + }; + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, "fnic_rport_exch_reset called portid 0x%06x\n", @@ -1577,121 +1678,18 @@ static void fnic_rport_exch_reset(struct fnic *fnic, u32 port_id) if (fnic->in_remove) return; - for (tag = 0; tag < fnic->fnic_max_tag_id; tag++) { - abt_tag = tag; - io_lock = fnic_io_lock_tag(fnic, tag); - spin_lock_irqsave(io_lock, flags); - sc = scsi_host_find_tag(fnic->lport->host, tag); - if (!sc) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } - - io_req = (struct fnic_io_req *)CMD_SP(sc); - - if (!io_req || io_req->port_id != port_id) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } - - if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) && - (!(CMD_FLAGS(sc) & FNIC_DEV_RST_ISSUED))) { - FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, - "fnic_rport_exch_reset dev rst not pending sc 0x%p\n", - sc); - spin_unlock_irqrestore(io_lock, flags); - continue; - } - - /* - * Found IO that is still pending with firmware and - * belongs to rport that went away - */ - if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } - if (io_req->abts_done) { - shost_printk(KERN_ERR, fnic->lport->host, - "fnic_rport_exch_reset: io_req->abts_done is set " - "state is %s\n", - fnic_ioreq_state_to_str(CMD_STATE(sc))); - } - - if (!(CMD_FLAGS(sc) & FNIC_IO_ISSUED)) { - shost_printk(KERN_ERR, fnic->lport->host, - "rport_exch_reset " - "IO not yet issued %p tag 0x%x flags " - "%x state %d\n", - sc, tag, CMD_FLAGS(sc), CMD_STATE(sc)); - } - old_ioreq_state = CMD_STATE(sc); - CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING; - CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE; - if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) { - atomic64_inc(&reset_stats->device_reset_terminates); - abt_tag = (tag | FNIC_TAG_DEV_RST); - FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, - "fnic_rport_exch_reset dev rst sc 0x%p\n", - sc); - } - - BUG_ON(io_req->abts_done); - - FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, - "fnic_rport_reset_exch: Issuing abts\n"); - - spin_unlock_irqrestore(io_lock, flags); - - /* Now queue the abort command to firmware */ - int_to_scsilun(sc->device->lun, &fc_lun); - - if (fnic_queue_abort_io_req(fnic, abt_tag, - FCPIO_ITMF_ABT_TASK_TERM, - fc_lun.scsi_lun, io_req)) { - /* - * Revert the cmd state back to old state, if - * it hasn't changed in between. This cmd will get - * aborted later by scsi_eh, or cleaned up during - * lun reset - */ - spin_lock_irqsave(io_lock, flags); - if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) - CMD_STATE(sc) = old_ioreq_state; - spin_unlock_irqrestore(io_lock, flags); - } else { - spin_lock_irqsave(io_lock, flags); - if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) - CMD_FLAGS(sc) |= FNIC_DEV_RST_TERM_ISSUED; - else - CMD_FLAGS(sc) |= FNIC_IO_INTERNAL_TERM_ISSUED; - spin_unlock_irqrestore(io_lock, flags); - atomic64_inc(&term_stats->terminates); - term_cnt++; - } - } - if (term_cnt > atomic64_read(&term_stats->max_terminates)) - atomic64_set(&term_stats->max_terminates, term_cnt); + scsi_host_busy_iter(fnic->lport->host, fnic_rport_abort_io_iter, + &iter_data); + if (iter_data.term_cnt > atomic64_read(&term_stats->max_terminates)) + atomic64_set(&term_stats->max_terminates, iter_data.term_cnt); } void fnic_terminate_rport_io(struct fc_rport *rport) { - int tag; - int abt_tag; - int term_cnt = 0; - struct fnic_io_req *io_req; - spinlock_t *io_lock; - unsigned long flags; - struct scsi_cmnd *sc; - struct scsi_lun fc_lun; struct fc_rport_libfc_priv *rdata; struct fc_lport *lport; struct fnic *fnic; - struct fc_rport *cmd_rport; - struct reset_stats *reset_stats; - struct terminate_stats *term_stats; - enum fnic_ioreq_state old_ioreq_state; if (!rport) { printk(KERN_ERR "fnic_terminate_rport_io: rport is NULL\n"); @@ -1719,108 +1717,7 @@ void fnic_terminate_rport_io(struct fc_rport *rport) if (fnic->in_remove) return; - reset_stats = &fnic->fnic_stats.reset_stats; - term_stats = &fnic->fnic_stats.term_stats; - - for (tag = 0; tag < fnic->fnic_max_tag_id; tag++) { - abt_tag = tag; - io_lock = fnic_io_lock_tag(fnic, tag); - spin_lock_irqsave(io_lock, flags); - sc = scsi_host_find_tag(fnic->lport->host, tag); - if (!sc) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } - - io_req = (struct fnic_io_req *)CMD_SP(sc); - if (!io_req) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } - - cmd_rport = starget_to_rport(scsi_target(sc->device)); - if (rport != cmd_rport) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } - - if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) && - (!(CMD_FLAGS(sc) & FNIC_DEV_RST_ISSUED))) { - FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, - "fnic_terminate_rport_io dev rst not pending sc 0x%p\n", - sc); - spin_unlock_irqrestore(io_lock, flags); - continue; - } - /* - * Found IO that is still pending with firmware and - * belongs to rport that went away - */ - if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } - if (io_req->abts_done) { - shost_printk(KERN_ERR, fnic->lport->host, - "fnic_terminate_rport_io: io_req->abts_done is set " - "state is %s\n", - fnic_ioreq_state_to_str(CMD_STATE(sc))); - } - if (!(CMD_FLAGS(sc) & FNIC_IO_ISSUED)) { - FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host, - "fnic_terminate_rport_io " - "IO not yet issued %p tag 0x%x flags " - "%x state %d\n", - sc, tag, CMD_FLAGS(sc), CMD_STATE(sc)); - } - old_ioreq_state = CMD_STATE(sc); - CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING; - CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE; - if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) { - atomic64_inc(&reset_stats->device_reset_terminates); - abt_tag = (tag | FNIC_TAG_DEV_RST); - FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, - "fnic_terminate_rport_io dev rst sc 0x%p\n", sc); - } - - BUG_ON(io_req->abts_done); - - FNIC_SCSI_DBG(KERN_DEBUG, - fnic->lport->host, - "fnic_terminate_rport_io: Issuing abts\n"); - - spin_unlock_irqrestore(io_lock, flags); - - /* Now queue the abort command to firmware */ - int_to_scsilun(sc->device->lun, &fc_lun); - - if (fnic_queue_abort_io_req(fnic, abt_tag, - FCPIO_ITMF_ABT_TASK_TERM, - fc_lun.scsi_lun, io_req)) { - /* - * Revert the cmd state back to old state, if - * it hasn't changed in between. This cmd will get - * aborted later by scsi_eh, or cleaned up during - * lun reset - */ - spin_lock_irqsave(io_lock, flags); - if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) - CMD_STATE(sc) = old_ioreq_state; - spin_unlock_irqrestore(io_lock, flags); - } else { - spin_lock_irqsave(io_lock, flags); - if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) - CMD_FLAGS(sc) |= FNIC_DEV_RST_TERM_ISSUED; - else - CMD_FLAGS(sc) |= FNIC_IO_INTERNAL_TERM_ISSUED; - spin_unlock_irqrestore(io_lock, flags); - atomic64_inc(&term_stats->terminates); - term_cnt++; - } - } - if (term_cnt > atomic64_read(&term_stats->max_terminates)) - atomic64_set(&term_stats->max_terminates, term_cnt); - + fnic_rport_exch_reset(fnic, rport->port_id); } /* @@ -2115,6 +2012,156 @@ lr_io_req_end: return ret; } +struct fnic_pending_aborts_iter_data { + struct fnic *fnic; + struct scsi_cmnd *lr_sc; + struct scsi_device *lun_dev; + int ret; +}; + +static bool fnic_pending_aborts_iter(struct scsi_cmnd *sc, + void *data, bool reserved) +{ + struct fnic_pending_aborts_iter_data *iter_data = data; + struct fnic *fnic = iter_data->fnic; + struct scsi_device *lun_dev = iter_data->lun_dev; + int abt_tag = sc->request->tag; + struct fnic_io_req *io_req; + spinlock_t *io_lock; + unsigned long flags; + struct scsi_lun fc_lun; + DECLARE_COMPLETION_ONSTACK(tm_done); + enum fnic_ioreq_state old_ioreq_state; + + if (sc == iter_data->lr_sc || sc->device != lun_dev) + return true; + if (reserved) + return true; + + io_lock = fnic_io_lock_tag(fnic, abt_tag); + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + if (!io_req) { + spin_unlock_irqrestore(io_lock, flags); + return true; + } + + /* + * Found IO that is still pending with firmware and + * belongs to the LUN that we are resetting + */ + FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, + "Found IO in %s on lun\n", + fnic_ioreq_state_to_str(CMD_STATE(sc))); + + if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) { + spin_unlock_irqrestore(io_lock, flags); + return true; + } + if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) && + (!(CMD_FLAGS(sc) & FNIC_DEV_RST_ISSUED))) { + FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host, + "%s dev rst not pending sc 0x%p\n", __func__, + sc); + spin_unlock_irqrestore(io_lock, flags); + return true; + } + + if (io_req->abts_done) + shost_printk(KERN_ERR, fnic->lport->host, + "%s: io_req->abts_done is set state is %s\n", + __func__, fnic_ioreq_state_to_str(CMD_STATE(sc))); + old_ioreq_state = CMD_STATE(sc); + /* + * Any pending IO issued prior to reset is expected to be + * in abts pending state, if not we need to set + * FNIC_IOREQ_ABTS_PENDING to indicate the IO is abort pending. + * When IO is completed, the IO will be handed over and + * handled in this function. + */ + CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING; + + BUG_ON(io_req->abts_done); + + if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) { + abt_tag |= FNIC_TAG_DEV_RST; + FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host, + "%s: dev rst sc 0x%p\n", __func__, sc); + } + + CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE; + io_req->abts_done = &tm_done; + spin_unlock_irqrestore(io_lock, flags); + + /* Now queue the abort command to firmware */ + int_to_scsilun(sc->device->lun, &fc_lun); + + if (fnic_queue_abort_io_req(fnic, abt_tag, + FCPIO_ITMF_ABT_TASK_TERM, + fc_lun.scsi_lun, io_req)) { + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + if (io_req) + io_req->abts_done = NULL; + if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) + CMD_STATE(sc) = old_ioreq_state; + spin_unlock_irqrestore(io_lock, flags); + iter_data->ret = FAILED; + return false; + } else { + spin_lock_irqsave(io_lock, flags); + if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) + CMD_FLAGS(sc) |= FNIC_DEV_RST_TERM_ISSUED; + spin_unlock_irqrestore(io_lock, flags); + } + CMD_FLAGS(sc) |= FNIC_IO_INTERNAL_TERM_ISSUED; + + wait_for_completion_timeout(&tm_done, msecs_to_jiffies + (fnic->config.ed_tov)); + + /* Recheck cmd state to check if it is now aborted */ + spin_lock_irqsave(io_lock, flags); + io_req = (struct fnic_io_req *)CMD_SP(sc); + if (!io_req) { + spin_unlock_irqrestore(io_lock, flags); + CMD_FLAGS(sc) |= FNIC_IO_ABT_TERM_REQ_NULL; + return true; + } + + io_req->abts_done = NULL; + + /* if abort is still pending with fw, fail */ + if (CMD_ABTS_STATUS(sc) == FCPIO_INVALID_CODE) { + spin_unlock_irqrestore(io_lock, flags); + CMD_FLAGS(sc) |= FNIC_IO_ABT_TERM_DONE; + iter_data->ret = FAILED; + return false; + } + CMD_STATE(sc) = FNIC_IOREQ_ABTS_COMPLETE; + + /* original sc used for lr is handled by dev reset code */ + if (sc != iter_data->lr_sc) + CMD_SP(sc) = NULL; + spin_unlock_irqrestore(io_lock, flags); + + /* original sc used for lr is handled by dev reset code */ + if (sc != iter_data->lr_sc) { + fnic_release_ioreq_buf(fnic, io_req, sc); + mempool_free(io_req, fnic->io_req_pool); + } + + /* + * Any IO is returned during reset, it needs to call scsi_done + * to return the scsi_cmnd to upper layer. + */ + if (sc->scsi_done) { + /* Set result to let upper SCSI layer retry */ + sc->result = DID_RESET << 16; + sc->scsi_done(sc); + } + return true; +} + /* * Clean up any pending aborts on the lun * For each outstanding IO on this lun, whose abort is not completed by fw, @@ -2123,157 +2170,25 @@ lr_io_req_end: */ static int fnic_clean_pending_aborts(struct fnic *fnic, struct scsi_cmnd *lr_sc, - bool new_sc) + bool new_sc) { - int tag, abt_tag; - struct fnic_io_req *io_req; - spinlock_t *io_lock; - unsigned long flags; - int ret = 0; - struct scsi_cmnd *sc; - struct scsi_lun fc_lun; - struct scsi_device *lun_dev = lr_sc->device; - DECLARE_COMPLETION_ONSTACK(tm_done); - enum fnic_ioreq_state old_ioreq_state; + int ret = SUCCESS; + struct fnic_pending_aborts_iter_data iter_data = { + .fnic = fnic, + .lun_dev = lr_sc->device, + .ret = SUCCESS, + }; - for (tag = 0; tag < fnic->fnic_max_tag_id; tag++) { - io_lock = fnic_io_lock_tag(fnic, tag); - spin_lock_irqsave(io_lock, flags); - sc = scsi_host_find_tag(fnic->lport->host, tag); - /* - * ignore this lun reset cmd if issued using new SC - * or cmds that do not belong to this lun - */ - if (!sc || ((sc == lr_sc) && new_sc) || sc->device != lun_dev) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } + if (new_sc) + iter_data.lr_sc = lr_sc; - io_req = (struct fnic_io_req *)CMD_SP(sc); - - if (!io_req || sc->device != lun_dev) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } - - /* - * Found IO that is still pending with firmware and - * belongs to the LUN that we are resetting - */ - FNIC_SCSI_DBG(KERN_DEBUG, fnic->lport->host, - "Found IO in %s on lun\n", - fnic_ioreq_state_to_str(CMD_STATE(sc))); - - if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } - if ((CMD_FLAGS(sc) & FNIC_DEVICE_RESET) && - (!(CMD_FLAGS(sc) & FNIC_DEV_RST_ISSUED))) { - FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host, - "%s dev rst not pending sc 0x%p\n", __func__, - sc); - spin_unlock_irqrestore(io_lock, flags); - continue; - } - - if (io_req->abts_done) - shost_printk(KERN_ERR, fnic->lport->host, - "%s: io_req->abts_done is set state is %s\n", - __func__, fnic_ioreq_state_to_str(CMD_STATE(sc))); - old_ioreq_state = CMD_STATE(sc); - /* - * Any pending IO issued prior to reset is expected to be - * in abts pending state, if not we need to set - * FNIC_IOREQ_ABTS_PENDING to indicate the IO is abort pending. - * When IO is completed, the IO will be handed over and - * handled in this function. - */ - CMD_STATE(sc) = FNIC_IOREQ_ABTS_PENDING; - - BUG_ON(io_req->abts_done); - - abt_tag = tag; - if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) { - abt_tag |= FNIC_TAG_DEV_RST; - FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host, - "%s: dev rst sc 0x%p\n", __func__, sc); - } - - CMD_ABTS_STATUS(sc) = FCPIO_INVALID_CODE; - io_req->abts_done = &tm_done; - spin_unlock_irqrestore(io_lock, flags); - - /* Now queue the abort command to firmware */ - int_to_scsilun(sc->device->lun, &fc_lun); - - if (fnic_queue_abort_io_req(fnic, abt_tag, - FCPIO_ITMF_ABT_TASK_TERM, - fc_lun.scsi_lun, io_req)) { - spin_lock_irqsave(io_lock, flags); - io_req = (struct fnic_io_req *)CMD_SP(sc); - if (io_req) - io_req->abts_done = NULL; - if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) - CMD_STATE(sc) = old_ioreq_state; - spin_unlock_irqrestore(io_lock, flags); - ret = 1; - goto clean_pending_aborts_end; - } else { - spin_lock_irqsave(io_lock, flags); - if (CMD_FLAGS(sc) & FNIC_DEVICE_RESET) - CMD_FLAGS(sc) |= FNIC_DEV_RST_TERM_ISSUED; - spin_unlock_irqrestore(io_lock, flags); - } - CMD_FLAGS(sc) |= FNIC_IO_INTERNAL_TERM_ISSUED; - - wait_for_completion_timeout(&tm_done, - msecs_to_jiffies - (fnic->config.ed_tov)); - - /* Recheck cmd state to check if it is now aborted */ - spin_lock_irqsave(io_lock, flags); - io_req = (struct fnic_io_req *)CMD_SP(sc); - if (!io_req) { - spin_unlock_irqrestore(io_lock, flags); - CMD_FLAGS(sc) |= FNIC_IO_ABT_TERM_REQ_NULL; - continue; - } - - io_req->abts_done = NULL; - - /* if abort is still pending with fw, fail */ - if (CMD_ABTS_STATUS(sc) == FCPIO_INVALID_CODE) { - spin_unlock_irqrestore(io_lock, flags); - CMD_FLAGS(sc) |= FNIC_IO_ABT_TERM_DONE; - ret = 1; - goto clean_pending_aborts_end; - } - CMD_STATE(sc) = FNIC_IOREQ_ABTS_COMPLETE; - - /* original sc used for lr is handled by dev reset code */ - if (sc != lr_sc) - CMD_SP(sc) = NULL; - spin_unlock_irqrestore(io_lock, flags); - - /* original sc used for lr is handled by dev reset code */ - if (sc != lr_sc) { - fnic_release_ioreq_buf(fnic, io_req, sc); - mempool_free(io_req, fnic->io_req_pool); - } - - /* - * Any IO is returned during reset, it needs to call scsi_done - * to return the scsi_cmnd to upper layer. - */ - if (sc->scsi_done) { - /* Set result to let upper SCSI layer retry */ - sc->result = DID_RESET << 16; - sc->scsi_done(sc); - } + scsi_host_busy_iter(fnic->lport->host, + fnic_pending_aborts_iter, &iter_data); + if (iter_data.ret == FAILED) { + ret = iter_data.ret; + goto clean_pending_aborts_end; } - schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); /* walk again to check, if IOs are still pending in fw */ @@ -2772,6 +2687,49 @@ call_fc_exch_mgr_reset: } +static bool fnic_abts_pending_iter(struct scsi_cmnd *sc, void *data, + bool reserved) +{ + struct fnic_pending_aborts_iter_data *iter_data = data; + struct fnic *fnic = iter_data->fnic; + int cmd_state; + struct fnic_io_req *io_req; + spinlock_t *io_lock; + unsigned long flags; + + /* + * ignore this lun reset cmd or cmds that do not belong to + * this lun + */ + if (iter_data->lr_sc && sc == iter_data->lr_sc) + return true; + if (iter_data->lun_dev && sc->device != iter_data->lun_dev) + return true; + + io_lock = fnic_io_lock_hash(fnic, sc); + spin_lock_irqsave(io_lock, flags); + + io_req = (struct fnic_io_req *)CMD_SP(sc); + if (!io_req) { + spin_unlock_irqrestore(io_lock, flags); + return true; + } + + /* + * Found IO that is still pending with firmware and + * belongs to the LUN that we are resetting + */ + FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host, + "Found IO in %s on lun\n", + fnic_ioreq_state_to_str(CMD_STATE(sc))); + cmd_state = CMD_STATE(sc); + spin_unlock_irqrestore(io_lock, flags); + if (cmd_state == FNIC_IOREQ_ABTS_PENDING) + iter_data->ret = 1; + + return iter_data->ret ? false : true; +} + /* * fnic_is_abts_pending() is a helper function that * walks through tag map to check if there is any IOs pending,if there is one, @@ -2781,49 +2739,20 @@ call_fc_exch_mgr_reset: */ int fnic_is_abts_pending(struct fnic *fnic, struct scsi_cmnd *lr_sc) { - int tag; - struct fnic_io_req *io_req; - spinlock_t *io_lock; - unsigned long flags; - int ret = 0; - struct scsi_cmnd *sc; - struct scsi_device *lun_dev = NULL; + struct fnic_pending_aborts_iter_data iter_data = { + .fnic = fnic, + .lun_dev = NULL, + .ret = 0, + }; - if (lr_sc) - lun_dev = lr_sc->device; - - /* walk again to check, if IOs are still pending in fw */ - for (tag = 0; tag < fnic->fnic_max_tag_id; tag++) { - sc = scsi_host_find_tag(fnic->lport->host, tag); - /* - * ignore this lun reset cmd or cmds that do not belong to - * this lun - */ - if (!sc || (lr_sc && (sc->device != lun_dev || sc == lr_sc))) - continue; - - io_lock = fnic_io_lock_hash(fnic, sc); - spin_lock_irqsave(io_lock, flags); - - io_req = (struct fnic_io_req *)CMD_SP(sc); - - if (!io_req || sc->device != lun_dev) { - spin_unlock_irqrestore(io_lock, flags); - continue; - } - - /* - * Found IO that is still pending with firmware and - * belongs to the LUN that we are resetting - */ - FNIC_SCSI_DBG(KERN_INFO, fnic->lport->host, - "Found IO in %s on lun\n", - fnic_ioreq_state_to_str(CMD_STATE(sc))); - - if (CMD_STATE(sc) == FNIC_IOREQ_ABTS_PENDING) - ret = 1; - spin_unlock_irqrestore(io_lock, flags); + if (lr_sc) { + iter_data.lun_dev = lr_sc->device; + iter_data.lr_sc = lr_sc; } - return ret; + /* walk again to check, if IOs are still pending in fw */ + scsi_host_busy_iter(fnic->lport->host, + fnic_abts_pending_iter, &iter_data); + + return iter_data.ret; } From a712b307cfde6dbe0d4829293afb1566beb30a9a Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 30 Apr 2021 10:17:39 -0700 Subject: [PATCH 0868/1379] drm/msm/dpu: Delete bonkers code dpu_crtc_atomic_flush() was directly poking it's attached planes in a code path that ended up in dpu_plane_atomic_update(), even if the plane was not involved in the current atomic update. While a bit dubious, this worked before because plane->state would always point to something valid. But now using drm_atomic_get_new_plane_state() we could get a NULL state pointer instead, leading to: [ 20.873273] Call trace: [ 20.875740] dpu_plane_atomic_update+0x5c/0xed0 [ 20.880311] dpu_plane_restore+0x40/0x88 [ 20.884266] dpu_crtc_atomic_flush+0xf4/0x208 [ 20.888660] drm_atomic_helper_commit_planes+0x150/0x238 [ 20.894014] msm_atomic_commit_tail+0x1d4/0x7a0 [ 20.898579] commit_tail+0xa4/0x168 [ 20.902102] drm_atomic_helper_commit+0x164/0x178 [ 20.906841] drm_atomic_commit+0x54/0x60 [ 20.910798] drm_atomic_connector_commit_dpms+0x10c/0x118 [ 20.916236] drm_mode_obj_set_property_ioctl+0x1e4/0x440 [ 20.921588] drm_connector_property_set_ioctl+0x60/0x88 [ 20.926852] drm_ioctl_kernel+0xd0/0x120 [ 20.930807] drm_ioctl+0x21c/0x478 [ 20.934235] __arm64_sys_ioctl+0xa8/0xe0 [ 20.938193] invoke_syscall+0x64/0x130 [ 20.941977] el0_svc_common.constprop.3+0x5c/0xe0 [ 20.946716] do_el0_svc+0x80/0xa0 [ 20.950058] el0_svc+0x20/0x30 [ 20.953145] el0_sync_handler+0x88/0xb0 [ 20.957014] el0_sync+0x13c/0x140 The reason for the codepath seems dubious, the atomic suspend/resume heplers should handle the power-collapse case. If not, the CRTC's atomic_check() should be adding the planes to the atomic update. Reported-by: Stephen Boyd Reported-by: John Stultz Fixes: 37418bf14c13 ("drm: Use state helper instead of the plane state pointer") Tested-by: John Stultz Signed-off-by: Rob Clark Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20210430171744.1721408-1-robdclark@gmail.com --- drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c | 10 ---------- drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c | 16 ---------------- drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h | 6 ------ 3 files changed, 32 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c index 9607a7644d4b..e65ff8168269 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_crtc.c @@ -566,16 +566,6 @@ static void dpu_crtc_atomic_flush(struct drm_crtc *crtc, if (unlikely(!cstate->num_mixers)) return; - /* - * For planes without commit update, drm framework will not add - * those planes to current state since hardware update is not - * required. However, if those planes were power collapsed since - * last commit cycle, driver has to restore the hardware state - * of those planes explicitly here prior to plane flush. - */ - drm_atomic_crtc_for_each_plane(plane, crtc) - dpu_plane_restore(plane, state); - /* update performance setting before crtc kickoff */ dpu_core_perf_crtc_update(crtc, 1, false); diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c index df7f3d3afd8b..7a993547eb75 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c @@ -1258,22 +1258,6 @@ static void dpu_plane_atomic_update(struct drm_plane *plane, } } -void dpu_plane_restore(struct drm_plane *plane, struct drm_atomic_state *state) -{ - struct dpu_plane *pdpu; - - if (!plane || !plane->state) { - DPU_ERROR("invalid plane\n"); - return; - } - - pdpu = to_dpu_plane(plane); - - DPU_DEBUG_PLANE(pdpu, "\n"); - - dpu_plane_atomic_update(plane, state); -} - static void dpu_plane_destroy(struct drm_plane *plane) { struct dpu_plane *pdpu = plane ? to_dpu_plane(plane) : NULL; diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h index 03b6365a750c..34e03ac05f4a 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.h @@ -84,12 +84,6 @@ bool is_dpu_plane_virtual(struct drm_plane *plane); void dpu_plane_get_ctl_flush(struct drm_plane *plane, struct dpu_hw_ctl *ctl, u32 *flush_sspp); -/** - * dpu_plane_restore - restore hw state if previously power collapsed - * @plane: Pointer to drm plane structure - */ -void dpu_plane_restore(struct drm_plane *plane, struct drm_atomic_state *state); - /** * dpu_plane_flush - final plane operations before commit flush * @plane: Pointer to drm plane structure From 266fd994b2b0ab7ba3e5541868838ce30775964b Mon Sep 17 00:00:00 2001 From: Sami Loone Date: Sat, 1 May 2021 12:07:53 +0200 Subject: [PATCH 0869/1379] ALSA: hda/realtek: ALC285 Thinkpad jack pin quirk is unreachable In 9bbb94e57df1 ("ALSA: hda/realtek: fix static noise on ALC285 Lenovo laptops") an existing Lenovo quirk was made more generic by removing a 0x12 pin requirement from the entry. This made the second chance table Thinkpad jack entry unreachable as the pin configurations became identical. Revert the 0x12 pin requirement removal and move Thinkpad jack pin quirk back to the primary pin table as they can co-exist when more specific configurations come first. Add a more targeted pin quirk for Lenovo devices that have 0x12 as 0x40000000. Tested on Yoga 6 (AMD) laptop. [ Corrected the commit ID -- tiwai ] Fixes: 9bbb94e57df1 ("ALSA: hda/realtek: fix static noise on ALC285 Lenovo laptops") Signed-off-by: Sami Loone Cc: Link: https://lore.kernel.org/r/YI0oefvTYn8URYDb@yoga Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index bd7bfd7c9ee7..a0fa99a31c23 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8801,6 +8801,16 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x19, 0x03a11020}, {0x21, 0x0321101f}), SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE, + {0x12, 0x90a60130}, + {0x14, 0x90170110}, + {0x19, 0x04a11040}, + {0x21, 0x04211020}), + SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE, + {0x14, 0x90170110}, + {0x19, 0x04a11040}, + {0x1d, 0x40600001}, + {0x21, 0x04211020}), + SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_THINKPAD_NO_BASS_SPK_HEADSET_JACK, {0x14, 0x90170110}, {0x19, 0x04a11040}, {0x21, 0x04211020}), @@ -8971,10 +8981,6 @@ static const struct snd_hda_pin_quirk alc269_fallback_pin_fixup_tbl[] = { SND_HDA_PIN_QUIRK(0x10ec0274, 0x1028, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB, {0x19, 0x40000000}, {0x1a, 0x40000000}), - SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_THINKPAD_NO_BASS_SPK_HEADSET_JACK, - {0x14, 0x90170110}, - {0x19, 0x04a11040}, - {0x21, 0x04211020}), {} }; From 64b9f64f80a6f4b7ea51bf0510119cb15e801dc6 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 23 Feb 2021 14:19:05 +0800 Subject: [PATCH 0870/1379] vdpa: introduce virtio pci driver This patch introduce a vDPA driver for virtio-pci device. It bridges the virtio-pci control command to the vDPA bus. This will be used for features prototyping and testing. Note that get/restore virtqueue state is not supported which needs extension on the virtio specification. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210223061905.422659-4-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/Makefile | 1 + drivers/vdpa/Kconfig | 7 + drivers/vdpa/Makefile | 1 + drivers/vdpa/virtio_pci/Makefile | 2 + drivers/vdpa/virtio_pci/vp_vdpa.c | 458 ++++++++++++++++++++++++++++++ 5 files changed, 469 insertions(+) create mode 100644 drivers/vdpa/virtio_pci/Makefile create mode 100644 drivers/vdpa/virtio_pci/vp_vdpa.c diff --git a/drivers/Makefile b/drivers/Makefile index 6fba7daba591..5ba932437869 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_DMADEVICES) += dma/ obj-y += soc/ obj-$(CONFIG_VIRTIO) += virtio/ +obj-$(CONFIG_VIRTIO_PCI_LIB) += virtio/ obj-$(CONFIG_VDPA) += vdpa/ obj-$(CONFIG_XEN) += xen/ diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index ffd1e098bfd2..a245809c99d0 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -52,4 +52,11 @@ config MLX5_VDPA_NET be executed by the hardware. It also supports a variety of stateless offloads depending on the actual device used and firmware version. +config VP_VDPA + tristate "Virtio PCI bridge vDPA driver" + select VIRTIO_PCI_LIB + depends on PCI_MSI + help + This kernel module bridges virtio PCI device to vDPA bus. + endif # VDPA diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile index d160e9b63a66..67fe7f3d6943 100644 --- a/drivers/vdpa/Makefile +++ b/drivers/vdpa/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_VDPA) += vdpa.o obj-$(CONFIG_VDPA_SIM) += vdpa_sim/ obj-$(CONFIG_IFCVF) += ifcvf/ obj-$(CONFIG_MLX5_VDPA) += mlx5/ +obj-$(CONFIG_VP_VDPA) += virtio_pci/ diff --git a/drivers/vdpa/virtio_pci/Makefile b/drivers/vdpa/virtio_pci/Makefile new file mode 100644 index 000000000000..231088d3af7d --- /dev/null +++ b/drivers/vdpa/virtio_pci/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_VP_VDPA) += vp_vdpa.o diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c new file mode 100644 index 000000000000..1321a2fcd088 --- /dev/null +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -0,0 +1,458 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vDPA bridge driver for modern virtio-pci device + * + * Copyright (c) 2020, Red Hat Inc. All rights reserved. + * Author: Jason Wang + * + * Based on virtio_pci_modern.c. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VP_VDPA_QUEUE_MAX 256 +#define VP_VDPA_DRIVER_NAME "vp_vdpa" +#define VP_VDPA_NAME_SIZE 256 + +struct vp_vring { + void __iomem *notify; + char msix_name[VP_VDPA_NAME_SIZE]; + struct vdpa_callback cb; + int irq; +}; + +struct vp_vdpa { + struct vdpa_device vdpa; + struct virtio_pci_modern_device mdev; + struct vp_vring *vring; + struct vdpa_callback config_cb; + char msix_name[VP_VDPA_NAME_SIZE]; + int config_irq; + int queues; + int vectors; +}; + +static struct vp_vdpa *vdpa_to_vp(struct vdpa_device *vdpa) +{ + return container_of(vdpa, struct vp_vdpa, vdpa); +} + +static struct virtio_pci_modern_device *vdpa_to_mdev(struct vdpa_device *vdpa) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + + return &vp_vdpa->mdev; +} + +static u64 vp_vdpa_get_features(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return vp_modern_get_features(mdev); +} + +static int vp_vdpa_set_features(struct vdpa_device *vdpa, u64 features) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + vp_modern_set_features(mdev, features); + + return 0; +} + +static u8 vp_vdpa_get_status(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return vp_modern_get_status(mdev); +} + +static void vp_vdpa_free_irq(struct vp_vdpa *vp_vdpa) +{ + struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev; + struct pci_dev *pdev = mdev->pci_dev; + int i; + + for (i = 0; i < vp_vdpa->queues; i++) { + if (vp_vdpa->vring[i].irq != VIRTIO_MSI_NO_VECTOR) { + vp_modern_queue_vector(mdev, i, VIRTIO_MSI_NO_VECTOR); + devm_free_irq(&pdev->dev, vp_vdpa->vring[i].irq, + &vp_vdpa->vring[i]); + vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR; + } + } + + if (vp_vdpa->config_irq != VIRTIO_MSI_NO_VECTOR) { + vp_modern_config_vector(mdev, VIRTIO_MSI_NO_VECTOR); + devm_free_irq(&pdev->dev, vp_vdpa->config_irq, vp_vdpa); + vp_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR; + } + + if (vp_vdpa->vectors) { + pci_free_irq_vectors(pdev); + vp_vdpa->vectors = 0; + } +} + +static irqreturn_t vp_vdpa_vq_handler(int irq, void *arg) +{ + struct vp_vring *vring = arg; + + if (vring->cb.callback) + return vring->cb.callback(vring->cb.private); + + return IRQ_HANDLED; +} + +static irqreturn_t vp_vdpa_config_handler(int irq, void *arg) +{ + struct vp_vdpa *vp_vdpa = arg; + + if (vp_vdpa->config_cb.callback) + return vp_vdpa->config_cb.callback(vp_vdpa->config_cb.private); + + return IRQ_HANDLED; +} + +static int vp_vdpa_request_irq(struct vp_vdpa *vp_vdpa) +{ + struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev; + struct pci_dev *pdev = mdev->pci_dev; + int i, ret, irq; + int queues = vp_vdpa->queues; + int vectors = queues + 1; + + ret = pci_alloc_irq_vectors(pdev, vectors, vectors, PCI_IRQ_MSIX); + if (ret != vectors) { + dev_err(&pdev->dev, + "vp_vdpa: fail to allocate irq vectors want %d but %d\n", + vectors, ret); + return ret; + } + + vp_vdpa->vectors = vectors; + + for (i = 0; i < queues; i++) { + snprintf(vp_vdpa->vring[i].msix_name, VP_VDPA_NAME_SIZE, + "vp-vdpa[%s]-%d\n", pci_name(pdev), i); + irq = pci_irq_vector(pdev, i); + ret = devm_request_irq(&pdev->dev, irq, + vp_vdpa_vq_handler, + 0, vp_vdpa->vring[i].msix_name, + &vp_vdpa->vring[i]); + if (ret) { + dev_err(&pdev->dev, + "vp_vdpa: fail to request irq for vq %d\n", i); + goto err; + } + vp_modern_queue_vector(mdev, i, i); + vp_vdpa->vring[i].irq = irq; + } + + snprintf(vp_vdpa->msix_name, VP_VDPA_NAME_SIZE, "vp-vdpa[%s]-config\n", + pci_name(pdev)); + irq = pci_irq_vector(pdev, queues); + ret = devm_request_irq(&pdev->dev, irq, vp_vdpa_config_handler, 0, + vp_vdpa->msix_name, vp_vdpa); + if (ret) { + dev_err(&pdev->dev, + "vp_vdpa: fail to request irq for vq %d\n", i); + goto err; + } + vp_modern_config_vector(mdev, queues); + vp_vdpa->config_irq = irq; + + return 0; +err: + vp_vdpa_free_irq(vp_vdpa); + return ret; +} + +static void vp_vdpa_set_status(struct vdpa_device *vdpa, u8 status) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev; + u8 s = vp_vdpa_get_status(vdpa); + + if (status & VIRTIO_CONFIG_S_DRIVER_OK && + !(s & VIRTIO_CONFIG_S_DRIVER_OK)) { + vp_vdpa_request_irq(vp_vdpa); + } + + vp_modern_set_status(mdev, status); + + if (!(status & VIRTIO_CONFIG_S_DRIVER_OK) && + (s & VIRTIO_CONFIG_S_DRIVER_OK)) + vp_vdpa_free_irq(vp_vdpa); +} + +static u16 vp_vdpa_get_vq_num_max(struct vdpa_device *vdpa) +{ + return VP_VDPA_QUEUE_MAX; +} + +static int vp_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 qid, + struct vdpa_vq_state *state) +{ + /* Note that this is not supported by virtio specification, so + * we return -EOPNOTSUPP here. This means we can't support live + * migration, vhost device start/stop. + */ + return -EOPNOTSUPP; +} + +static int vp_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 qid, + const struct vdpa_vq_state *state) +{ + /* Note that this is not supported by virtio specification, so + * we return -ENOPOTSUPP here. This means we can't support live + * migration, vhost device start/stop. + */ + return -EOPNOTSUPP; +} + +static void vp_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 qid, + struct vdpa_callback *cb) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + + vp_vdpa->vring[qid].cb = *cb; +} + +static void vp_vdpa_set_vq_ready(struct vdpa_device *vdpa, + u16 qid, bool ready) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + vp_modern_set_queue_enable(mdev, qid, ready); +} + +static bool vp_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 qid) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return vp_modern_get_queue_enable(mdev, qid); +} + +static void vp_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 qid, + u32 num) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + vp_modern_set_queue_size(mdev, qid, num); +} + +static int vp_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 qid, + u64 desc_area, u64 driver_area, + u64 device_area) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + vp_modern_queue_address(mdev, qid, desc_area, + driver_area, device_area); + + return 0; +} + +static void vp_vdpa_kick_vq(struct vdpa_device *vdpa, u16 qid) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + + vp_iowrite16(qid, vp_vdpa->vring[qid].notify); +} + +static u32 vp_vdpa_get_generation(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return vp_modern_generation(mdev); +} + +static u32 vp_vdpa_get_device_id(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return mdev->id.device; +} + +static u32 vp_vdpa_get_vendor_id(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return mdev->id.vendor; +} + +static u32 vp_vdpa_get_vq_align(struct vdpa_device *vdpa) +{ + return PAGE_SIZE; +} + +static void vp_vdpa_get_config(struct vdpa_device *vdpa, + unsigned int offset, + void *buf, unsigned int len) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev; + u8 old, new; + u8 *p; + int i; + + do { + old = vp_ioread8(&mdev->common->config_generation); + p = buf; + for (i = 0; i < len; i++) + *p++ = vp_ioread8(mdev->device + offset + i); + + new = vp_ioread8(&mdev->common->config_generation); + } while (old != new); +} + +static void vp_vdpa_set_config(struct vdpa_device *vdpa, + unsigned int offset, const void *buf, + unsigned int len) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev; + const u8 *p = buf; + int i; + + for (i = 0; i < len; i++) + vp_iowrite8(*p++, mdev->device + offset + i); +} + +static void vp_vdpa_set_config_cb(struct vdpa_device *vdpa, + struct vdpa_callback *cb) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + + vp_vdpa->config_cb = *cb; +} + +static const struct vdpa_config_ops vp_vdpa_ops = { + .get_features = vp_vdpa_get_features, + .set_features = vp_vdpa_set_features, + .get_status = vp_vdpa_get_status, + .set_status = vp_vdpa_set_status, + .get_vq_num_max = vp_vdpa_get_vq_num_max, + .get_vq_state = vp_vdpa_get_vq_state, + .set_vq_state = vp_vdpa_set_vq_state, + .set_vq_cb = vp_vdpa_set_vq_cb, + .set_vq_ready = vp_vdpa_set_vq_ready, + .get_vq_ready = vp_vdpa_get_vq_ready, + .set_vq_num = vp_vdpa_set_vq_num, + .set_vq_address = vp_vdpa_set_vq_address, + .kick_vq = vp_vdpa_kick_vq, + .get_generation = vp_vdpa_get_generation, + .get_device_id = vp_vdpa_get_device_id, + .get_vendor_id = vp_vdpa_get_vendor_id, + .get_vq_align = vp_vdpa_get_vq_align, + .get_config = vp_vdpa_get_config, + .set_config = vp_vdpa_set_config, + .set_config_cb = vp_vdpa_set_config_cb, +}; + +static void vp_vdpa_free_irq_vectors(void *data) +{ + pci_free_irq_vectors(data); +} + +static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct virtio_pci_modern_device *mdev; + struct device *dev = &pdev->dev; + struct vp_vdpa *vp_vdpa; + u16 notify_off; + int ret, i; + + ret = pcim_enable_device(pdev); + if (ret) + return ret; + + vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, + dev, &vp_vdpa_ops, NULL); + if (vp_vdpa == NULL) { + dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); + return -ENOMEM; + } + + mdev = &vp_vdpa->mdev; + mdev->pci_dev = pdev; + + ret = vp_modern_probe(mdev); + if (ret) { + dev_err(&pdev->dev, "Failed to probe modern PCI device\n"); + goto err; + } + + pci_set_master(pdev); + pci_set_drvdata(pdev, vp_vdpa); + + vp_vdpa->vdpa.dma_dev = &pdev->dev; + vp_vdpa->queues = vp_modern_get_num_queues(mdev); + + ret = devm_add_action_or_reset(dev, vp_vdpa_free_irq_vectors, pdev); + if (ret) { + dev_err(&pdev->dev, + "Failed for adding devres for freeing irq vectors\n"); + goto err; + } + + vp_vdpa->vring = devm_kcalloc(&pdev->dev, vp_vdpa->queues, + sizeof(*vp_vdpa->vring), + GFP_KERNEL); + if (!vp_vdpa->vring) { + ret = -ENOMEM; + dev_err(&pdev->dev, "Fail to allocate virtqueues\n"); + goto err; + } + + for (i = 0; i < vp_vdpa->queues; i++) { + notify_off = vp_modern_get_queue_notify_off(mdev, i); + vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR; + vp_vdpa->vring[i].notify = mdev->notify_base + + notify_off * mdev->notify_offset_multiplier; + } + vp_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR; + + ret = vdpa_register_device(&vp_vdpa->vdpa, vp_vdpa->queues); + if (ret) { + dev_err(&pdev->dev, "Failed to register to vdpa bus\n"); + goto err; + } + + return 0; + +err: + put_device(&vp_vdpa->vdpa.dev); + return ret; +} + +static void vp_vdpa_remove(struct pci_dev *pdev) +{ + struct vp_vdpa *vp_vdpa = pci_get_drvdata(pdev); + + vdpa_unregister_device(&vp_vdpa->vdpa); + vp_modern_remove(&vp_vdpa->mdev); +} + +static struct pci_driver vp_vdpa_driver = { + .name = "vp-vdpa", + .id_table = NULL, /* only dynamic ids */ + .probe = vp_vdpa_probe, + .remove = vp_vdpa_remove, +}; + +module_pci_driver(vp_vdpa_driver); + +MODULE_AUTHOR("Jason Wang "); +MODULE_DESCRIPTION("vp-vdpa"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1"); From 58926c8aab104daa49f35b9fcf664d95c22c8ac7 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 8 Apr 2021 12:13:21 +0300 Subject: [PATCH 0871/1379] vdpa/mlx5: Enable user to add/delete vdpa device Allow to control vdpa device creation and destruction using the vdpa management tool. Examples: 1. List the management devices $ vdpa mgmtdev show pci/0000:3b:00.1: supported_classes net 2. Create vdpa instance $ vdpa dev add mgmtdev pci/0000:3b:00.1 name vdpa0 3. Show vdpa devices $ vdpa dev show vdpa0: type network mgmtdev pci/0000:3b:00.1 vendor_id 5555 max_vqs 16 \ max_vq_size 256 Signed-off-by: Eli Cohen Reviewed-by: Parav Pandit Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210408091320.4600-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 79 +++++++++++++++++++++++++++---- 1 file changed, 70 insertions(+), 9 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 4d2809c7d4e3..25533db01f5f 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1974,23 +1974,32 @@ static void init_mvqs(struct mlx5_vdpa_net *ndev) } } -static int mlx5v_probe(struct auxiliary_device *adev, - const struct auxiliary_device_id *id) +struct mlx5_vdpa_mgmtdev { + struct vdpa_mgmt_dev mgtdev; + struct mlx5_adev *madev; + struct mlx5_vdpa_net *ndev; +}; + +static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name) { - struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev); - struct mlx5_core_dev *mdev = madev->mdev; + struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev); struct virtio_net_config *config; struct mlx5_vdpa_dev *mvdev; struct mlx5_vdpa_net *ndev; + struct mlx5_core_dev *mdev; u32 max_vqs; int err; + if (mgtdev->ndev) + return -ENOSPC; + + mdev = mgtdev->madev->mdev; /* we save one virtqueue for control virtqueue should we require it */ max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues); max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS); ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops, - NULL); + name); if (IS_ERR(ndev)) return PTR_ERR(ndev); @@ -2017,11 +2026,12 @@ static int mlx5v_probe(struct auxiliary_device *adev, if (err) goto err_res; - err = vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs)); + mvdev->vdev.mdev = &mgtdev->mgtdev; + err = _vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs)); if (err) goto err_reg; - dev_set_drvdata(&adev->dev, ndev); + mgtdev->ndev = ndev; return 0; err_reg: @@ -2034,11 +2044,62 @@ err_mtu: return err; } +static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev) +{ + struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev); + + _vdpa_unregister_device(dev); + mgtdev->ndev = NULL; +} + +static const struct vdpa_mgmtdev_ops mdev_ops = { + .dev_add = mlx5_vdpa_dev_add, + .dev_del = mlx5_vdpa_dev_del, +}; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static int mlx5v_probe(struct auxiliary_device *adev, + const struct auxiliary_device_id *id) + +{ + struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev); + struct mlx5_core_dev *mdev = madev->mdev; + struct mlx5_vdpa_mgmtdev *mgtdev; + int err; + + mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL); + if (!mgtdev) + return -ENOMEM; + + mgtdev->mgtdev.ops = &mdev_ops; + mgtdev->mgtdev.device = mdev->device; + mgtdev->mgtdev.id_table = id_table; + mgtdev->madev = madev; + + err = vdpa_mgmtdev_register(&mgtdev->mgtdev); + if (err) + goto reg_err; + + dev_set_drvdata(&adev->dev, mgtdev); + + return 0; + +reg_err: + kfree(mgtdev); + return err; +} + static void mlx5v_remove(struct auxiliary_device *adev) { - struct mlx5_vdpa_dev *mvdev = dev_get_drvdata(&adev->dev); + struct mlx5_vdpa_mgmtdev *mgtdev; - vdpa_unregister_device(&mvdev->vdev); + mgtdev = dev_get_drvdata(&adev->dev); + vdpa_mgmtdev_unregister(&mgtdev->mgtdev); + kfree(mgtdev); } static const struct auxiliary_device_id mlx5v_id_table[] = { From d0f9164eb294aeb884cbe36ddbbae34fa0124aa1 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 6 Apr 2021 20:04:44 +0300 Subject: [PATCH 0872/1379] vdpa: Follow kdoc comment style Follow comment style mentioned in the Writing kernel-doc document [1]. Following warnings are fixed. $ scripts/kernel-doc -v -none include/linux/vdpa.h include/linux/vdpa.h:11: warning: missing initial short description on line: * vDPA callback definition. include/linux/vdpa.h:11: info: Scanning doc for vDPA include/linux/vdpa.h:15: warning: cannot understand function prototype: 'struct vdpa_callback ' include/linux/vdpa.h:21: warning: missing initial short description on line: * vDPA notification area include/linux/vdpa.h:21: info: Scanning doc for vDPA include/linux/vdpa.h:25: warning: cannot understand function prototype: 'struct vdpa_notification_area ' include/linux/vdpa.h:31: warning: missing initial short description on line: * vDPA vq_state definition include/linux/vdpa.h:31: info: Scanning doc for vDPA include/linux/vdpa.h:34: warning: cannot understand function prototype: 'struct vdpa_vq_state ' include/linux/vdpa.h:41: info: Scanning doc for vDPA device include/linux/vdpa.h:51: warning: cannot understand function prototype: 'struct vdpa_device ' include/linux/vdpa.h:62: info: Scanning doc for vDPA IOVA range include/linux/vdpa.h:66: warning: cannot understand function prototype: 'struct vdpa_iova_range ' include/linux/vdpa.h:72: info: Scanning doc for vDPA_config_ops include/linux/vdpa.h:203: warning: cannot understand function prototype: 'struct vdpa_config_ops ' include/linux/vdpa.h:270: info: Scanning doc for vdpa_driver include/linux/vdpa.h:275: warning: cannot understand function prototype: 'struct vdpa_driver ' include/linux/vdpa.h:347: info: Scanning doc for vdpa_mgmtdev_ops include/linux/vdpa.h:360: warning: cannot understand function prototype: 'struct vdpa_mgmtdev_ops ' After this fix: scripts/kernel-doc -v -none include/linux/vdpa.h include/linux/vdpa.h:11: info: Scanning doc for struct vdpa_calllback include/linux/vdpa.h:21: info: Scanning doc for struct vdpa_notification_area include/linux/vdpa.h:31: info: Scanning doc for struct vdpa_vq_state include/linux/vdpa.h:41: info: Scanning doc for struct vdpa_device include/linux/vdpa.h:62: info: Scanning doc for struct vdpa_iova_range include/linux/vdpa.h:72: info: Scanning doc for struct vdpa_config_ops include/linux/vdpa.h:270: info: Scanning doc for struct vdpa_driver include/linux/vdpa.h:347: info: Scanning doc for struct vdpa_mgmtdev_ops [1] https://www.kernel.org/doc/html/latest/doc-guide/kernel-doc.html Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Link: https://lore.kernel.org/r/20210406170457.98481-2-parav@nvidia.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Acked-by: Jason Wang --- include/linux/vdpa.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 15fa085fab05..37b65ca940cf 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -8,7 +8,7 @@ #include /** - * vDPA callback definition. + * struct vdpa_calllback - vDPA callback definition. * @callback: interrupt callback function * @private: the data passed to the callback function */ @@ -18,7 +18,7 @@ struct vdpa_callback { }; /** - * vDPA notification area + * struct vdpa_notification_area - vDPA notification area * @addr: base address of the notification area * @size: size of the notification area */ @@ -28,7 +28,7 @@ struct vdpa_notification_area { }; /** - * vDPA vq_state definition + * struct vdpa_vq_state - vDPA vq_state definition * @avail_index: available index */ struct vdpa_vq_state { @@ -38,7 +38,7 @@ struct vdpa_vq_state { struct vdpa_mgmt_dev; /** - * vDPA device - representation of a vDPA device + * struct vdpa_device - representation of a vDPA device * @dev: underlying device * @dma_dev: the actual device that is performing DMA * @config: the configuration ops for this device. @@ -59,7 +59,7 @@ struct vdpa_device { }; /** - * vDPA IOVA range - the IOVA range support by the device + * struct vdpa_iova_range - the IOVA range support by the device * @first: start of the IOVA range * @last: end of the IOVA range */ @@ -69,7 +69,7 @@ struct vdpa_iova_range { }; /** - * vDPA_config_ops - operations for configuring a vDPA device. + * struct vdpa_config_ops - operations for configuring a vDPA device. * Note: vDPA device drivers are required to implement all of the * operations unless it is mentioned to be optional in the following * list. @@ -267,7 +267,7 @@ int _vdpa_register_device(struct vdpa_device *vdev, int nvqs); void _vdpa_unregister_device(struct vdpa_device *vdev); /** - * vdpa_driver - operations for a vDPA driver + * struct vdpa_driver - operations for a vDPA driver * @driver: underlying device driver * @probe: the function to call when a device is found. Returns 0 or -errno. * @remove: the function to call when a device is removed. @@ -344,18 +344,18 @@ static inline void vdpa_get_config(struct vdpa_device *vdev, unsigned offset, } /** - * vdpa_mgmtdev_ops - vdpa device ops - * @dev_add: Add a vdpa device using alloc and register - * @mdev: parent device to use for device addition - * @name: name of the new vdpa device - * Driver need to add a new device using _vdpa_register_device() - * after fully initializing the vdpa device. Driver must return 0 - * on success or appropriate error code. - * @dev_del: Remove a vdpa device using unregister - * @mdev: parent device to use for device removal - * @dev: vdpa device to remove - * Driver need to remove the specified device by calling - * _vdpa_unregister_device(). + * struct vdpa_mgmtdev_ops - vdpa device ops + * @dev_add: Add a vdpa device using alloc and register + * @mdev: parent device to use for device addition + * @name: name of the new vdpa device + * Driver need to add a new device using _vdpa_register_device() + * after fully initializing the vdpa device. Driver must return 0 + * on success or appropriate error code. + * @dev_del: Remove a vdpa device using unregister + * @mdev: parent device to use for device removal + * @dev: vdpa device to remove + * Driver need to remove the specified device by calling + * _vdpa_unregister_device(). */ struct vdpa_mgmtdev_ops { int (*dev_add)(struct vdpa_mgmt_dev *mdev, const char *name); From c0a54b4bcb457232d5dce36ffbcd31d201ba3332 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 6 Apr 2021 20:04:45 +0300 Subject: [PATCH 0873/1379] vdpa: Follow kdoc comment style Follow comment style mentioned in the Writing kernel-doc document [1]. Following warnings are fixed. $ scripts/kernel-doc -v -none drivers/vdpa/vdpa.c drivers/vdpa/vdpa.c:67: info: Scanning doc for __vdpa_alloc_device drivers/vdpa/vdpa.c:84: warning: No description found for return value of '__vdpa_alloc_device' drivers/vdpa/vdpa.c:153: info: Scanning doc for _vdpa_register_device drivers/vdpa/vdpa.c:163: warning: No description found for return value of '_vdpa_register_device' drivers/vdpa/vdpa.c:172: info: Scanning doc for vdpa_register_device drivers/vdpa/vdpa.c:180: warning: No description found for return value of 'vdpa_register_device' drivers/vdpa/vdpa.c:191: info: Scanning doc for _vdpa_unregister_device drivers/vdpa/vdpa.c:205: info: Scanning doc for vdpa_unregister_device drivers/vdpa/vdpa.c:217: info: Scanning doc for __vdpa_register_driver drivers/vdpa/vdpa.c:224: warning: No description found for return value of '__vdpa_register_driver' drivers/vdpa/vdpa.c:233: info: Scanning doc for vdpa_unregister_driver drivers/vdpa/vdpa.c:243: info: Scanning doc for vdpa_mgmtdev_register drivers/vdpa/vdpa.c:250: warning: No description found for return value of 'vdpa_mgmtdev_register' After the fix: scripts/kernel-doc -v -none drivers/vdpa/vdpa.c drivers/vdpa/vdpa.c:67: info: Scanning doc for __vdpa_alloc_device drivers/vdpa/vdpa.c:153: info: Scanning doc for _vdpa_register_device drivers/vdpa/vdpa.c:172: info: Scanning doc for vdpa_register_device drivers/vdpa/vdpa.c:191: info: Scanning doc for _vdpa_unregister_device drivers/vdpa/vdpa.c:205: info: Scanning doc for vdpa_unregister_device drivers/vdpa/vdpa.c:217: info: Scanning doc for __vdpa_register_driver drivers/vdpa/vdpa.c:233: info: Scanning doc for vdpa_unregister_driver drivers/vdpa/vdpa.c:243: info: Scanning doc for vdpa_mgmtdev_register [1] https://www.kernel.org/doc/html/latest/doc-guide/kernel-doc.html Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Link: https://lore.kernel.org/r/20210406170457.98481-3-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 5cffce67cab0..bb3f1d1f0422 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -75,8 +75,8 @@ static void vdpa_release_dev(struct device *d) * Driver should use vdpa_alloc_device() wrapper macro instead of * using this directly. * - * Returns an error when parent/config/dma_dev is not set or fail to get - * ida. + * Return: Returns an error when parent/config/dma_dev is not set or fail to get + * ida. */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, @@ -157,7 +157,7 @@ static int __vdpa_register_device(struct vdpa_device *vdev, int nvqs) * @vdev: the vdpa device to be registered to vDPA bus * @nvqs: number of virtqueues supported by this device * - * Returns an error when fail to add device to vDPA bus + * Return: Returns an error when fail to add device to vDPA bus */ int _vdpa_register_device(struct vdpa_device *vdev, int nvqs) { @@ -174,7 +174,7 @@ EXPORT_SYMBOL_GPL(_vdpa_register_device); * @vdev: the vdpa device to be registered to vDPA bus * @nvqs: number of virtqueues supported by this device * - * Returns an error when fail to add to vDPA bus + * Return: Returns an error when fail to add to vDPA bus */ int vdpa_register_device(struct vdpa_device *vdev, int nvqs) { @@ -218,7 +218,7 @@ EXPORT_SYMBOL_GPL(vdpa_unregister_device); * @drv: the vdpa device driver to be registered * @owner: module owner of the driver * - * Returns an err when fail to do the registration + * Return: Returns an err when fail to do the registration */ int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner) { @@ -245,6 +245,8 @@ EXPORT_SYMBOL_GPL(vdpa_unregister_driver); * @mdev: Pointer to vdpa management device * vdpa_mgmtdev_register() register a vdpa management device which supports * vdpa device management. + * Return: Returns 0 on success or failure when required callback ops are not + * initialized. */ int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev) { From 3fd02fbbfac0dabb624606d1303d309f34ec15d4 Mon Sep 17 00:00:00 2001 From: Liu Xiang Date: Sat, 27 Mar 2021 11:17:10 +0800 Subject: [PATCH 0874/1379] virtio-balloon: fix a typo in comment of virtballoon_migratepage() Typo: compation --> compaction Signed-off-by: Liu Xiang Link: https://lore.kernel.org/r/20210327031710.16151-1-liu.xiang@zlingsmart.com Signed-off-by: Michael S. Tsirkin Reviewed-by: David Hildenbrand --- drivers/virtio/virtio_balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 8985fc2cea86..510e9318854d 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -734,7 +734,7 @@ static void report_free_page_func(struct work_struct *work) #ifdef CONFIG_BALLOON_COMPACTION /* * virtballoon_migratepage - perform the balloon page migration on behalf of - * a compation thread. (called under page lock) + * a compaction thread. (called under page lock) * @vb_dev_info: the balloon device * @newpage: page that will replace the isolated page after migration finishes. * @page : the isolated (old) page that is about to be migrated to newpage. From 122b84a1267aec28ab929edae1ac700a03fb65e0 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 2 May 2021 12:33:19 +0300 Subject: [PATCH 0875/1379] virtio-net: don't allocate control_buf if not supported Not all virtio_net devices support the ctrl queue feature. Thus, there is no need to allocate unused resources. Signed-off-by: Max Gurtovoy Link: https://lore.kernel.org/r/20210502093319.61313-1-mgurtovoy@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/net/virtio_net.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 0824e6999e49..ac0c143f97b4 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2801,9 +2801,13 @@ static int virtnet_alloc_queues(struct virtnet_info *vi) { int i; - vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL); - if (!vi->ctrl) - goto err_ctrl; + if (vi->has_cvq) { + vi->ctrl = kzalloc(sizeof(*vi->ctrl), GFP_KERNEL); + if (!vi->ctrl) + goto err_ctrl; + } else { + vi->ctrl = NULL; + } vi->sq = kcalloc(vi->max_queue_pairs, sizeof(*vi->sq), GFP_KERNEL); if (!vi->sq) goto err_sq; From 9e3bb9b79a7131a088cfffbdcc30e747dad9d090 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 15 Apr 2021 03:31:41 -0400 Subject: [PATCH 0876/1379] virtio_pci_modern: introduce helper to map vq notify area This patch factors out the logic of vq notify area mapping. Following patches will switch to use this common helpers for both virtio_pci library and virtio-pci vDPA driver. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210415073147.19331-2-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen --- drivers/virtio/virtio_pci_modern_dev.c | 35 ++++++++++++++++++++++++++ include/linux/virtio_pci_modern.h | 2 ++ 2 files changed, 37 insertions(+) diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index cbd667496bb1..28cb5847fafa 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -593,6 +593,41 @@ u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev, } EXPORT_SYMBOL_GPL(vp_modern_get_queue_notify_off); +/* + * vp_modern_map_vq_notify - map notification area for a + * specific virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * + * Returns the address of the notification area + */ +void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, + u16 index) +{ + u16 off = vp_modern_get_queue_notify_off(mdev, index); + + if (mdev->notify_base) { + /* offset should not wrap */ + if ((u64)off * mdev->notify_offset_multiplier + 2 + > mdev->notify_len) { + dev_warn(&mdev->pci_dev->dev, + "bad notification offset %u (x %u) " + "for queue %u > %zd", + off, mdev->notify_offset_multiplier, + index, mdev->notify_len); + return NULL; + } + return (void __force *)mdev->notify_base + + off * mdev->notify_offset_multiplier; + } else { + return (void __force *)vp_modern_map_capability(mdev, + mdev->notify_map_cap, 2, 2, + off * mdev->notify_offset_multiplier, 2, + NULL); + } +} +EXPORT_SYMBOL_GPL(vp_modern_map_vq_notify); + MODULE_VERSION("0.1"); MODULE_DESCRIPTION("Modern Virtio PCI Device"); MODULE_AUTHOR("Jason Wang "); diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index f26acbeec965..1b95d39b00fc 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -106,6 +106,8 @@ void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, in u32 align, u32 start, u32 size, size_t *len); +void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, + u16 index); int vp_modern_probe(struct virtio_pci_modern_device *mdev); void vp_modern_remove(struct virtio_pci_modern_device *mdev); #endif From 7dca6c0ea96b9e583ebcf95fe3c14ba3385f467b Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 15 Apr 2021 03:31:42 -0400 Subject: [PATCH 0877/1379] virtio-pci library: switch to use vp_modern_map_vq_notify() This patch switch to use vp_modern_map_notify() for virtio-pci library. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210415073147.19331-3-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen --- drivers/virtio/virtio_pci_modern.c | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index fbd4ebc00eb6..29607d9bd95c 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -192,7 +192,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, struct virtio_pci_modern_device *mdev = &vp_dev->mdev; struct virtqueue *vq; - u16 num, off; + u16 num; int err; if (index >= vp_modern_get_num_queues(mdev)) @@ -208,9 +208,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, return ERR_PTR(-EINVAL); } - /* get offset of notification word for this vq */ - off = vp_modern_get_queue_notify_off(mdev, index); - info->msix_vector = msix_vec; /* create the vring */ @@ -227,27 +224,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, virtqueue_get_avail_addr(vq), virtqueue_get_used_addr(vq)); - if (mdev->notify_base) { - /* offset should not wrap */ - if ((u64)off * mdev->notify_offset_multiplier + 2 - > mdev->notify_len) { - dev_warn(&mdev->pci_dev->dev, - "bad notification offset %u (x %u) " - "for queue %u > %zd", - off, mdev->notify_offset_multiplier, - index, mdev->notify_len); - err = -EINVAL; - goto err_map_notify; - } - vq->priv = (void __force *)mdev->notify_base + - off * mdev->notify_offset_multiplier; - } else { - vq->priv = (void __force *)vp_modern_map_capability(mdev, - mdev->notify_map_cap, 2, 2, - off * mdev->notify_offset_multiplier, 2, - NULL); - } - + vq->priv = vp_modern_map_vq_notify(mdev, index); if (!vq->priv) { err = -ENOMEM; goto err_map_notify; From 11d8ffed00b231356008b35a3b0bc192e42333fa Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 15 Apr 2021 03:31:43 -0400 Subject: [PATCH 0878/1379] vp_vdpa: switch to use vp_modern_map_vq_notify() This patch switches to use vp_vdpa to use vp_modern_map_notify(). Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210415073147.19331-4-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen --- drivers/vdpa/virtio_pci/vp_vdpa.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index 1321a2fcd088..2afc90645660 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -369,7 +369,6 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) struct virtio_pci_modern_device *mdev; struct device *dev = &pdev->dev; struct vp_vdpa *vp_vdpa; - u16 notify_off; int ret, i; ret = pcim_enable_device(pdev); @@ -415,10 +414,12 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) } for (i = 0; i < vp_vdpa->queues; i++) { - notify_off = vp_modern_get_queue_notify_off(mdev, i); vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR; - vp_vdpa->vring[i].notify = mdev->notify_base + - notify_off * mdev->notify_offset_multiplier; + vp_vdpa->vring[i].notify = vp_modern_map_vq_notify(mdev, i); + if (!vp_vdpa->vring[i].notify) { + dev_warn(&pdev->dev, "Fail to map vq notify %d\n", i); + goto err; + } } vp_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR; From a5f7a24f49d81fab9f59611814a8817cc8a876a2 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 15 Apr 2021 03:31:44 -0400 Subject: [PATCH 0879/1379] virtio_pci_modern: hide vp_modern_get_queue_notify_off() All users (both virtio-pci library and vp_vdpa driver) has been switched to use vp_modern_map_vq_notify(). So there's no need to export the low level helper of vp_modern_get_queue_notify_off(). Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210415073147.19331-5-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen --- drivers/virtio/virtio_pci_modern_dev.c | 5 ++--- include/linux/virtio_pci_modern.h | 2 -- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index 28cb5847fafa..5a657e56b46d 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -584,14 +584,13 @@ EXPORT_SYMBOL_GPL(vp_modern_get_num_queues); * * Returns the notification offset for a virtqueue */ -u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev, - u16 index) +static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev, + u16 index) { vp_iowrite16(index, &mdev->common->queue_select); return vp_ioread16(&mdev->common->queue_notify_off); } -EXPORT_SYMBOL_GPL(vp_modern_get_queue_notify_off); /* * vp_modern_map_vq_notify - map notification area for a diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index 1b95d39b00fc..179a2fb4bf37 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -99,8 +99,6 @@ void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev, u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, u16 idx); u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev); -u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev, - u16 idx); void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, size_t minlen, u32 align, From fd466b36940b22a506265edf12714bd0cf9ed836 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 15 Apr 2021 03:31:45 -0400 Subject: [PATCH 0880/1379] virito_pci libray: hide vp_modern_map_capability() No user now and the capability should not be setup externally. Instead, every access to the capability should be done via virtio_pci_modern_device. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210415073147.19331-6-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen --- drivers/virtio/virtio_pci_modern_dev.c | 10 ++++------ include/linux/virtio_pci_modern.h | 5 ----- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index 5a657e56b46d..9c241c9bd920 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -16,11 +16,10 @@ * * Returns the io address of for the part of the capability */ -void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, - size_t minlen, - u32 align, - u32 start, u32 size, - size_t *len) +static void __iomem * +vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, + size_t minlen, u32 align, u32 start, u32 size, + size_t *len) { struct pci_dev *dev = mdev->pci_dev; u8 bar; @@ -90,7 +89,6 @@ void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, in length, offset, bar); return p; } -EXPORT_SYMBOL_GPL(vp_modern_map_capability); /** * virtio_pci_find_capability - walk capabilities to find device info. diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index 179a2fb4bf37..e6e7072413c1 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -99,11 +99,6 @@ void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev, u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, u16 idx); u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev); -void __iomem *vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, - size_t minlen, - u32 align, - u32 start, u32 size, - size_t *len); void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, u16 index); int vp_modern_probe(struct virtio_pci_modern_device *mdev); From 9e311bcad73dc14bd0a736db6ad3d382227e11fe Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 15 Apr 2021 03:31:46 -0400 Subject: [PATCH 0881/1379] virtio-pci library: report resource address Sometimes it might be useful to report the capability physical address. One example is to report the physical address of the doorbell in order to be mapped by userspace. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210415073147.19331-7-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/virtio_pci/vp_vdpa.c | 3 ++- drivers/virtio/virtio_pci_modern.c | 2 +- drivers/virtio/virtio_pci_modern_dev.c | 24 +++++++++++++++++------- include/linux/virtio_pci_modern.h | 4 +++- 4 files changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index 2afc90645660..98205e54d089 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -415,7 +415,8 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) for (i = 0; i < vp_vdpa->queues; i++) { vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR; - vp_vdpa->vring[i].notify = vp_modern_map_vq_notify(mdev, i); + vp_vdpa->vring[i].notify = + vp_modern_map_vq_notify(mdev, i, NULL); if (!vp_vdpa->vring[i].notify) { dev_warn(&pdev->dev, "Fail to map vq notify %d\n", i); goto err; diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 29607d9bd95c..722ea44e7579 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -224,7 +224,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, virtqueue_get_avail_addr(vq), virtqueue_get_used_addr(vq)); - vq->priv = vp_modern_map_vq_notify(mdev, index); + vq->priv = vp_modern_map_vq_notify(mdev, index, NULL); if (!vq->priv) { err = -ENOMEM; goto err_map_notify; diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index 9c241c9bd920..ae87b3fa8858 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -13,13 +13,14 @@ * @start: start from the capability * @size: map size * @len: the length that is actually mapped + * @pa: physical address of the capability * * Returns the io address of for the part of the capability */ static void __iomem * vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, size_t minlen, u32 align, u32 start, u32 size, - size_t *len) + size_t *len, resource_size_t *pa) { struct pci_dev *dev = mdev->pci_dev; u8 bar; @@ -87,6 +88,9 @@ vp_modern_map_capability(struct virtio_pci_modern_device *mdev, int off, dev_err(&dev->dev, "virtio_pci: unable to map virtio %u@%u on bar %i\n", length, offset, bar); + else if (pa) + *pa = pci_resource_start(dev, bar) + offset; + return p; } @@ -273,12 +277,12 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev) mdev->common = vp_modern_map_capability(mdev, common, sizeof(struct virtio_pci_common_cfg), 4, 0, sizeof(struct virtio_pci_common_cfg), - NULL); + NULL, NULL); if (!mdev->common) goto err_map_common; mdev->isr = vp_modern_map_capability(mdev, isr, sizeof(u8), 1, 0, 1, - NULL); + NULL, NULL); if (!mdev->isr) goto err_map_isr; @@ -306,7 +310,8 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev) mdev->notify_base = vp_modern_map_capability(mdev, notify, 2, 2, 0, notify_length, - &mdev->notify_len); + &mdev->notify_len, + &mdev->notify_pa); if (!mdev->notify_base) goto err_map_notify; } else { @@ -319,7 +324,8 @@ int vp_modern_probe(struct virtio_pci_modern_device *mdev) if (device) { mdev->device = vp_modern_map_capability(mdev, device, 0, 4, 0, PAGE_SIZE, - &mdev->device_len); + &mdev->device_len, + NULL); if (!mdev->device) goto err_map_device; } @@ -595,11 +601,12 @@ static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev, * specific virtqueue * @mdev: the modern virtio-pci device * @index: the queue index + * @pa: the pointer to the physical address of the nofity area * * Returns the address of the notification area */ void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, - u16 index) + u16 index, resource_size_t *pa) { u16 off = vp_modern_get_queue_notify_off(mdev, index); @@ -614,13 +621,16 @@ void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, index, mdev->notify_len); return NULL; } + if (pa) + *pa = mdev->notify_pa + + off * mdev->notify_offset_multiplier; return (void __force *)mdev->notify_base + off * mdev->notify_offset_multiplier; } else { return (void __force *)vp_modern_map_capability(mdev, mdev->notify_map_cap, 2, 2, off * mdev->notify_offset_multiplier, 2, - NULL); + NULL, pa); } } EXPORT_SYMBOL_GPL(vp_modern_map_vq_notify); diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index e6e7072413c1..cdfabbefacdf 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -13,6 +13,8 @@ struct virtio_pci_modern_device { void __iomem *device; /* Base of vq notifications (non-legacy mode). */ void __iomem *notify_base; + /* Physical base of vq notifications */ + resource_size_t notify_pa; /* Where to read and clear interrupt */ u8 __iomem *isr; @@ -100,7 +102,7 @@ u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, u16 idx); u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev); void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, - u16 index); + u16 index, resource_size_t *pa); int vp_modern_probe(struct virtio_pci_modern_device *mdev); void vp_modern_remove(struct virtio_pci_modern_device *mdev); #endif From 526cb8580bc6b9e5bc14cc5d24ecf4633a84cfa1 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 15 Apr 2021 03:31:47 -0400 Subject: [PATCH 0882/1379] vp_vdpa: report doorbell address This patch reports the per vq doorbell location and size to vDPA bus. Userspace can then map the doorbell via mmap() via vhost-vDPA bus driver. Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210415073147.19331-8-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Eli Cohen --- drivers/vdpa/virtio_pci/vp_vdpa.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index 98205e54d089..002b928d0ca1 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -26,6 +26,7 @@ struct vp_vring { void __iomem *notify; char msix_name[VP_VDPA_NAME_SIZE]; struct vdpa_callback cb; + resource_size_t notify_pa; int irq; }; @@ -336,6 +337,19 @@ static void vp_vdpa_set_config_cb(struct vdpa_device *vdpa, vp_vdpa->config_cb = *cb; } +static struct vdpa_notification_area +vp_vdpa_get_vq_notification(struct vdpa_device *vdpa, u16 qid) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev; + struct vdpa_notification_area notify; + + notify.addr = vp_vdpa->vring[qid].notify_pa; + notify.size = mdev->notify_offset_multiplier; + + return notify; +} + static const struct vdpa_config_ops vp_vdpa_ops = { .get_features = vp_vdpa_get_features, .set_features = vp_vdpa_set_features, @@ -343,6 +357,7 @@ static const struct vdpa_config_ops vp_vdpa_ops = { .set_status = vp_vdpa_set_status, .get_vq_num_max = vp_vdpa_get_vq_num_max, .get_vq_state = vp_vdpa_get_vq_state, + .get_vq_notification = vp_vdpa_get_vq_notification, .set_vq_state = vp_vdpa_set_vq_state, .set_vq_cb = vp_vdpa_set_vq_cb, .set_vq_ready = vp_vdpa_set_vq_ready, @@ -416,7 +431,8 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) for (i = 0; i < vp_vdpa->queues; i++) { vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR; vp_vdpa->vring[i].notify = - vp_modern_map_vq_notify(mdev, i, NULL); + vp_modern_map_vq_notify(mdev, i, + &vp_vdpa->vring[i].notify_pa); if (!vp_vdpa->vring[i].notify) { dev_warn(&pdev->dev, "Fail to map vq notify %d\n", i); goto err; From 3a3e0fad16d40a2aa68ddf7eea4acdf48b22dd44 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 13 Apr 2021 17:15:57 +0800 Subject: [PATCH 0883/1379] vhost-vdpa: fix vm_flags for virtqueue doorbell mapping The virtqueue doorbell is usually implemented via registeres but we don't provide the necessary vma->flags like VM_PFNMAP. This may cause several issues e.g when userspace tries to map the doorbell via vhost IOTLB, kernel may panic due to the page is not backed by page structure. This patch fixes this by setting the necessary vm_flags. With this patch, try to map doorbell via IOTLB will fail with bad address. Cc: stable@vger.kernel.org Fixes: ddd89d0a059d ("vhost_vdpa: support doorbell mapping via mmap") Signed-off-by: Jason Wang Link: https://lore.kernel.org/r/20210413091557.29008-1-jasowang@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index bfa4c6ef554e..c79d2f2387aa 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -993,6 +993,7 @@ static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start != notify.size) return -ENOTSUPP; + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &vhost_vdpa_vm_ops; return 0; } From 0ecb1960788d0ce627e246303a31843c1f496bed Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Wed, 17 Mar 2021 17:49:27 +0800 Subject: [PATCH 0884/1379] vDPA/ifcvf: get_vendor_id returns a device specific vendor id In this commit, ifcvf_get_vendor_id() will return a device specific vendor id of the probed pci device than a hard code. Signed-off-by: Zhu Lingshan Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210317094933.16417-2-lingshan.zhu@intel.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/ifcvf/ifcvf_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index d555a6a5d1ba..f5633ab78952 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -324,7 +324,10 @@ static u32 ifcvf_vdpa_get_device_id(struct vdpa_device *vdpa_dev) static u32 ifcvf_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev) { - return IFCVF_SUBSYS_VENDOR_ID; + struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev); + struct pci_dev *pdev = adapter->pdev; + + return pdev->subsystem_vendor; } static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device *vdpa_dev) From 139c3fd9c9fc52ea5b8a347046993000afe36c06 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Wed, 17 Mar 2021 17:49:28 +0800 Subject: [PATCH 0885/1379] vDPA/ifcvf: enable Intel C5000X-PL virtio-net for vDPA This commit enabled Intel FPGA SmartNIC C5000X-PL virtio-net for vDPA Signed-off-by: Zhu Lingshan Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210317094933.16417-3-lingshan.zhu@intel.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/ifcvf/ifcvf_base.h | 5 +++++ drivers/vdpa/ifcvf/ifcvf_main.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index 64696d63fe07..75d9a8052039 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -23,6 +23,11 @@ #define IFCVF_SUBSYS_VENDOR_ID 0x8086 #define IFCVF_SUBSYS_DEVICE_ID 0x001A +#define C5000X_PL_VENDOR_ID 0x1AF4 +#define C5000X_PL_DEVICE_ID 0x1000 +#define C5000X_PL_SUBSYS_VENDOR_ID 0x8086 +#define C5000X_PL_SUBSYS_DEVICE_ID 0x0001 + #define IFCVF_SUPPORTED_FEATURES \ ((1ULL << VIRTIO_NET_F_MAC) | \ (1ULL << VIRTIO_F_ANY_LAYOUT) | \ diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index f5633ab78952..ded5a0303dbc 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -483,6 +483,11 @@ static struct pci_device_id ifcvf_pci_ids[] = { IFCVF_DEVICE_ID, IFCVF_SUBSYS_VENDOR_ID, IFCVF_SUBSYS_DEVICE_ID) }, + { PCI_DEVICE_SUB(C5000X_PL_VENDOR_ID, + C5000X_PL_DEVICE_ID, + C5000X_PL_SUBSYS_VENDOR_ID, + C5000X_PL_SUBSYS_DEVICE_ID) }, + { 0 }, }; MODULE_DEVICE_TABLE(pci, ifcvf_pci_ids); From 51fc387b67cb25416757f7a889bab328cc0faf37 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Wed, 17 Mar 2021 17:49:29 +0800 Subject: [PATCH 0886/1379] vDPA/ifcvf: rename original IFCVF dev ids to N3000 ids IFCVF driver probes multiple types of devices now, to distinguish the original device driven by IFCVF from others, it is renamed as "N3000". Signed-off-by: Zhu Lingshan Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210317094933.16417-4-lingshan.zhu@intel.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/ifcvf/ifcvf_base.h | 8 ++++---- drivers/vdpa/ifcvf/ifcvf_main.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index 75d9a8052039..794d1505d857 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -18,10 +18,10 @@ #include #include -#define IFCVF_VENDOR_ID 0x1AF4 -#define IFCVF_DEVICE_ID 0x1041 -#define IFCVF_SUBSYS_VENDOR_ID 0x8086 -#define IFCVF_SUBSYS_DEVICE_ID 0x001A +#define N3000_VENDOR_ID 0x1AF4 +#define N3000_DEVICE_ID 0x1041 +#define N3000_SUBSYS_VENDOR_ID 0x8086 +#define N3000_SUBSYS_DEVICE_ID 0x001A #define C5000X_PL_VENDOR_ID 0x1AF4 #define C5000X_PL_DEVICE_ID 0x1000 diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index ded5a0303dbc..6e766d5c4f11 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -479,10 +479,10 @@ static void ifcvf_remove(struct pci_dev *pdev) } static struct pci_device_id ifcvf_pci_ids[] = { - { PCI_DEVICE_SUB(IFCVF_VENDOR_ID, - IFCVF_DEVICE_ID, - IFCVF_SUBSYS_VENDOR_ID, - IFCVF_SUBSYS_DEVICE_ID) }, + { PCI_DEVICE_SUB(N3000_VENDOR_ID, + N3000_DEVICE_ID, + N3000_SUBSYS_VENDOR_ID, + N3000_SUBSYS_DEVICE_ID) }, { PCI_DEVICE_SUB(C5000X_PL_VENDOR_ID, C5000X_PL_DEVICE_ID, C5000X_PL_SUBSYS_VENDOR_ID, From 2f1b305070393151d3997217a4452ef99bdb48cc Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Wed, 17 Mar 2021 17:49:30 +0800 Subject: [PATCH 0887/1379] vDPA/ifcvf: remove the version number string This commit removes the version number string, using kernel version is enough. Signed-off-by: Zhu Lingshan Reviewed-by: Leon Romanovsky Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210317094933.16417-5-lingshan.zhu@intel.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/ifcvf/ifcvf_main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 6e766d5c4f11..d3d42c431e69 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -14,7 +14,6 @@ #include #include "ifcvf_base.h" -#define VERSION_STRING "0.1" #define DRIVER_AUTHOR "Intel Corporation" #define IFCVF_DRIVER_NAME "ifcvf" @@ -502,4 +501,3 @@ static struct pci_driver ifcvf_driver = { module_pci_driver(ifcvf_driver); MODULE_LICENSE("GPL v2"); -MODULE_VERSION(VERSION_STRING); From 69d00d9858c7451a1e3fc556ece211533fb85b64 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Wed, 17 Mar 2021 17:49:31 +0800 Subject: [PATCH 0888/1379] vDPA/ifcvf: fetch device feature bits when probe This commit would read and store device feature bits when probe. rename ifcvf_get_features() to ifcvf_get_hw_features(), it reads and stores features of the probed device. new ifcvf_get_features() simply returns stored feature bits. Signed-off-by: Zhu Lingshan Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210317094933.16417-6-lingshan.zhu@intel.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/ifcvf/ifcvf_base.c | 12 ++++++++++-- drivers/vdpa/ifcvf/ifcvf_base.h | 2 ++ drivers/vdpa/ifcvf/ifcvf_main.c | 2 ++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c index f2a128e56de5..ea6a78791c9b 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.c +++ b/drivers/vdpa/ifcvf/ifcvf_base.c @@ -202,10 +202,11 @@ static void ifcvf_add_status(struct ifcvf_hw *hw, u8 status) ifcvf_get_status(hw); } -u64 ifcvf_get_features(struct ifcvf_hw *hw) +u64 ifcvf_get_hw_features(struct ifcvf_hw *hw) { struct virtio_pci_common_cfg __iomem *cfg = hw->common_cfg; u32 features_lo, features_hi; + u64 features; ifc_iowrite32(0, &cfg->device_feature_select); features_lo = ifc_ioread32(&cfg->device_feature); @@ -213,7 +214,14 @@ u64 ifcvf_get_features(struct ifcvf_hw *hw) ifc_iowrite32(1, &cfg->device_feature_select); features_hi = ifc_ioread32(&cfg->device_feature); - return ((u64)features_hi << 32) | features_lo; + features = ((u64)features_hi << 32) | features_lo; + + return features; +} + +u64 ifcvf_get_features(struct ifcvf_hw *hw) +{ + return hw->hw_features; } void ifcvf_read_net_config(struct ifcvf_hw *hw, u64 offset, diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index 794d1505d857..dbb8c10aa3b1 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -83,6 +83,7 @@ struct ifcvf_hw { void __iomem *notify_base; u32 notify_off_multiplier; u64 req_features; + u64 hw_features; struct virtio_pci_common_cfg __iomem *common_cfg; void __iomem *net_cfg; struct vring_info vring[IFCVF_MAX_QUEUE_PAIRS * 2]; @@ -121,6 +122,7 @@ void ifcvf_set_status(struct ifcvf_hw *hw, u8 status); void io_write64_twopart(u64 val, u32 *lo, u32 *hi); void ifcvf_reset(struct ifcvf_hw *hw); u64 ifcvf_get_features(struct ifcvf_hw *hw); +u64 ifcvf_get_hw_features(struct ifcvf_hw *hw); u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid); int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num); struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw); diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index d3d42c431e69..fae664d92aff 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -457,6 +457,8 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) vf->vring[i].irq = -EINVAL; + vf->hw_features = ifcvf_get_hw_features(vf); + ret = vdpa_register_device(&adapter->vdpa, IFCVF_MAX_QUEUE_PAIRS * 2); if (ret) { IFCVF_ERR(pdev, "Failed to register ifcvf to vdpa bus"); From 1d895a68085b28d098893570b024229aacc9a057 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Wed, 17 Mar 2021 17:49:32 +0800 Subject: [PATCH 0889/1379] vDPA/ifcvf: verify mandatory feature bits for vDPA vDPA requres VIRTIO_F_ACCESS_PLATFORM as a must, this commit examines this when set features. Signed-off-by: Zhu Lingshan Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210317094933.16417-7-lingshan.zhu@intel.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/ifcvf/ifcvf_base.c | 12 ++++++++++++ drivers/vdpa/ifcvf/ifcvf_base.h | 1 + drivers/vdpa/ifcvf/ifcvf_main.c | 5 +++++ 3 files changed, 18 insertions(+) diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c index ea6a78791c9b..1a661ab45af5 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.c +++ b/drivers/vdpa/ifcvf/ifcvf_base.c @@ -224,6 +224,18 @@ u64 ifcvf_get_features(struct ifcvf_hw *hw) return hw->hw_features; } +int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features) +{ + struct ifcvf_adapter *ifcvf = vf_to_adapter(hw); + + if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)) && features) { + IFCVF_ERR(ifcvf->pdev, "VIRTIO_F_ACCESS_PLATFORM is not negotiated\n"); + return -EINVAL; + } + + return 0; +} + void ifcvf_read_net_config(struct ifcvf_hw *hw, u64 offset, void *dst, int length) { diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index dbb8c10aa3b1..f77239fc1644 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -123,6 +123,7 @@ void io_write64_twopart(u64 val, u32 *lo, u32 *hi); void ifcvf_reset(struct ifcvf_hw *hw); u64 ifcvf_get_features(struct ifcvf_hw *hw); u64 ifcvf_get_hw_features(struct ifcvf_hw *hw); +int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features); u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid); int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num); struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw); diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index fae664d92aff..8bff4efbe749 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -179,6 +179,11 @@ static u64 ifcvf_vdpa_get_features(struct vdpa_device *vdpa_dev) static int ifcvf_vdpa_set_features(struct vdpa_device *vdpa_dev, u64 features) { struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + int ret; + + ret = ifcvf_verify_min_features(vf, features); + if (ret) + return ret; vf->req_features = features; From e8ef6124959a7c4004ef95b1b17cfa9b2ca582bd Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Wed, 17 Mar 2021 17:49:33 +0800 Subject: [PATCH 0890/1379] vDPA/ifcvf: deduce VIRTIO device ID from pdev ids This commit deduces the VIRTIO device ID of a probed device from its pdev device ids. Signed-off-by: Zhu Lingshan Link: https://lore.kernel.org/r/20210317094933.16417-8-lingshan.zhu@intel.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/ifcvf/ifcvf_base.h | 1 + drivers/vdpa/ifcvf/ifcvf_main.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index f77239fc1644..b2eeb16b9c2c 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -127,4 +127,5 @@ int ifcvf_verify_min_features(struct ifcvf_hw *hw, u64 features); u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid); int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num); struct ifcvf_adapter *vf_to_adapter(struct ifcvf_hw *hw); +int ifcvf_probed_virtio_net(struct ifcvf_hw *hw); #endif /* _IFCVF_H_ */ diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 8bff4efbe749..a16753f1de5b 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -323,7 +323,19 @@ static u32 ifcvf_vdpa_get_generation(struct vdpa_device *vdpa_dev) static u32 ifcvf_vdpa_get_device_id(struct vdpa_device *vdpa_dev) { - return VIRTIO_ID_NET; + struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev); + struct pci_dev *pdev = adapter->pdev; + u32 ret = -ENODEV; + + if (pdev->device < 0x1000 || pdev->device > 0x107f) + return ret; + + if (pdev->device < 0x1040) + ret = pdev->subsystem_device; + else + ret = pdev->device - 0x1040; + + return ret; } static u32 ifcvf_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev) From 4080fc1067501707b9693b8003feae7d50d14e35 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:37 +0100 Subject: [PATCH 0891/1379] vdpa_sim: use iova module to allocate IOVA addresses The identical mapping used until now created issues when mapping different virtual pages with the same physical address. To solve this issue, we can use the iova module, to handle the IOVA allocation. For simplicity we use an IOVA allocator with byte granularity. We add two new functions, vdpasim_map_range() and vdpasim_unmap_range(), to handle the IOVA allocation and the registration into the IOMMU/IOTLB. These functions are used by dma_map_ops callbacks. Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-2-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/Kconfig | 1 + drivers/vdpa/vdpa_sim/vdpa_sim.c | 108 +++++++++++++++++++------------ drivers/vdpa/vdpa_sim/vdpa_sim.h | 2 + 3 files changed, 69 insertions(+), 42 deletions(-) diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index a245809c99d0..6e82a0e228c2 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -14,6 +14,7 @@ config VDPA_SIM depends on RUNTIME_TESTING_MENU && HAS_DMA select DMA_OPS select VHOST_RING + select IOMMU_IOVA help Enable this module to support vDPA device simulators. These devices are used for testing, prototyping and development of vDPA. diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 5b6b2f87d40c..fc2ec9599753 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "vdpa_sim.h" @@ -128,30 +129,57 @@ static int dir_to_perm(enum dma_data_direction dir) return perm; } +static dma_addr_t vdpasim_map_range(struct vdpasim *vdpasim, phys_addr_t paddr, + size_t size, unsigned int perm) +{ + struct iova *iova; + dma_addr_t dma_addr; + int ret; + + /* We set the limit_pfn to the maximum (ULONG_MAX - 1) */ + iova = alloc_iova(&vdpasim->iova, size, ULONG_MAX - 1, true); + if (!iova) + return DMA_MAPPING_ERROR; + + dma_addr = iova_dma_addr(&vdpasim->iova, iova); + + spin_lock(&vdpasim->iommu_lock); + ret = vhost_iotlb_add_range(vdpasim->iommu, (u64)dma_addr, + (u64)dma_addr + size - 1, (u64)paddr, perm); + spin_unlock(&vdpasim->iommu_lock); + + if (ret) { + __free_iova(&vdpasim->iova, iova); + return DMA_MAPPING_ERROR; + } + + return dma_addr; +} + +static void vdpasim_unmap_range(struct vdpasim *vdpasim, dma_addr_t dma_addr, + size_t size) +{ + spin_lock(&vdpasim->iommu_lock); + vhost_iotlb_del_range(vdpasim->iommu, (u64)dma_addr, + (u64)dma_addr + size - 1); + spin_unlock(&vdpasim->iommu_lock); + + free_iova(&vdpasim->iova, iova_pfn(&vdpasim->iova, dma_addr)); +} + static dma_addr_t vdpasim_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { struct vdpasim *vdpasim = dev_to_sim(dev); - struct vhost_iotlb *iommu = vdpasim->iommu; - u64 pa = (page_to_pfn(page) << PAGE_SHIFT) + offset; - int ret, perm = dir_to_perm(dir); + phys_addr_t paddr = page_to_phys(page) + offset; + int perm = dir_to_perm(dir); if (perm < 0) return DMA_MAPPING_ERROR; - /* For simplicity, use identical mapping to avoid e.g iova - * allocator. - */ - spin_lock(&vdpasim->iommu_lock); - ret = vhost_iotlb_add_range(iommu, pa, pa + size - 1, - pa, dir_to_perm(dir)); - spin_unlock(&vdpasim->iommu_lock); - if (ret) - return DMA_MAPPING_ERROR; - - return (dma_addr_t)(pa); + return vdpasim_map_range(vdpasim, paddr, size, perm); } static void vdpasim_unmap_page(struct device *dev, dma_addr_t dma_addr, @@ -159,12 +187,8 @@ static void vdpasim_unmap_page(struct device *dev, dma_addr_t dma_addr, unsigned long attrs) { struct vdpasim *vdpasim = dev_to_sim(dev); - struct vhost_iotlb *iommu = vdpasim->iommu; - spin_lock(&vdpasim->iommu_lock); - vhost_iotlb_del_range(iommu, (u64)dma_addr, - (u64)dma_addr + size - 1); - spin_unlock(&vdpasim->iommu_lock); + vdpasim_unmap_range(vdpasim, dma_addr, size); } static void *vdpasim_alloc_coherent(struct device *dev, size_t size, @@ -172,27 +196,22 @@ static void *vdpasim_alloc_coherent(struct device *dev, size_t size, unsigned long attrs) { struct vdpasim *vdpasim = dev_to_sim(dev); - struct vhost_iotlb *iommu = vdpasim->iommu; - void *addr = kmalloc(size, flag); - int ret; + phys_addr_t paddr; + void *addr; - spin_lock(&vdpasim->iommu_lock); + addr = kmalloc(size, flag); if (!addr) { *dma_addr = DMA_MAPPING_ERROR; - } else { - u64 pa = virt_to_phys(addr); - - ret = vhost_iotlb_add_range(iommu, (u64)pa, - (u64)pa + size - 1, - pa, VHOST_MAP_RW); - if (ret) { - *dma_addr = DMA_MAPPING_ERROR; - kfree(addr); - addr = NULL; - } else - *dma_addr = (dma_addr_t)pa; + return NULL; + } + + paddr = virt_to_phys(addr); + + *dma_addr = vdpasim_map_range(vdpasim, paddr, size, VHOST_MAP_RW); + if (*dma_addr == DMA_MAPPING_ERROR) { + kfree(addr); + return NULL; } - spin_unlock(&vdpasim->iommu_lock); return addr; } @@ -202,14 +221,10 @@ static void vdpasim_free_coherent(struct device *dev, size_t size, unsigned long attrs) { struct vdpasim *vdpasim = dev_to_sim(dev); - struct vhost_iotlb *iommu = vdpasim->iommu; - spin_lock(&vdpasim->iommu_lock); - vhost_iotlb_del_range(iommu, (u64)dma_addr, - (u64)dma_addr + size - 1); - spin_unlock(&vdpasim->iommu_lock); + vdpasim_unmap_range(vdpasim, dma_addr, size); - kfree(phys_to_virt((uintptr_t)dma_addr)); + kfree(vaddr); } static const struct dma_map_ops vdpasim_dma_ops = { @@ -271,6 +286,13 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) for (i = 0; i < dev_attr->nvqs; i++) vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu); + ret = iova_cache_get(); + if (ret) + goto err_iommu; + + /* For simplicity we use an IOVA allocator with byte granularity */ + init_iova_domain(&vdpasim->iova, 1, 0); + vdpasim->vdpa.dma_dev = dev; return vdpasim; @@ -541,6 +563,8 @@ static void vdpasim_free(struct vdpa_device *vdpa) struct vdpasim *vdpasim = vdpa_to_sim(vdpa); cancel_work_sync(&vdpasim->work); + put_iova_domain(&vdpasim->iova); + iova_cache_put(); kvfree(vdpasim->buffer); if (vdpasim->iommu) vhost_iotlb_free(vdpasim->iommu); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index 6d75444f9948..cd58e888bcf3 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -6,6 +6,7 @@ #ifndef _VDPA_SIM_H #define _VDPA_SIM_H +#include #include #include #include @@ -57,6 +58,7 @@ struct vdpasim { /* virtio config according to device type */ void *config; struct vhost_iotlb *iommu; + struct iova_domain iova; void *buffer; u32 status; u32 generation; From f53d9910d009bc015b42d88114e2d86a93b0e6b7 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:38 +0100 Subject: [PATCH 0892/1379] vringh: add 'iotlb_lock' to synchronize iotlb accesses Usually iotlb accesses are synchronized with a spinlock. Let's request it as a new parameter in vringh_set_iotlb() and hold it when we navigate the iotlb in iotlb_translate() to avoid race conditions with any new additions/deletions of ranges from the ioltb. Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-3-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 3 ++- drivers/vhost/vringh.c | 9 ++++++++- include/linux/vringh.h | 6 +++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index fc2ec9599753..a92c08880098 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -284,7 +284,8 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) goto err_iommu; for (i = 0; i < dev_attr->nvqs; i++) - vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu); + vringh_set_iotlb(&vdpasim->vqs[i].vring, vdpasim->iommu, + &vdpasim->iommu_lock); ret = iova_cache_get(); if (ret) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 85d85faba058..f68122705719 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -1074,6 +1074,8 @@ static int iotlb_translate(const struct vringh *vrh, int ret = 0; u64 s = 0; + spin_lock(vrh->iotlb_lock); + while (len > s) { u64 size, pa, pfn; @@ -1103,6 +1105,8 @@ static int iotlb_translate(const struct vringh *vrh, ++ret; } + spin_unlock(vrh->iotlb_lock); + return ret; } @@ -1262,10 +1266,13 @@ EXPORT_SYMBOL(vringh_init_iotlb); * vringh_set_iotlb - initialize a vringh for a ring with IOTLB. * @vrh: the vring * @iotlb: iotlb associated with this vring + * @iotlb_lock: spinlock to synchronize the iotlb accesses */ -void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb) +void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb, + spinlock_t *iotlb_lock) { vrh->iotlb = iotlb; + vrh->iotlb_lock = iotlb_lock; } EXPORT_SYMBOL(vringh_set_iotlb); diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 59bd50f99291..9c077863c8f6 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -46,6 +46,9 @@ struct vringh { /* IOTLB for this vring */ struct vhost_iotlb *iotlb; + /* spinlock to synchronize IOTLB accesses */ + spinlock_t *iotlb_lock; + /* The function to call to notify the guest about added buffers */ void (*notify)(struct vringh *); }; @@ -258,7 +261,8 @@ static inline __virtio64 cpu_to_vringh64(const struct vringh *vrh, u64 val) #if IS_REACHABLE(CONFIG_VHOST_IOTLB) -void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb); +void vringh_set_iotlb(struct vringh *vrh, struct vhost_iotlb *iotlb, + spinlock_t *iotlb_lock); int vringh_init_iotlb(struct vringh *vrh, u64 features, unsigned int num, bool weak_barriers, From bbc2c372a83d74d5499ad21d0ade2b71f5bde620 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:39 +0100 Subject: [PATCH 0893/1379] vringh: reset kiov 'consumed' field in __vringh_iov() __vringh_iov() overwrites the contents of riov and wiov, in fact it resets the 'i' and 'used' fields, but also the 'consumed' field should be reset to avoid an inconsistent state. Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-4-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vringh.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index f68122705719..bee63d68201a 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -290,9 +290,9 @@ __vringh_iov(struct vringh *vrh, u16 i, return -EINVAL; if (riov) - riov->i = riov->used = 0; + riov->i = riov->used = riov->consumed = 0; if (wiov) - wiov->i = wiov->used = 0; + wiov->i = wiov->used = wiov->consumed = 0; for (;;) { void *addr; From 69c13c58bd10f036d6e697e664948952e61acfb1 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:40 +0100 Subject: [PATCH 0894/1379] vringh: explain more about cleaning riov and wiov riov and wiov can be reused with subsequent calls of vringh_getdesc_*(). Let's add a paragraph in the documentation of these functions to better explain when riov and wiov need to be cleaned up. Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-5-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vringh.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index bee63d68201a..2a88e087afd8 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -662,7 +662,10 @@ EXPORT_SYMBOL(vringh_init_user); * *head will be vrh->vring.num. You may be able to ignore an invalid * descriptor, but there's not much you can do with an invalid ring. * - * Note that you may need to clean up riov and wiov, even on error! + * Note that you can reuse riov and wiov with subsequent calls. Content is + * overwritten and memory reallocated if more space is needed. + * When you don't have to use riov and wiov anymore, you should clean up them + * calling vringh_iov_cleanup() to release the memory, even on error! */ int vringh_getdesc_user(struct vringh *vrh, struct vringh_iov *riov, @@ -932,7 +935,10 @@ EXPORT_SYMBOL(vringh_init_kern); * *head will be vrh->vring.num. You may be able to ignore an invalid * descriptor, but there's not much you can do with an invalid ring. * - * Note that you may need to clean up riov and wiov, even on error! + * Note that you can reuse riov and wiov with subsequent calls. Content is + * overwritten and memory reallocated if more space is needed. + * When you don't have to use riov and wiov anymore, you should clean up them + * calling vringh_kiov_cleanup() to release the memory, even on error! */ int vringh_getdesc_kern(struct vringh *vrh, struct vringh_kiov *riov, @@ -1292,7 +1298,10 @@ EXPORT_SYMBOL(vringh_set_iotlb); * *head will be vrh->vring.num. You may be able to ignore an invalid * descriptor, but there's not much you can do with an invalid ring. * - * Note that you may need to clean up riov and wiov, even on error! + * Note that you can reuse riov and wiov with subsequent calls. Content is + * overwritten and memory reallocated if more space is needed. + * When you don't have to use riov and wiov anymore, you should clean up them + * calling vringh_kiov_cleanup() to release the memory, even on error! */ int vringh_getdesc_iotlb(struct vringh *vrh, struct vringh_kiov *riov, From b8c06ad4d67db56ed6bdfb685c134da74e92a2c7 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:41 +0100 Subject: [PATCH 0895/1379] vringh: implement vringh_kiov_advance() In some cases, it may be useful to provide a way to skip a number of bytes in a vringh_kiov. Let's implement vringh_kiov_advance() for this purpose, reusing the code from vringh_iov_xfer(). We replace that code calling the new vringh_kiov_advance(). Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-6-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vringh.c | 41 +++++++++++++++++++++++++++++------------ include/linux/vringh.h | 2 ++ 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 2a88e087afd8..4af8fa259d65 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -75,6 +75,34 @@ static inline int __vringh_get_head(const struct vringh *vrh, return head; } +/** + * vringh_kiov_advance - skip bytes from vring_kiov + * @iov: an iov passed to vringh_getdesc_*() (updated as we consume) + * @len: the maximum length to advance + */ +void vringh_kiov_advance(struct vringh_kiov *iov, size_t len) +{ + while (len && iov->i < iov->used) { + size_t partlen = min(iov->iov[iov->i].iov_len, len); + + iov->consumed += partlen; + iov->iov[iov->i].iov_len -= partlen; + iov->iov[iov->i].iov_base += partlen; + + if (!iov->iov[iov->i].iov_len) { + /* Fix up old iov element then increment. */ + iov->iov[iov->i].iov_len = iov->consumed; + iov->iov[iov->i].iov_base -= iov->consumed; + + iov->consumed = 0; + iov->i++; + } + + len -= partlen; + } +} +EXPORT_SYMBOL(vringh_kiov_advance); + /* Copy some bytes to/from the iovec. Returns num copied. */ static inline ssize_t vringh_iov_xfer(struct vringh *vrh, struct vringh_kiov *iov, @@ -95,19 +123,8 @@ static inline ssize_t vringh_iov_xfer(struct vringh *vrh, done += partlen; len -= partlen; ptr += partlen; - iov->consumed += partlen; - iov->iov[iov->i].iov_len -= partlen; - iov->iov[iov->i].iov_base += partlen; - if (!iov->iov[iov->i].iov_len) { - /* Fix up old iov element then increment. */ - iov->iov[iov->i].iov_len = iov->consumed; - iov->iov[iov->i].iov_base -= iov->consumed; - - - iov->consumed = 0; - iov->i++; - } + vringh_kiov_advance(iov, partlen); } return done; } diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 9c077863c8f6..755211ebd195 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -199,6 +199,8 @@ static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov) kiov->iov = NULL; } +void vringh_kiov_advance(struct vringh_kiov *kiov, size_t len); + int vringh_getdesc_kern(struct vringh *vrh, struct vringh_kiov *riov, struct vringh_kiov *wiov, From 14c9ac05ce09c8c6a89ffcca6ffb68707cba36c2 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:42 +0100 Subject: [PATCH 0896/1379] vringh: add vringh_kiov_length() helper This new helper returns the total number of bytes covered by a vringh_kiov. Suggested-by: Jason Wang Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-7-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- include/linux/vringh.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 755211ebd195..84db7b8f912f 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -199,6 +199,17 @@ static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov) kiov->iov = NULL; } +static inline size_t vringh_kiov_length(struct vringh_kiov *kiov) +{ + size_t len = 0; + int i; + + for (i = kiov->i; i < kiov->used; i++) + len += kiov->iov[i].iov_len; + + return len; +} + void vringh_kiov_advance(struct vringh_kiov *kiov, size_t len); int vringh_getdesc_kern(struct vringh *vrh, From bc433e5e0d42d7892dcefb65686c9f1df126923a Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:43 +0100 Subject: [PATCH 0897/1379] vdpa_sim: cleanup kiovs in vdpasim_free() vringh_getdesc_iotlb() allocates memory to store the kvec, that is freed with vringh_kiov_cleanup(). vringh_getdesc_iotlb() is able to reuse a kvec previously allocated, so in order to avoid to allocate the kvec for each request, we are not calling vringh_kiov_cleanup() when we finished to handle a request, but we should call it when we free the entire device. Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-8-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index a92c08880098..14dc2d3d983e 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -562,8 +562,15 @@ static int vdpasim_dma_unmap(struct vdpa_device *vdpa, u64 iova, u64 size) static void vdpasim_free(struct vdpa_device *vdpa) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + int i; cancel_work_sync(&vdpasim->work); + + for (i = 0; i < vdpasim->dev_attr.nvqs; i++) { + vringh_kiov_cleanup(&vdpasim->vqs[i].out_iov); + vringh_kiov_cleanup(&vdpasim->vqs[i].in_iov); + } + put_iova_domain(&vdpasim->iova); iova_cache_put(); kvfree(vdpasim->buffer); From 442706f9f94d28fe3c9f188ae4ebbd6b40addffe Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:44 +0100 Subject: [PATCH 0898/1379] vdpa: add get_config_size callback in vdpa_config_ops This new callback is used to get the size of the configuration space of vDPA devices. Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-9-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/ifcvf/ifcvf_main.c | 6 ++++++ drivers/vdpa/mlx5/net/mlx5_vnet.c | 6 ++++++ drivers/vdpa/vdpa_sim/vdpa_sim.c | 9 +++++++++ drivers/vdpa/virtio_pci/vp_vdpa.c | 8 ++++++++ include/linux/vdpa.h | 4 ++++ 5 files changed, 33 insertions(+) diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index a16753f1de5b..44d7586019da 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -351,6 +351,11 @@ static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device *vdpa_dev) return IFCVF_QUEUE_ALIGNMENT; } +static size_t ifcvf_vdpa_get_config_size(struct vdpa_device *vdpa_dev) +{ + return sizeof(struct virtio_net_config); +} + static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned int offset, void *buf, unsigned int len) @@ -411,6 +416,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = { .get_device_id = ifcvf_vdpa_get_device_id, .get_vendor_id = ifcvf_vdpa_get_vendor_id, .get_vq_align = ifcvf_vdpa_get_vq_align, + .get_config_size = ifcvf_vdpa_get_config_size, .get_config = ifcvf_vdpa_get_config, .set_config = ifcvf_vdpa_set_config, .set_config_cb = ifcvf_vdpa_set_config_cb, diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 25533db01f5f..189e4385df40 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1809,6 +1809,11 @@ err_setup: ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED; } +static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev) +{ + return sizeof(struct virtio_net_config); +} + static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len) { @@ -1895,6 +1900,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .get_vendor_id = mlx5_vdpa_get_vendor_id, .get_status = mlx5_vdpa_get_status, .set_status = mlx5_vdpa_set_status, + .get_config_size = mlx5_vdpa_get_config_size, .get_config = mlx5_vdpa_get_config, .set_config = mlx5_vdpa_set_config, .get_generation = mlx5_vdpa_get_generation, diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 14dc2d3d983e..98f793bc9376 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -462,6 +462,13 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status) spin_unlock(&vdpasim->lock); } +static size_t vdpasim_get_config_size(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + return vdpasim->dev_attr.config_size; +} + static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset, void *buf, unsigned int len) { @@ -598,6 +605,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = { .get_vendor_id = vdpasim_get_vendor_id, .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, + .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, .get_generation = vdpasim_get_generation, @@ -625,6 +633,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .get_vendor_id = vdpasim_get_vendor_id, .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, + .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, .get_generation = vdpasim_get_generation, diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index 002b928d0ca1..c76ebb531212 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -296,6 +296,13 @@ static u32 vp_vdpa_get_vq_align(struct vdpa_device *vdpa) return PAGE_SIZE; } +static size_t vp_vdpa_get_config_size(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return mdev->device_len; +} + static void vp_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset, void *buf, unsigned int len) @@ -369,6 +376,7 @@ static const struct vdpa_config_ops vp_vdpa_ops = { .get_device_id = vp_vdpa_get_device_id, .get_vendor_id = vp_vdpa_get_vendor_id, .get_vq_align = vp_vdpa_get_vq_align, + .get_config_size = vp_vdpa_get_config_size, .get_config = vp_vdpa_get_config, .set_config = vp_vdpa_set_config, .set_config_cb = vp_vdpa_set_config_cb, diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 37b65ca940cf..f311d227aa1b 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -150,6 +150,9 @@ struct vdpa_iova_range { * @set_status: Set the device status * @vdev: vdpa device * @status: virtio device status + * @get_config_size: Get the size of the configuration space + * @vdev: vdpa device + * Returns size_t: configuration size * @get_config: Read from device specific configuration space * @vdev: vdpa device * @offset: offset from the beginning of @@ -231,6 +234,7 @@ struct vdpa_config_ops { u32 (*get_vendor_id)(struct vdpa_device *vdev); u8 (*get_status)(struct vdpa_device *vdev); void (*set_status)(struct vdpa_device *vdev, u8 status); + size_t (*get_config_size)(struct vdpa_device *vdev); void (*get_config)(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len); void (*set_config)(struct vdpa_device *vdev, unsigned int offset, From d6d8bb92fdde6390037bf9da174ed3ab551c04d7 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:45 +0100 Subject: [PATCH 0899/1379] vhost/vdpa: use get_config_size callback in vhost_vdpa_config_validate() Let's use the new 'get_config_size()' callback available instead of using the 'virtio_id' to get the size of the device config space. Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-10-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vdpa.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index c79d2f2387aa..a27756e2f99b 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -188,13 +188,8 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 __user *statusp) static int vhost_vdpa_config_validate(struct vhost_vdpa *v, struct vhost_vdpa_config *c) { - long size = 0; - - switch (v->virtio_id) { - case VIRTIO_ID_NET: - size = sizeof(struct virtio_net_config); - break; - } + struct vdpa_device *vdpa = v->vdpa; + long size = vdpa->config->get_config_size(vdpa); if (c->len == 0) return -EINVAL; From 9d6d97bff7909910af537fd3903d05338adaaefa Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Mon, 15 Mar 2021 17:34:46 +0100 Subject: [PATCH 0900/1379] vhost/vdpa: Remove the restriction that only supports virtio-net devices Since the config checks are done by the vDPA drivers, we can remove the virtio-net restriction and we should be able to support all kinds of virtio devices. is not needed anymore, but we need to include to avoid compilation failures. Signed-off-by: Xie Yongji Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-11-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vdpa.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index a27756e2f99b..a1496c596120 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -16,12 +16,12 @@ #include #include #include +#include #include #include #include #include #include -#include #include "vhost.h" @@ -1023,10 +1023,6 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa) int minor; int r; - /* Currently, we only accept the network devices. */ - if (ops->get_device_id(vdpa) != VIRTIO_ID_NET) - return -ENOTSUPP; - v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!v) return -ENOMEM; From 0c853c2c2924464bd86537164ed18d5d953b4909 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 15 Mar 2021 17:34:47 +0100 Subject: [PATCH 0901/1379] vdpa: add vdpa simulator for block device This will allow running vDPA for virtio block protocol. It's a preliminary implementation with a simple request handling: for each request, only the status (last byte) is set. It's always set to VIRTIO_BLK_S_OK. Also input validation is missing and will be added in the next commits. Signed-off-by: Max Gurtovoy [sgarzare: various cleanups/fixes] Acked-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-12-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/Kconfig | 7 ++ drivers/vdpa/vdpa_sim/Makefile | 1 + drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 145 +++++++++++++++++++++++++++ 3 files changed, 153 insertions(+) create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim_blk.c diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index 6e82a0e228c2..a503c1b2bfd9 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -26,6 +26,13 @@ config VDPA_SIM_NET help vDPA networking device simulator which loops TX traffic back to RX. +config VDPA_SIM_BLOCK + tristate "vDPA simulator for block device" + depends on VDPA_SIM + help + vDPA block device simulator which terminates IO request in a + memory buffer. + config IFCVF tristate "Intel IFC VF vDPA driver" depends on PCI_MSI diff --git a/drivers/vdpa/vdpa_sim/Makefile b/drivers/vdpa/vdpa_sim/Makefile index 79d4536d347e..d458103302f2 100644 --- a/drivers/vdpa/vdpa_sim/Makefile +++ b/drivers/vdpa/vdpa_sim/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_VDPA_SIM) += vdpa_sim.o obj-$(CONFIG_VDPA_SIM_NET) += vdpa_sim_net.o +obj-$(CONFIG_VDPA_SIM_BLOCK) += vdpa_sim_blk.o diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c new file mode 100644 index 000000000000..64926a70af5e --- /dev/null +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VDPA simulator for block device. + * + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vdpa_sim.h" + +#define DRV_VERSION "0.1" +#define DRV_AUTHOR "Max Gurtovoy " +#define DRV_DESC "vDPA Device Simulator for block device" +#define DRV_LICENSE "GPL v2" + +#define VDPASIM_BLK_FEATURES (VDPASIM_FEATURES | \ + (1ULL << VIRTIO_BLK_F_SIZE_MAX) | \ + (1ULL << VIRTIO_BLK_F_SEG_MAX) | \ + (1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ + (1ULL << VIRTIO_BLK_F_TOPOLOGY) | \ + (1ULL << VIRTIO_BLK_F_MQ)) + +#define VDPASIM_BLK_CAPACITY 0x40000 +#define VDPASIM_BLK_SIZE_MAX 0x1000 +#define VDPASIM_BLK_SEG_MAX 32 +#define VDPASIM_BLK_VQ_NUM 1 + +static struct vdpasim *vdpasim_blk_dev; + +static void vdpasim_blk_work(struct work_struct *work) +{ + struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); + u8 status = VIRTIO_BLK_S_OK; + int i; + + spin_lock(&vdpasim->lock); + + if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) + goto out; + + for (i = 0; i < VDPASIM_BLK_VQ_NUM; i++) { + struct vdpasim_virtqueue *vq = &vdpasim->vqs[i]; + + if (!vq->ready) + continue; + + while (vringh_getdesc_iotlb(&vq->vring, &vq->out_iov, + &vq->in_iov, &vq->head, + GFP_ATOMIC) > 0) { + int write; + + vq->in_iov.i = vq->in_iov.used - 1; + write = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, + &status, 1); + if (write <= 0) + break; + + /* Make sure data is wrote before advancing index */ + smp_wmb(); + + vringh_complete_iotlb(&vq->vring, vq->head, write); + + /* Make sure used is visible before rasing the interrupt. */ + smp_wmb(); + + local_bh_disable(); + if (vringh_need_notify_iotlb(&vq->vring) > 0) + vringh_notify(&vq->vring); + local_bh_enable(); + } + } +out: + spin_unlock(&vdpasim->lock); +} + +static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) +{ + struct virtio_blk_config *blk_config = config; + + memset(config, 0, sizeof(struct virtio_blk_config)); + + blk_config->capacity = cpu_to_vdpasim64(vdpasim, VDPASIM_BLK_CAPACITY); + blk_config->size_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SIZE_MAX); + blk_config->seg_max = cpu_to_vdpasim32(vdpasim, VDPASIM_BLK_SEG_MAX); + blk_config->num_queues = cpu_to_vdpasim16(vdpasim, VDPASIM_BLK_VQ_NUM); + blk_config->min_io_size = cpu_to_vdpasim16(vdpasim, 1); + blk_config->opt_io_size = cpu_to_vdpasim32(vdpasim, 1); + blk_config->blk_size = cpu_to_vdpasim32(vdpasim, SECTOR_SIZE); +} + +static int __init vdpasim_blk_init(void) +{ + struct vdpasim_dev_attr dev_attr = {}; + int ret; + + dev_attr.id = VIRTIO_ID_BLOCK; + dev_attr.supported_features = VDPASIM_BLK_FEATURES; + dev_attr.nvqs = VDPASIM_BLK_VQ_NUM; + dev_attr.config_size = sizeof(struct virtio_blk_config); + dev_attr.get_config = vdpasim_blk_get_config; + dev_attr.work_fn = vdpasim_blk_work; + dev_attr.buffer_size = PAGE_SIZE; + + vdpasim_blk_dev = vdpasim_create(&dev_attr); + if (IS_ERR(vdpasim_blk_dev)) { + ret = PTR_ERR(vdpasim_blk_dev); + goto out; + } + + ret = vdpa_register_device(&vdpasim_blk_dev->vdpa, VDPASIM_BLK_VQ_NUM); + if (ret) + goto put_dev; + + return 0; + +put_dev: + put_device(&vdpasim_blk_dev->vdpa.dev); +out: + return ret; +} + +static void __exit vdpasim_blk_exit(void) +{ + struct vdpa_device *vdpa = &vdpasim_blk_dev->vdpa; + + vdpa_unregister_device(vdpa); +} + +module_init(vdpasim_blk_init) +module_exit(vdpasim_blk_exit) + +MODULE_VERSION(DRV_VERSION); +MODULE_LICENSE(DRV_LICENSE); +MODULE_AUTHOR(DRV_AUTHOR); +MODULE_DESCRIPTION(DRV_DESC); From 7d189f617f83f780915b737896a696ff605bd19f Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:48 +0100 Subject: [PATCH 0902/1379] vdpa_sim_blk: implement ramdisk behaviour The previous implementation wrote only the status of each request. This patch implements a more accurate block device simulator, providing a ramdisk-like behavior and adding input validation. Acked-by: Jason Wang Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-13-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 164 ++++++++++++++++++++++++--- 1 file changed, 146 insertions(+), 18 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index 64926a70af5e..a31964e3e5a4 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -3,6 +3,7 @@ * VDPA simulator for block device. * * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2021, Red Hat Inc. All rights reserved. * */ @@ -14,6 +15,7 @@ #include #include #include +#include #include #include "vdpa_sim.h" @@ -37,10 +39,151 @@ static struct vdpasim *vdpasim_blk_dev; +static bool vdpasim_blk_check_range(u64 start_sector, size_t range_size) +{ + u64 range_sectors = range_size >> SECTOR_SHIFT; + + if (range_size > VDPASIM_BLK_SIZE_MAX * VDPASIM_BLK_SEG_MAX) + return false; + + if (start_sector > VDPASIM_BLK_CAPACITY) + return false; + + if (range_sectors > VDPASIM_BLK_CAPACITY - start_sector) + return false; + + return true; +} + +/* Returns 'true' if the request is handled (with or without an I/O error) + * and the status is correctly written in the last byte of the 'in iov', + * 'false' otherwise. + */ +static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, + struct vdpasim_virtqueue *vq) +{ + size_t pushed = 0, to_pull, to_push; + struct virtio_blk_outhdr hdr; + ssize_t bytes; + loff_t offset; + u64 sector; + u8 status; + u32 type; + int ret; + + ret = vringh_getdesc_iotlb(&vq->vring, &vq->out_iov, &vq->in_iov, + &vq->head, GFP_ATOMIC); + if (ret != 1) + return false; + + if (vq->out_iov.used < 1 || vq->in_iov.used < 1) { + dev_err(&vdpasim->vdpa.dev, "missing headers - out_iov: %u in_iov %u\n", + vq->out_iov.used, vq->in_iov.used); + return false; + } + + if (vq->in_iov.iov[vq->in_iov.used - 1].iov_len < 1) { + dev_err(&vdpasim->vdpa.dev, "request in header too short\n"); + return false; + } + + /* The last byte is the status and we checked if the last iov has + * enough room for it. + */ + to_push = vringh_kiov_length(&vq->in_iov) - 1; + + to_pull = vringh_kiov_length(&vq->out_iov); + + bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, &hdr, + sizeof(hdr)); + if (bytes != sizeof(hdr)) { + dev_err(&vdpasim->vdpa.dev, "request out header too short\n"); + return false; + } + + to_pull -= bytes; + + type = vdpasim32_to_cpu(vdpasim, hdr.type); + sector = vdpasim64_to_cpu(vdpasim, hdr.sector); + offset = sector << SECTOR_SHIFT; + status = VIRTIO_BLK_S_OK; + + switch (type) { + case VIRTIO_BLK_T_IN: + if (!vdpasim_blk_check_range(sector, to_push)) { + dev_err(&vdpasim->vdpa.dev, + "reading over the capacity - offset: 0x%llx len: 0x%zx\n", + offset, to_push); + status = VIRTIO_BLK_S_IOERR; + break; + } + + bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, + vdpasim->buffer + offset, + to_push); + if (bytes < 0) { + dev_err(&vdpasim->vdpa.dev, + "vringh_iov_push_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", + bytes, offset, to_push); + status = VIRTIO_BLK_S_IOERR; + break; + } + + pushed += bytes; + break; + + case VIRTIO_BLK_T_OUT: + if (!vdpasim_blk_check_range(sector, to_pull)) { + dev_err(&vdpasim->vdpa.dev, + "writing over the capacity - offset: 0x%llx len: 0x%zx\n", + offset, to_pull); + status = VIRTIO_BLK_S_IOERR; + break; + } + + bytes = vringh_iov_pull_iotlb(&vq->vring, &vq->out_iov, + vdpasim->buffer + offset, + to_pull); + if (bytes < 0) { + dev_err(&vdpasim->vdpa.dev, + "vringh_iov_pull_iotlb() error: %zd offset: 0x%llx len: 0x%zx\n", + bytes, offset, to_pull); + status = VIRTIO_BLK_S_IOERR; + break; + } + break; + + default: + dev_warn(&vdpasim->vdpa.dev, + "Unsupported request type %d\n", type); + status = VIRTIO_BLK_S_IOERR; + break; + } + + /* If some operations fail, we need to skip the remaining bytes + * to put the status in the last byte + */ + if (to_push - pushed > 0) + vringh_kiov_advance(&vq->in_iov, to_push - pushed); + + /* Last byte is the status */ + bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, &status, 1); + if (bytes != 1) + return false; + + pushed += bytes; + + /* Make sure data is wrote before advancing index */ + smp_wmb(); + + vringh_complete_iotlb(&vq->vring, vq->head, pushed); + + return true; +} + static void vdpasim_blk_work(struct work_struct *work) { struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); - u8 status = VIRTIO_BLK_S_OK; int i; spin_lock(&vdpasim->lock); @@ -54,22 +197,7 @@ static void vdpasim_blk_work(struct work_struct *work) if (!vq->ready) continue; - while (vringh_getdesc_iotlb(&vq->vring, &vq->out_iov, - &vq->in_iov, &vq->head, - GFP_ATOMIC) > 0) { - int write; - - vq->in_iov.i = vq->in_iov.used - 1; - write = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, - &status, 1); - if (write <= 0) - break; - - /* Make sure data is wrote before advancing index */ - smp_wmb(); - - vringh_complete_iotlb(&vq->vring, vq->head, write); - + while (vdpasim_blk_handle_req(vdpasim, vq)) { /* Make sure used is visible before rasing the interrupt. */ smp_wmb(); @@ -109,7 +237,7 @@ static int __init vdpasim_blk_init(void) dev_attr.config_size = sizeof(struct virtio_blk_config); dev_attr.get_config = vdpasim_blk_get_config; dev_attr.work_fn = vdpasim_blk_work; - dev_attr.buffer_size = PAGE_SIZE; + dev_attr.buffer_size = VDPASIM_BLK_CAPACITY << SECTOR_SHIFT; vdpasim_blk_dev = vdpasim_create(&dev_attr); if (IS_ERR(vdpasim_blk_dev)) { From e6fa605227071620c11014efbc2930029e1673b9 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:49 +0100 Subject: [PATCH 0903/1379] vdpa_sim_blk: handle VIRTIO_BLK_T_GET_ID Handle VIRTIO_BLK_T_GET_ID request, always answering the "vdpa_blk_sim" string. Acked-by: Jason Wang Reviewed-by: Stefan Hajnoczi Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-14-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index a31964e3e5a4..643ae3bc62c0 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -38,6 +38,7 @@ #define VDPASIM_BLK_VQ_NUM 1 static struct vdpasim *vdpasim_blk_dev; +static char vdpasim_blk_id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim"; static bool vdpasim_blk_check_range(u64 start_sector, size_t range_size) { @@ -153,6 +154,20 @@ static bool vdpasim_blk_handle_req(struct vdpasim *vdpasim, } break; + case VIRTIO_BLK_T_GET_ID: + bytes = vringh_iov_push_iotlb(&vq->vring, &vq->in_iov, + vdpasim_blk_id, + VIRTIO_BLK_ID_BYTES); + if (bytes < 0) { + dev_err(&vdpasim->vdpa.dev, + "vringh_iov_push_iotlb() error: %zd\n", bytes); + status = VIRTIO_BLK_S_IOERR; + break; + } + + pushed += bytes; + break; + default: dev_warn(&vdpasim->vdpa.dev, "Unsupported request type %d\n", type); From 899c4d187f6a5c11d8eae33506fa0736dbabc39f Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 15 Mar 2021 17:34:50 +0100 Subject: [PATCH 0904/1379] vdpa_sim_blk: add support for vdpa management tool Enable the user to create vDPA block simulator devices using the vdpa management tool: # Show vDPA supported devices $ vdpa mgmtdev show vdpasim_blk: supported_classes block # Create a vDPA block device named as 'blk0' from the management # device vdpasim: $ vdpa dev add mgmtdev vdpasim_blk name blk0 # Show the info of the 'blk0' device just created $ vdpa dev show blk0 -jp { "dev": { "blk0": { "type": "block", "mgmtdev": "vdpasim_blk", "vendor_id": 0, "max_vqs": 1, "max_vq_size": 256 } } } # Delete the vDPA device after its use $ vdpa dev del blk0 Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20210315163450.254396-15-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 76 +++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 13 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index 643ae3bc62c0..5bfe1c281645 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -37,7 +37,6 @@ #define VDPASIM_BLK_SEG_MAX 32 #define VDPASIM_BLK_VQ_NUM 1 -static struct vdpasim *vdpasim_blk_dev; static char vdpasim_blk_id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim"; static bool vdpasim_blk_check_range(u64 start_sector, size_t range_size) @@ -241,11 +240,23 @@ static void vdpasim_blk_get_config(struct vdpasim *vdpasim, void *config) blk_config->blk_size = cpu_to_vdpasim32(vdpasim, SECTOR_SIZE); } -static int __init vdpasim_blk_init(void) +static void vdpasim_blk_mgmtdev_release(struct device *dev) +{ +} + +static struct device vdpasim_blk_mgmtdev = { + .init_name = "vdpasim_blk", + .release = vdpasim_blk_mgmtdev_release, +}; + +static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name) { struct vdpasim_dev_attr dev_attr = {}; + struct vdpasim *simdev; int ret; + dev_attr.mgmt_dev = mdev; + dev_attr.name = name; dev_attr.id = VIRTIO_ID_BLOCK; dev_attr.supported_features = VDPASIM_BLK_FEATURES; dev_attr.nvqs = VDPASIM_BLK_VQ_NUM; @@ -254,29 +265,68 @@ static int __init vdpasim_blk_init(void) dev_attr.work_fn = vdpasim_blk_work; dev_attr.buffer_size = VDPASIM_BLK_CAPACITY << SECTOR_SHIFT; - vdpasim_blk_dev = vdpasim_create(&dev_attr); - if (IS_ERR(vdpasim_blk_dev)) { - ret = PTR_ERR(vdpasim_blk_dev); - goto out; - } + simdev = vdpasim_create(&dev_attr); + if (IS_ERR(simdev)) + return PTR_ERR(simdev); - ret = vdpa_register_device(&vdpasim_blk_dev->vdpa, VDPASIM_BLK_VQ_NUM); + ret = _vdpa_register_device(&simdev->vdpa, VDPASIM_BLK_VQ_NUM); if (ret) goto put_dev; return 0; put_dev: - put_device(&vdpasim_blk_dev->vdpa.dev); -out: + put_device(&simdev->vdpa.dev); + return ret; +} + +static void vdpasim_blk_dev_del(struct vdpa_mgmt_dev *mdev, + struct vdpa_device *dev) +{ + struct vdpasim *simdev = container_of(dev, struct vdpasim, vdpa); + + _vdpa_unregister_device(&simdev->vdpa); +} + +static const struct vdpa_mgmtdev_ops vdpasim_blk_mgmtdev_ops = { + .dev_add = vdpasim_blk_dev_add, + .dev_del = vdpasim_blk_dev_del +}; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct vdpa_mgmt_dev mgmt_dev = { + .device = &vdpasim_blk_mgmtdev, + .id_table = id_table, + .ops = &vdpasim_blk_mgmtdev_ops, +}; + +static int __init vdpasim_blk_init(void) +{ + int ret; + + ret = device_register(&vdpasim_blk_mgmtdev); + if (ret) + return ret; + + ret = vdpa_mgmtdev_register(&mgmt_dev); + if (ret) + goto parent_err; + + return 0; + +parent_err: + device_unregister(&vdpasim_blk_mgmtdev); return ret; } static void __exit vdpasim_blk_exit(void) { - struct vdpa_device *vdpa = &vdpasim_blk_dev->vdpa; - - vdpa_unregister_device(vdpa); + vdpa_mgmtdev_unregister(&mgmt_dev); + device_unregister(&vdpasim_blk_mgmtdev); } module_init(vdpasim_blk_init) From 26bfea1309f5d4faad33383d2d82a3463f518982 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Mon, 19 Apr 2021 14:33:24 +0800 Subject: [PATCH 0905/1379] vDPA/ifcvf: deduce VIRTIO device ID when probe This commit deduces VIRTIO device ID as device type when probe, then ifcvf_vdpa_get_device_id() can simply return the ID. ifcvf_vdpa_get_features() and ifcvf_vdpa_get_config_size() can work properly based on the device ID. Signed-off-by: Zhu Lingshan Link: https://lore.kernel.org/r/20210419063326.3748-2-lingshan.zhu@intel.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vdpa/ifcvf/ifcvf_base.h | 1 + drivers/vdpa/ifcvf/ifcvf_main.c | 27 +++++++++++++++------------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index b2eeb16b9c2c..1c04cd256fa7 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -84,6 +84,7 @@ struct ifcvf_hw { u32 notify_off_multiplier; u64 req_features; u64 hw_features; + u32 dev_type; struct virtio_pci_common_cfg __iomem *common_cfg; void __iomem *net_cfg; struct vring_info vring[IFCVF_MAX_QUEUE_PAIRS * 2]; diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 44d7586019da..66927ec81fa5 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -323,19 +323,9 @@ static u32 ifcvf_vdpa_get_generation(struct vdpa_device *vdpa_dev) static u32 ifcvf_vdpa_get_device_id(struct vdpa_device *vdpa_dev) { - struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev); - struct pci_dev *pdev = adapter->pdev; - u32 ret = -ENODEV; + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); - if (pdev->device < 0x1000 || pdev->device > 0x107f) - return ret; - - if (pdev->device < 0x1040) - ret = pdev->subsystem_device; - else - ret = pdev->device - 0x1040; - - return ret; + return vf->dev_type; } static u32 ifcvf_vdpa_get_vendor_id(struct vdpa_device *vdpa_dev) @@ -466,6 +456,19 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) pci_set_drvdata(pdev, adapter); vf = &adapter->vf; + + /* This drirver drives both modern virtio devices and transitional + * devices in modern mode. + * vDPA requires feature bit VIRTIO_F_ACCESS_PLATFORM, + * so legacy devices and transitional devices in legacy + * mode will not work for vDPA, this driver will not + * drive devices with legacy interface. + */ + if (pdev->device < 0x1040) + vf->dev_type = pdev->subsystem_device; + else + vf->dev_type = pdev->device - 0x1040; + vf->base = pcim_iomap_table(pdev); adapter->pdev = pdev; From 6ad31d162a4e0227bd8e854255b37a23a4263900 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Mon, 19 Apr 2021 14:33:25 +0800 Subject: [PATCH 0906/1379] vDPA/ifcvf: enable Intel C5000X-PL virtio-block for vDPA This commit enabled Intel FPGA SmartNIC C5000X-PL virtio-block for vDPA. Signed-off-by: Zhu Lingshan Reviewed-by: Stefano Garzarella Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210419063326.3748-3-lingshan.zhu@intel.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/ifcvf/ifcvf_base.h | 8 +++++++- drivers/vdpa/ifcvf/ifcvf_main.c | 19 ++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index 1c04cd256fa7..0111bfdeb342 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -28,7 +29,12 @@ #define C5000X_PL_SUBSYS_VENDOR_ID 0x8086 #define C5000X_PL_SUBSYS_DEVICE_ID 0x0001 -#define IFCVF_SUPPORTED_FEATURES \ +#define C5000X_PL_BLK_VENDOR_ID 0x1AF4 +#define C5000X_PL_BLK_DEVICE_ID 0x1001 +#define C5000X_PL_BLK_SUBSYS_VENDOR_ID 0x8086 +#define C5000X_PL_BLK_SUBSYS_DEVICE_ID 0x0002 + +#define IFCVF_NET_SUPPORTED_FEATURES \ ((1ULL << VIRTIO_NET_F_MAC) | \ (1ULL << VIRTIO_F_ANY_LAYOUT) | \ (1ULL << VIRTIO_F_VERSION_1) | \ diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 66927ec81fa5..9a4a6df91f08 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -168,10 +168,23 @@ static struct ifcvf_hw *vdpa_to_vf(struct vdpa_device *vdpa_dev) static u64 ifcvf_vdpa_get_features(struct vdpa_device *vdpa_dev) { + struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev); struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + struct pci_dev *pdev = adapter->pdev; + u64 features; - features = ifcvf_get_features(vf) & IFCVF_SUPPORTED_FEATURES; + switch (vf->dev_type) { + case VIRTIO_ID_NET: + features = ifcvf_get_features(vf) & IFCVF_NET_SUPPORTED_FEATURES; + break; + case VIRTIO_ID_BLOCK: + features = ifcvf_get_features(vf); + break; + default: + features = 0; + IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", vf->dev_type); + } return features; } @@ -514,6 +527,10 @@ static struct pci_device_id ifcvf_pci_ids[] = { C5000X_PL_DEVICE_ID, C5000X_PL_SUBSYS_VENDOR_ID, C5000X_PL_SUBSYS_DEVICE_ID) }, + { PCI_DEVICE_SUB(C5000X_PL_BLK_VENDOR_ID, + C5000X_PL_BLK_DEVICE_ID, + C5000X_PL_BLK_SUBSYS_VENDOR_ID, + C5000X_PL_BLK_SUBSYS_DEVICE_ID) }, { 0 }, }; From 5619003173bad626e7d6bd6241c1855b549f9311 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Mon, 19 Apr 2021 14:33:26 +0800 Subject: [PATCH 0907/1379] vDPA/ifcvf: get_config_size should return dev specific config size get_config_size() should return the size based on the decected device type. Signed-off-by: Zhu Lingshan Reviewed-by: Stefano Garzarella Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210419063326.3748-4-lingshan.zhu@intel.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/ifcvf/ifcvf_main.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 9a4a6df91f08..ab0ab5cf0f6e 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -356,7 +356,24 @@ static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device *vdpa_dev) static size_t ifcvf_vdpa_get_config_size(struct vdpa_device *vdpa_dev) { - return sizeof(struct virtio_net_config); + struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev); + struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); + struct pci_dev *pdev = adapter->pdev; + size_t size; + + switch (vf->dev_type) { + case VIRTIO_ID_NET: + size = sizeof(struct virtio_net_config); + break; + case VIRTIO_ID_BLOCK: + size = sizeof(struct virtio_blk_config); + break; + default: + size = 0; + IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", vf->dev_type); + } + + return size; } static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev, From 45799491a92174ff78d9c46de55d614814bdd3e1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 2 May 2021 02:24:35 +0900 Subject: [PATCH 0908/1379] kbuild: remove ARCH=sh64 support from top Makefile Commit 37744feebc08 ("sh: remove sh5 support") removed the SUPERH64 support entirely. Remove the left-over code from the top Makefile. Signed-off-by: Masahiro Yamada Acked-by: Arnd Bergmann --- Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Makefile b/Makefile index 3597fd75d5b5..cbf4b18cf51c 100644 --- a/Makefile +++ b/Makefile @@ -399,11 +399,6 @@ ifeq ($(ARCH),sparc64) SRCARCH := sparc endif -# Additional ARCH settings for sh -ifeq ($(ARCH),sh64) - SRCARCH := sh -endif - export cross_compiling := ifneq ($(SRCARCH),$(SUBARCH)) cross_compiling := 1 From b9b34ddbe2076ade359cd5ce7537d5ed019e9807 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 30 Apr 2021 16:21:46 +0200 Subject: [PATCH 0909/1379] bpf: Fix masking negation logic upon negative dst register The negation logic for the case where the off_reg is sitting in the dst register is not correct given then we cannot just invert the add to a sub or vice versa. As a fix, perform the final bitwise and-op unconditionally into AX from the off_reg, then move the pointer from the src to dst and finally use AX as the source for the original pointer arithmetic operation such that the inversion yields a correct result. The single non-AX mov in between is possible given constant blinding is retaining it as it's not an immediate based operation. Fixes: 979d63d50c0c ("bpf: prevent out of bounds speculation on pointer arithmetic") Signed-off-by: Daniel Borkmann Tested-by: Piotr Krysiuk Reviewed-by: Piotr Krysiuk Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8fd552c16763..db347fb125e3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12391,14 +12391,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env) *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); - if (issrc) { - *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, - off_reg); - insn->src_reg = BPF_REG_AX; - } else { - *patch++ = BPF_ALU64_REG(BPF_AND, off_reg, - BPF_REG_AX); - } + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg); + if (!issrc) + *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg); + insn->src_reg = BPF_REG_AX; if (isneg) insn->code = insn->code == code_add ? code_sub : code_add; From 801c6058d14a82179a7ee17a4b532cac6fad067f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 29 Apr 2021 15:19:37 +0000 Subject: [PATCH 0910/1379] bpf: Fix leakage of uninitialized bpf stack under speculation The current implemented mechanisms to mitigate data disclosure under speculation mainly address stack and map value oob access from the speculative domain. However, Piotr discovered that uninitialized BPF stack is not protected yet, and thus old data from the kernel stack, potentially including addresses of kernel structures, could still be extracted from that 512 bytes large window. The BPF stack is special compared to map values since it's not zero initialized for every program invocation, whereas map values /are/ zero initialized upon their initial allocation and thus cannot leak any prior data in either domain. In the non-speculative domain, the verifier ensures that every stack slot read must have a prior stack slot write by the BPF program to avoid such data leaking issue. However, this is not enough: for example, when the pointer arithmetic operation moves the stack pointer from the last valid stack offset to the first valid offset, the sanitation logic allows for any intermediate offsets during speculative execution, which could then be used to extract any restricted stack content via side-channel. Given for unprivileged stack pointer arithmetic the use of unknown but bounded scalars is generally forbidden, we can simply turn the register-based arithmetic operation into an immediate-based arithmetic operation without the need for masking. This also gives the benefit of reducing the needed instructions for the operation. Given after the work in 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask"), the aux->alu_limit already holds the final immediate value for the offset register with the known scalar. Thus, a simple mov of the immediate to AX register with using AX as the source for the original instruction is sufficient and possible now in this case. Reported-by: Piotr Krysiuk Signed-off-by: Daniel Borkmann Tested-by: Piotr Krysiuk Reviewed-by: Piotr Krysiuk Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++-- kernel/bpf/verifier.c | 27 +++++++++++++++++---------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6023a1367853..06841517ab1e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -302,10 +302,11 @@ struct bpf_verifier_state_list { }; /* Possible states for alu_state member. */ -#define BPF_ALU_SANITIZE_SRC 1U -#define BPF_ALU_SANITIZE_DST 2U +#define BPF_ALU_SANITIZE_SRC (1U << 0) +#define BPF_ALU_SANITIZE_DST (1U << 1) #define BPF_ALU_NEG_VALUE (1U << 2) #define BPF_ALU_NON_POINTER (1U << 3) +#define BPF_ALU_IMMEDIATE (1U << 4) #define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ BPF_ALU_SANITIZE_DST) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index db347fb125e3..757476c91c98 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6496,6 +6496,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, { struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux; struct bpf_verifier_state *vstate = env->cur_state; + bool off_is_imm = tnum_is_const(off_reg->var_off); bool off_is_neg = off_reg->smin_value < 0; bool ptr_is_dst_reg = ptr_reg == dst_reg; u8 opcode = BPF_OP(insn->code); @@ -6526,6 +6527,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, alu_limit = abs(tmp_aux->alu_limit - alu_limit); } else { alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; alu_state |= ptr_is_dst_reg ? BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; } @@ -12371,7 +12373,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; struct bpf_insn *patch = &insn_buf[0]; - bool issrc, isneg; + bool issrc, isneg, isimm; u32 off_reg; aux = &env->insn_aux_data[i + delta]; @@ -12382,16 +12384,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env) isneg = aux->alu_state & BPF_ALU_NEG_VALUE; issrc = (aux->alu_state & BPF_ALU_SANITIZE) == BPF_ALU_SANITIZE_SRC; + isimm = aux->alu_state & BPF_ALU_IMMEDIATE; off_reg = issrc ? insn->src_reg : insn->dst_reg; - if (isneg) - *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); - *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); - *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); - *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); - *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); - *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); - *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg); + if (isimm) { + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); + } else { + if (isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); + *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); + *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg); + } if (!issrc) *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg); insn->src_reg = BPF_REG_AX; @@ -12399,7 +12406,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->code = insn->code == code_add ? code_sub : code_add; *patch++ = *insn; - if (issrc && isneg) + if (issrc && isneg && !isimm) *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); cnt = patch - insn_buf; From ab2165e2e6ed17345ffa8ee88ca764e8788ebcd7 Mon Sep 17 00:00:00 2001 From: Timo Gurr Date: Mon, 3 May 2021 13:08:22 +0200 Subject: [PATCH 0911/1379] ALSA: usb-audio: Add dB range mapping for Sennheiser Communications Headset PC 8 The decibel volume range contains a negative maximum value resulting in pipewire complaining about the device and effectivly having no sound output. The wrong values also resulted in the headset sounding muted already at a mixer level of about ~25%. PipeWire BugLink: https://gitlab.freedesktop.org/pipewire/pipewire/-/issues/1049 BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=212897 Signed-off-by: Timo Gurr Cc: Link: https://lore.kernel.org/r/20210503110822.10222-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/usb/mixer_maps.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sound/usb/mixer_maps.c b/sound/usb/mixer_maps.c index 646deb6244b1..c5794e83fd80 100644 --- a/sound/usb/mixer_maps.c +++ b/sound/usb/mixer_maps.c @@ -337,6 +337,13 @@ static const struct usbmix_name_map bose_companion5_map[] = { { 0 } /* terminator */ }; +/* Sennheiser Communications Headset [PC 8], the dB value is reported as -6 negative maximum */ +static const struct usbmix_dB_map sennheiser_pc8_dB = {-9500, 0}; +static const struct usbmix_name_map sennheiser_pc8_map[] = { + { 9, NULL, .dB = &sennheiser_pc8_dB }, + { 0 } /* terminator */ +}; + /* * Dell usb dock with ALC4020 codec had a firmware problem where it got * screwed up when zero volume is passed; just skip it as a workaround @@ -593,6 +600,11 @@ static const struct usbmix_ctl_map usbmix_ctl_maps[] = { .id = USB_ID(0x17aa, 0x1046), .map = lenovo_p620_rear_map, }, + { + /* Sennheiser Communications Headset [PC 8] */ + .id = USB_ID(0x1395, 0x0025), + .map = sennheiser_pc8_map, + }, { 0 } /* terminator */ }; From 127f1c09c5c84800761cf650b4c4f0a312f569ef Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 3 May 2021 15:08:59 +0200 Subject: [PATCH 0912/1379] parisc: Fix typo in setup.c Signed-off-by: Helge Deller --- arch/parisc/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index e320bae501d3..3fb86ee507dd 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -268,7 +268,7 @@ static int __init parisc_init_resources(void) result = request_resource(&iomem_resource, &local_broadcast); if (result < 0) { printk(KERN_ERR - "%s: failed to claim %saddress space!\n", + "%s: failed to claim %s address space!\n", __FILE__, local_broadcast.name); return result; } From c2036abb625fc7d63ab64fa23999a7e3a90e7412 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 30 Apr 2021 10:26:56 -0500 Subject: [PATCH 0913/1379] dt-bindings: Remove unused Sigma Designs Tango bindings The Sigma Designs Tango support has been removed, but 2 binding docs for NAND and PCIe were missed. Remove them. Cc: Marc Gonzalez Cc: Richard Weinberger Cc: Vignesh Raghavendra Cc: Bjorn Helgaas Cc: linux-mtd@lists.infradead.org Cc: linux-pci@vger.kernel.org Acked-by: Miquel Raynal Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210430153225.3366000-1-robh@kernel.org/ Signed-off-by: Rob Herring --- .../devicetree/bindings/mtd/tango-nand.txt | 38 ------------------- .../devicetree/bindings/pci/tango-pcie.txt | 29 -------------- 2 files changed, 67 deletions(-) delete mode 100644 Documentation/devicetree/bindings/mtd/tango-nand.txt delete mode 100644 Documentation/devicetree/bindings/pci/tango-pcie.txt diff --git a/Documentation/devicetree/bindings/mtd/tango-nand.txt b/Documentation/devicetree/bindings/mtd/tango-nand.txt deleted file mode 100644 index 91c8420241af..000000000000 --- a/Documentation/devicetree/bindings/mtd/tango-nand.txt +++ /dev/null @@ -1,38 +0,0 @@ -Sigma Designs Tango4 NAND Flash Controller (NFC) - -Required properties: - -- compatible: "sigma,smp8758-nand" -- reg: address/size of nfc_reg, nfc_mem, and pbus_reg -- dmas: reference to the DMA channel used by the controller -- dma-names: "rxtx" -- clocks: reference to the system clock -- #address-cells: <1> -- #size-cells: <0> - -Children nodes represent the available NAND chips. -See Documentation/devicetree/bindings/mtd/nand-controller.yaml for generic bindings. - -Example: - - nandc: nand-controller@2c000 { - compatible = "sigma,smp8758-nand"; - reg = <0x2c000 0x30>, <0x2d000 0x800>, <0x20000 0x1000>; - dmas = <&dma0 3>; - dma-names = "rxtx"; - clocks = <&clkgen SYS_CLK>; - #address-cells = <1>; - #size-cells = <0>; - - nand@0 { - reg = <0>; /* CS0 */ - nand-ecc-strength = <14>; - nand-ecc-step-size = <1024>; - }; - - nand@1 { - reg = <1>; /* CS1 */ - nand-ecc-strength = <14>; - nand-ecc-step-size = <1024>; - }; - }; diff --git a/Documentation/devicetree/bindings/pci/tango-pcie.txt b/Documentation/devicetree/bindings/pci/tango-pcie.txt deleted file mode 100644 index 244683836a79..000000000000 --- a/Documentation/devicetree/bindings/pci/tango-pcie.txt +++ /dev/null @@ -1,29 +0,0 @@ -Sigma Designs Tango PCIe controller - -Required properties: - -- compatible: "sigma,smp8759-pcie" -- reg: address/size of PCI configuration space, address/size of register area -- bus-range: defined by size of PCI configuration space -- device_type: "pci" -- #size-cells: <2> -- #address-cells: <3> -- msi-controller -- ranges: translation from system to bus addresses -- interrupts: spec for misc interrupts, spec for MSI - -Example: - - pcie@2e000 { - compatible = "sigma,smp8759-pcie"; - reg = <0x50000000 0x400000>, <0x2e000 0x100>; - bus-range = <0 3>; - device_type = "pci"; - #size-cells = <2>; - #address-cells = <3>; - msi-controller; - ranges = <0x02000000 0x0 0x00400000 0x50400000 0x0 0x3c00000>; - interrupts = - <54 IRQ_TYPE_LEVEL_HIGH>, /* misc interrupts */ - <55 IRQ_TYPE_LEVEL_HIGH>; /* MSI */ - }; From 788dcee0306e1bdbae1a76d1b3478bb899c5838e Mon Sep 17 00:00:00 2001 From: Sid Manning Date: Fri, 23 Apr 2021 15:06:58 -0500 Subject: [PATCH 0914/1379] Hexagon: fix build errors Fix type-o in ptrace.c. Add missing include: asm/hexagon_vm.h Remove superfluous cast. Replace 'p3_0' with 'preds'. Signed-off-by: Sid Manning Add -mlong-calls to build flags. Signed-off-by: Brian Cain Tested-by: Nick Desaulniers Reviewed-by: Nick Desaulniers --- arch/hexagon/Makefile | 3 +++ arch/hexagon/include/asm/timex.h | 3 ++- arch/hexagon/kernel/ptrace.c | 4 ++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/hexagon/Makefile b/arch/hexagon/Makefile index c168c6980d05..38264831905b 100644 --- a/arch/hexagon/Makefile +++ b/arch/hexagon/Makefile @@ -10,6 +10,9 @@ LDFLAGS_vmlinux += -G0 # Do not use single-byte enums; these will overflow. KBUILD_CFLAGS += -fno-short-enums +# We must use long-calls: +KBUILD_CFLAGS += -mlong-calls + # Modules must use either long-calls, or use pic/plt. # Use long-calls for now, it's easier. And faster. # KBUILD_CFLAGS_MODULE += -fPIC diff --git a/arch/hexagon/include/asm/timex.h b/arch/hexagon/include/asm/timex.h index 78338d8ada83..8d4ec76fceb4 100644 --- a/arch/hexagon/include/asm/timex.h +++ b/arch/hexagon/include/asm/timex.h @@ -8,6 +8,7 @@ #include #include +#include /* Using TCX0 as our clock. CLOCK_TICK_RATE scheduled to be removed. */ #define CLOCK_TICK_RATE TCX0_CLK_RATE @@ -16,7 +17,7 @@ static inline int read_current_timer(unsigned long *timer_val) { - *timer_val = (unsigned long) __vmgettime(); + *timer_val = __vmgettime(); return 0; } diff --git a/arch/hexagon/kernel/ptrace.c b/arch/hexagon/kernel/ptrace.c index a5a89e944257..8975f9b4cedf 100644 --- a/arch/hexagon/kernel/ptrace.c +++ b/arch/hexagon/kernel/ptrace.c @@ -35,7 +35,7 @@ void user_disable_single_step(struct task_struct *child) static int genregs_get(struct task_struct *target, const struct user_regset *regset, - srtuct membuf to) + struct membuf to) { struct pt_regs *regs = task_pt_regs(target); @@ -54,7 +54,7 @@ static int genregs_get(struct task_struct *target, membuf_store(&to, regs->m0); membuf_store(&to, regs->m1); membuf_store(&to, regs->usr); - membuf_store(&to, regs->p3_0); + membuf_store(&to, regs->preds); membuf_store(&to, regs->gp); membuf_store(&to, regs->ugp); membuf_store(&to, pt_elr(regs)); // pc From 6fff7410f6befe5744d54f0418d65a6322998c09 Mon Sep 17 00:00:00 2001 From: Sid Manning Date: Mon, 26 Apr 2021 13:51:53 -0500 Subject: [PATCH 0915/1379] Hexagon: change jumps to must-extend in futex_atomic_* Cross-section jumps from .fixup section must be extended. Signed-off-by: Sid Manning Signed-off-by: Brian Cain Tested-by: Nick Desaulniers Reviewed-by: Nick Desaulniers --- arch/hexagon/include/asm/futex.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/hexagon/include/asm/futex.h b/arch/hexagon/include/asm/futex.h index 6b9c554aee78..9fb00a0ae89f 100644 --- a/arch/hexagon/include/asm/futex.h +++ b/arch/hexagon/include/asm/futex.h @@ -21,7 +21,7 @@ "3:\n" \ ".section .fixup,\"ax\"\n" \ "4: %1 = #%5;\n" \ - " jump 3b\n" \ + " jump ##3b\n" \ ".previous\n" \ ".section __ex_table,\"a\"\n" \ ".long 1b,4b,2b,4b\n" \ @@ -90,7 +90,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, "3:\n" ".section .fixup,\"ax\"\n" "4: %0 = #%6\n" - " jump 3b\n" + " jump ##3b\n" ".previous\n" ".section __ex_table,\"a\"\n" ".long 1b,4b,2b,4b\n" From aaa44952bbd1d4db14a4d676bf9595bb5db7e7b0 Mon Sep 17 00:00:00 2001 From: Sid Manning Date: Mon, 26 Apr 2021 22:38:31 -0500 Subject: [PATCH 0916/1379] Hexagon: remove DEBUG from comet config Remove CONFIG_DEBUG_INFO from comet configuration. Signed-off-by: Sid Manning Signed-off-by: Brian Cain Tested-by: Nick Desaulniers Reviewed-by: Nick Desaulniers --- arch/hexagon/configs/comet_defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/hexagon/configs/comet_defconfig b/arch/hexagon/configs/comet_defconfig index f19ae2ab0aaa..f70daa038e85 100644 --- a/arch/hexagon/configs/comet_defconfig +++ b/arch/hexagon/configs/comet_defconfig @@ -81,4 +81,3 @@ CONFIG_FRAME_WARN=0 CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y # CONFIG_SCHED_DEBUG is not set -CONFIG_DEBUG_INFO=y From f1f99adf05f2138ff2646d756d4674e302e8d02d Mon Sep 17 00:00:00 2001 From: Sid Manning Date: Tue, 13 Oct 2020 16:17:50 -0500 Subject: [PATCH 0917/1379] Hexagon: add target builtins to kernel Add the compiler-rt builtins like memcpy to the hexagon kernel. Signed-off-by: Sid Manning Add SYM_FUNC_START/END, ksyms exports Signed-off-by: Brian Cain Tested-by: Nick Desaulniers --- arch/hexagon/Makefile | 3 -- arch/hexagon/kernel/hexagon_ksyms.c | 8 +-- arch/hexagon/lib/Makefile | 3 +- arch/hexagon/lib/divsi3.S | 67 ++++++++++++++++++++++++ arch/hexagon/lib/memcpy_likely_aligned.S | 56 ++++++++++++++++++++ arch/hexagon/lib/modsi3.S | 46 ++++++++++++++++ arch/hexagon/lib/udivsi3.S | 38 ++++++++++++++ arch/hexagon/lib/umodsi3.S | 36 +++++++++++++ 8 files changed, 249 insertions(+), 8 deletions(-) create mode 100644 arch/hexagon/lib/divsi3.S create mode 100644 arch/hexagon/lib/memcpy_likely_aligned.S create mode 100644 arch/hexagon/lib/modsi3.S create mode 100644 arch/hexagon/lib/udivsi3.S create mode 100644 arch/hexagon/lib/umodsi3.S diff --git a/arch/hexagon/Makefile b/arch/hexagon/Makefile index 38264831905b..74b644ea8a00 100644 --- a/arch/hexagon/Makefile +++ b/arch/hexagon/Makefile @@ -33,9 +33,6 @@ TIR_NAME := r19 KBUILD_CFLAGS += -ffixed-$(TIR_NAME) -DTHREADINFO_REG=$(TIR_NAME) -D__linux__ KBUILD_AFLAGS += -DTHREADINFO_REG=$(TIR_NAME) -LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name 2>/dev/null) -libs-y += $(LIBGCC) - head-y := arch/hexagon/kernel/head.o core-y += arch/hexagon/kernel/ \ diff --git a/arch/hexagon/kernel/hexagon_ksyms.c b/arch/hexagon/kernel/hexagon_ksyms.c index 6fb1aaab1c29..35545a7386a0 100644 --- a/arch/hexagon/kernel/hexagon_ksyms.c +++ b/arch/hexagon/kernel/hexagon_ksyms.c @@ -35,8 +35,8 @@ EXPORT_SYMBOL(_dflt_cache_att); DECLARE_EXPORT(__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes); /* Additional functions */ -DECLARE_EXPORT(__divsi3); -DECLARE_EXPORT(__modsi3); -DECLARE_EXPORT(__udivsi3); -DECLARE_EXPORT(__umodsi3); +DECLARE_EXPORT(__hexagon_divsi3); +DECLARE_EXPORT(__hexagon_modsi3); +DECLARE_EXPORT(__hexagon_udivsi3); +DECLARE_EXPORT(__hexagon_umodsi3); DECLARE_EXPORT(csum_tcpudp_magic); diff --git a/arch/hexagon/lib/Makefile b/arch/hexagon/lib/Makefile index 54be529d17a2..a64641e89d5f 100644 --- a/arch/hexagon/lib/Makefile +++ b/arch/hexagon/lib/Makefile @@ -2,4 +2,5 @@ # # Makefile for hexagon-specific library files. # -obj-y = checksum.o io.o memcpy.o memset.o +obj-y = checksum.o io.o memcpy.o memset.o memcpy_likely_aligned.o \ + divsi3.o modsi3.o udivsi3.o umodsi3.o diff --git a/arch/hexagon/lib/divsi3.S b/arch/hexagon/lib/divsi3.S new file mode 100644 index 000000000000..783e09424c2c --- /dev/null +++ b/arch/hexagon/lib/divsi3.S @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#include + +SYM_FUNC_START(__hexagon_divsi3) + { + p0 = cmp.gt(r0,#-1) + p1 = cmp.gt(r1,#-1) + r3:2 = vabsw(r1:0) + } + { + p3 = xor(p0,p1) + r4 = sub(r2,r3) + r6 = cl0(r2) + p0 = cmp.gtu(r3,r2) + } + { + r0 = mux(p3,#-1,#1) + r7 = cl0(r3) + p1 = cmp.gtu(r3,r4) + } + { + r0 = mux(p0,#0,r0) + p0 = or(p0,p1) + if (p0.new) jumpr:nt r31 + r6 = sub(r7,r6) + } + { + r7 = r6 + r5:4 = combine(#1,r3) + r6 = add(#1,lsr(r6,#1)) + p0 = cmp.gtu(r6,#4) + } + { + r5:4 = vaslw(r5:4,r7) + if (!p0) r6 = #3 + } + { + loop0(1f,r6) + r7:6 = vlsrw(r5:4,#1) + r1:0 = #0 + } + .falign +1: + { + r5:4 = vlsrw(r5:4,#2) + if (!p0.new) r0 = add(r0,r5) + if (!p0.new) r2 = sub(r2,r4) + p0 = cmp.gtu(r4,r2) + } + { + r7:6 = vlsrw(r7:6,#2) + if (!p0.new) r0 = add(r0,r7) + if (!p0.new) r2 = sub(r2,r6) + p0 = cmp.gtu(r6,r2) + }:endloop0 + { + if (!p0) r0 = add(r0,r7) + } + { + if (p3) r0 = sub(r1,r0) + jumpr r31 + } +SYM_FUNC_END(__hexagon_divsi3) diff --git a/arch/hexagon/lib/memcpy_likely_aligned.S b/arch/hexagon/lib/memcpy_likely_aligned.S new file mode 100644 index 000000000000..6a541fb90a54 --- /dev/null +++ b/arch/hexagon/lib/memcpy_likely_aligned.S @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#include + +SYM_FUNC_START(__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes) + { + p0 = bitsclr(r1,#7) + p0 = bitsclr(r0,#7) + if (p0.new) r5:4 = memd(r1) + if (p0.new) r7:6 = memd(r1+#8) + } + { + if (!p0) jump:nt .Lmemcpy_call + if (p0) r9:8 = memd(r1+#16) + if (p0) r11:10 = memd(r1+#24) + p0 = cmp.gtu(r2,#64) + } + { + if (p0) jump:nt .Lmemcpy_call + if (!p0) memd(r0) = r5:4 + if (!p0) memd(r0+#8) = r7:6 + p0 = cmp.gtu(r2,#32) + } + { + p1 = cmp.gtu(r2,#40) + p2 = cmp.gtu(r2,#48) + if (p0) r13:12 = memd(r1+#32) + if (p1.new) r15:14 = memd(r1+#40) + } + { + memd(r0+#16) = r9:8 + memd(r0+#24) = r11:10 + } + { + if (p0) memd(r0+#32) = r13:12 + if (p1) memd(r0+#40) = r15:14 + if (!p2) jumpr:t r31 + } + { + p0 = cmp.gtu(r2,#56) + r5:4 = memd(r1+#48) + if (p0.new) r7:6 = memd(r1+#56) + } + { + memd(r0+#48) = r5:4 + if (p0) memd(r0+#56) = r7:6 + jumpr r31 + } + +.Lmemcpy_call: + jump memcpy + +SYM_FUNC_END(__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes) diff --git a/arch/hexagon/lib/modsi3.S b/arch/hexagon/lib/modsi3.S new file mode 100644 index 000000000000..9ea1c86efac2 --- /dev/null +++ b/arch/hexagon/lib/modsi3.S @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#include + +SYM_FUNC_START(__hexagon_modsi3) + { + p2 = cmp.ge(r0,#0) + r2 = abs(r0) + r1 = abs(r1) + } + { + r3 = cl0(r2) + r4 = cl0(r1) + p0 = cmp.gtu(r1,r2) + } + { + r3 = sub(r4,r3) + if (p0) jumpr r31 + } + { + p1 = cmp.eq(r3,#0) + loop0(1f,r3) + r0 = r2 + r2 = lsl(r1,r3) + } + .falign +1: + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r2) + r2 = lsr(r2,#1) + if (p1) r1 = #0 + }:endloop0 + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r1) + if (p2) jumpr r31 + } + { + r0 = neg(r0) + jumpr r31 + } +SYM_FUNC_END(__hexagon_modsi3) diff --git a/arch/hexagon/lib/udivsi3.S b/arch/hexagon/lib/udivsi3.S new file mode 100644 index 000000000000..477f27b9311c --- /dev/null +++ b/arch/hexagon/lib/udivsi3.S @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#include + +SYM_FUNC_START(__hexagon_udivsi3) + { + r2 = cl0(r0) + r3 = cl0(r1) + r5:4 = combine(#1,#0) + p0 = cmp.gtu(r1,r0) + } + { + r6 = sub(r3,r2) + r4 = r1 + r1:0 = combine(r0,r4) + if (p0) jumpr r31 + } + { + r3:2 = vlslw(r5:4,r6) + loop0(1f,r6) + } + .falign +1: + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r1 = sub(r1,r2) + if (!p0.new) r0 = add(r0,r3) + r3:2 = vlsrw(r3:2,#1) + }:endloop0 + { + p0 = cmp.gtu(r2,r1) + if (!p0.new) r0 = add(r0,r3) + jumpr r31 + } +SYM_FUNC_END(__hexagon_udivsi3) diff --git a/arch/hexagon/lib/umodsi3.S b/arch/hexagon/lib/umodsi3.S new file mode 100644 index 000000000000..280bf06a55e7 --- /dev/null +++ b/arch/hexagon/lib/umodsi3.S @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ + +#include + +SYM_FUNC_START(__hexagon_umodsi3) + { + r2 = cl0(r0) + r3 = cl0(r1) + p0 = cmp.gtu(r1,r0) + } + { + r2 = sub(r3,r2) + if (p0) jumpr r31 + } + { + loop0(1f,r2) + p1 = cmp.eq(r2,#0) + r2 = lsl(r1,r2) + } + .falign +1: + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r2) + r2 = lsr(r2,#1) + if (p1) r1 = #0 + }:endloop0 + { + p0 = cmp.gtu(r2,r0) + if (!p0.new) r0 = sub(r0,r1) + jumpr r31 + } +SYM_FUNC_END(__hexagon_umodsi3) From c3f207ab29f793b8c942ce8067ed123f18d5b81b Mon Sep 17 00:00:00 2001 From: Rohith Surabattula Date: Tue, 13 Apr 2021 00:26:42 -0500 Subject: [PATCH 0918/1379] cifs: Deferred close for files When file is closed, SMB2 close request is not sent to server immediately and is deferred for acregmax defined interval. When file is reopened by same process for read or write, the file handle is reused if an oplock is held. When client receives a oplock/lease break, file is closed immediately if reference count is zero, else oplock is downgraded. Signed-off-by: Rohith Surabattula Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 15 +++++++- fs/cifs/cifsglob.h | 17 +++++++++ fs/cifs/cifsproto.h | 11 ++++++ fs/cifs/file.c | 84 +++++++++++++++++++++++++++++++++++++++++++-- fs/cifs/inode.c | 1 + fs/cifs/misc.c | 62 +++++++++++++++++++++++++++++++++ 6 files changed, 187 insertions(+), 3 deletions(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 44fd850f05c1..8a6894577697 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -133,6 +133,7 @@ struct workqueue_struct *cifsiod_wq; struct workqueue_struct *decrypt_wq; struct workqueue_struct *fileinfo_put_wq; struct workqueue_struct *cifsoplockd_wq; +struct workqueue_struct *deferredclose_wq; __u32 cifs_lock_secret; /* @@ -390,6 +391,8 @@ cifs_alloc_inode(struct super_block *sb) /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */ INIT_LIST_HEAD(&cifs_inode->openFileList); INIT_LIST_HEAD(&cifs_inode->llist); + INIT_LIST_HEAD(&cifs_inode->deferred_closes); + spin_lock_init(&cifs_inode->deferred_lock); return &cifs_inode->vfs_inode; } @@ -1637,9 +1640,16 @@ init_cifs(void) goto out_destroy_fileinfo_put_wq; } + deferredclose_wq = alloc_workqueue("deferredclose", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!deferredclose_wq) { + rc = -ENOMEM; + goto out_destroy_cifsoplockd_wq; + } + rc = cifs_fscache_register(); if (rc) - goto out_destroy_cifsoplockd_wq; + goto out_destroy_deferredclose_wq; rc = cifs_init_inodecache(); if (rc) @@ -1707,6 +1717,8 @@ out_destroy_inodecache: cifs_destroy_inodecache(); out_unreg_fscache: cifs_fscache_unregister(); +out_destroy_deferredclose_wq: + destroy_workqueue(deferredclose_wq); out_destroy_cifsoplockd_wq: destroy_workqueue(cifsoplockd_wq); out_destroy_fileinfo_put_wq: @@ -1741,6 +1753,7 @@ exit_cifs(void) cifs_destroy_mids(); cifs_destroy_inodecache(); cifs_fscache_unregister(); + destroy_workqueue(deferredclose_wq); destroy_workqueue(cifsoplockd_wq); destroy_workqueue(decrypt_wq); destroy_workqueue(fileinfo_put_wq); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b23a0ee8c6f8..d88b4b523dcc 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1154,6 +1154,14 @@ struct cifs_pending_open { __u32 oplock; }; +struct cifs_deferred_close { + struct list_head dlist; + struct tcon_link *tlink; + __u16 netfid; + __u64 persistent_fid; + __u64 volatile_fid; +}; + /* * This info hangs off the cifsFileInfo structure, pointed to by llist. * This is used to track byte stream locks on the file @@ -1248,6 +1256,9 @@ struct cifsFileInfo { struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ struct work_struct put; /* work for the final part of _put */ + struct delayed_work deferred; + bool oplock_break_received; /* Flag to indicate oplock break */ + bool deferred_scheduled; }; struct cifs_io_parms { @@ -1392,6 +1403,7 @@ struct cifsInodeInfo { #define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */ #define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */ #define CIFS_INO_LOCK (5) /* lock bit for synchronization */ +#define CIFS_INO_MODIFIED_ATTR (6) /* Indicate change in mtime/ctime */ unsigned long flags; spinlock_t writers_lock; unsigned int writers; /* Number of writers on this inode */ @@ -1404,6 +1416,8 @@ struct cifsInodeInfo { struct fscache_cookie *fscache; #endif struct inode vfs_inode; + struct list_head deferred_closes; /* list of deferred closes */ + spinlock_t deferred_lock; /* protection on deferred list */ }; static inline struct cifsInodeInfo * @@ -1871,11 +1885,14 @@ extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */ void cifs_oplock_break(struct work_struct *work); void cifs_queue_oplock_break(struct cifsFileInfo *cfile); +void smb2_deferred_work_close(struct work_struct *work); +extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; extern struct workqueue_struct *decrypt_wq; extern struct workqueue_struct *fileinfo_put_wq; extern struct workqueue_struct *cifsoplockd_wq; +extern struct workqueue_struct *deferredclose_wq; extern __u32 cifs_lock_secret; extern mempool_t *cifs_mid_poolp; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index c8faa3e82fe7..c6dacce87d3a 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -267,6 +267,17 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open); extern void cifs_del_pending_open(struct cifs_pending_open *open); + +extern bool cifs_is_deferred_close(struct cifsFileInfo *cfile, + struct cifs_deferred_close **dclose); + +extern void cifs_add_deferred_close(struct cifsFileInfo *cfile, + struct cifs_deferred_close *dclose); + +extern void cifs_del_deferred_close(struct cifsFileInfo *cfile); + +extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode); + extern struct TCP_Server_Info *cifs_get_tcp_session(struct smb3_fs_context *ctx); extern void cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 3d4e6e7dac1d..7e97aeabd616 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -322,9 +322,12 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->dentry = dget(dentry); cfile->f_flags = file->f_flags; cfile->invalidHandle = false; + cfile->oplock_break_received = false; + cfile->deferred_scheduled = false; cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); INIT_WORK(&cfile->put, cifsFileInfo_put_work); + INIT_DELAYED_WORK(&cfile->deferred, smb2_deferred_work_close); mutex_init(&cfile->fh_mutex); spin_lock_init(&cfile->file_info_lock); @@ -565,6 +568,23 @@ int cifs_open(struct inode *inode, struct file *file) file->f_op = &cifs_file_direct_ops; } + spin_lock(&CIFS_I(inode)->deferred_lock); + /* Get the cached handle as SMB2 close is deferred */ + rc = cifs_get_readable_path(tcon, full_path, &cfile); + if (rc == 0) { + if (file->f_flags == cfile->f_flags) { + file->private_data = cfile; + cifs_del_deferred_close(cfile); + spin_unlock(&CIFS_I(inode)->deferred_lock); + goto out; + } else { + spin_unlock(&CIFS_I(inode)->deferred_lock); + _cifsFileInfo_put(cfile, true, false); + } + } else { + spin_unlock(&CIFS_I(inode)->deferred_lock); + } + if (server->oplocks) oplock = REQ_OPLOCK; else @@ -846,11 +866,52 @@ reopen_error_exit: return rc; } +void smb2_deferred_work_close(struct work_struct *work) +{ + struct cifsFileInfo *cfile = container_of(work, + struct cifsFileInfo, deferred.work); + + spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + cifs_del_deferred_close(cfile); + cfile->deferred_scheduled = false; + spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + _cifsFileInfo_put(cfile, true, false); +} + int cifs_close(struct inode *inode, struct file *file) { + struct cifsFileInfo *cfile; + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_deferred_close *dclose; + if (file->private_data != NULL) { - _cifsFileInfo_put(file->private_data, true, false); + cfile = file->private_data; file->private_data = NULL; + dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); + if ((cinode->oplock == CIFS_CACHE_RHW_FLG) && + dclose) { + if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) + inode->i_ctime = inode->i_mtime = current_time(inode); + spin_lock(&cinode->deferred_lock); + cifs_add_deferred_close(cfile, dclose); + if (cfile->deferred_scheduled) { + mod_delayed_work(deferredclose_wq, + &cfile->deferred, cifs_sb->ctx->acregmax); + } else { + /* Deferred close for files */ + queue_delayed_work(deferredclose_wq, + &cfile->deferred, cifs_sb->ctx->acregmax); + cfile->deferred_scheduled = true; + spin_unlock(&cinode->deferred_lock); + return 0; + } + spin_unlock(&cinode->deferred_lock); + _cifsFileInfo_put(cfile, true, false); + } else { + _cifsFileInfo_put(cfile, true, false); + kfree(dclose); + } } /* return code from the ->release op is always ignored */ @@ -1947,7 +2008,8 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, if (fsuid_only && !uid_eq(open_file->uid, current_fsuid())) continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) { - if (!open_file->invalidHandle) { + if ((!open_file->invalidHandle) && + (!open_file->oplock_break_received)) { /* found a good file */ /* lock it so it will not be closed on us */ cifsFileInfo_get(open_file); @@ -2476,6 +2538,8 @@ retry: if (cfile) cifsFileInfo_put(cfile); free_xid(xid); + /* Indication to update ctime and mtime as close is deferred */ + set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); return rc; } @@ -2584,6 +2648,8 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, unlock_page(page); put_page(page); + /* Indication to update ctime and mtime as close is deferred */ + set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); return rc; } @@ -4744,6 +4810,8 @@ void cifs_oplock_break(struct work_struct *work) struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; bool purge_cache = false; + bool is_deferred = false; + struct cifs_deferred_close *dclose; wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, TASK_UNINTERRUPTIBLE); @@ -4791,6 +4859,18 @@ oplock_break_ack: cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false); + /* + * When oplock break is received and there are no active + * file handles but cached, then set the flag oplock_break_received. + * So, new open will not use cached handle. + */ + spin_lock(&CIFS_I(inode)->deferred_lock); + is_deferred = cifs_is_deferred_close(cfile, &dclose); + if (is_deferred) { + cfile->oplock_break_received = true; + mod_delayed_work(deferredclose_wq, &cfile->deferred, 0); + } + spin_unlock(&CIFS_I(inode)->deferred_lock); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a0846f788436..b4326ffcefce 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1645,6 +1645,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) goto unlink_out; } + cifs_close_deferred_file(CIFS_I(inode)); if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = CIFSPOSIXDelFile(xid, tcon, full_path, diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index c15a90e422be..e63fbd4a6bfe 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -672,6 +672,68 @@ cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } +bool +cifs_is_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close **pdclose) +{ + struct cifs_deferred_close *dclose; + + list_for_each_entry(dclose, &CIFS_I(d_inode(cfile->dentry))->deferred_closes, dlist) { + if ((dclose->netfid == cfile->fid.netfid) && + (dclose->persistent_fid == cfile->fid.persistent_fid) && + (dclose->volatile_fid == cfile->fid.volatile_fid)) { + *pdclose = dclose; + return true; + } + } + return false; +} + +void +cifs_add_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close *dclose) +{ + bool is_deferred = false; + struct cifs_deferred_close *pdclose; + + is_deferred = cifs_is_deferred_close(cfile, &pdclose); + if (is_deferred) { + kfree(dclose); + return; + } + + dclose->tlink = cfile->tlink; + dclose->netfid = cfile->fid.netfid; + dclose->persistent_fid = cfile->fid.persistent_fid; + dclose->volatile_fid = cfile->fid.volatile_fid; + list_add_tail(&dclose->dlist, &CIFS_I(d_inode(cfile->dentry))->deferred_closes); +} + +void +cifs_del_deferred_close(struct cifsFileInfo *cfile) +{ + bool is_deferred = false; + struct cifs_deferred_close *dclose; + + is_deferred = cifs_is_deferred_close(cfile, &dclose); + if (!is_deferred) + return; + list_del(&dclose->dlist); + kfree(dclose); +} + +void +cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) +{ + struct cifsFileInfo *cfile = NULL; + struct cifs_deferred_close *dclose; + + list_for_each_entry(cfile, &cifs_inode->openFileList, flist) { + spin_lock(&cifs_inode->deferred_lock); + if (cifs_is_deferred_close(cfile, &dclose)) + mod_delayed_work(deferredclose_wq, &cfile->deferred, 0); + spin_unlock(&cifs_inode->deferred_lock); + } +} + /* parses DFS refferal V3 structure * caller is responsible for freeing target_nodes * returns: From 087f757b0129850c99cc9116df4909dac1bce871 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 29 Apr 2021 00:18:43 -0500 Subject: [PATCH 0919/1379] cifs: add shutdown support Various filesystem support the shutdown ioctl which is used by various xfstests. The shutdown ioctl sets a flag on the superblock which prevents open, unlink, symlink, hardlink, rmdir, create etc. on the file system until unmount and remounted. The two flags supported in this patch are: FSOP_GOING_FLAGS_LOGFLUSH and FSOP_GOING_FLAGS_NOLOGFLUSH which require very little other than blocking new operations (since we do not cache writes to metadata on the client with cifs.ko). FSOP_GOING_FLAGS_DEFAULT is not supported yet, but could be added in the future but would need to call syncfs or equivalent to write out pending data on the mount. With this patch various xfstests now work including tests 043 through 046 for example. Signed-off-by: Steve French Reviewed-by: Aurelien Aptel --- fs/cifs/cifs_fs_sb.h | 1 + fs/cifs/cifs_ioctl.h | 16 +++++++++++++ fs/cifs/dir.c | 10 +++++++++ fs/cifs/file.c | 6 +++++ fs/cifs/fs_context.c | 1 + fs/cifs/inode.c | 25 +++++++++++++++++++-- fs/cifs/ioctl.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/link.c | 7 ++++++ fs/cifs/xattr.c | 4 ++++ 9 files changed, 121 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 2a5325a7ae49..9c45b3a82ad9 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -55,6 +55,7 @@ #define CIFS_MOUNT_MODE_FROM_SID 0x10000000 /* retrieve mode from special ACE */ #define CIFS_MOUNT_RO_CACHE 0x20000000 /* assumes share will not change */ #define CIFS_MOUNT_RW_CACHE 0x40000000 /* assumes only client accessing */ +#define CIFS_MOUNT_SHUTDOWN 0x80000000 struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 153d5c842a9b..f262c64516bc 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -78,3 +78,19 @@ struct smb3_notify { #define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info) #define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify) +#define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32) + +/* + * Flags for going down operation + */ +#define CIFS_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define CIFS_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define CIFS_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +static inline bool cifs_forced_shutdown(struct cifs_sb_info *sbi) +{ + if (CIFS_MOUNT_SHUTDOWN & sbi->mnt_cifs_flags) + return true; + else + return false; +} diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 03afad8b24af..263720131986 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -34,6 +34,7 @@ #include "cifs_fs_sb.h" #include "cifs_unicode.h" #include "fs_context.h" +#include "cifs_ioctl.h" static void renew_parental_timestamps(struct dentry *direntry) @@ -429,6 +430,9 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, __u32 oplock; struct cifsFileInfo *file_info; + if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) + return -EIO; + /* * Posix open is only called (at lookup time) for file create now. For * opens (rather than creates), because we do not know if it is a file @@ -545,6 +549,9 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n", inode, direntry, direntry); + if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) + return -EIO; + tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); rc = PTR_ERR(tlink); if (IS_ERR(tlink)) @@ -582,6 +589,9 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, return -EINVAL; cifs_sb = CIFS_SB(inode->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7e97aeabd616..c95893351b6c 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -45,6 +45,7 @@ #include "fscache.h" #include "smbdirect.h" #include "fs_context.h" +#include "cifs_ioctl.h" static inline int cifs_convert_flags(unsigned int flags) { @@ -542,6 +543,11 @@ int cifs_open(struct inode *inode, struct file *file) xid = get_xid(); cifs_sb = CIFS_SB(inode->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) { + free_xid(xid); + return -EIO; + } + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { free_xid(xid); diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 3e0d016849e3..1d6e0e15b034 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -1642,6 +1642,7 @@ void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb) cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n"); } } + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SHUTDOWN; return; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b4326ffcefce..728ff45b6667 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -26,7 +26,6 @@ #include #include #include - #include #include "cifsfs.h" #include "cifspdu.h" @@ -38,7 +37,7 @@ #include "cifs_unicode.h" #include "fscache.h" #include "fs_context.h" - +#include "cifs_ioctl.h" static void cifs_set_ops(struct inode *inode) { @@ -1623,6 +1622,9 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry); + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -1876,6 +1878,8 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, mode, inode); cifs_sb = CIFS_SB(inode->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -1958,6 +1962,11 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) } cifs_sb = CIFS_SB(inode->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) { + rc = -EIO; + goto rmdir_exit; + } + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { rc = PTR_ERR(tlink); @@ -2092,6 +2101,9 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, return -EINVAL; cifs_sb = CIFS_SB(source_dir->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -2408,6 +2420,9 @@ int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct inode *inode = d_inode(dentry); int rc; + if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) + return -EIO; + /* * We need to be sure that all dirty pages are written and the server * has actual ctime, mtime and file length. @@ -2480,6 +2495,9 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, struct cifsFileInfo *cfile; int rc; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + /* * We need to be sure that all dirty pages are written as they * might fill holes on the server. @@ -2966,6 +2984,9 @@ cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry, struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); int rc, retries = 0; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + do { if (pTcon->unix_ext) rc = cifs_setattr_unix(direntry, attrs); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 08d99fec593e..ef41fa878793 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -164,6 +164,56 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, return rc; } +static int cifs_shutdown(struct super_block *sb, unsigned long arg) +{ + struct cifs_sb_info *sbi = CIFS_SB(sb); + __u32 flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, (__u32 __user *)arg)) + return -EFAULT; + + if (flags > CIFS_GOING_FLAGS_NOLOGFLUSH) + return -EINVAL; + + if (cifs_forced_shutdown(sbi)) + return 0; + + cifs_dbg(VFS, "shut down requested (%d)", flags); +/* trace_cifs_shutdown(sb, flags);*/ + + /* + * see: + * https://man7.org/linux/man-pages/man2/ioctl_xfs_goingdown.2.html + * for more information and description of original intent of the flags + */ + switch (flags) { + /* + * We could add support later for default flag which requires: + * "Flush all dirty data and metadata to disk" + * would need to call syncfs or equivalent to flush page cache for + * the mount and then issue fsync to server (if nostrictsync not set) + */ + case CIFS_GOING_FLAGS_DEFAULT: + cifs_dbg(FYI, "shutdown with default flag not supported\n"); + return -EINVAL; + /* + * FLAGS_LOGFLUSH is easy since it asks to write out metadata (not + * data) but metadata writes are not cached on the client, so can treat + * it similarly to NOLOGFLUSH + */ + case CIFS_GOING_FLAGS_LOGFLUSH: + case CIFS_GOING_FLAGS_NOLOGFLUSH: + sbi->mnt_cifs_flags |= CIFS_MOUNT_SHUTDOWN; + return 0; + default: + return -EINVAL; + } + return 0; +} + long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); @@ -325,6 +375,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EOPNOTSUPP; cifs_put_tlink(tlink); break; + case CIFS_IOC_SHUTDOWN: + rc = cifs_shutdown(inode->i_sb, arg); + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 616e1bc0cc0a..1cbe7ec73728 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -30,6 +30,7 @@ #include "cifs_fs_sb.h" #include "cifs_unicode.h" #include "smb2proto.h" +#include "cifs_ioctl.h" /* * M-F Symlink Functions - Begin @@ -518,6 +519,9 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, struct TCP_Server_Info *server; struct cifsInodeInfo *cifsInode; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -682,6 +686,9 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, void *page = alloc_dentry_path(); struct inode *newinode = NULL; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + xid = get_xid(); tlink = cifs_sb_tlink(cifs_sb); diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index e351b945135b..aa3e8ca0457c 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -30,6 +30,7 @@ #include "cifs_debug.h" #include "cifs_fs_sb.h" #include "cifs_unicode.h" +#include "cifs_ioctl.h" #define MAX_EA_VALUE_SIZE CIFSMaxBufSize #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" /* DACL only */ @@ -421,6 +422,9 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) const char *full_path; void *page; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) return -EOPNOTSUPP; From aa22ebc3826be23a4b2f776c7ad5079c75611dec Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 30 Apr 2021 17:14:45 -0500 Subject: [PATCH 0920/1379] smb3.1.1: allow dumping GCM256 keys to improve debugging of encrypted shares Previously we were only able to dump CCM or GCM-128 keys (see "smbinfo keys" e.g.) to allow network debugging (e.g. wireshark) of mounts to SMB3.1.1 encrypted shares. But with the addition of GCM-256 support, we have to be able to dump 32 byte instead of 16 byte keys which requires adding an additional ioctl for that. Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifs_ioctl.h | 19 +++++++++++++++++++ fs/cifs/ioctl.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index f262c64516bc..4a97fe12006b 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -57,6 +57,12 @@ struct smb_query_info { /* char buffer[]; */ } __packed; +/* + * Dumping the commonly used 16 byte (e.g. CCM and GCM128) keys still supported + * for backlevel compatibility, but is not sufficient for dumping the less + * frequently used GCM256 (32 byte) keys (see the newer "CIFS_DUMP_FULL_KEY" + * ioctl for dumping decryption info for GCM256 mounts) + */ struct smb3_key_debug_info { __u64 Suid; __u16 cipher_type; @@ -65,6 +71,18 @@ struct smb3_key_debug_info { __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; } __packed; +/* + * Dump full key (32 byte encrypt/decrypt keys instead of 16 bytes) + * is needed if GCM256 (stronger encryption) negotiated + */ +struct smb3_full_key_debug_info { + __u64 Suid; + __u16 cipher_type; + __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */ + __u8 smb3encryptionkey[32]; /* SMB3_ENC_DEC_KEY_SIZE */ + __u8 smb3decryptionkey[32]; /* SMB3_ENC_DEC_KEY_SIZE */ +} __packed; + struct smb3_notify { __u32 completion_filter; bool watch_tree; @@ -78,6 +96,7 @@ struct smb3_notify { #define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info) #define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify) +#define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info) #define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32) /* diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index ef41fa878793..7d9654f56edc 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -218,6 +218,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); struct smb3_key_debug_info pkey_inf; + struct smb3_full_key_debug_info pfull_key_inf; int rc = -ENOTTY; /* strange error - but the precedent */ unsigned int xid; struct cifsFileInfo *pSMBFile = filep->private_data; @@ -354,6 +355,38 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) else rc = 0; break; + /* + * Dump full key (32 bytes instead of 16 bytes) is + * needed if GCM256 (stronger encryption) negotiated + */ + case CIFS_DUMP_FULL_KEY: + if (pSMBFile == NULL) + break; + if (!capable(CAP_SYS_ADMIN)) { + rc = -EACCES; + break; + } + + tcon = tlink_tcon(pSMBFile->tlink); + if (!smb3_encryption_required(tcon)) { + rc = -EOPNOTSUPP; + break; + } + pfull_key_inf.cipher_type = + le16_to_cpu(tcon->ses->server->cipher_type); + pfull_key_inf.Suid = tcon->ses->Suid; + memcpy(pfull_key_inf.auth_key, tcon->ses->auth_key.response, + 16 /* SMB2_NTLMV2_SESSKEY_SIZE */); + memcpy(pfull_key_inf.smb3decryptionkey, + tcon->ses->smb3decryptionkey, 32 /* SMB3_ENC_DEC_KEY_SIZE */); + memcpy(pfull_key_inf.smb3encryptionkey, + tcon->ses->smb3encryptionkey, 32 /* SMB3_ENC_DEC_KEY_SIZE */); + if (copy_to_user((void __user *)arg, &pfull_key_inf, + sizeof(struct smb3_full_key_debug_info))) + rc = -EFAULT; + else + rc = 0; + break; case CIFS_IOC_NOTIFY: if (!S_ISDIR(inode->i_mode)) { /* Notify can only be done on directories */ From 7ba3d1cdb7988ccfbc6e4995dee04510c85fefbc Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 2 May 2021 17:39:30 -0500 Subject: [PATCH 0921/1379] smb3.1.1: allow dumping keys for multiuser mounts When mounted multiuser it is hard to dump keys for the other sessions which makes it hard to debug using network traces (e.g. using wireshark). Suggested-by: Shyam Prasad N Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/ioctl.c | 66 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 20 deletions(-) diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 7d9654f56edc..28ec8d7c521a 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -214,11 +214,54 @@ static int cifs_shutdown(struct super_block *sb, unsigned long arg) return 0; } +static int cifs_dump_full_key(struct cifs_tcon *tcon, unsigned long arg) +{ + struct smb3_full_key_debug_info pfull_key_inf; + __u64 suid; + struct list_head *tmp; + struct cifs_ses *ses; + bool found = false; + + if (!smb3_encryption_required(tcon)) + return -EOPNOTSUPP; + + ses = tcon->ses; /* default to user id for current user */ + if (get_user(suid, (__u64 __user *)arg)) + suid = 0; + if (suid) { + /* search to see if there is a session with a matching SMB UID */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &tcon->ses->server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + if (ses->Suid == suid) { + found = true; + break; + } + } + spin_unlock(&cifs_tcp_ses_lock); + if (found == false) + return -EINVAL; + } /* else uses default user's SMB UID (ie current user) */ + + pfull_key_inf.cipher_type = le16_to_cpu(ses->server->cipher_type); + pfull_key_inf.Suid = ses->Suid; + memcpy(pfull_key_inf.auth_key, ses->auth_key.response, + 16 /* SMB2_NTLMV2_SESSKEY_SIZE */); + memcpy(pfull_key_inf.smb3decryptionkey, ses->smb3decryptionkey, + 32 /* SMB3_ENC_DEC_KEY_SIZE */); + memcpy(pfull_key_inf.smb3encryptionkey, + ses->smb3encryptionkey, 32 /* SMB3_ENC_DEC_KEY_SIZE */); + if (copy_to_user((void __user *)arg, &pfull_key_inf, + sizeof(struct smb3_full_key_debug_info))) + return -EFAULT; + + return 0; +} + long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); struct smb3_key_debug_info pkey_inf; - struct smb3_full_key_debug_info pfull_key_inf; int rc = -ENOTTY; /* strange error - but the precedent */ unsigned int xid; struct cifsFileInfo *pSMBFile = filep->private_data; @@ -366,26 +409,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EACCES; break; } - tcon = tlink_tcon(pSMBFile->tlink); - if (!smb3_encryption_required(tcon)) { - rc = -EOPNOTSUPP; - break; - } - pfull_key_inf.cipher_type = - le16_to_cpu(tcon->ses->server->cipher_type); - pfull_key_inf.Suid = tcon->ses->Suid; - memcpy(pfull_key_inf.auth_key, tcon->ses->auth_key.response, - 16 /* SMB2_NTLMV2_SESSKEY_SIZE */); - memcpy(pfull_key_inf.smb3decryptionkey, - tcon->ses->smb3decryptionkey, 32 /* SMB3_ENC_DEC_KEY_SIZE */); - memcpy(pfull_key_inf.smb3encryptionkey, - tcon->ses->smb3encryptionkey, 32 /* SMB3_ENC_DEC_KEY_SIZE */); - if (copy_to_user((void __user *)arg, &pfull_key_inf, - sizeof(struct smb3_full_key_debug_info))) - rc = -EFAULT; - else - rc = 0; + rc = cifs_dump_full_key(tcon, arg); + break; case CIFS_IOC_NOTIFY: if (!S_ISDIR(inode->i_mode)) { From a7277a73984114b38dcb62c8548850800ffe864e Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Thu, 22 Apr 2021 17:08:57 +0800 Subject: [PATCH 0922/1379] dt-bindings: serial: 8250: Remove duplicated compatible strings The compatible strings "mediatek,*" appears two times, remove one of them. Fixes: e69f5dc623f9 ("dt-bindings: serial: Convert 8250 to json-schema") Signed-off-by: Zhen Lei Link: https://lore.kernel.org/r/20210422090857.583-1-thunder.leizhen@huawei.com Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/serial/8250.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Documentation/devicetree/bindings/serial/8250.yaml b/Documentation/devicetree/bindings/serial/8250.yaml index f0506a917793..41f57c448621 100644 --- a/Documentation/devicetree/bindings/serial/8250.yaml +++ b/Documentation/devicetree/bindings/serial/8250.yaml @@ -99,11 +99,6 @@ properties: - mediatek,mt7622-btif - mediatek,mt7623-btif - const: mediatek,mtk-btif - - items: - - enum: - - mediatek,mt7622-btif - - mediatek,mt7623-btif - - const: mediatek,mtk-btif - items: - const: mrvl,mmp-uart - const: intel,xscale-uart From f4916649f98e2c7bdba38c6597a98c456c17317d Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Thu, 29 Apr 2021 07:53:18 +0000 Subject: [PATCH 0923/1379] cifs: detect dead connections only when echoes are enabled. We can detect server unresponsiveness only if echoes are enabled. Echoes can be disabled under two scenarios: 1. The connection is low on credits, so we've disabled echoes/oplocks. 2. The connection has not seen any request till now (other than negotiate/sess-setup), which is when we enable these two, based on the credits available. So this fix will check for dead connection, only when echo is enabled. Signed-off-by: Shyam Prasad N CC: # v5.8+ Signed-off-by: Steve French --- fs/cifs/connect.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 121d8b4535b0..becd5f807787 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -476,6 +476,7 @@ server_unresponsive(struct TCP_Server_Info *server) */ if ((server->tcpStatus == CifsGood || server->tcpStatus == CifsNeedNegotiate) && + (!server->ops->can_echo || server->ops->can_echo(server)) && time_after(jiffies, server->lstrp + 3 * server->echo_interval)) { cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n", (3 * server->echo_interval) / HZ); From 2a30f9440640c418bcfbea9b2b344d268b58e0a2 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 29 Apr 2021 13:05:10 +0000 Subject: [PATCH 0924/1379] libbpf: Fix signed overflow in ringbuf_process_ring One of our benchmarks running in (Google-internal) CI pushes data through the ringbuf faster htan than userspace is able to consume it. In this case it seems we're actually able to get >INT_MAX entries in a single ring_buffer__consume() call. ASAN detected that cnt overflows in this case. Fix by using 64-bit counter internally and then capping the result to INT_MAX before converting to the int return type. Do the same for the ring_buffer__poll(). Fixes: bf99c936f947 (libbpf: Add BPF ring buffer support) Signed-off-by: Brendan Jackman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210429130510.1621665-1-jackmanb@google.com --- tools/lib/bpf/ringbuf.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index e7a8d847161f..1d80ad4e0de8 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -202,9 +202,11 @@ static inline int roundup_len(__u32 len) return (len + 7) / 8 * 8; } -static int ringbuf_process_ring(struct ring* r) +static int64_t ringbuf_process_ring(struct ring* r) { - int *len_ptr, len, err, cnt = 0; + int *len_ptr, len, err; + /* 64-bit to avoid overflow in case of extreme application behavior */ + int64_t cnt = 0; unsigned long cons_pos, prod_pos; bool got_new_data; void *sample; @@ -244,12 +246,14 @@ done: } /* Consume available ring buffer(s) data without event polling. - * Returns number of records consumed across all registered ring buffers, or - * negative number if any of the callbacks return error. + * Returns number of records consumed across all registered ring buffers (or + * INT_MAX, whichever is less), or negative number if any of the callbacks + * return error. */ int ring_buffer__consume(struct ring_buffer *rb) { - int i, err, res = 0; + int64_t err, res = 0; + int i; for (i = 0; i < rb->ring_cnt; i++) { struct ring *ring = &rb->rings[i]; @@ -259,18 +263,24 @@ int ring_buffer__consume(struct ring_buffer *rb) return err; res += err; } + if (res > INT_MAX) + return INT_MAX; return res; } /* Poll for available data and consume records, if any are available. - * Returns number of records consumed, or negative number, if any of the - * registered callbacks returned error. + * Returns number of records consumed (or INT_MAX, whichever is less), or + * negative number, if any of the registered callbacks returned error. */ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) { - int i, cnt, err, res = 0; + int i, cnt; + int64_t err, res = 0; cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms); + if (cnt < 0) + return -errno; + for (i = 0; i < cnt; i++) { __u32 ring_id = rb->events[i].data.fd; struct ring *ring = &rb->rings[ring_id]; @@ -280,7 +290,9 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) return err; res += err; } - return cnt < 0 ? -errno : res; + if (res > INT_MAX) + return INT_MAX; + return res; } /* Get an fd that can be used to sleep until data is available in the ring(s) */ From 5b2abdafbedb902d7d8d3d5e571a38b8900dd15f Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Sat, 1 May 2021 16:17:07 +0000 Subject: [PATCH 0925/1379] cifs: use echo_interval even when connection not ready. When the tcp connection is not ready to send requests, we keep retrying echo with an interval of zero. This seems unnecessary, and this fix changes the interval between echoes to what is specified as echo_interval. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index becd5f807787..d7efbcb2d5df 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -392,16 +392,6 @@ cifs_echo_request(struct work_struct *work) int rc; struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, echo.work); - unsigned long echo_interval; - - /* - * If we need to renegotiate, set echo interval to zero to - * immediately call echo service where we can renegotiate. - */ - if (server->tcpStatus == CifsNeedNegotiate) - echo_interval = 0; - else - echo_interval = server->echo_interval; /* * We cannot send an echo if it is disabled. @@ -412,7 +402,7 @@ cifs_echo_request(struct work_struct *work) server->tcpStatus == CifsExiting || server->tcpStatus == CifsNew || (server->ops->can_echo && !server->ops->can_echo(server)) || - time_before(jiffies, server->lstrp + echo_interval - HZ)) + time_before(jiffies, server->lstrp + server->echo_interval - HZ)) goto requeue_echo; rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS; From 80d43cbd46155744ee450d2476ee4fcf2917ae9b Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Wed, 28 Apr 2021 08:13:56 +0200 Subject: [PATCH 0926/1379] block/rnbd-clt: Change queue_depth type in rnbd_clt_session to size_t The member queue_depth in the structure rnbd_clt_session is read from the rtrs client side using the function rtrs_clt_query, which in turn is read from the rtrs_clt structure. It should really be of type size_t. Fixes: 90426e89f54db ("block/rnbd: client: private header with client structs and functions") Signed-off-by: Md Haris Iqbal Reviewed-by: Guoqing Jiang Signed-off-by: Gioh Kim Link: https://lore.kernel.org/r/20210428061359.206794-2-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index 451e7383738f..b5322c5aaac0 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -87,7 +87,7 @@ struct rnbd_clt_session { DECLARE_BITMAP(cpu_queues_bm, NR_CPUS); int __percpu *cpu_rr; /* per-cpu var for CPU round-robin */ atomic_t busy; - int queue_depth; + size_t queue_depth; u32 max_io_size; struct blk_mq_tag_set tag_set; u32 nr_poll_queues; From 292660fa35e8917a78235d39722edf9bbc04cab7 Mon Sep 17 00:00:00 2001 From: Dima Stepanov Date: Wed, 28 Apr 2021 08:13:57 +0200 Subject: [PATCH 0927/1379] block/rnbd: Fix style issues This patch fixes some style issues detected by scripts/checkpatch.pl * Resolve spacing and tab issues * Remove extra braces in rnbd_get_iu * Use num_possible_cpus() instead of NR_CPUS in alloc_sess * Fix the comments styling in rnbd_queue_rq Signed-off-by: Dima Stepanov Signed-off-by: Gioh Kim Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20210428061359.206794-3-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index c01786afe1b1..15f831159c31 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -88,7 +88,7 @@ static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev, dev->discard_alignment = le32_to_cpu(rsp->discard_alignment); dev->secure_discard = le16_to_cpu(rsp->secure_discard); dev->rotational = rsp->rotational; - dev->wc = !!(rsp->cache_policy & RNBD_WRITEBACK); + dev->wc = !!(rsp->cache_policy & RNBD_WRITEBACK); dev->fua = !!(rsp->cache_policy & RNBD_FUA); dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE; @@ -351,9 +351,8 @@ static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess, struct rtrs_permit *permit; iu = kzalloc(sizeof(*iu), GFP_KERNEL); - if (!iu) { + if (!iu) return NULL; - } permit = rnbd_get_permit(sess, con_type, wait); if (unlikely(!permit)) { @@ -805,7 +804,7 @@ static struct rnbd_clt_session *alloc_sess(const char *sessname) mutex_init(&sess->lock); INIT_LIST_HEAD(&sess->devs_list); INIT_LIST_HEAD(&sess->list); - bitmap_zero(sess->cpu_queues_bm, NR_CPUS); + bitmap_zero(sess->cpu_queues_bm, num_possible_cpus()); init_waitqueue_head(&sess->rtrs_waitq); refcount_set(&sess->refcount, 1); @@ -1148,7 +1147,8 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, iu->sgt.sgl = iu->first_sgl; err = sg_alloc_table_chained(&iu->sgt, /* Even-if the request has no segment, - * sglist must have one entry at least */ + * sglist must have one entry at least. + */ blk_rq_nr_phys_segments(rq) ? : 1, iu->sgt.sgl, RNBD_INLINE_SG_CNT); From 1056ad829ec43f9b705b507c2093b05e2088b0b7 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Wed, 28 Apr 2021 08:13:58 +0200 Subject: [PATCH 0928/1379] block/rnbd-clt: Check the return value of the function rtrs_clt_query In case none of the paths are in connected state, the function rtrs_clt_query returns an error. In such a case, error out since the values in the rtrs_attrs structure would be garbage. Fixes: f7a7a5c228d45 ("block/rnbd: client: main functionality") Signed-off-by: Md Haris Iqbal Reviewed-by: Guoqing Jiang Signed-off-by: Jack Wang Signed-off-by: Gioh Kim Link: https://lore.kernel.org/r/20210428061359.206794-4-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 15f831159c31..f855bf1fa8d5 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -691,7 +691,11 @@ static void remap_devs(struct rnbd_clt_session *sess) return; } - rtrs_clt_query(sess->rtrs, &attrs); + err = rtrs_clt_query(sess->rtrs, &attrs); + if (err) { + pr_err("rtrs_clt_query(\"%s\"): %d\n", sess->sessname, err); + return; + } mutex_lock(&sess->lock); sess->max_io_size = attrs.max_io_size; @@ -1294,7 +1298,11 @@ find_and_get_or_create_sess(const char *sessname, err = PTR_ERR(sess->rtrs); goto wake_up_and_put; } - rtrs_clt_query(sess->rtrs, &attrs); + + err = rtrs_clt_query(sess->rtrs, &attrs); + if (err) + goto close_rtrs; + sess->max_io_size = attrs.max_io_size; sess->queue_depth = attrs.queue_depth; sess->nr_poll_queues = nr_poll_queues; From 1e31016b6926c996e9113619c2ce1f42ad74ddd1 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Wed, 28 Apr 2021 08:13:59 +0200 Subject: [PATCH 0929/1379] block/rnbd: Remove all likely and unlikely The IO performance test with fio after removing the likely and unlikely macros in all if-statement shows no performance drop. They do not help for the performance of rnbd. The fio test did random read on 32 rnbd devices and 64 processes. Test environment: - AMD Opteron(tm) Processor 6386 SE - 125G memory - kernel version: 5.4.86 - gcc version: gcc (Debian 8.3.0-6) 8.3.0 - Infiniband controller: InfiniBand: Mellanox Technologies MT26428 [ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE] (rev b0) before read: IOPS=549k, BW=2146MiB/s read: IOPS=544k, BW=2125MiB/s read: IOPS=553k, BW=2158MiB/s read: IOPS=535k, BW=2089MiB/s read: IOPS=543k, BW=2122MiB/s read: IOPS=552k, BW=2154MiB/s average: IOPS=546k, BW=2132MiB/s after read: IOPS=556k, BW=2172MiB/s read: IOPS=561k, BW=2191MiB/s read: IOPS=552k, BW=2156MiB/s read: IOPS=551k, BW=2154MiB/s read: IOPS=562k, BW=2194MiB/s ----------- average: IOPS=556k, BW=2173MiB/s The IOPS and bandwidth got better slightly after removing likely/unlikely. (IOPS= +1.8% BW= +1.9%) But we cannot make sure that removing the likely/unlikely help the performance because it depends on various situations. We only make sure that removing the likely/unlikely does not drop the performance. Signed-off-by: Gioh Kim Reviewed-by: Md Haris Iqbal Link: https://lore.kernel.org/r/20210428061359.206794-5-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 24 ++++++++++++------------ drivers/block/rnbd/rnbd-srv.c | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index f855bf1fa8d5..c604a402cd5c 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -241,7 +241,7 @@ static bool rnbd_rerun_if_needed(struct rnbd_clt_session *sess) cpu_q = rnbd_get_cpu_qlist(sess, nxt_cpu(cpu_q->cpu))) { if (!spin_trylock_irqsave(&cpu_q->requeue_lock, flags)) continue; - if (unlikely(!test_bit(cpu_q->cpu, sess->cpu_queues_bm))) + if (!test_bit(cpu_q->cpu, sess->cpu_queues_bm)) goto unlock; q = list_first_entry_or_null(&cpu_q->requeue_list, typeof(*q), requeue_list); @@ -320,7 +320,7 @@ static struct rtrs_permit *rnbd_get_permit(struct rnbd_clt_session *sess, struct rtrs_permit *permit; permit = rtrs_clt_get_permit(sess->rtrs, con_type, wait); - if (likely(permit)) + if (permit) /* We have a subtle rare case here, when all permits can be * consumed before busy counter increased. This is safe, * because loser will get NULL as a permit, observe 0 busy @@ -355,7 +355,7 @@ static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess, return NULL; permit = rnbd_get_permit(sess, con_type, wait); - if (unlikely(!permit)) { + if (!permit) { kfree(iu); return NULL; } @@ -1050,7 +1050,7 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev, }; err = rtrs_clt_request(rq_data_dir(rq), &req_ops, rtrs, permit, &vec, 1, size, iu->sgt.sgl, sg_cnt); - if (unlikely(err)) { + if (err) { rnbd_clt_err_rl(dev, "RTRS failed to transfer IO, err: %d\n", err); return err; @@ -1081,7 +1081,7 @@ static bool rnbd_clt_dev_add_to_requeue(struct rnbd_clt_dev *dev, cpu_q = get_cpu_ptr(sess->cpu_queues); spin_lock_irqsave(&cpu_q->requeue_lock, flags); - if (likely(!test_and_set_bit_lock(0, &q->in_list))) { + if (!test_and_set_bit_lock(0, &q->in_list)) { if (WARN_ON(!list_empty(&q->requeue_list))) goto unlock; @@ -1093,7 +1093,7 @@ static bool rnbd_clt_dev_add_to_requeue(struct rnbd_clt_dev *dev, */ smp_mb__before_atomic(); } - if (likely(atomic_read(&sess->busy))) { + if (atomic_read(&sess->busy)) { list_add_tail(&q->requeue_list, &cpu_q->requeue_list); } else { /* Very unlikely, but possible: busy counter was @@ -1121,7 +1121,7 @@ static void rnbd_clt_dev_kick_mq_queue(struct rnbd_clt_dev *dev, if (delay != RNBD_DELAY_IFBUSY) blk_mq_delay_run_hw_queue(hctx, delay); - else if (unlikely(!rnbd_clt_dev_add_to_requeue(dev, q))) + else if (!rnbd_clt_dev_add_to_requeue(dev, q)) /* * If session is not busy we have to restart * the queue ourselves. @@ -1138,12 +1138,12 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, int err; blk_status_t ret = BLK_STS_IOERR; - if (unlikely(dev->dev_state != DEV_STATE_MAPPED)) + if (dev->dev_state != DEV_STATE_MAPPED) return BLK_STS_IOERR; iu->permit = rnbd_get_permit(dev->sess, RTRS_IO_CON, RTRS_PERMIT_NOWAIT); - if (unlikely(!iu->permit)) { + if (!iu->permit) { rnbd_clt_dev_kick_mq_queue(dev, hctx, RNBD_DELAY_IFBUSY); return BLK_STS_RESOURCE; } @@ -1165,9 +1165,9 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); err = rnbd_client_xfer_request(dev, rq, iu); - if (likely(err == 0)) + if (err == 0) return BLK_STS_OK; - if (unlikely(err == -EAGAIN || err == -ENOMEM)) { + if (err == -EAGAIN || err == -ENOMEM) { rnbd_clt_dev_kick_mq_queue(dev, hctx, 10/*ms*/); ret = BLK_STS_RESOURCE; } @@ -1584,7 +1584,7 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, struct rnbd_clt_dev *dev; int ret; - if (unlikely(exists_devpath(pathname, sessname))) + if (exists_devpath(pathname, sessname)) return ERR_PTR(-EEXIST); sess = find_and_get_or_create_sess(sessname, paths, path_cnt, port_nr, nr_poll_queues); diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 899dd9d7c10b..aafecfe97055 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -104,7 +104,7 @@ rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess) rcu_read_lock(); sess_dev = xa_load(&srv_sess->index_idr, dev_id); - if (likely(sess_dev)) + if (sess_dev) ret = kref_get_unless_zero(&sess_dev->kref); rcu_read_unlock(); From be4f361d69f4487ab56eb67b0cd0559fb1895af2 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Wed, 28 Apr 2021 17:35:21 +0200 Subject: [PATCH 0930/1379] s390: dasd: Mundane spelling fixes s/Subssystem/Subsystem/ ......two different places s/reportet/reported/ s/managemnet/management/ Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20210428153521.2050899-2-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_eckd.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/s390/block/dasd_eckd.h b/drivers/s390/block/dasd_eckd.h index ca24a78a256e..73651211789f 100644 --- a/drivers/s390/block/dasd_eckd.h +++ b/drivers/s390/block/dasd_eckd.h @@ -52,7 +52,7 @@ #define DASD_ECKD_CCW_RCD 0xFA #define DASD_ECKD_CCW_DSO 0xF7 -/* Define Subssystem Function / Orders */ +/* Define Subsystem Function / Orders */ #define DSO_ORDER_RAS 0x81 /* @@ -110,7 +110,7 @@ #define DASD_ECKD_PG_GROUPED 0x10 /* - * Size that is reportet for large volumes in the old 16-bit no_cyl field + * Size that is reported for large volumes in the old 16-bit no_cyl field */ #define LV_COMPAT_CYL 0xFFFE @@ -555,7 +555,7 @@ struct dasd_dso_ras_ext_range { } __packed; /* - * Define Subsytem Operation - Release Allocated Space + * Define Subsystem Operation - Release Allocated Space */ struct dasd_dso_ras_data { __u8 order; @@ -676,7 +676,7 @@ struct dasd_eckd_private { struct dasd_ext_pool_sum eps; u32 real_cyl; - /* alias managemnet */ + /* alias management */ struct dasd_uid uid; struct alias_pav_group *pavgroup; struct alias_lcu *lcu; From c646790a1fcae7738972accc41ccaa4983e5c234 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 29 Apr 2021 11:27:41 +0200 Subject: [PATCH 0931/1379] RDMA/rtrs: fix uninitialized symbol 'cnt' rtrs_clt_rdma_cq_direct returns an ninitialized value in cnt if there is no session. This patch makes rtrs_clt_rdma_cq_direct returns a negative value for block layer not to try again. Fixes: 2958a995edc94 ("block/rnbd-clt: Support polling mode for IO latency optimization") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20210429092741.266533-1-gi-oh.kim@ionos.com Signed-off-by: Jens Axboe --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index b74a872387c4..934a2ff18e7f 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -2896,7 +2896,8 @@ EXPORT_SYMBOL(rtrs_clt_request); int rtrs_clt_rdma_cq_direct(struct rtrs_clt *clt, unsigned int index) { - int cnt; + /* If no path, return -1 for block layer not to try again */ + int cnt = -1; struct rtrs_con *con; struct rtrs_clt_sess *sess; struct path_it it; From cd2c7545ae1beac3b6aae033c7f31193b3255946 Mon Sep 17 00:00:00 2001 From: Changheun Lee Date: Mon, 3 May 2021 18:52:03 +0900 Subject: [PATCH 0932/1379] bio: limit bio max size bio size can grow up to 4GB when muli-page bvec is enabled. but sometimes it would lead to inefficient behaviors. in case of large chunk direct I/O, - 32MB chunk read in user space - all pages for 32MB would be merged to a bio structure if the pages physical addresses are contiguous. it makes some delay to submit until merge complete. bio max size should be limited to a proper size. When 32MB chunk read with direct I/O option is coming from userspace, kernel behavior is below now in do_direct_IO() loop. it's timeline. | bio merge for 32MB. total 8,192 pages are merged. | total elapsed time is over 2ms. |------------------ ... ----------------------->| | 8,192 pages merged a bio. | at this time, first bio submit is done. | 1 bio is split to 32 read request and issue. |---------------> |---------------> |---------------> ...... |---------------> |--------------->| total 19ms elapsed to complete 32MB read done from device. | If bio max size is limited with 1MB, behavior is changed below. | bio merge for 1MB. 256 pages are merged for each bio. | total 32 bio will be made. | total elapsed time is over 2ms. it's same. | but, first bio submit timing is fast. about 100us. |--->|--->|--->|---> ... -->|--->|--->|--->|--->| | 256 pages merged a bio. | at this time, first bio submit is done. | and 1 read request is issued for 1 bio. |---------------> |---------------> |---------------> ...... |---------------> |--------------->| total 17ms elapsed to complete 32MB read done from device. | As a result, read request issue timing is faster if bio max size is limited. Current kernel behavior with multipage bvec, super large bio can be created. And it lead to delay first I/O request issue. Signed-off-by: Changheun Lee Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20210503095203.29076-1-nanich.lee@samsung.com Signed-off-by: Jens Axboe --- block/bio.c | 13 +++++++++++-- block/blk-settings.c | 5 +++++ include/linux/bio.h | 4 +++- include/linux/blkdev.h | 2 ++ 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/block/bio.c b/block/bio.c index 44205dfb6b60..221dc56ba22f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -255,6 +255,13 @@ void bio_init(struct bio *bio, struct bio_vec *table, } EXPORT_SYMBOL(bio_init); +unsigned int bio_max_size(struct bio *bio) +{ + struct block_device *bdev = bio->bi_bdev; + + return bdev ? bdev->bd_disk->queue->limits.bio_max_bytes : UINT_MAX; +} + /** * bio_reset - reinitialize a bio * @bio: bio to reset @@ -866,7 +873,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page, struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (page_is_mergeable(bv, page, len, off, same_page)) { - if (bio->bi_iter.bi_size > UINT_MAX - len) { + if (bio->bi_iter.bi_size > bio_max_size(bio) - len) { *same_page = false; return false; } @@ -995,6 +1002,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; + unsigned int bytes_left = bio_max_size(bio) - bio->bi_iter.bi_size; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; bool same_page = false; @@ -1010,7 +1018,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + size = iov_iter_get_pages(iter, pages, bytes_left, nr_pages, + &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; diff --git a/block/blk-settings.c b/block/blk-settings.c index 9c009090c4b5..c646503e55d2 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -31,6 +31,7 @@ EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); */ void blk_set_default_limits(struct queue_limits *lim) { + lim->bio_max_bytes = UINT_MAX; lim->max_segments = BLK_MAX_SEGMENTS; lim->max_discard_segments = 1; lim->max_integrity_segments = 0; @@ -139,6 +140,10 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; + if (check_shl_overflow(max_sectors, SECTOR_SHIFT, + &limits->bio_max_bytes)) + limits->bio_max_bytes = UINT_MAX; + q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); diff --git a/include/linux/bio.h b/include/linux/bio.h index a0b4cfdf62a4..f1a99f0a240c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -106,6 +106,8 @@ static inline void *bio_data(struct bio *bio) return NULL; } +extern unsigned int bio_max_size(struct bio *bio); + /** * bio_full - check if the bio is full * @bio: bio to check @@ -119,7 +121,7 @@ static inline bool bio_full(struct bio *bio, unsigned len) if (bio->bi_vcnt >= bio->bi_max_vecs) return true; - if (bio->bi_iter.bi_size > UINT_MAX - len) + if (bio->bi_iter.bi_size > bio_max_size(bio) - len) return true; return false; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b91ba6207365..40c7c4d87aa1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -327,6 +327,8 @@ enum blk_bounce { }; struct queue_limits { + unsigned int bio_max_bytes; + enum blk_bounce bounce; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; From 48582b2e3b87b794a9845d488af2c76ce055502b Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Fri, 30 Apr 2021 11:21:54 -0400 Subject: [PATCH 0933/1379] reset: add missing empty function reset_control_rearm() All other functions are defined for when CONFIG_RESET_CONTROLLER is not set. Fixes: 557acb3d2cd9 ("reset: make shared pulsed reset controls re-triggerable") Link: https://lore.kernel.org/r/20210430152156.21162-2-jim2101024@gmail.com Signed-off-by: Jim Quinlan Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # v5.11+ --- include/linux/reset.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/reset.h b/include/linux/reset.h index b9109efa2a5c..9700124affa3 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -47,6 +47,11 @@ static inline int reset_control_reset(struct reset_control *rstc) return 0; } +static inline int reset_control_rearm(struct reset_control *rstc) +{ + return 0; +} + static inline int reset_control_assert(struct reset_control *rstc) { return 0; From e8d6f9e56187c101b325e8d18f1d4032420d08ff Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Fri, 30 Apr 2021 11:21:55 -0400 Subject: [PATCH 0934/1379] ata: ahci_brcm: Fix use of BCM7216 reset controller This driver may use one of two resets controllers. Keep them in separate variables to keep things simple. The reset controller "rescal" is shared between the AHCI driver and the PCIe driver for the BrcmSTB 7216 chip. Use devm_reset_control_get_optional_shared() to handle this sharing. [bhelgaas: add Jens' ack from v5 posting] Fixes: 272ecd60a636 ("ata: ahci_brcm: BCM7216 reset is self de-asserting") Fixes: c345ec6a50e9 ("ata: ahci_brcm: Support BCM7216 reset controller name") Link: https://lore.kernel.org/r/20210430152156.21162-3-jim2101024@gmail.com Signed-off-by: Jim Quinlan Signed-off-by: Bjorn Helgaas Acked-by: Jens Axboe Acked-by: Florian Fainelli --- drivers/ata/ahci_brcm.c | 46 ++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/ata/ahci_brcm.c b/drivers/ata/ahci_brcm.c index 5b32df5d33ad..6e9c5ade4c2e 100644 --- a/drivers/ata/ahci_brcm.c +++ b/drivers/ata/ahci_brcm.c @@ -86,7 +86,8 @@ struct brcm_ahci_priv { u32 port_mask; u32 quirks; enum brcm_ahci_version version; - struct reset_control *rcdev; + struct reset_control *rcdev_rescal; + struct reset_control *rcdev_ahci; }; static inline u32 brcm_sata_readreg(void __iomem *addr) @@ -352,8 +353,8 @@ static int brcm_ahci_suspend(struct device *dev) else ret = 0; - if (priv->version != BRCM_SATA_BCM7216) - reset_control_assert(priv->rcdev); + reset_control_assert(priv->rcdev_ahci); + reset_control_rearm(priv->rcdev_rescal); return ret; } @@ -365,10 +366,10 @@ static int __maybe_unused brcm_ahci_resume(struct device *dev) struct brcm_ahci_priv *priv = hpriv->plat_data; int ret = 0; - if (priv->version == BRCM_SATA_BCM7216) - ret = reset_control_reset(priv->rcdev); - else - ret = reset_control_deassert(priv->rcdev); + ret = reset_control_deassert(priv->rcdev_ahci); + if (ret) + return ret; + ret = reset_control_reset(priv->rcdev_rescal); if (ret) return ret; @@ -434,7 +435,6 @@ static int brcm_ahci_probe(struct platform_device *pdev) { const struct of_device_id *of_id; struct device *dev = &pdev->dev; - const char *reset_name = NULL; struct brcm_ahci_priv *priv; struct ahci_host_priv *hpriv; struct resource *res; @@ -456,15 +456,15 @@ static int brcm_ahci_probe(struct platform_device *pdev) if (IS_ERR(priv->top_ctrl)) return PTR_ERR(priv->top_ctrl); - /* Reset is optional depending on platform and named differently */ - if (priv->version == BRCM_SATA_BCM7216) - reset_name = "rescal"; - else - reset_name = "ahci"; - - priv->rcdev = devm_reset_control_get_optional(&pdev->dev, reset_name); - if (IS_ERR(priv->rcdev)) - return PTR_ERR(priv->rcdev); + if (priv->version == BRCM_SATA_BCM7216) { + priv->rcdev_rescal = devm_reset_control_get_optional_shared( + &pdev->dev, "rescal"); + if (IS_ERR(priv->rcdev_rescal)) + return PTR_ERR(priv->rcdev_rescal); + } + priv->rcdev_ahci = devm_reset_control_get_optional(&pdev->dev, "ahci"); + if (IS_ERR(priv->rcdev_ahci)) + return PTR_ERR(priv->rcdev_ahci); hpriv = ahci_platform_get_resources(pdev, 0); if (IS_ERR(hpriv)) @@ -485,10 +485,10 @@ static int brcm_ahci_probe(struct platform_device *pdev) break; } - if (priv->version == BRCM_SATA_BCM7216) - ret = reset_control_reset(priv->rcdev); - else - ret = reset_control_deassert(priv->rcdev); + ret = reset_control_reset(priv->rcdev_rescal); + if (ret) + return ret; + ret = reset_control_deassert(priv->rcdev_ahci); if (ret) return ret; @@ -539,8 +539,8 @@ out_disable_regulators: out_disable_clks: ahci_platform_disable_clks(hpriv); out_reset: - if (priv->version != BRCM_SATA_BCM7216) - reset_control_assert(priv->rcdev); + reset_control_assert(priv->rcdev_ahci); + reset_control_rearm(priv->rcdev_rescal); return ret; } From bb610757fcd74558ad94fe19993fd4470208dd02 Mon Sep 17 00:00:00 2001 From: Jim Quinlan Date: Fri, 30 Apr 2021 11:21:56 -0400 Subject: [PATCH 0935/1379] PCI: brcmstb: Use reset/rearm instead of deassert/assert The Broadcom STB PCIe RC uses a reset control "rescal" for certain chips. The "rescal" implements a "pulse reset" so using assert/deassert is wrong for this device. Instead, we use reset/rearm. We need to use rearm so that we can reset it after a suspend/resume cycle; w/o using "rearm", the "rescal" device will only ever fire once. Of course for suspend/resume to work we also need to put the reset/rearm calls in the suspend and resume routines. Fixes: 740d6c3708a9 ("PCI: brcmstb: Add control of rescal reset") Link: https://lore.kernel.org/r/20210430152156.21162-4-jim2101024@gmail.com Signed-off-by: Jim Quinlan Signed-off-by: Bjorn Helgaas Acked-by: Florian Fainelli --- drivers/pci/controller/pcie-brcmstb.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index e330e6811f0b..3b35d629035e 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -1148,6 +1148,7 @@ static int brcm_pcie_suspend(struct device *dev) brcm_pcie_turn_off(pcie); ret = brcm_phy_stop(pcie); + reset_control_rearm(pcie->rescal); clk_disable_unprepare(pcie->clk); return ret; @@ -1163,9 +1164,13 @@ static int brcm_pcie_resume(struct device *dev) base = pcie->base; clk_prepare_enable(pcie->clk); + ret = reset_control_reset(pcie->rescal); + if (ret) + goto err_disable_clk; + ret = brcm_phy_start(pcie); if (ret) - goto err; + goto err_reset; /* Take bridge out of reset so we can access the SERDES reg */ pcie->bridge_sw_init_set(pcie, 0); @@ -1180,14 +1185,16 @@ static int brcm_pcie_resume(struct device *dev) ret = brcm_pcie_setup(pcie); if (ret) - goto err; + goto err_reset; if (pcie->msi) brcm_msi_set_regs(pcie->msi); return 0; -err: +err_reset: + reset_control_rearm(pcie->rescal); +err_disable_clk: clk_disable_unprepare(pcie->clk); return ret; } @@ -1197,7 +1204,7 @@ static void __brcm_pcie_remove(struct brcm_pcie *pcie) brcm_msi_remove(pcie); brcm_pcie_turn_off(pcie); brcm_phy_stop(pcie); - reset_control_assert(pcie->rescal); + reset_control_rearm(pcie->rescal); clk_disable_unprepare(pcie->clk); } @@ -1278,13 +1285,13 @@ static int brcm_pcie_probe(struct platform_device *pdev) return PTR_ERR(pcie->perst_reset); } - ret = reset_control_deassert(pcie->rescal); + ret = reset_control_reset(pcie->rescal); if (ret) dev_err(&pdev->dev, "failed to deassert 'rescal'\n"); ret = brcm_phy_start(pcie); if (ret) { - reset_control_assert(pcie->rescal); + reset_control_rearm(pcie->rescal); clk_disable_unprepare(pcie->clk); return ret; } From 5cd1a85a6c3f49ad008c008299e0dbe9ac33fba6 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Sun, 25 Apr 2021 18:40:58 +0800 Subject: [PATCH 0936/1379] of: overlay: Remove redundant assignment to ret Variable ret is set to zero but this value is never read as it is overwritten with a new value later on, hence it is a redundant assignment and can be removed. Cleans up the following clang-analyzer warning: drivers/of/overlay.c:1197:2: warning: Value stored to 'ret' is never read [clang-analyzer-deadcode.DeadStores]. drivers/of/overlay.c:1026:2: warning: Value stored to 'ret' is never read [clang-analyzer-deadcode.DeadStores]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/1619347258-55002-1-git-send-email-jiapeng.chong@linux.alibaba.com Signed-off-by: Rob Herring --- drivers/of/overlay.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/of/overlay.c b/drivers/of/overlay.c index 180c6fb3ef36..d80160cf34bb 100644 --- a/drivers/of/overlay.c +++ b/drivers/of/overlay.c @@ -1024,7 +1024,6 @@ int of_overlay_fdt_apply(const void *overlay_fdt, u32 overlay_fdt_size, struct device_node *overlay_root = NULL; *ovcs_id = 0; - ret = 0; if (overlay_fdt_size < sizeof(struct fdt_header) || fdt_check_header(overlay_fdt)) { @@ -1195,8 +1194,6 @@ int of_overlay_remove(int *ovcs_id) struct overlay_changeset *ovcs; int ret, ret_apply, ret_tmp; - ret = 0; - if (devicetree_corrupt()) { pr_err("suspect devicetree state, refuse to remove overlay\n"); ret = -EBUSY; From 62b3b3660aff66433d71f142ab6ed2baaea25025 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 29 Apr 2021 14:44:09 +0200 Subject: [PATCH 0937/1379] dt-bindings: PCI: rcar-pci-host: Document missing R-Car H1 support scripts/checkpatch.pl -f drivers/pci/controller/pcie-rcar-host.c: WARNING: DT compatible string "renesas,pcie-r8a7779" appears un-documented -- check ./Documentation/devicetree/bindings/ #853: FILE: drivers/pci/controller/pcie-rcar-host.c:853: + { .compatible = "renesas,pcie-r8a7779", Re-add the compatible value for R-Car H1, which was lost during the json-schema conversion. Make the "resets" property optional on R-Car H1, as it is not present yet on R-Car Gen1 SoCs. Fixes: 0d69ce3c2c63d4db ("dt-bindings: PCI: rcar-pci-host: Convert bindings to json-schema") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/fb0bb969cd0e5872ab5eac70e070242c0d8a5b81.1619700202.git.geert+renesas@glider.be Signed-off-by: Rob Herring --- .../devicetree/bindings/pci/rcar-pci-host.yaml | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pci/rcar-pci-host.yaml b/Documentation/devicetree/bindings/pci/rcar-pci-host.yaml index 4a2bcc0158e2..8fdfbc763d70 100644 --- a/Documentation/devicetree/bindings/pci/rcar-pci-host.yaml +++ b/Documentation/devicetree/bindings/pci/rcar-pci-host.yaml @@ -17,6 +17,7 @@ allOf: properties: compatible: oneOf: + - const: renesas,pcie-r8a7779 # R-Car H1 - items: - enum: - renesas,pcie-r8a7742 # RZ/G1H @@ -74,7 +75,16 @@ required: - clocks - clock-names - power-domains - - resets + +if: + not: + properties: + compatible: + contains: + const: renesas,pcie-r8a7779 +then: + required: + - resets unevaluatedProperties: false From 7935bb56e21b2add81149f4def8e59b4133fe57c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 29 Apr 2021 14:45:52 +0200 Subject: [PATCH 0938/1379] dt-bindings: media: renesas,vin: Make resets optional on R-Car Gen1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "resets" property is not present on R-Car Gen1 SoCs. Supporting it would require migrating from renesas,cpg-clocks to renesas,cpg-mssr. Fixes: 905fc6b1bfb4a631 ("dt-bindings: rcar-vin: Convert bindings to json-schema") Signed-off-by: Geert Uytterhoeven Reviewed-by: Niklas Söderlund Link: https://lore.kernel.org/r/217c8197efaee7d803b22d433abb0ea8e33b84c6.1619700314.git.geert+renesas@glider.be Signed-off-by: Rob Herring --- .../bindings/media/renesas,vin.yaml | 44 ++++++++++++------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/Documentation/devicetree/bindings/media/renesas,vin.yaml b/Documentation/devicetree/bindings/media/renesas,vin.yaml index fe7c4cbfe4ba..dd1a5ce5896c 100644 --- a/Documentation/devicetree/bindings/media/renesas,vin.yaml +++ b/Documentation/devicetree/bindings/media/renesas,vin.yaml @@ -193,23 +193,35 @@ required: - interrupts - clocks - power-domains - - resets -if: - properties: - compatible: - contains: - enum: - - renesas,vin-r8a7778 - - renesas,vin-r8a7779 - - renesas,rcar-gen2-vin -then: - required: - - port -else: - required: - - renesas,id - - ports +allOf: + - if: + not: + properties: + compatible: + contains: + enum: + - renesas,vin-r8a7778 + - renesas,vin-r8a7779 + then: + required: + - resets + + - if: + properties: + compatible: + contains: + enum: + - renesas,vin-r8a7778 + - renesas,vin-r8a7779 + - renesas,rcar-gen2-vin + then: + required: + - port + else: + required: + - renesas,id + - ports additionalProperties: false From 643001b47adc844ae33510c4bb93c236667008a3 Mon Sep 17 00:00:00 2001 From: Lv Yunlong Date: Sun, 2 May 2021 04:58:18 -0700 Subject: [PATCH 0939/1379] ethernet:enic: Fix a use after free bug in enic_hard_start_xmit In enic_hard_start_xmit, it calls enic_queue_wq_skb(). Inside enic_queue_wq_skb, if some error happens, the skb will be freed by dev_kfree_skb(skb). But the freed skb is still used in skb_tx_timestamp(skb). My patch makes enic_queue_wq_skb() return error and goto spin_unlock() incase of error. The solution is provided by Govind. See https://lkml.org/lkml/2021/4/30/961. Fixes: fb7516d42478e ("enic: add sw timestamp support") Signed-off-by: Lv Yunlong Acked-by: Govindarajulu Varadarajan Signed-off-by: David S. Miller --- drivers/net/ethernet/cisco/enic/enic_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index f48957a17c3a..d0a8f7106958 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -768,7 +768,7 @@ static inline int enic_queue_wq_skb_encap(struct enic *enic, struct vnic_wq *wq, return err; } -static inline void enic_queue_wq_skb(struct enic *enic, +static inline int enic_queue_wq_skb(struct enic *enic, struct vnic_wq *wq, struct sk_buff *skb) { unsigned int mss = skb_shinfo(skb)->gso_size; @@ -814,6 +814,7 @@ static inline void enic_queue_wq_skb(struct enic *enic, wq->to_use = buf->next; dev_kfree_skb(skb); } + return err; } /* netif_tx_lock held, process context with BHs disabled, or BH */ @@ -857,7 +858,8 @@ static netdev_tx_t enic_hard_start_xmit(struct sk_buff *skb, return NETDEV_TX_BUSY; } - enic_queue_wq_skb(enic, wq, skb); + if (enic_queue_wq_skb(enic, wq, skb)) + goto error; if (vnic_wq_desc_avail(wq) < MAX_SKB_FRAGS + ENIC_DESC_MAX_SPLITS) netif_tx_stop_queue(txq); @@ -865,6 +867,7 @@ static netdev_tx_t enic_hard_start_xmit(struct sk_buff *skb, if (!netdev_xmit_more() || netif_xmit_stopped(txq)) vnic_wq_doorbell(wq); +error: spin_unlock(&enic->wq_lock[txq_map]); return NETDEV_TX_OK; From 22008f560bd36028dd459692794edf2e11e017a5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 3 May 2021 04:36:58 +0800 Subject: [PATCH 0940/1379] Revert "Revert "sctp: Fix bundling of SHUTDOWN with COOKIE-ACK"" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 7e9269a5acec6d841d22e12770a0b02db4f5d8f2. As Jere notice, commit 35b4f24415c8 ("sctp: do asoc update earlier in sctp_sf_do_dupcook_a") only keeps the SHUTDOWN and COOKIE-ACK with the same asoc, not transport. So we have to bring this patch back. Reported-by: Jere Leppänen Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index a42844904b81..5fc3f3a331f8 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1903,7 +1903,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_a( */ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); return sctp_sf_do_9_2_start_shutdown(net, ep, asoc, - SCTP_ST_CHUNK(0), NULL, + SCTP_ST_CHUNK(0), repl, commands); } else { sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, @@ -5549,7 +5549,7 @@ enum sctp_disposition sctp_sf_do_9_2_start_shutdown( * in the Cumulative TSN Ack field the last sequential TSN it * has received from the peer. */ - reply = sctp_make_shutdown(asoc, NULL); + reply = sctp_make_shutdown(asoc, arg); if (!reply) goto nomem; @@ -6147,7 +6147,7 @@ enum sctp_disposition sctp_sf_autoclose_timer_expire( disposition = SCTP_DISPOSITION_CONSUME; if (sctp_outq_is_empty(&asoc->outqueue)) { disposition = sctp_sf_do_9_2_start_shutdown(net, ep, asoc, type, - arg, commands); + NULL, commands); } return disposition; From 7aa4e54739be1471d8dd78f3c0148164085bdc20 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 3 May 2021 04:36:59 +0800 Subject: [PATCH 0941/1379] Revert "sctp: Fix SHUTDOWN CTSN Ack in the peer restart case" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 12dfd78e3a74825e6f0bc8df7ef9f938fbc6bfe3. This can be reverted as shutdown and cookie_ack chunk are using the same asoc since commit 35b4f24415c8 ("sctp: do asoc update earlier in sctp_sf_do_dupcook_a"). Reported-by: Jere Leppänen Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 5f9a7c028274..5b44d228b6ca 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -858,11 +858,7 @@ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, struct sctp_chunk *retval; __u32 ctsn; - if (chunk && chunk->asoc) - ctsn = sctp_tsnmap_get_ctsn(&chunk->asoc->peer.tsn_map); - else - ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); - + ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); shut.cum_tsn_ack = htonl(ctsn); retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0, From f282df0391267fb2b263da1cc3233aa6fb81defc Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 3 May 2021 04:41:20 +0800 Subject: [PATCH 0942/1379] sctp: fix a SCTP_MIB_CURRESTAB leak in sctp_sf_do_dupcook_b Normally SCTP_MIB_CURRESTAB is always incremented once asoc enter into ESTABLISHED from the state < ESTABLISHED and decremented when the asoc is being deleted. However, in sctp_sf_do_dupcook_b(), the asoc's state can be changed to ESTABLISHED from the state >= ESTABLISHED where it shouldn't increment SCTP_MIB_CURRESTAB. Otherwise, one asoc may increment MIB_CURRESTAB multiple times but only decrement once at the end. I was able to reproduce it by using scapy to do the 4-way shakehands, after that I replayed the COOKIE-ECHO chunk with 'peer_vtag' field changed to different values, and SCTP_MIB_CURRESTAB was incremented multiple times and never went back to 0 even when the asoc was freed. This patch is to fix it by only incrementing SCTP_MIB_CURRESTAB when the state < ESTABLISHED in sctp_sf_do_dupcook_b(). Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 5fc3f3a331f8..fd1e319eda00 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -1953,7 +1953,8 @@ static enum sctp_disposition sctp_sf_do_dupcook_b( sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_ESTABLISHED)); - SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB); + if (asoc->state < SCTP_STATE_ESTABLISHED) + SCTP_INC_STATS(net, SCTP_MIB_CURRESTAB); sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL()); /* Update the content of current association. */ From 2e9f60932a2c19e8a11b4a69d419f107024b05a0 Mon Sep 17 00:00:00 2001 From: Phillip Potter Date: Sun, 2 May 2021 22:34:42 +0100 Subject: [PATCH 0943/1379] net: hsr: check skb can contain struct hsr_ethhdr in fill_frame_info Check at start of fill_frame_info that the MAC header in the supplied skb is large enough to fit a struct hsr_ethhdr, as otherwise this is not a valid HSR frame. If it is too small, return an error which will then cause the callers to clean up the skb. Fixes a KMSAN-found uninit-value bug reported by syzbot at: https://syzkaller.appspot.com/bug?id=f7e9b601f1414f814f7602a82b6619a8d80bce3f Reported-by: syzbot+e267bed19bfc5478fb33@syzkaller.appspotmail.com Signed-off-by: Phillip Potter Signed-off-by: David S. Miller --- net/hsr/hsr_forward.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index b218e4594009..6852e9bccf5b 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -520,6 +520,10 @@ static int fill_frame_info(struct hsr_frame_info *frame, struct ethhdr *ethhdr; __be16 proto; + /* Check if skb contains hsr_ethhdr */ + if (skb->mac_len < sizeof(struct hsr_ethhdr)) + return -EINVAL; + memset(frame, 0, sizeof(*frame)); frame->is_supervision = is_supervision_frame(port->hsr, skb); frame->node_src = hsr_get_node(port, &hsr->node_db, skb, From 01bfe5e8e428b475982a98a46cca5755726f3f7f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 3 May 2021 05:11:41 +0800 Subject: [PATCH 0944/1379] Revert "net/sctp: fix race condition in sctp_destroy_sock" This reverts commit b166a20b07382b8bc1dcee2a448715c9c2c81b5b. This one has to be reverted as it introduced a dead lock, as syzbot reported: CPU0 CPU1 ---- ---- lock(&net->sctp.addr_wq_lock); lock(slock-AF_INET6); lock(&net->sctp.addr_wq_lock); lock(slock-AF_INET6); CPU0 is the thread of sctp_addr_wq_timeout_handler(), and CPU1 is that of sctp_close(). The original issue this commit fixed will be fixed in the next patch. Reported-by: syzbot+959223586843e69a2674@syzkaller.appspotmail.com Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b7b90135c36a..76a388b5021c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1520,9 +1520,11 @@ static void sctp_close(struct sock *sk, long timeout) /* Supposedly, no process has access to the socket, but * the net layers still may. + * Also, sctp_destroy_sock() needs to be called with addr_wq_lock + * held and that should be grabbed before socket lock. */ - local_bh_disable(); - bh_lock_sock(sk); + spin_lock_bh(&net->sctp.addr_wq_lock); + bh_lock_sock_nested(sk); /* Hold the sock, since sk_common_release() will put sock_put() * and we have just a little more cleanup. @@ -1531,7 +1533,7 @@ static void sctp_close(struct sock *sk, long timeout) sk_common_release(sk); bh_unlock_sock(sk); - local_bh_enable(); + spin_unlock_bh(&net->sctp.addr_wq_lock); sock_put(sk); @@ -4991,6 +4993,9 @@ static int sctp_init_sock(struct sock *sk) sk_sockets_allocated_inc(sk); sock_prot_inuse_add(net, sk->sk_prot, 1); + /* Nothing can fail after this block, otherwise + * sctp_destroy_sock() will be called without addr_wq_lock held + */ if (net->sctp.default_auto_asconf) { spin_lock(&sock_net(sk)->sctp.addr_wq_lock); list_add_tail(&sp->auto_asconf_list, @@ -5025,9 +5030,7 @@ static void sctp_destroy_sock(struct sock *sk) if (sp->do_auto_asconf) { sp->do_auto_asconf = 0; - spin_lock_bh(&sock_net(sk)->sctp.addr_wq_lock); list_del(&sp->auto_asconf_list); - spin_unlock_bh(&sock_net(sk)->sctp.addr_wq_lock); } sctp_endpoint_free(sp->ep); local_bh_disable(); From 34e5b01186858b36c4d7c87e1a025071e8e2401f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 3 May 2021 05:11:42 +0800 Subject: [PATCH 0945/1379] sctp: delay auto_asconf init until binding the first addr As Or Cohen described: If sctp_destroy_sock is called without sock_net(sk)->sctp.addr_wq_lock held and sp->do_auto_asconf is true, then an element is removed from the auto_asconf_splist without any proper locking. This can happen in the following functions: 1. In sctp_accept, if sctp_sock_migrate fails. 2. In inet_create or inet6_create, if there is a bpf program attached to BPF_CGROUP_INET_SOCK_CREATE which denies creation of the sctp socket. This patch is to fix it by moving the auto_asconf init out of sctp_init_sock(), by which inet_create()/inet6_create() won't need to operate it in sctp_destroy_sock() when calling sk_common_release(). It also makes more sense to do auto_asconf init while binding the first addr, as auto_asconf actually requires an ANY addr bind, see it in sctp_addr_wq_timeout_handler(). This addresses CVE-2021-23133. Fixes: 610236587600 ("bpf: Add new cgroup attach type to enable sock modifications") Reported-by: Or Cohen Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 76a388b5021c..40f9f6c4a0a1 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -357,6 +357,18 @@ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, return af; } +static void sctp_auto_asconf_init(struct sctp_sock *sp) +{ + struct net *net = sock_net(&sp->inet.sk); + + if (net->sctp.default_auto_asconf) { + spin_lock(&net->sctp.addr_wq_lock); + list_add_tail(&sp->auto_asconf_list, &net->sctp.auto_asconf_splist); + spin_unlock(&net->sctp.addr_wq_lock); + sp->do_auto_asconf = 1; + } +} + /* Bind a local address either to an endpoint or to an association. */ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) { @@ -418,8 +430,10 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) return -EADDRINUSE; /* Refresh ephemeral port. */ - if (!bp->port) + if (!bp->port) { bp->port = inet_sk(sk)->inet_num; + sctp_auto_asconf_init(sp); + } /* Add the address to the bind address list. * Use GFP_ATOMIC since BHs will be disabled. @@ -4993,19 +5007,6 @@ static int sctp_init_sock(struct sock *sk) sk_sockets_allocated_inc(sk); sock_prot_inuse_add(net, sk->sk_prot, 1); - /* Nothing can fail after this block, otherwise - * sctp_destroy_sock() will be called without addr_wq_lock held - */ - if (net->sctp.default_auto_asconf) { - spin_lock(&sock_net(sk)->sctp.addr_wq_lock); - list_add_tail(&sp->auto_asconf_list, - &net->sctp.auto_asconf_splist); - sp->do_auto_asconf = 1; - spin_unlock(&sock_net(sk)->sctp.addr_wq_lock); - } else { - sp->do_auto_asconf = 0; - } - local_bh_enable(); return 0; @@ -9401,6 +9402,8 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, return err; } + sctp_auto_asconf_init(newsp); + /* Move any messages in the old socket's receive queue that are for the * peeled off association to the new socket's receive queue. */ From d362fd0be456dba2d3d58a90b7a193962776562b Mon Sep 17 00:00:00 2001 From: Xie He Date: Sun, 2 May 2021 20:51:36 -0700 Subject: [PATCH 0946/1379] Revert "drivers/net/wan/hdlc_fr: Fix a double free in pvc_xmit" This reverts commit 1b479fb80160 ("drivers/net/wan/hdlc_fr: Fix a double free in pvc_xmit"). 1. This commit is incorrect. "__skb_pad" will NOT free the skb on failure when its "free_on_error" parameter is "false". 2. This commit claims to fix my commit. But it didn't CC me?? Fixes: 1b479fb80160 ("drivers/net/wan/hdlc_fr: Fix a double free in pvc_xmit") Cc: Lv Yunlong Signed-off-by: Xie He Signed-off-by: David S. Miller --- drivers/net/wan/hdlc_fr.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c index 4d9dc7d15908..0720f5f92caa 100644 --- a/drivers/net/wan/hdlc_fr.c +++ b/drivers/net/wan/hdlc_fr.c @@ -415,7 +415,7 @@ static netdev_tx_t pvc_xmit(struct sk_buff *skb, struct net_device *dev) if (pad > 0) { /* Pad the frame with zeros */ if (__skb_pad(skb, pad, false)) - goto out; + goto drop; skb_put(skb, pad); } } @@ -448,9 +448,8 @@ static netdev_tx_t pvc_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; drop: - kfree_skb(skb); -out: dev->stats.tx_dropped++; + kfree_skb(skb); return NETDEV_TX_OK; } From bd1af6b5fffd36c12997bd48d61d39dc5796fa7b Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Mon, 3 May 2021 17:10:50 +0200 Subject: [PATCH 0947/1379] Documentation: ABI: sysfs-class-net-qmi: document pass-through file Add documentation for /sys/class/net//qmi/pass_through Signed-off-by: Daniele Palmas Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net-qmi | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-class-net-qmi b/Documentation/ABI/testing/sysfs-class-net-qmi index ed79f5893421..47e6b9732337 100644 --- a/Documentation/ABI/testing/sysfs-class-net-qmi +++ b/Documentation/ABI/testing/sysfs-class-net-qmi @@ -58,3 +58,19 @@ Description: Indicates the mux id associated to the qmimux network interface during its creation. + +What: /sys/class/net//qmi/pass_through +Date: January 2021 +KernelVersion: 5.12 +Contact: Subash Abhinov Kasiviswanathan +Description: + Boolean. Default: 'N' + + Set this to 'Y' to enable 'pass-through' mode, allowing packets + in MAP format to be passed on to the stack. + + Normally the rmnet driver (CONFIG_RMNET) is then used to process + and demultiplex these packets. + + 'Pass-through' mode can be enabled when the device is in + 'raw-ip' mode only. From c7d13358b6a2f49f81a34aa323a2d0878a0532a2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 30 Apr 2021 14:00:13 +0200 Subject: [PATCH 0948/1379] netfilter: xt_SECMARK: add new revision to fix structure layout This extension breaks when trying to delete rules, add a new revision to fix this. Fixes: 5e6874cdb8de ("[SECMARK]: Add xtables SECMARK target") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_SECMARK.h | 6 ++ net/netfilter/xt_SECMARK.c | 88 ++++++++++++++++++----- 2 files changed, 75 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/netfilter/xt_SECMARK.h b/include/uapi/linux/netfilter/xt_SECMARK.h index 1f2a708413f5..beb2cadba8a9 100644 --- a/include/uapi/linux/netfilter/xt_SECMARK.h +++ b/include/uapi/linux/netfilter/xt_SECMARK.h @@ -20,4 +20,10 @@ struct xt_secmark_target_info { char secctx[SECMARK_SECCTX_MAX]; }; +struct xt_secmark_target_info_v1 { + __u8 mode; + char secctx[SECMARK_SECCTX_MAX]; + __u32 secid; +}; + #endif /*_XT_SECMARK_H_target */ diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 75625d13e976..498a0bf6f044 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -24,10 +24,9 @@ MODULE_ALIAS("ip6t_SECMARK"); static u8 mode; static unsigned int -secmark_tg(struct sk_buff *skb, const struct xt_action_param *par) +secmark_tg(struct sk_buff *skb, const struct xt_secmark_target_info_v1 *info) { u32 secmark = 0; - const struct xt_secmark_target_info *info = par->targinfo; switch (mode) { case SECMARK_MODE_SEL: @@ -41,7 +40,7 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static int checkentry_lsm(struct xt_secmark_target_info *info) +static int checkentry_lsm(struct xt_secmark_target_info_v1 *info) { int err; @@ -73,15 +72,15 @@ static int checkentry_lsm(struct xt_secmark_target_info *info) return 0; } -static int secmark_tg_check(const struct xt_tgchk_param *par) +static int +secmark_tg_check(const char *table, struct xt_secmark_target_info_v1 *info) { - struct xt_secmark_target_info *info = par->targinfo; int err; - if (strcmp(par->table, "mangle") != 0 && - strcmp(par->table, "security") != 0) { + if (strcmp(table, "mangle") != 0 && + strcmp(table, "security") != 0) { pr_info_ratelimited("only valid in \'mangle\' or \'security\' table, not \'%s\'\n", - par->table); + table); return -EINVAL; } @@ -116,25 +115,76 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par) } } -static struct xt_target secmark_tg_reg __read_mostly = { - .name = "SECMARK", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = secmark_tg_check, - .destroy = secmark_tg_destroy, - .target = secmark_tg, - .targetsize = sizeof(struct xt_secmark_target_info), - .me = THIS_MODULE, +static int secmark_tg_check_v0(const struct xt_tgchk_param *par) +{ + struct xt_secmark_target_info *info = par->targinfo; + struct xt_secmark_target_info_v1 newinfo = { + .mode = info->mode, + }; + int ret; + + memcpy(newinfo.secctx, info->secctx, SECMARK_SECCTX_MAX); + + ret = secmark_tg_check(par->table, &newinfo); + info->secid = newinfo.secid; + + return ret; +} + +static unsigned int +secmark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_secmark_target_info *info = par->targinfo; + struct xt_secmark_target_info_v1 newinfo = { + .secid = info->secid, + }; + + return secmark_tg(skb, &newinfo); +} + +static int secmark_tg_check_v1(const struct xt_tgchk_param *par) +{ + return secmark_tg_check(par->table, par->targinfo); +} + +static unsigned int +secmark_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + return secmark_tg(skb, par->targinfo); +} + +static struct xt_target secmark_tg_reg[] __read_mostly = { + { + .name = "SECMARK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = secmark_tg_check_v0, + .destroy = secmark_tg_destroy, + .target = secmark_tg_v0, + .targetsize = sizeof(struct xt_secmark_target_info), + .me = THIS_MODULE, + }, + { + .name = "SECMARK", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = secmark_tg_check_v1, + .destroy = secmark_tg_destroy, + .target = secmark_tg_v1, + .targetsize = sizeof(struct xt_secmark_target_info_v1), + .usersize = offsetof(struct xt_secmark_target_info_v1, secid), + .me = THIS_MODULE, + }, }; static int __init secmark_tg_init(void) { - return xt_register_target(&secmark_tg_reg); + return xt_register_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg)); } static void __exit secmark_tg_exit(void) { - xt_unregister_target(&secmark_tg_reg); + xt_unregister_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg)); } module_init(secmark_tg_init); From 43016d02cf6e46edfc4696452251d34bba0c0435 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 May 2021 13:51:15 +0200 Subject: [PATCH 0949/1379] netfilter: arptables: use pernet ops struct during unregister Like with iptables and ebtables, hook unregistration has to use the pernet ops struct, not the template. This triggered following splat: hook not found, pf 3 num 0 WARNING: CPU: 0 PID: 224 at net/netfilter/core.c:480 __nf_unregister_net_hook+0x1eb/0x610 net/netfilter/core.c:480 [..] nf_unregister_net_hook net/netfilter/core.c:502 [inline] nf_unregister_net_hooks+0x117/0x160 net/netfilter/core.c:576 arpt_unregister_table_pre_exit+0x67/0x80 net/ipv4/netfilter/arp_tables.c:1565 Fixes: f9006acc8dfe5 ("netfilter: arp_tables: pass table pointer via nf_hook_ops") Reported-by: syzbot+dcccba8a1e41a38cb9df@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_arp/arp_tables.h | 3 +-- net/ipv4/netfilter/arp_tables.c | 5 ++--- net/ipv4/netfilter/arptable_filter.c | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h index 2aab9612f6ab..4f9a4b3c5892 100644 --- a/include/linux/netfilter_arp/arp_tables.h +++ b/include/linux/netfilter_arp/arp_tables.h @@ -53,8 +53,7 @@ int arpt_register_table(struct net *net, const struct xt_table *table, const struct arpt_replace *repl, const struct nf_hook_ops *ops); void arpt_unregister_table(struct net *net, const char *name); -void arpt_unregister_table_pre_exit(struct net *net, const char *name, - const struct nf_hook_ops *ops); +void arpt_unregister_table_pre_exit(struct net *net, const char *name); extern unsigned int arpt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct xt_table *table); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index cf20316094d0..c53f14b94356 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1556,13 +1556,12 @@ out_free: return ret; } -void arpt_unregister_table_pre_exit(struct net *net, const char *name, - const struct nf_hook_ops *ops) +void arpt_unregister_table_pre_exit(struct net *net, const char *name) { struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name); if (table) - nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks)); } EXPORT_SYMBOL(arpt_unregister_table_pre_exit); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index b8f45e9bbec8..6922612df456 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -54,7 +54,7 @@ static int __net_init arptable_filter_table_init(struct net *net) static void __net_exit arptable_filter_net_pre_exit(struct net *net) { - arpt_unregister_table_pre_exit(net, "filter", arpfilter_ops); + arpt_unregister_table_pre_exit(net, "filter"); } static void __net_exit arptable_filter_net_exit(struct net *net) From ac31565c21937eee9117e43c9cd34f557f6f1cb8 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Wed, 28 Apr 2021 17:44:24 +0800 Subject: [PATCH 0950/1379] xsk: Fix for xp_aligned_validate_desc() when len == chunk_size When desc->len is equal to chunk_size, it is legal. But when the xp_aligned_validate_desc() got chunk_end from desc->addr + desc->len pointing to the next chunk during the check, it caused the check to fail. This problem was first introduced in bbff2f321a86 ("xsk: new descriptor addressing scheme"). Later in 2b43470add8c ("xsk: Introduce AF_XDP buffer allocation API") this piece of code was moved into the new function called xp_aligned_validate_desc(). This function was then moved into xsk_queue.h via 26062b185eee ("xsk: Explicitly inline functions and move definitions"). Fixes: bbff2f321a86 ("xsk: new descriptor addressing scheme") Signed-off-by: Xuan Zhuo Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20210428094424.54435-1-xuanzhuo@linux.alibaba.com --- net/xdp/xsk_queue.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 2ac3802c2cd7..9d2a89d793c0 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -128,13 +128,12 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 chunk, chunk_end; + u64 chunk; - chunk = xp_aligned_extract_addr(pool, desc->addr); - chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len); - if (chunk != chunk_end) + if (desc->len > pool->chunk_size) return false; + chunk = xp_aligned_extract_addr(pool, desc->addr); if (chunk >= pool->addrs_cnt) return false; From eef8abdaedf8084bfda66cd2eecf7eebbdff2c16 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 28 Apr 2021 23:31:37 -0700 Subject: [PATCH 0951/1379] Documentation: input: joydev file corrections Fix typos, grammar, punctuation in Documentation/input/joydev/*.rst files. Signed-off-by: Randy Dunlap Cc: Dmitry Torokhov Link: https://lore.kernel.org/r/20210429063137.20232-1-rdunlap@infradead.org Signed-off-by: Jonathan Corbet --- Documentation/input/joydev/joystick-api.rst | 14 +++++------ Documentation/input/joydev/joystick.rst | 26 ++++++++++----------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Documentation/input/joydev/joystick-api.rst b/Documentation/input/joydev/joystick-api.rst index 95803e2e8cd0..af5934c10c1c 100644 --- a/Documentation/input/joydev/joystick-api.rst +++ b/Documentation/input/joydev/joystick-api.rst @@ -71,7 +71,7 @@ The possible values of ``type`` are:: #define JS_EVENT_INIT 0x80 /* initial state of device */ As mentioned above, the driver will issue synthetic JS_EVENT_INIT ORed -events on open. That is, if it's issuing a INIT BUTTON event, the +events on open. That is, if it's issuing an INIT BUTTON event, the current type value will be:: int type = JS_EVENT_BUTTON | JS_EVENT_INIT; /* 0x81 */ @@ -100,8 +100,8 @@ is, you have both an axis 0 and a button 0). Generally, =============== ======= Hats vary from one joystick type to another. Some can be moved in 8 -directions, some only in 4, The driver, however, always reports a hat as two -independent axis, even if the hardware doesn't allow independent movement. +directions, some only in 4. The driver, however, always reports a hat as two +independent axes, even if the hardware doesn't allow independent movement. js_event.value @@ -188,10 +188,10 @@ One reason for emptying the queue is that if it gets full you'll start missing events since the queue is finite, and older events will get overwritten. -The other reason is that you want to know all what happened, and not +The other reason is that you want to know all that happened, and not delay the processing till later. -Why can get the queue full? Because you don't empty the queue as +Why can the queue get full? Because you don't empty the queue as mentioned, or because too much time elapses from one read to another and too many events to store in the queue get generated. Note that high system load may contribute to space those reads even more. @@ -277,7 +277,7 @@ to be in the stable part of the API, and therefore may change without warning in following releases of the driver. Both JSIOCSCORR and JSIOCGCORR expect &js_corr to be able to hold -information for all axis. That is, struct js_corr corr[MAX_AXIS]; +information for all axes. That is, struct js_corr corr[MAX_AXIS]; struct js_corr is defined as:: @@ -328,7 +328,7 @@ To test the state of the buttons, second_button_state = js.buttons & 2; The axis values do not have a defined range in the original 0.x driver, -except for that the values are non-negative. The 1.2.8+ drivers use a +except that the values are non-negative. The 1.2.8+ drivers use a fixed range for reporting the values, 1 being the minimum, 128 the center, and 255 maximum value. diff --git a/Documentation/input/joydev/joystick.rst b/Documentation/input/joydev/joystick.rst index 9746fd76cc58..f615906a0821 100644 --- a/Documentation/input/joydev/joystick.rst +++ b/Documentation/input/joydev/joystick.rst @@ -133,15 +133,15 @@ And add a line to your rc script executing that file:: This way, after the next reboot your joystick will remain calibrated. You can also add the ``jscal -p`` line to your shutdown script. -HW specific driver information -============================== +Hardware-specific driver information +==================================== In this section each of the separate hardware specific drivers is described. Analog joysticks ---------------- -The analog.c uses the standard analog inputs of the gameport, and thus +The analog.c driver uses the standard analog inputs of the gameport, and thus supports all standard joysticks and gamepads. It uses a very advanced routine for this, allowing for data precision that can't be found on any other system. @@ -266,7 +266,7 @@ to: * Logitech WingMan Extreme Digital 3D ADI devices are autodetected, and the driver supports up to two (any -combination of) devices on a single gameport, using an Y-cable or chained +combination of) devices on a single gameport, using a Y-cable or chained together. Logitech WingMan Joystick, Logitech WingMan Attack, Logitech WingMan @@ -288,7 +288,7 @@ supports: * Gravis Xterminator DualControl All these devices are autodetected, and you can even use any combination -of up to two of these pads either chained together or using an Y-cable on a +of up to two of these pads either chained together or using a Y-cable on a single gameport. GrIP MultiPort isn't supported yet. Gravis Stinger is a serial device and is @@ -311,7 +311,7 @@ allow connecting analog joysticks to them, you'll need to load the analog driver as well to handle the attached joysticks. The trackball should work with USB mousedev module as a normal mouse. See -the USB documentation for how to setup an USB mouse. +the USB documentation for how to setup a USB mouse. ThrustMaster DirectConnect (BSP) -------------------------------- @@ -332,7 +332,7 @@ If you have one of these, contact me. TMDC devices are autodetected, and thus no parameters to the module are needed. Up to two TMDC devices can be connected to one gameport, using -an Y-cable. +a Y-cable. Creative Labs Blaster --------------------- @@ -342,7 +342,7 @@ the: * Creative Blaster GamePad Cobra -Up to two of these can be used on a single gameport, using an Y-cable. +Up to two of these can be used on a single gameport, using a Y-cable. Genius Digital joysticks ------------------------ @@ -381,7 +381,7 @@ card, 16 in case you have two in your system. Trident 4DWave / Aureal Vortex ------------------------------ -Soundcards with a Trident 4DWave DX/NX or Aureal Vortex/Vortex2 chipsets +Soundcards with a Trident 4DWave DX/NX or Aureal Vortex/Vortex2 chipset provide an "Enhanced Game Port" mode where the soundcard handles polling the joystick. This mode is supported by the pcigame.c module. Once loaded the analog driver can use the enhanced features of these gameports.. @@ -454,7 +454,7 @@ Devices currently supported by spaceball.c are: * SpaceTec SpaceBall 4000 FLX In addition to having the spaceorb/spaceball and serport modules in the -kernel, you also need to attach a serial port to it. to do that, run the +kernel, you also need to attach a serial port to it. To do that, run the inputattach program:: inputattach --spaceorb /dev/tts/x & @@ -466,7 +466,7 @@ or:: where /dev/tts/x is the serial port which the device is connected to. After doing this, the device will be reported and will start working. -There is one caveat with the SpaceOrb. The button #6, the on the bottom +There is one caveat with the SpaceOrb. The button #6, the one on the bottom side of the orb, although reported as an ordinary button, causes internal recentering of the spaceorb, moving the zero point to the position in which the ball is at the moment of pressing the button. So, think first before @@ -500,7 +500,7 @@ joy-magellan module. It currently supports only the: * Magellan 3D * Space Mouse -models, the additional buttons on the 'Plus' versions are not supported yet. +models; the additional buttons on the 'Plus' versions are not supported yet. To use it, you need to attach the serial port to the driver using the:: @@ -575,7 +575,7 @@ FAQ :A: The device files don't exist. Create them (see section 2.2). :Q: Is it possible to connect my old Atari/Commodore/Amiga/console joystick - or pad that uses a 9-pin D-type cannon connector to the serial port of my + or pad that uses a 9-pin D-type Cannon connector to the serial port of my PC? :A: Yes, it is possible, but it'll burn your serial port or the pad. It won't work, of course. From bd8ede484750b36fb81bf666b6ee87678843d01d Mon Sep 17 00:00:00 2001 From: Wu XiangCheng Date: Fri, 30 Apr 2021 20:22:35 +0800 Subject: [PATCH 0952/1379] docs/zh_CN: Adjust order and content of zh_CN/index.rst Adjust order and content of zh_CN/index.rst to make it clear, complete introductions and TODOLists. Signed-off-by: Wu XiangCheng Link: https://lore.kernel.org/r/20210430122234.GA655@bobwxc.top Signed-off-by: Jonathan Corbet --- Documentation/translations/zh_CN/index.rst | 172 +++++++++++++++++++-- 1 file changed, 160 insertions(+), 12 deletions(-) diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst index ee6b20ca9080..d56d6b7092e6 100644 --- a/Documentation/translations/zh_CN/index.rst +++ b/Documentation/translations/zh_CN/index.rst @@ -1,36 +1,184 @@ +.. SPDX-License-Identifier: GPL-2.0 + .. raw:: latex \renewcommand\thesection* \renewcommand\thesubsection* +.. _linux_doc_zh: + 中文翻译 ======== -这些手册包含有关如何开发内核的整体信息。内核社区非常庞大,一年下来有数千名开发 -人员做出贡献。 与任何大型社区一样,知道如何完成任务将使得更改合并的过程变得更 -加容易。 -翻译计划: -内核中文文档欢迎任何翻译投稿,特别是关于内核用户和管理员指南部分。 +.. note:: + + **翻译计划:** + 内核中文文档欢迎任何翻译投稿,特别是关于内核用户和管理员指南部分。 + +许可证文档 +---------- + +下面的文档介绍了Linux内核源代码的许可证(GPLv2)、如何在源代码树中正确标记 +单个文件的许可证、以及指向完整许可证文本的链接。 + +* Documentation/translations/zh_CN/process/license-rules.rst + +用户文档 +-------- + +下面的手册是为内核用户编写的——即那些试图让它在给定系统上以最佳方式工作的 +用户。 .. toctree:: :maxdepth: 2 admin-guide/index + +TODOList: + +* kbuild/index + +固件相关文档 +------------ + +下列文档描述了内核需要的平台固件相关信息。 + +TODOList: + +* firmware-guide/index +* devicetree/index + +应用程序开发人员文档 +-------------------- + +用户空间API手册涵盖了描述应用程序开发人员可见内核接口方面的文档。 + +TODOlist: + +* userspace-api/index + +内核开发简介 +------------ + +这些手册包含有关如何开发内核的整体信息。内核社区非常庞大,一年下来有数千名 +开发人员做出贡献。与任何大型社区一样,知道如何完成任务将使得更改合并的过程 +变得更加容易。 + +.. toctree:: + :maxdepth: 2 + process/index dev-tools/index doc-guide/index kernel-hacking/index - filesystems/index - arm64/index - sound/index - cpu-freq/index - mips/index - iio/index - riscv/index + +TODOList: + +* trace/index +* maintainer/index +* fault-injection/index +* livepatch/index +* rust/index + +内核API文档 +----------- + +以下手册从内核开发人员的角度详细介绍了特定的内核子系统是如何工作的。这里的 +大部分信息都是直接从内核源代码获取的,并根据需要添加补充材料(或者至少是在 +我们设法添加的时候——可能不是所有的都是有需要的)。 + +.. toctree:: + :maxdepth: 2 + core-api/index + cpu-freq/index + iio/index + sound/index + filesystems/index + +TODOList: + +* driver-api/index +* locking/index +* accounting/index +* block/index +* cdrom/index +* ide/index +* fb/index +* fpga/index +* hid/index +* i2c/index +* isdn/index +* infiniband/index +* leds/index +* netlabel/index +* networking/index +* pcmcia/index +* power/index +* target/index +* timers/index +* spi/index +* w1/index +* watchdog/index +* virt/index +* input/index +* hwmon/index +* gpu/index +* security/index +* crypto/index +* vm/index +* bpf/index +* usb/index +* PCI/index +* scsi/index +* misc-devices/index +* scheduler/index +* mhi/index + +体系结构无关文档 +---------------- + +TODOList: + +* asm-annotations + +特定体系结构文档 +---------------- + +.. toctree:: + :maxdepth: 2 + + mips/index + arm64/index + riscv/index openrisc/index +TODOList: + +* arm/index +* ia64/index +* m68k/index +* nios2/index +* parisc/index +* powerpc/index +* s390/index +* sh/index +* sparc/index +* x86/index +* xtensa/index + +其他文档 +-------- + +有几份未排序的文档似乎不适合放在文档的其他部分,或者可能需要进行一些调整和/或 +转换为reStructureText格式,也有可能太旧。 + +TODOList: + +* staging/index +* watch_queue + 目录和表格 ---------- From 0ca0d55526d338d926e85352d3e44dd85728676f Mon Sep 17 00:00:00 2001 From: Yanteng Si Date: Wed, 28 Apr 2021 18:07:20 +0800 Subject: [PATCH 0953/1379] docs/core-api: Consistent code style all `example` in this file should be replaced with ``example``. Signed-off-by: Yanteng Si Acked-by: Matthias Maennich Link: https://lore.kernel.org/r/20210428100720.1076276-1-siyanteng@loongson.cn Signed-off-by: Jonathan Corbet --- Documentation/core-api/symbol-namespaces.rst | 26 ++++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/Documentation/core-api/symbol-namespaces.rst b/Documentation/core-api/symbol-namespaces.rst index 9b76337f6756..5ad9e0abe42c 100644 --- a/Documentation/core-api/symbol-namespaces.rst +++ b/Documentation/core-api/symbol-namespaces.rst @@ -43,14 +43,14 @@ exporting of kernel symbols to the kernel symbol table, variants of these are available to export symbols into a certain namespace: EXPORT_SYMBOL_NS() and EXPORT_SYMBOL_NS_GPL(). They take one additional argument: the namespace. Please note that due to macro expansion that argument needs to be a -preprocessor symbol. E.g. to export the symbol `usb_stor_suspend` into the -namespace `USB_STORAGE`, use:: +preprocessor symbol. E.g. to export the symbol ``usb_stor_suspend`` into the +namespace ``USB_STORAGE``, use:: EXPORT_SYMBOL_NS(usb_stor_suspend, USB_STORAGE); -The corresponding ksymtab entry struct `kernel_symbol` will have the member -`namespace` set accordingly. A symbol that is exported without a namespace will -refer to `NULL`. There is no default namespace if none is defined. `modpost` +The corresponding ksymtab entry struct ``kernel_symbol`` will have the member +``namespace`` set accordingly. A symbol that is exported without a namespace will +refer to ``NULL``. There is no default namespace if none is defined. ``modpost`` and kernel/module.c make use the namespace at build time or module load time, respectively. @@ -64,7 +64,7 @@ and EXPORT_SYMBOL_GPL() macro expansions that do not specify a namespace. There are multiple ways of specifying this define and it depends on the subsystem and the maintainer's preference, which one to use. The first option -is to define the default namespace in the `Makefile` of the subsystem. E.g. to +is to define the default namespace in the ``Makefile`` of the subsystem. E.g. to export all symbols defined in usb-common into the namespace USB_COMMON, add a line like this to drivers/usb/common/Makefile:: @@ -96,7 +96,7 @@ using a statement like:: MODULE_IMPORT_NS(USB_STORAGE); -This will create a `modinfo` tag in the module for each imported namespace. +This will create a ``modinfo`` tag in the module for each imported namespace. This has the side effect, that the imported namespaces of a module can be inspected with modinfo:: @@ -113,7 +113,7 @@ metadata definitions like MODULE_AUTHOR() or MODULE_LICENSE(). Refer to section 4. Loading Modules that use namespaced Symbols ============================================== -At module loading time (e.g. `insmod`), the kernel will check each symbol +At module loading time (e.g. ``insmod``), the kernel will check each symbol referenced from the module for its availability and whether the namespace it might be exported to has been imported by the module. The default behaviour of the kernel is to reject loading modules that don't specify sufficient imports. @@ -138,19 +138,19 @@ missing imports. Fixing missing imports can be done with:: A typical scenario for module authors would be:: - write code that depends on a symbol from a not imported namespace - - `make` + - ``make`` - notice the warning of modpost telling about a missing import - - run `make nsdeps` to add the import to the correct code location + - run ``make nsdeps`` to add the import to the correct code location For subsystem maintainers introducing a namespace, the steps are very similar. -Again, `make nsdeps` will eventually add the missing namespace imports for +Again, ``make nsdeps`` will eventually add the missing namespace imports for in-tree modules:: - move or add symbols to a namespace (e.g. with EXPORT_SYMBOL_NS()) - - `make` (preferably with an allmodconfig to cover all in-kernel + - ``make`` (preferably with an allmodconfig to cover all in-kernel modules) - notice the warning of modpost telling about a missing import - - run `make nsdeps` to add the import to the correct code location + - run ``make nsdeps`` to add the import to the correct code location You can also run nsdeps for external module builds. A typical usage is:: From da2e56634b262fddfa40b2cfedd24de841418cd3 Mon Sep 17 00:00:00 2001 From: "John 'Warthog9' Hawley (VMware)" Date: Mon, 19 Apr 2021 17:29:26 -0700 Subject: [PATCH 0954/1379] ktest: Minor cleanup with uninitialized variable $build_options Signed-off-by: John 'Warthog9' Hawley (VMware) Signed-off-by: Steven Rostedt (VMware) --- tools/testing/ktest/ktest.pl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index 4e2450964517..18fd4fd117dd 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -2594,6 +2594,9 @@ sub build { # Run old config regardless, to enforce min configurations make_oldconfig; + if (not defined($build_options)){ + $build_options = ""; + } my $build_ret = run_command "$make $build_options", $buildlog; if (defined($post_build)) { From 2676eb4bfc546dc490d2abd155877a580c74c294 Mon Sep 17 00:00:00 2001 From: "John 'Warthog9' Hawley (VMware)" Date: Mon, 19 Apr 2021 17:29:27 -0700 Subject: [PATCH 0955/1379] ktest: Add example config for using VMware VMs This duplicates the KVM/Qemu config with specific notes for how to use it with VMware VMs on Workstation, Player, or Fusion. The main thing to be aware of is how the serial port is exposed which is a unix pipe, and will need something like ncat to get into ktest's monitoring Signed-off-by: John 'Warthog9' Hawley (VMware) Signed-off-by: Steven Rostedt (VMware) --- tools/testing/ktest/examples/vmware.conf | 137 +++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 tools/testing/ktest/examples/vmware.conf diff --git a/tools/testing/ktest/examples/vmware.conf b/tools/testing/ktest/examples/vmware.conf new file mode 100644 index 000000000000..61958163d242 --- /dev/null +++ b/tools/testing/ktest/examples/vmware.conf @@ -0,0 +1,137 @@ +# +# This config is an example usage of ktest.pl with a vmware guest +# +# VMware Setup: +# ------------- +# - Edit the Virtual Machine ("Edit virtual machine settings") +# - Add a Serial Port +# - You almost certainly want it set "Connect at power on" +# - Select "Use socket (named pipe)" +# - Select a name that you'll recognize, like 'ktestserialpipe' +# - From: Server +# - To: A Virtual Machine +# - Save +# - Make sure you note the name, it will be in the base directory of the +# virtual machine (where the "disks" are stored. The default +# is /var/lib/vmware// +# +# - Make note of the path to the VM +# +# +# The guest is called 'Guest' and this would be something that +# could be run on the host to test a virtual machine target. + +MACHINE = Guest + +# Name of the serial pipe you set in the VMware settings +VMWARE_SERIAL_NAME = + +# Define a variable of the name of the VM +# Noting this needs to be the name of the kmx file, and usually, the +# name of the directory that it's in. If the directory and name +# differ change the VMWARE_VM_DIR accordingly. +# Please ommit the .kmx extension +VMWARE_VM_NAME = + +# VM dir name. This is usually the same as the virtual machine's name, +# but not always the case. Change if they differ +VMWARE_VM_DIR = ${VMWARE_VM_NAME} + +# Base directory that the Virtual machine is contained in +# /var/lib/vmware is the default on Linux +VMWARE_VM_BASE_DIR = /var/lib/vmware/${VMWARE_VM_DIR} + +# Use ncat to read the unix pipe. Anything that can read the Unix Pipe +# and output it's contents to stdout will work +CONSOLE = /usr/bin/ncat -U ${VMWARE_VM_BASE_DIR}/${VMWARE_SERIAL_NAME} + +# Define what version of Workstation you are using +# This is used by vmrun to use the appropriate appripriate pieces to +# test this. In all likelihood you want 'ws' or 'player' +# Valid options: +# ws - Workstation (Windows or Linux host) +# fusion - Fusion (Mac host) +# player - Using VMware Player (Windows or Linux host) +# Note: vmrun has to run directly on the host machine +VMWARE_HOST_TYPE = ws + +# VMware provides `vmrun` to allow you to do certain things to the virtual machine +# This should hard reset the VM and force a boot +VMWARE_POWER_CYCLE = /usr/bin/vmrun -T ${VMWARE_HOST_TYPE} reset ${VMWARE_VM_BASE_DIR}/${VMWARE_VM_NAME}.kmx nogui + +#*************************************# +# This part is the same as test.conf # +#*************************************# + +# The include files will set up the type of test to run. Just set TEST to +# which test you want to run. +# +# TESTS = patchcheck, randconfig, boot, test, config-bisect, bisect, min-config +# +# See the include/*.conf files that define these tests +# +TEST := patchcheck + +# Some tests may have more than one test to run. Define MULTI := 1 to run +# the extra tests. +MULTI := 0 + +# In case you want to differentiate which type of system you are testing +BITS := 64 + +# REBOOT = none, error, fail, empty +# See include/defaults.conf +REBOOT := empty + + +# The defaults file will set up various settings that can be used by all +# machine configs. +INCLUDE include/defaults.conf + + +#*************************************# +# Now we are different from test.conf # +#*************************************# + + +# The example here assumes that Guest is running a Fedora release +# that uses dracut for its initfs. The POST_INSTALL will be executed +# after the install of the kernel and modules are complete. +# +POST_INSTALL = ${SSH} /sbin/dracut -f /boot/initramfs-test.img $KERNEL_VERSION + +# Guests sometimes get stuck on reboot. We wait 3 seconds after running +# the reboot command and then do a full power-cycle of the guest. +# This forces the guest to restart. +# +POWERCYCLE_AFTER_REBOOT = 3 + +# We do the same after the halt command, but this time we wait 20 seconds. +POWEROFF_AFTER_HALT = 20 + + +# As the defaults.conf file has a POWER_CYCLE option already defined, +# and options can not be defined in the same section more than once +# (all DEFAULTS sections are considered the same). We use the +# DEFAULTS OVERRIDE to tell ktest.pl to ignore the previous defined +# options, for the options set in the OVERRIDE section. +# +DEFAULTS OVERRIDE + +# Instead of using the default POWER_CYCLE option defined in +# defaults.conf, we use virsh to cycle it. To do so, we destroy +# the guest, wait 5 seconds, and then start it up again. +# Crude, but effective. +# +POWER_CYCLE = ${VMWARE_POWER_CYCLE} + + +DEFAULTS + +# The following files each handle a different test case. +# Having them included allows you to set up more than one machine and share +# the same tests. +INCLUDE include/patchcheck.conf +INCLUDE include/tests.conf +INCLUDE include/bisect.conf +INCLUDE include/min-config.conf From becdd17b5acc79267cf4dba65e07e96e11cc9b57 Mon Sep 17 00:00:00 2001 From: "John 'Warthog9' Hawley (VMware)" Date: Mon, 19 Apr 2021 17:29:28 -0700 Subject: [PATCH 0956/1379] ktest: Adding editor hints to improve consistency Emacs and Vi(m) have different styles of dealing with perl syntax which can lead to slightly inconsistent indentation, and makes the code slightly harder to read. Emacs assumes a more perl recommended standard of 4 spaces (1 column) or tab (two column) indentation. Vi(m) tends to favor just normal spaces or tabs depending on what was being used. This gives the basic hinting to Emacs and Vim to do what is expected to be basically consistent. Emacs: - Explicitly flip into perl mode, cperl would require more adjustments Vi(m): - Set softtabs=4 which will flip it over to doing indentation the way you would expect from Emacs Signed-off-by: John 'Warthog9' Hawley (VMware) Signed-off-by: Steven Rostedt (VMware) --- tools/testing/ktest/ktest.pl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index 18fd4fd117dd..14a753b86445 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -4520,3 +4520,12 @@ if (defined($opt{"LOG_FILE"})) { } exit 0; + +## +# The following are here to standardize tabs/spaces/etc across the most likely editors +### + +# Local Variables: +# mode: perl +# End: +# vim: softtabstop=4 From 12d4cddda2043466a5af8fc0c49e49f24f1d4c59 Mon Sep 17 00:00:00 2001 From: "John 'Warthog9' Hawley (VMware)" Date: Mon, 19 Apr 2021 17:29:29 -0700 Subject: [PATCH 0957/1379] ktest: Fixing indentation to match expected pattern This is a followup to "ktest: Adding editor hints to improve consistency" to actually adjust the existing indentation to match the, now, expected pattern (first column 4 spaces, 2nd tab, 3rd tab + 4 spaces, etc). This should, at least help, keep things consistent going forward now. Signed-off-by: John 'Warthog9' Hawley (VMware) Signed-off-by: Steven Rostedt (VMware) --- tools/testing/ktest/ktest.pl | 196 +++++++++++++++++------------------ 1 file changed, 97 insertions(+), 99 deletions(-) diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index 14a753b86445..633c715173ff 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -760,7 +760,7 @@ sub process_variables { # remove the space added in the beginning $retval =~ s/ //; - return "$retval" + return "$retval"; } sub set_value { @@ -1095,7 +1095,7 @@ sub __read_config { } } } - + if ( ! -r $file ) { die "$name: $.: Can't read file $file\n$_"; } @@ -1186,13 +1186,13 @@ sub __read_config { } sub get_test_case { - print "What test case would you like to run?\n"; - print " (build, install or boot)\n"; - print " Other tests are available but require editing ktest.conf\n"; - print " (see tools/testing/ktest/sample.conf)\n"; - my $ans = ; - chomp $ans; - $default{"TEST_TYPE"} = $ans; + print "What test case would you like to run?\n"; + print " (build, install or boot)\n"; + print " Other tests are available but require editing ktest.conf\n"; + print " (see tools/testing/ktest/sample.conf)\n"; + my $ans = ; + chomp $ans; + $default{"TEST_TYPE"} = $ans; } sub read_config { @@ -1519,13 +1519,13 @@ sub dodie { close O; close L; } - send_email("KTEST: critical failure for test $i [$name]", - "Your test started at $script_start_time has failed with:\n@_\n", $log_file); + send_email("KTEST: critical failure for test $i [$name]", + "Your test started at $script_start_time has failed with:\n@_\n", $log_file); } if ($monitor_cnt) { - # restore terminal settings - system("stty $stty_orig"); + # restore terminal settings + system("stty $stty_orig"); } if (defined($post_test)) { @@ -1709,81 +1709,81 @@ sub wait_for_monitor { } sub save_logs { - my ($result, $basedir) = @_; - my @t = localtime; - my $date = sprintf "%04d%02d%02d%02d%02d%02d", - 1900+$t[5],$t[4],$t[3],$t[2],$t[1],$t[0]; + my ($result, $basedir) = @_; + my @t = localtime; + my $date = sprintf "%04d%02d%02d%02d%02d%02d", + 1900+$t[5],$t[4],$t[3],$t[2],$t[1],$t[0]; - my $type = $build_type; - if ($type =~ /useconfig/) { - $type = "useconfig"; + my $type = $build_type; + if ($type =~ /useconfig/) { + $type = "useconfig"; + } + + my $dir = "$machine-$test_type-$type-$result-$date"; + + $dir = "$basedir/$dir"; + + if (!-d $dir) { + mkpath($dir) or + dodie "can't create $dir"; + } + + my %files = ( + "config" => $output_config, + "buildlog" => $buildlog, + "dmesg" => $dmesg, + "testlog" => $testlog, + ); + + while (my ($name, $source) = each(%files)) { + if (-f "$source") { + cp "$source", "$dir/$name" or + dodie "failed to copy $source"; } + } - my $dir = "$machine-$test_type-$type-$result-$date"; - - $dir = "$basedir/$dir"; - - if (!-d $dir) { - mkpath($dir) or - dodie "can't create $dir"; - } - - my %files = ( - "config" => $output_config, - "buildlog" => $buildlog, - "dmesg" => $dmesg, - "testlog" => $testlog, - ); - - while (my ($name, $source) = each(%files)) { - if (-f "$source") { - cp "$source", "$dir/$name" or - dodie "failed to copy $source"; - } - } - - doprint "*** Saved info to $dir ***\n"; + doprint "*** Saved info to $dir ***\n"; } sub fail { - if ($die_on_failure) { - dodie @_; - } + if ($die_on_failure) { + dodie @_; + } - doprint "FAILED\n"; + doprint "FAILED\n"; - my $i = $iteration; + my $i = $iteration; - # no need to reboot for just building. - if (!do_not_reboot) { - doprint "REBOOTING\n"; - reboot_to_good $sleep_time; - } + # no need to reboot for just building. + if (!do_not_reboot) { + doprint "REBOOTING\n"; + reboot_to_good $sleep_time; + } - my $name = ""; + my $name = ""; - if (defined($test_name)) { - $name = " ($test_name)"; - } + if (defined($test_name)) { + $name = " ($test_name)"; + } - print_times; + print_times; - doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; - doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; - doprint "KTEST RESULT: TEST $i$name Failed: ", @_, "\n"; - doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; - doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; + doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; + doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; + doprint "KTEST RESULT: TEST $i$name Failed: ", @_, "\n"; + doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; + doprint "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n"; - if (defined($store_failures)) { - save_logs "fail", $store_failures; - } + if (defined($store_failures)) { + save_logs "fail", $store_failures; + } - if (defined($post_test)) { - run_command $post_test; - } + if (defined($post_test)) { + run_command $post_test; + } - return 1; + return 1; } sub run_command { @@ -1969,9 +1969,9 @@ sub get_grub_index { $target = '^menuentry.*' . $grub_menu_qt; $skip = '^menuentry\s|^submenu\s'; } elsif ($reboot_type eq "grub2bls") { - $command = $grub_bls_get; - $target = '^title=.*' . $grub_menu_qt; - $skip = '^title='; + $command = $grub_bls_get; + $target = '^title=.*' . $grub_menu_qt; + $skip = '^title='; } else { return; } @@ -2394,7 +2394,7 @@ sub check_buildlog { while () { if (/$check_build_re/) { my $warning = process_warning_line $_; - + $warnings_list{$warning} = 1; } } @@ -2659,7 +2659,7 @@ sub success { doprint "*******************************************\n"; if (defined($store_successes)) { - save_logs "success", $store_successes; + save_logs "success", $store_successes; } if ($i != $opt{"NUM_TESTS"} && !do_not_reboot) { @@ -3246,13 +3246,13 @@ sub run_config_bisect { $ret = run_config_bisect_test $config_bisect_type; if ($ret) { - doprint "NEW GOOD CONFIG ($pass)\n"; + doprint "NEW GOOD CONFIG ($pass)\n"; system("cp $output_config $tmpdir/good_config.tmp.$pass"); $pass++; # Return 3 for good config return 3; } else { - doprint "NEW BAD CONFIG ($pass)\n"; + doprint "NEW BAD CONFIG ($pass)\n"; system("cp $output_config $tmpdir/bad_config.tmp.$pass"); $pass++; # Return 4 for bad config @@ -3371,7 +3371,7 @@ sub config_bisect { } while ($ret == 3 || $ret == 4); if ($ret == 2) { - config_bisect_end "$good_config.tmp", "$bad_config.tmp"; + config_bisect_end "$good_config.tmp", "$bad_config.tmp"; } return $ret if ($ret < 0); @@ -3551,7 +3551,6 @@ sub read_kconfig { my $cont = 0; my $line; - if (! -f $kconfig) { doprint "file $kconfig does not exist, skipping\n"; return; @@ -3660,7 +3659,7 @@ sub read_depends { if (! -f $kconfig && $arch =~ /\d$/) { my $orig = $arch; - # some subarchs have numbers, truncate them + # some subarchs have numbers, truncate them $arch =~ s/\d*$//; $kconfig = "$builddir/arch/$arch/Kconfig"; if (! -f $kconfig) { @@ -3855,7 +3854,7 @@ sub make_min_config { foreach my $config (@config_keys) { my $kconfig = chomp_config $config; if (!defined $depcount{$kconfig}) { - $depcount{$kconfig} = 0; + $depcount{$kconfig} = 0; } } @@ -3957,13 +3956,13 @@ sub make_min_config { my $failed = 0; build "oldconfig" or $failed = 1; if (!$failed) { - start_monitor_and_install or $failed = 1; + start_monitor_and_install or $failed = 1; - if ($type eq "test" && !$failed) { - do_run_test or $failed = 1; - } + if ($type eq "test" && !$failed) { + do_run_test or $failed = 1; + } - end_monitor; + end_monitor; } $in_bisect = 0; @@ -4277,8 +4276,8 @@ sub send_email { sub cancel_test { if ($email_when_canceled) { my $name = get_test_name; - send_email("KTEST: Your [$name] test was cancelled", - "Your test started at $script_start_time was cancelled: sig int"); + send_email("KTEST: Your [$name] test was cancelled", + "Your test started at $script_start_time was cancelled: sig int"); } die "\nCaught Sig Int, test interrupted: $!\n" } @@ -4326,15 +4325,15 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) { # The first test may override the PRE_KTEST option if ($i == 1) { - if (defined($pre_ktest)) { - doprint "\n"; - run_command $pre_ktest; - } - if ($email_when_started) { + if (defined($pre_ktest)) { + doprint "\n"; + run_command $pre_ktest; + } + if ($email_when_started) { my $name = get_test_name; - send_email("KTEST: Your [$name] test was started", - "Your test was started on $script_start_time"); - } + send_email("KTEST: Your [$name] test was started", + "Your test was started on $script_start_time"); + } } # Any test can override the POST_KTEST option @@ -4506,12 +4505,11 @@ if ($opt{"POWEROFF_ON_SUCCESS"}) { run_command $switch_to_good; } - doprint "\n $successes of $opt{NUM_TESTS} tests were successful\n\n"; if ($email_when_finished) { send_email("KTEST: Your test has finished!", - "$successes of $opt{NUM_TESTS} tests started at $script_start_time were successful!"); + "$successes of $opt{NUM_TESTS} tests started at $script_start_time were successful!"); } if (defined($opt{"LOG_FILE"})) { From c043ccbfc6d83fa21512f842c5d2ba4060cee5fe Mon Sep 17 00:00:00 2001 From: "John 'Warthog9' Hawley (VMware)" Date: Mon, 19 Apr 2021 17:29:30 -0700 Subject: [PATCH 0958/1379] ktest: Further consistency cleanups This cleans up some additional whitespace pieces that to be more consistent, as well as moving a curly brace around, and some 'or' statements to match the rest of the file (usually or goes at the end of the line vs. at the beginning) Signed-off-by: John 'Warthog9' Hawley (VMware) Signed-off-by: Steven Rostedt (VMware) --- tools/testing/ktest/ktest.pl | 85 +++++++++++++++--------------------- 1 file changed, 36 insertions(+), 49 deletions(-) diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index 633c715173ff..a3c6ad64c479 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -24,7 +24,7 @@ my %evals; #default opts my %default = ( - "MAILER" => "sendmail", # default mailer + "MAILER" => "sendmail", # default mailer "EMAIL_ON_ERROR" => 1, "EMAIL_WHEN_FINISHED" => 1, "EMAIL_WHEN_CANCELED" => 0, @@ -36,15 +36,15 @@ my %default = ( "CLOSE_CONSOLE_SIGNAL" => "INT", "TIMEOUT" => 120, "TMP_DIR" => "/tmp/ktest/\${MACHINE}", - "SLEEP_TIME" => 60, # sleep time between tests + "SLEEP_TIME" => 60, # sleep time between tests "BUILD_NOCLEAN" => 0, "REBOOT_ON_ERROR" => 0, "POWEROFF_ON_ERROR" => 0, "REBOOT_ON_SUCCESS" => 1, "POWEROFF_ON_SUCCESS" => 0, "BUILD_OPTIONS" => "", - "BISECT_SLEEP_TIME" => 60, # sleep time between bisects - "PATCHCHECK_SLEEP_TIME" => 60, # sleep time between patch checks + "BISECT_SLEEP_TIME" => 60, # sleep time between bisects + "PATCHCHECK_SLEEP_TIME" => 60, # sleep time between patch checks "CLEAR_LOG" => 0, "BISECT_MANUAL" => 0, "BISECT_SKIP" => 1, @@ -537,7 +537,7 @@ sub read_prompt { my $ans; for (;;) { - if ($cancel) { + if ($cancel) { print "$prompt [y/n/C] "; } else { print "$prompt [Y/n] "; @@ -863,7 +863,6 @@ sub value_defined { defined($opt{$2}); } -my $d = 0; sub process_expression { my ($name, $val) = @_; @@ -978,7 +977,6 @@ sub __read_config { $override = 0; if ($type eq "TEST_START") { - if ($num_tests_set) { die "$name: $.: Can not specify both NUM_TESTS and TEST_START\n"; } @@ -1048,7 +1046,6 @@ sub __read_config { $test_num = $old_test_num; $repeat = $old_repeat; } - } elsif (/^\s*ELSE\b(.*)$/) { if (!$if) { die "$name: $.: ELSE found with out matching IF section\n$_"; @@ -1471,7 +1468,6 @@ sub get_test_name() { } sub dodie { - # avoid recursion return if ($in_die); $in_die = 1; @@ -1481,10 +1477,8 @@ sub dodie { doprint "CRITICAL FAILURE... [TEST $i] ", @_, "\n"; if ($reboot_on_error && !do_not_reboot) { - doprint "REBOOTING\n"; reboot_to_good; - } elsif ($poweroff_on_error && defined($power_off)) { doprint "POWERING OFF\n"; `$power_off`; @@ -1519,8 +1513,9 @@ sub dodie { close O; close L; } + send_email("KTEST: critical failure for test $i [$name]", - "Your test started at $script_start_time has failed with:\n@_\n", $log_file); + "Your test started at $script_start_time has failed with:\n@_\n", $log_file); } if ($monitor_cnt) { @@ -1915,8 +1910,8 @@ sub _get_grub_index { my ($command, $target, $skip) = @_; return if (defined($grub_number) && defined($last_grub_menu) && - $last_grub_menu eq $grub_menu && defined($last_machine) && - $last_machine eq $machine); + $last_grub_menu eq $grub_menu && defined($last_machine) && + $last_machine eq $machine); doprint "Find $reboot_type menu ... "; $grub_number = -1; @@ -1924,8 +1919,8 @@ sub _get_grub_index { my $ssh_grub = $ssh_exec; $ssh_grub =~ s,\$SSH_COMMAND,$command,g; - open(IN, "$ssh_grub |") - or dodie "unable to execute $command"; + open(IN, "$ssh_grub |") or + dodie "unable to execute $command"; my $found = 0; @@ -1979,8 +1974,7 @@ sub get_grub_index { _get_grub_index($command, $target, $skip); } -sub wait_for_input -{ +sub wait_for_input { my ($fp, $time) = @_; my $start_time; my $rin; @@ -2096,7 +2090,6 @@ sub monitor { my $version_found = 0; while (!$done) { - if ($bug && defined($stop_after_failure) && $stop_after_failure >= 0) { my $time = $stop_after_failure - (time - $failure_start); @@ -2571,7 +2564,6 @@ sub build { run_command "mv $outputdir/config_temp $output_config" or dodie "moving config_temp"; } - } elsif (!$noclean) { unlink "$output_config"; run_command "$make mrproper" or @@ -2652,11 +2644,12 @@ sub success { print_times; - doprint "\n\n*******************************************\n"; - doprint "*******************************************\n"; - doprint "KTEST RESULT: TEST $i$name SUCCESS!!!! **\n"; - doprint "*******************************************\n"; - doprint "*******************************************\n"; + doprint "\n\n"; + doprint "*******************************************\n"; + doprint "*******************************************\n"; + doprint "KTEST RESULT: TEST $i$name SUCCESS!!!! **\n"; + doprint "*******************************************\n"; + doprint "*******************************************\n"; if (defined($store_successes)) { save_logs "success", $store_successes; @@ -3034,7 +3027,6 @@ sub bisect { } if ($do_check) { - # get current HEAD my $head = get_sha1("HEAD"); @@ -3074,13 +3066,11 @@ sub bisect { run_command "git bisect replay $replay" or dodie "failed to run replay"; } else { - run_command "git bisect good $good" or dodie "could not set bisect good to $good"; run_git_bisect "git bisect bad $bad" or dodie "could not set bisect bad to $bad"; - } if (defined($start)) { @@ -3133,8 +3123,8 @@ sub assign_configs { doprint "Reading configs from $config\n"; - open (IN, $config) - or dodie "Failed to read $config"; + open (IN, $config) or + dodie "Failed to read $config"; while () { chomp; @@ -3287,10 +3277,11 @@ sub config_bisect { if (!defined($config_bisect_exec)) { # First check the location that ktest.pl ran - my @locations = ( "$pwd/config-bisect.pl", - "$dirname/config-bisect.pl", - "$builddir/tools/testing/ktest/config-bisect.pl", - undef ); + my @locations = ( + "$pwd/config-bisect.pl", + "$dirname/config-bisect.pl", + "$builddir/tools/testing/ktest/config-bisect.pl", + undef ); foreach my $loc (@locations) { doprint "loc = $loc\n"; $config_bisect_exec = $loc; @@ -3632,8 +3623,8 @@ sub read_kconfig { sub read_depends { # find out which arch this is by the kconfig file - open (IN, $output_config) - or dodie "Failed to read $output_config"; + open (IN, $output_config) or + dodie "Failed to read $output_config"; my $arch; while () { if (m,Linux/(\S+)\s+\S+\s+Kernel Configuration,) { @@ -3708,7 +3699,6 @@ sub get_depends { my @configs; while ($dep =~ /[$valid]/) { - if ($dep =~ /^[^$valid]*([$valid]+)/) { my $conf = "CONFIG_" . $1; @@ -3889,7 +3879,6 @@ sub make_min_config { my $take_two = 0; while (!$done) { - my $config; my $found; @@ -3900,7 +3889,7 @@ sub make_min_config { # Sort keys by who is most dependent on @test_configs = sort { $depcount{chomp_config($b)} <=> $depcount{chomp_config($a)} } - @test_configs ; + @test_configs ; # Put configs that did not modify the config at the end. my $reset = 1; @@ -3976,8 +3965,8 @@ sub make_min_config { # update new ignore configs if (defined($ignore_config)) { - open (OUT, ">$temp_config") - or dodie "Can't write to $temp_config"; + open (OUT, ">$temp_config") or + dodie "Can't write to $temp_config"; foreach my $config (keys %save_configs) { print OUT "$save_configs{$config}\n"; } @@ -4004,8 +3993,8 @@ sub make_min_config { } # Save off all the current mandatory configs - open (OUT, ">$temp_config") - or dodie "Can't write to $temp_config"; + open (OUT, ">$temp_config") or + dodie "Can't write to $temp_config"; foreach my $config (keys %keep_configs) { print OUT "$keep_configs{$config}\n"; } @@ -4043,7 +4032,6 @@ sub make_warnings_file { open(IN, $buildlog) or dodie "Can't open $buildlog"; while () { - # Some compilers use UTF-8 extended for quotes # for distcc heterogeneous systems, this causes issues s/$utf8_quote/'/g; @@ -4263,7 +4251,6 @@ sub do_send_mail { } sub send_email { - if (defined($mailto)) { if (!defined($mailer)) { doprint "No email sent: email or mailer not specified in config.\n"; @@ -4277,7 +4264,7 @@ sub cancel_test { if ($email_when_canceled) { my $name = get_test_name; send_email("KTEST: Your [$name] test was cancelled", - "Your test started at $script_start_time was cancelled: sig int"); + "Your test started at $script_start_time was cancelled: sig int"); } die "\nCaught Sig Int, test interrupted: $!\n" } @@ -4332,7 +4319,7 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) { if ($email_when_started) { my $name = get_test_name; send_email("KTEST: Your [$name] test was started", - "Your test was started on $script_start_time"); + "Your test was started on $script_start_time"); } } @@ -4411,7 +4398,7 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) { my $ret = run_command $pre_test; if (!$ret && defined($pre_test_die) && $pre_test_die) { - dodie "failed to pre_test\n"; + dodie "failed to pre_test\n"; } } @@ -4509,7 +4496,7 @@ doprint "\n $successes of $opt{NUM_TESTS} tests were successful\n\n"; if ($email_when_finished) { send_email("KTEST: Your test has finished!", - "$successes of $opt{NUM_TESTS} tests started at $script_start_time were successful!"); + "$successes of $opt{NUM_TESTS} tests started at $script_start_time were successful!"); } if (defined($opt{"LOG_FILE"})) { From 6a0f3652952c7bba83af66c115a311d4a2164ebb Mon Sep 17 00:00:00 2001 From: "John 'Warthog9' Hawley (VMware)" Date: Mon, 19 Apr 2021 17:29:31 -0700 Subject: [PATCH 0959/1379] ktest: Re-arrange the code blocks for better discoverability Perl, as with most scripting languages, is fairly flexible in how / where you can define things, and it will (for the most part) do what you would expect it to do. This however can lead to situations, like with ktest, where things get muddled over time. This pushes the variable definitions back up to the top, followed by functions, with the main script executables down at the bottom, INSTEAD of being somewhat mish-mashed together in certain places. This mostly has the advantage of making it more obvious where things are initially defined, what functions are there, and ACTUALLY where the main script starts executing, and should make this a little more approachable. Signed-off-by: John 'Warthog9' Hawley (VMware) Signed-off-by: Steven Rostedt (VMware) --- tools/testing/ktest/ktest.pl | 296 ++++++++++++++++++----------------- 1 file changed, 154 insertions(+), 142 deletions(-) diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index a3c6ad64c479..09d1578f9d66 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -512,6 +512,69 @@ $config_help{"REBOOT_SCRIPT"} = << "EOF" EOF ; +# used with process_expression() +my $d = 0; + +# defined before get_test_name() +my $in_die = 0; + +# defined before process_warning_line() +my $check_build_re = ".*:.*(warning|error|Error):.*"; +my $utf8_quote = "\\x{e2}\\x{80}(\\x{98}|\\x{99})"; + +# defined before child_finished() +my $child_done; + +# config_ignore holds the configs that were set (or unset) for +# a good config and we will ignore these configs for the rest +# of a config bisect. These configs stay as they were. +my %config_ignore; + +# config_set holds what all configs were set as. +my %config_set; + +# config_off holds the set of configs that the bad config had disabled. +# We need to record them and set them in the .config when running +# olddefconfig, because olddefconfig keeps the defaults. +my %config_off; + +# config_off_tmp holds a set of configs to turn off for now +my @config_off_tmp; + +# config_list is the set of configs that are being tested +my %config_list; +my %null_config; + +my %dependency; + +# found above run_config_bisect() +my $pass = 1; + +# found above add_dep() + +my %depends; +my %depcount; +my $iflevel = 0; +my @ifdeps; + +# prevent recursion +my %read_kconfigs; + +# found above test_this_config() +my %min_configs; +my %keep_configs; +my %save_configs; +my %processed_configs; +my %nochange_config; + +# +# These are first defined here, main function later on +# +sub run_command; +sub start_monitor; +sub end_monitor; +sub wait_for_monitor; + sub _logit { if (defined($opt{"LOG_FILE"})) { print LOG @_; @@ -1365,11 +1428,6 @@ sub eval_option { return $option; } -sub run_command; -sub start_monitor; -sub end_monitor; -sub wait_for_monitor; - sub reboot { my ($time) = @_; my $powercycle = 0; @@ -1454,8 +1512,6 @@ sub do_not_reboot { ($test_type eq "config_bisect" && $opt{"CONFIG_BISECT_TYPE[$i]"} eq "build"); } -my $in_die = 0; - sub get_test_name() { my $name; @@ -2342,9 +2398,6 @@ sub start_monitor_and_install { return monitor; } -my $check_build_re = ".*:.*(warning|error|Error):.*"; -my $utf8_quote = "\\x{e2}\\x{80}(\\x{98}|\\x{99})"; - sub process_warning_line { my ($line) = @_; @@ -2694,8 +2747,6 @@ sub child_run_test { exit $run_command_status; } -my $child_done; - sub child_finished { $child_done = 1; } @@ -3096,28 +3147,6 @@ sub bisect { success $i; } -# config_ignore holds the configs that were set (or unset) for -# a good config and we will ignore these configs for the rest -# of a config bisect. These configs stay as they were. -my %config_ignore; - -# config_set holds what all configs were set as. -my %config_set; - -# config_off holds the set of configs that the bad config had disabled. -# We need to record them and set them in the .config when running -# olddefconfig, because olddefconfig keeps the defaults. -my %config_off; - -# config_off_tmp holds a set of configs to turn off for now -my @config_off_tmp; - -# config_list is the set of configs that are being tested -my %config_list; -my %null_config; - -my %dependency; - sub assign_configs { my ($hash, $config) = @_; @@ -3212,8 +3241,6 @@ sub config_bisect_end { doprint "***************************************\n\n"; } -my $pass = 1; - sub run_config_bisect { my ($good, $bad, $last_result) = @_; my $reset = ""; @@ -3505,14 +3532,6 @@ sub patchcheck { return 1; } -my %depends; -my %depcount; -my $iflevel = 0; -my @ifdeps; - -# prevent recursion -my %read_kconfigs; - sub add_dep { # $config depends on $dep my ($config, $dep) = @_; @@ -3713,12 +3732,6 @@ sub get_depends { return @configs; } -my %min_configs; -my %keep_configs; -my %save_configs; -my %processed_configs; -my %nochange_config; - sub test_this_config { my ($config) = @_; @@ -4047,98 +4060,6 @@ sub make_warnings_file { success $i; } -$#ARGV < 1 or die "ktest.pl version: $VERSION\n usage: ktest.pl [config-file]\n"; - -if ($#ARGV == 0) { - $ktest_config = $ARGV[0]; - if (! -f $ktest_config) { - print "$ktest_config does not exist.\n"; - if (!read_yn "Create it?") { - exit 0; - } - } -} - -if (! -f $ktest_config) { - $newconfig = 1; - get_test_case; - open(OUT, ">$ktest_config") or die "Can not create $ktest_config"; - print OUT << "EOF" -# Generated by ktest.pl -# - -# PWD is a ktest.pl variable that will result in the process working -# directory that ktest.pl is executed in. - -# THIS_DIR is automatically assigned the PWD of the path that generated -# the config file. It is best to use this variable when assigning other -# directory paths within this directory. This allows you to easily -# move the test cases to other locations or to other machines. -# -THIS_DIR := $variable{"PWD"} - -# Define each test with TEST_START -# The config options below it will override the defaults -TEST_START -TEST_TYPE = $default{"TEST_TYPE"} - -DEFAULTS -EOF -; - close(OUT); -} -read_config $ktest_config; - -if (defined($opt{"LOG_FILE"})) { - $opt{"LOG_FILE"} = eval_option("LOG_FILE", $opt{"LOG_FILE"}, -1); -} - -# Append any configs entered in manually to the config file. -my @new_configs = keys %entered_configs; -if ($#new_configs >= 0) { - print "\nAppending entered in configs to $ktest_config\n"; - open(OUT, ">>$ktest_config") or die "Can not append to $ktest_config"; - foreach my $config (@new_configs) { - print OUT "$config = $entered_configs{$config}\n"; - $opt{$config} = process_variables($entered_configs{$config}); - } -} - -if (defined($opt{"LOG_FILE"})) { - if ($opt{"CLEAR_LOG"}) { - unlink $opt{"LOG_FILE"}; - } - open(LOG, ">> $opt{LOG_FILE}") or die "Can't write to $opt{LOG_FILE}"; - LOG->autoflush(1); -} - -doprint "\n\nSTARTING AUTOMATED TESTS\n\n"; - -for (my $i = 0, my $repeat = 1; $i <= $opt{"NUM_TESTS"}; $i += $repeat) { - - if (!$i) { - doprint "DEFAULT OPTIONS:\n"; - } else { - doprint "\nTEST $i OPTIONS"; - if (defined($repeat_tests{$i})) { - $repeat = $repeat_tests{$i}; - doprint " ITERATE $repeat"; - } - doprint "\n"; - } - - foreach my $option (sort keys %opt) { - - if ($option =~ /\[(\d+)\]$/) { - next if ($i != $1); - } else { - next if ($i); - } - - doprint "$option = $opt{$option}\n"; - } -} - sub option_defined { my ($option) = @_; @@ -4269,6 +4190,97 @@ sub cancel_test { die "\nCaught Sig Int, test interrupted: $!\n" } +$#ARGV < 1 or die "ktest.pl version: $VERSION\n usage: ktest.pl [config-file]\n"; + +if ($#ARGV == 0) { + $ktest_config = $ARGV[0]; + if (! -f $ktest_config) { + print "$ktest_config does not exist.\n"; + if (!read_yn "Create it?") { + exit 0; + } + } +} + +if (! -f $ktest_config) { + $newconfig = 1; + get_test_case; + open(OUT, ">$ktest_config") or die "Can not create $ktest_config"; + print OUT << "EOF" +# Generated by ktest.pl +# + +# PWD is a ktest.pl variable that will result in the process working +# directory that ktest.pl is executed in. + +# THIS_DIR is automatically assigned the PWD of the path that generated +# the config file. It is best to use this variable when assigning other +# directory paths within this directory. This allows you to easily +# move the test cases to other locations or to other machines. +# +THIS_DIR := $variable{"PWD"} + +# Define each test with TEST_START +# The config options below it will override the defaults +TEST_START +TEST_TYPE = $default{"TEST_TYPE"} + +DEFAULTS +EOF +; + close(OUT); +} +read_config $ktest_config; + +if (defined($opt{"LOG_FILE"})) { + $opt{"LOG_FILE"} = eval_option("LOG_FILE", $opt{"LOG_FILE"}, -1); +} + +# Append any configs entered in manually to the config file. +my @new_configs = keys %entered_configs; +if ($#new_configs >= 0) { + print "\nAppending entered in configs to $ktest_config\n"; + open(OUT, ">>$ktest_config") or die "Can not append to $ktest_config"; + foreach my $config (@new_configs) { + print OUT "$config = $entered_configs{$config}\n"; + $opt{$config} = process_variables($entered_configs{$config}); + } +} + +if (defined($opt{"LOG_FILE"})) { + if ($opt{"CLEAR_LOG"}) { + unlink $opt{"LOG_FILE"}; + } + open(LOG, ">> $opt{LOG_FILE}") or die "Can't write to $opt{LOG_FILE}"; + LOG->autoflush(1); +} + +doprint "\n\nSTARTING AUTOMATED TESTS\n\n"; + +for (my $i = 0, my $repeat = 1; $i <= $opt{"NUM_TESTS"}; $i += $repeat) { + + if (!$i) { + doprint "DEFAULT OPTIONS:\n"; + } else { + doprint "\nTEST $i OPTIONS"; + if (defined($repeat_tests{$i})) { + $repeat = $repeat_tests{$i}; + doprint " ITERATE $repeat"; + } + doprint "\n"; + } + + foreach my $option (sort keys %opt) { + if ($option =~ /\[(\d+)\]$/) { + next if ($i != $1); + } else { + next if ($i); + } + + doprint "$option = $opt{$option}\n"; + } +} + $SIG{INT} = qw(cancel_test); # First we need to do is the builds From c7ceee6958770c447b86a8917a603a20d646b608 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 3 May 2021 18:51:54 -0400 Subject: [PATCH 0960/1379] ktest: Add KTEST section to MAINTAINERS file As I wanted to add John Hawley as a co-maintainer for ktest, I found that there never was a KTEST section in the MAINTAINERS file. Add one! Signed-off-by: Steven Rostedt (VMware) --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index d92f85ca831d..3539d76232de 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9976,6 +9976,12 @@ S: Maintained F: Documentation/devicetree/bindings/leds/backlight/kinetic,ktd253.yaml F: drivers/video/backlight/ktd253-backlight.c +KTEST +M: Steven Rostedt +M: John Hawley +S: Maintained +F: tools/testing/ktest + L3MDEV M: David Ahern L: netdev@vger.kernel.org From 2fa4928aed4c10bb9d1906b8bb606e6212d91dd2 Mon Sep 17 00:00:00 2001 From: Anatoly Pugachev Date: Wed, 28 Apr 2021 13:48:51 +0300 Subject: [PATCH 0961/1379] docs: correct URL to bios and kernel developer's guide correct URL to bios and kernel developer's guide on amd.com site Signed-off-by: Anatoly Pugachev Link: https://lore.kernel.org/r/20210428104851.GA10572@u164.east.ru [jc: fixed resulting sphinx warning] Signed-off-by: Jonathan Corbet --- Documentation/ABI/testing/sysfs-devices-system-cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 0eee30b27ab6..fe13baa53c59 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -285,7 +285,7 @@ Description: Disable L3 cache indices All AMD processors with L3 caches provide this functionality. For details, see BKDGs at - http://developer.amd.com/documentation/guides/Pages/default.aspx + https://www.amd.com/en/support/tech-docs?keyword=bios+kernel What: /sys/devices/system/cpu/cpufreq/boost From 0043f0b27a0406730caef61068703fcacd9c2166 Mon Sep 17 00:00:00 2001 From: Thorsten Leemhuis Date: Thu, 15 Apr 2021 12:29:14 +0200 Subject: [PATCH 0962/1379] docs: reporting-issues.rst: CC subsystem and maintainers on regressions When reporting a regression, users ideally should CC the subsystem and its maintainers, as that will ensure they get aware of the regression quickly. And if the culprit is known, they should also CC everyone who signed if off; the text mentioned the latter in once place already, but forgot to do so in two other areas where it's relevant. While fixing this also remind readers to check the mailing list archives for issues that need to be reported to a bug tracker, as someone might have reported it by mail only. All of this got triggered by a recent report where these changes would have made a difference. Signed-off-by: Thorsten Leemhuis Link: https://lore.kernel.org/lkml/dff6badf-58f5-98c8-871c-94d901ac6919@leemhuis.info/ Link: https://lore.kernel.org/lkml/CAJZ5v0hX2StQVttAciHYH-urUH+Hi92z9z2ZbcNgQPt0E2Jpwg@mail.gmail.com/ Link: https://lore.kernel.org/r/dd13f10c30e79e550215e53a8103406daec4e593.1618482489.git.linux@leemhuis.info Signed-off-by: Jonathan Corbet --- .../admin-guide/reporting-issues.rst | 49 ++++++++++++------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/Documentation/admin-guide/reporting-issues.rst b/Documentation/admin-guide/reporting-issues.rst index 48b4d0ef2b09..18d8e25ba9df 100644 --- a/Documentation/admin-guide/reporting-issues.rst +++ b/Documentation/admin-guide/reporting-issues.rst @@ -24,7 +24,8 @@ longterm series? One still supported? Then search the `LKML you don't find any, install `the latest release from that series `_. If it still shows the issue, report it to the stable mailing list (stable@vger.kernel.org) and CC the regressions list -(regressions@lists.linux.dev). +(regressions@lists.linux.dev); ideally also CC the maintainer and the mailing +list for the subsystem in question. In all other cases try your best guess which kernel part might be causing the issue. Check the :ref:`MAINTAINERS ` file for how its developers @@ -48,8 +49,9 @@ before the issue occurs. If you are facing multiple issues with the Linux kernel at once, report each separately. While writing your report, include all information relevant to the issue, like the kernel and the distro used. In case of a regression, CC the -regressions mailing list (regressions@lists.linux.dev) to your report; also try -to include the commit-id of the change causing it, which a bisection can find. +regressions mailing list (regressions@lists.linux.dev) to your report. Also try +to pin-point the culprit with a bisection; if you succeed, include its +commit-id and CC everyone in the sign-off-by chain. Once the report is out, answer any questions that come up and help where you can. That includes keeping the ball rolling by occasionally retesting with newer @@ -198,10 +200,11 @@ report them: * Send a short problem report to the Linux stable mailing list (stable@vger.kernel.org) and CC the Linux regressions mailing list - (regressions@lists.linux.dev). Roughly describe the issue and ideally - explain how to reproduce it. Mention the first version that shows the - problem and the last version that's working fine. Then wait for further - instructions. + (regressions@lists.linux.dev); if you suspect the cause in a particular + subsystem, CC its maintainer and its mailing list. Roughly describe the + issue and ideally explain how to reproduce it. Mention the first version + that shows the problem and the last version that's working fine. Then + wait for further instructions. The reference section below explains each of these steps in more detail. @@ -768,7 +771,9 @@ regular internet search engine and add something like the results to the archives at that URL. It's also wise to check the internet, LKML and maybe bugzilla.kernel.org again -at this point. +at this point. If your report needs to be filed in a bug tracker, you may want +to check the mailing list archives for the subsystem as well, as someone might +have reported it only there. For details how to search and what to do if you find matching reports see "Search for existing reports, first run" above. @@ -1249,9 +1254,10 @@ and the oldest where the issue occurs (say 5.8-rc1). When sending the report by mail, CC the Linux regressions mailing list (regressions@lists.linux.dev). In case the report needs to be filed to some web -tracker, proceed to do so; once filed, forward the report by mail to the -regressions list. Make sure to inline the forwarded report, hence do not attach -it. Also add a short note at the top where you mention the URL to the ticket. +tracker, proceed to do so. Once filed, forward the report by mail to the +regressions list; CC the maintainer and the mailing list for the subsystem in +question. Make sure to inline the forwarded report, hence do not attach it. +Also add a short note at the top where you mention the URL to the ticket. When mailing or forwarding the report, in case of a successful bisection add the author of the culprit to the recipients; also CC everyone in the signed-off-by @@ -1536,17 +1542,20 @@ Report the regression *Send a short problem report to the Linux stable mailing list (stable@vger.kernel.org) and CC the Linux regressions mailing list - (regressions@lists.linux.dev). Roughly describe the issue and ideally - explain how to reproduce it. Mention the first version that shows the - problem and the last version that's working fine. Then wait for further - instructions.* + (regressions@lists.linux.dev); if you suspect the cause in a particular + subsystem, CC its maintainer and its mailing list. Roughly describe the + issue and ideally explain how to reproduce it. Mention the first version + that shows the problem and the last version that's working fine. Then + wait for further instructions.* When reporting a regression that happens within a stable or longterm kernel line (say when updating from 5.10.4 to 5.10.5) a brief report is enough for -the start to get the issue reported quickly. Hence a rough description is all -it takes. +the start to get the issue reported quickly. Hence a rough description to the +stable and regressions mailing list is all it takes; but in case you suspect +the cause in a particular subsystem, CC its maintainers and its mailing list +as well, because that will speed things up. -But note, it helps developers a great deal if you can specify the exact version +And note, it helps developers a great deal if you can specify the exact version that introduced the problem. Hence if possible within a reasonable time frame, try to find that version using vanilla kernels. Lets assume something broke when your distributor released a update from Linux kernel 5.10.5 to 5.10.8. Then as @@ -1563,7 +1572,9 @@ pinpoint the exact change that causes the issue (which then can easily get reverted to fix the issue quickly). Hence consider to do a proper bisection right away if time permits. See the section 'Special care for regressions' and the document 'Documentation/admin-guide/bug-bisect.rst' for details how to -perform one. +perform one. In case of a successful bisection add the author of the culprit to +the recipients; also CC everyone in the signed-off-by chain, which you find at +the end of its commit message. Reference for "Reporting issues only occurring in older kernel version lines" From f5169f713e0c02333e770c9045a00fa54ac98220 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Fri, 23 Apr 2021 18:32:59 +0530 Subject: [PATCH 0963/1379] Removed the oprofiled version option Removed the oprofiled version option Signed-off-by: Bhaskar Chowdhury Link: https://lore.kernel.org/r/c98fa38b74bdd8ab16d35862895dac5f5a535f94.1619181632.git.unixbhaskar@gmail.com Signed-off-by: Jonathan Corbet --- scripts/ver_linux | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/ver_linux b/scripts/ver_linux index a92acc703f9b..1a8ee4ff0e32 100755 --- a/scripts/ver_linux +++ b/scripts/ver_linux @@ -47,7 +47,6 @@ BEGIN { printversion("Net-tools", version("ifconfig --version")) printversion("Kbd", version("loadkeys -V")) printversion("Console-tools", version("loadkeys -V")) - printversion("Oprofile", version("oprofiled --version")) printversion("Sh-utils", version("expr --v")) printversion("Udev", version("udevadm --version")) printversion("Wireless-tools", version("iwconfig --version")) From 8e9fa2f21151f48c0fc3c53876d4564752fd9fbd Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Fri, 23 Apr 2021 18:33:00 +0530 Subject: [PATCH 0964/1379] oprofiled version output line removed from the list Oprofiled version output line removed from the list. Signed-off-by: Bhaskar Chowdhury Link: https://lore.kernel.org/r/8d1928ff2fea29d67143d235839a5e845e4402c9.1619181632.git.unixbhaskar@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/process/changes.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index dac17711dc11..d3a8557b66a1 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -48,7 +48,6 @@ quota-tools 3.09 quota -V PPP 2.4.0 pppd --version nfs-utils 1.0.5 showmount --version procps 3.2.0 ps --version -oprofile 0.9 oprofiled --version udev 081 udevd --version grub 0.93 grub --version || grub-install --version mcelog 0.6 mcelog --version From 7fc4607899e87259bb751ccdbe53628aa467ec22 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Fri, 23 Apr 2021 18:33:01 +0530 Subject: [PATCH 0965/1379] Enlisted oprofile version line removed Enlisted oprofile version line removed. Signed-off-by: Bhaskar Chowdhury Link: https://lore.kernel.org/r/35c4436f0f1b3072d3016148ce1461905b6f782b.1619181632.git.unixbhaskar@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/translations/it_IT/process/changes.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/translations/it_IT/process/changes.rst b/Documentation/translations/it_IT/process/changes.rst index cc883f8d96c4..87d081889bfc 100644 --- a/Documentation/translations/it_IT/process/changes.rst +++ b/Documentation/translations/it_IT/process/changes.rst @@ -51,7 +51,6 @@ quota-tools 3.09 quota -V PPP 2.4.0 pppd --version nfs-utils 1.0.5 showmount --version procps 3.2.0 ps --version -oprofile 0.9 oprofiled --version udev 081 udevd --version grub 0.93 grub --version || grub-install --version mcelog 0.6 mcelog --version From 970aa72c4dd37645ceb7dd15515d9502c4c56aa1 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 29 Apr 2021 14:47:56 +0200 Subject: [PATCH 0966/1379] dt-bindings: display: renesas,du: Add missing power-domains property "make dtbs_check" complains: arch/arm/boot/dts/r8a7779-marzen.dt.yaml: display@fff80000: 'power-domains' does not match any of the regexes: 'pinctrl-[0-9]+' arch/arm64/boot/dts/renesas/r8a77970-v3msk.dt.yaml: display@feb00000: 'power-domains' does not match any of the regexes: 'pinctrl-[0-9]+' arch/arm64/boot/dts/renesas/r8a77970-eagle.dt.yaml: display@feb00000: 'power-domains' does not match any of the regexes: 'pinctrl-[0-9]+' arch/arm64/boot/dts/renesas/r8a77980-condor.dt.yaml: display@feb00000: 'power-domains' does not match any of the regexes: 'pinctrl-[0-9]+' arch/arm64/boot/dts/renesas/r8a77980-v3hsk.dt.yaml: display@feb00000: 'power-domains' does not match any of the regexes: 'pinctrl-[0-9]+' Fix this by documenting the power-domains property. Fixes: 99d66127fad25ebb ("dt-bindings: display: renesas,du: Convert binding to YAML") Signed-off-by: Geert Uytterhoeven Reviewed-by: Laurent Pinchart Link: https://lore.kernel.org/r/600d42256515f180bc84b72e8bdb5c5d9126ab62.1619700459.git.geert+renesas@glider.be Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/display/renesas,du.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/devicetree/bindings/display/renesas,du.yaml b/Documentation/devicetree/bindings/display/renesas,du.yaml index 552a99ce4f12..121596f106da 100644 --- a/Documentation/devicetree/bindings/display/renesas,du.yaml +++ b/Documentation/devicetree/bindings/display/renesas,du.yaml @@ -51,6 +51,9 @@ properties: resets: true reset-names: true + power-domains: + maxItems: 1 + ports: $ref: /schemas/graph.yaml#/properties/port description: | From 8abddd968a303db75e4debe77a3df484164f1f33 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 3 May 2021 19:17:55 +1000 Subject: [PATCH 0967/1379] powerpc/64s/radix: Enable huge vmalloc mappings This reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%, due to vfs hashes being allocated with 2MB pages. Signed-off-by: Nicholas Piggin Reviewed-by: Christophe Leroy Acked-by: Michael Ellerman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210503091755.613393-1-npiggin@gmail.com --- .../admin-guide/kernel-parameters.txt | 2 ++ arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/module.c | 18 +++++++++++++----- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 02470ba1fe6a..1fcec80b099e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3251,6 +3251,8 @@ nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings. + nohugevmalloc [PPC] Disable kernel huge vmalloc mappings. + nosmt [KNL,S390] Disable symmetric multithreading (SMT). Equivalent to smt=1. diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1e6230bea09d..c547a9d6a2dd 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -185,6 +185,7 @@ config PPC select GENERIC_VDSO_TIME_NS select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if PPC_BOOK3S_64 && PPC_RADIX_MMU + select HAVE_ARCH_HUGE_VMALLOC if HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if PPC32 && PPC_PAGE_SHIFT <= 14 diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index fab84024650c..3f35c8d20be7 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -88,17 +89,22 @@ int module_finalize(const Elf_Ehdr *hdr, return 0; } -#ifdef MODULES_VADDR static __always_inline void * __module_alloc(unsigned long size, unsigned long start, unsigned long end) { - return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL, - PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, - __builtin_return_address(0)); + /* + * Don't do huge page allocations for modules yet until more testing + * is done. STRICT_MODULE_RWX may require extra work to support this + * too. + */ + return __vmalloc_node_range(size, 1, start, end, GFP_KERNEL, PAGE_KERNEL_EXEC, + VM_FLUSH_RESET_PERMS | VM_NO_HUGE_VMAP, + NUMA_NO_NODE, __builtin_return_address(0)); } void *module_alloc(unsigned long size) { +#ifdef MODULES_VADDR unsigned long limit = (unsigned long)_etext - SZ_32M; void *ptr = NULL; @@ -112,5 +118,7 @@ void *module_alloc(unsigned long size) ptr = __module_alloc(size, MODULES_VADDR, MODULES_END); return ptr; -} +#else + return __module_alloc(size, VMALLOC_START, VMALLOC_END); #endif +} From 48145b62563a9ae1ad631d6b576c6b9a798fcbec Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Thu, 22 Apr 2021 17:04:07 +0900 Subject: [PATCH 0968/1379] nvme: fix controller ioctl through ns_head In multipath case, we should consider namespace attachment with controllers in a subsystem when we find out the live controller for the namespace. This patch manually reverted the commit 3557a4409701 ("nvme: don't bother to look up a namespace for controller ioctls") with few more updates to nvme_ns_head_chr_ioctl which has been newly updated. Fixes: 3557a4409701 ("nvme: don't bother to look up a namespace for controller ioctls") Cc: Christoph Hellwig Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 22 ------------- drivers/nvme/host/ioctl.c | 65 ++++++++++++++++++++++++--------------- drivers/nvme/host/nvme.h | 1 - 3 files changed, 41 insertions(+), 47 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b6f7815fa239..c1c196459d79 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1999,28 +1999,6 @@ static const struct block_device_operations nvme_bdev_ops = { .pr_ops = &nvme_pr_ops, }; -#ifdef CONFIG_NVME_MULTIPATH -struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys) -{ - struct nvme_ctrl *ctrl; - int ret; - - ret = mutex_lock_killable(&nvme_subsystems_lock); - if (ret) - return ERR_PTR(ret); - list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { - if (ctrl->state == NVME_CTRL_LIVE) - goto found; - } - mutex_unlock(&nvme_subsystems_lock); - return ERR_PTR(-EWOULDBLOCK); -found: - nvme_get_ctrl(ctrl); - mutex_unlock(&nvme_subsystems_lock); - return ctrl; -} -#endif /* CONFIG_NVME_MULTIPATH */ - static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) { unsigned long timeout = diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 502f8e4a2a1f..9557ead02de1 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -370,41 +370,45 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } #ifdef CONFIG_NVME_MULTIPATH -static int nvme_ns_head_ctrl_ioctl(struct nvme_ns_head *head, - unsigned int cmd, void __user *argp) +static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, + void __user *argp, struct nvme_ns_head *head, int srcu_idx) { - struct nvme_ctrl *ctrl = nvme_find_get_live_ctrl(head->subsys); + struct nvme_ctrl *ctrl = ns->ctrl; int ret; - if (IS_ERR(ctrl)) - return PTR_ERR(ctrl); - ret = nvme_ctrl_ioctl(ctrl, cmd, argp); + nvme_get_ctrl(ns->ctrl); + nvme_put_ns_from_disk(head, srcu_idx); + ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp); + nvme_put_ctrl(ctrl); return ret; } -static int nvme_ns_head_ns_ioctl(struct nvme_ns_head *head, - unsigned int cmd, void __user *argp) -{ - int srcu_idx = srcu_read_lock(&head->srcu); - struct nvme_ns *ns = nvme_find_path(head); - int ret = -EWOULDBLOCK; - - if (ns) - ret = nvme_ns_ioctl(ns, cmd, argp); - srcu_read_unlock(&head->srcu, srcu_idx); - return ret; -} - int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - struct nvme_ns_head *head = bdev->bd_disk->private_data; + struct nvme_ns_head *head = NULL; void __user *argp = (void __user *)arg; + struct nvme_ns *ns; + int srcu_idx, ret; + ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); + if (unlikely(!ns)) + return -EWOULDBLOCK; + + /* + * Handle ioctls that apply to the controller instead of the namespace + * seperately and drop the ns SRCU reference early. This avoids a + * deadlock when deleting namespaces using the passthrough interface. + */ if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(head, cmd, argp); - return nvme_ns_head_ns_ioctl(head, cmd, argp); + ret = nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + else { + ret = nvme_ns_ioctl(ns, cmd, argp); + nvme_put_ns_from_disk(head, srcu_idx); + } + + return ret; } long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, @@ -414,10 +418,23 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); void __user *argp = (void __user *)arg; + struct nvme_ns *ns; + int srcu_idx, ret; + + srcu_idx = srcu_read_lock(&head->srcu); + ns = nvme_find_path(head); + if (!ns) { + srcu_read_unlock(&head->srcu, srcu_idx); + return -EWOULDBLOCK; + } if (is_ctrl_ioctl(cmd)) - return nvme_ns_head_ctrl_ioctl(head, cmd, argp); - return nvme_ns_head_ns_ioctl(head, cmd, argp); + return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); + + ret = nvme_ns_ioctl(ns, cmd, argp); + nvme_put_ns_from_disk(head, srcu_idx); + + return ret; } #endif /* CONFIG_NVME_MULTIPATH */ diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 773dde5b231d..c1e086a0bc3f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -664,7 +664,6 @@ struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx); bool nvme_tryget_ns_head(struct nvme_ns_head *head); void nvme_put_ns_head(struct nvme_ns_head *head); -struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys); int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, const struct file_operations *fops, struct module *owner); void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device); From 4c74d1f80381996027bacc4f6c554948ef9bf374 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 27 Apr 2021 12:17:46 +0530 Subject: [PATCH 0969/1379] nvme: add nvme_get_ns helper Add a helper to avoid opencoding ns->kref increment. Decrement is already done via nvme_put_ns helper. Signed-off-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c1c196459d79..f2d1f2d699b8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -576,6 +576,11 @@ static void nvme_free_ns(struct kref *kref) kfree(ns); } +static inline bool nvme_get_ns(struct nvme_ns *ns) +{ + return kref_get_unless_zero(&ns->kref); +} + void nvme_put_ns(struct nvme_ns *ns) { kref_put(&ns->kref, nvme_free_ns); @@ -1494,7 +1499,7 @@ static int nvme_ns_open(struct nvme_ns *ns) /* should never be called due to GENHD_FL_HIDDEN */ if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head))) goto fail; - if (!kref_get_unless_zero(&ns->kref)) + if (!nvme_get_ns(ns)) goto fail; if (!try_module_get(ns->ctrl->ops->module)) goto fail_put_ns; @@ -3582,7 +3587,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) { if (ns->head->ns_id == nsid) { - if (!kref_get_unless_zero(&ns->kref)) + if (!nvme_get_ns(ns)) continue; ret = ns; break; From 51ad06cd698cb9ff280a769ed8d57210a1d2266d Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Tue, 27 Apr 2021 12:17:47 +0530 Subject: [PATCH 0970/1379] nvme: avoid memset for passthrough requests nvme_clear_nvme_request() clears the nvme_command, which is unncessary for passthrough requests as nvme_command is overwritten immediately. Move clearing part from this helper to the caller, so that double memset for passthrough requests is avoided. Signed-off-by: Kanchan Joshi Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f2d1f2d699b8..61e122cecc2a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -589,9 +589,6 @@ EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); static inline void nvme_clear_nvme_request(struct request *req) { - struct nvme_command *cmd = nvme_req(req)->cmd; - - memset(cmd, 0, sizeof(*cmd)); nvme_req(req)->retries = 0; nvme_req(req)->flags = 0; req->rq_flags |= RQF_DONTPREP; @@ -903,8 +900,10 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) struct nvme_command *cmd = nvme_req(req)->cmd; blk_status_t ret = BLK_STS_OK; - if (!(req->rq_flags & RQF_DONTPREP)) + if (!(req->rq_flags & RQF_DONTPREP)) { nvme_clear_nvme_request(req); + memset(cmd, 0, sizeof(*cmd)); + } switch (req_op(req)) { case REQ_OP_DRV_IN: From a97157440e1e69c35d7804d3b72da0c626ef28e6 Mon Sep 17 00:00:00 2001 From: Tao Chiu Date: Mon, 26 Apr 2021 10:53:10 +0800 Subject: [PATCH 0971/1379] nvme: move the fabrics queue ready check routines to core queue_rq() in pci only checks if the dispatched queue (nvmeq) is ready, e.g. not being suspended. Since nvme_alloc_admin_tags() in reset flow restarts the admin queue, users are able to submit admin commands to a controller before reset_work() completes. Commands submitted under this condition may interfere with commands that performs identify, IO queue setup in reset_work(), and may result in a hang described in the following patch. As seen in the fabrics, user commands are prevented from being executed under inproper controller states. We may reuse this logic to maintain a clear admin queue during reset_work(). Signed-off-by: Tao Chiu Signed-off-by: Cody Wong Reviewed-by: Leon Chien Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 60 +++++++++++++++++++++++++++++++++++++ drivers/nvme/host/fabrics.c | 57 ----------------------------------- drivers/nvme/host/fabrics.h | 13 -------- drivers/nvme/host/fc.c | 4 +-- drivers/nvme/host/nvme.h | 15 ++++++++++ drivers/nvme/host/rdma.c | 4 +-- drivers/nvme/host/tcp.c | 4 +-- drivers/nvme/target/loop.c | 4 +-- 8 files changed, 83 insertions(+), 78 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 61e122cecc2a..522c9b229f80 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -639,6 +639,66 @@ static struct request *nvme_alloc_request_qid(struct request_queue *q, return req; } +/* + * For something we're not in a state to send to the device the default action + * is to busy it and retry it after the controller state is recovered. However, + * if the controller is deleting or if anything is marked for failfast or + * nvme multipath it is immediately failed. + * + * Note: commands used to initialize the controller will be marked for failfast. + * Note: nvme cli/ioctl commands are marked for failfast. + */ +blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, + struct request *rq) +{ + if (ctrl->state != NVME_CTRL_DELETING_NOIO && + ctrl->state != NVME_CTRL_DEAD && + !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && + !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) + return BLK_STS_RESOURCE; + return nvme_host_path_error(rq); +} +EXPORT_SYMBOL_GPL(nvme_fail_nonready_command); + +bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live) +{ + struct nvme_request *req = nvme_req(rq); + + /* + * currently we have a problem sending passthru commands + * on the admin_q if the controller is not LIVE because we can't + * make sure that they are going out after the admin connect, + * controller enable and/or other commands in the initialization + * sequence. until the controller will be LIVE, fail with + * BLK_STS_RESOURCE so that they will be rescheduled. + */ + if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD)) + return false; + + if (ctrl->ops->flags & NVME_F_FABRICS) { + /* + * Only allow commands on a live queue, except for the connect + * command, which is require to set the queue live in the + * appropinquate states. + */ + switch (ctrl->state) { + case NVME_CTRL_CONNECTING: + if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) && + req->cmd->fabrics.fctype == nvme_fabrics_type_connect) + return true; + break; + default: + break; + case NVME_CTRL_DEAD: + return false; + } + } + + return queue_live; +} +EXPORT_SYMBOL_GPL(__nvme_check_ready); + static int nvme_toggle_streams(struct nvme_ctrl *ctrl, bool enable) { struct nvme_command c; diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 13c2747e3d00..a2bb7fc63a73 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -533,63 +533,6 @@ static struct nvmf_transport_ops *nvmf_lookup_transport( return NULL; } -/* - * For something we're not in a state to send to the device the default action - * is to busy it and retry it after the controller state is recovered. However, - * if the controller is deleting or if anything is marked for failfast or - * nvme multipath it is immediately failed. - * - * Note: commands used to initialize the controller will be marked for failfast. - * Note: nvme cli/ioctl commands are marked for failfast. - */ -blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, - struct request *rq) -{ - if (ctrl->state != NVME_CTRL_DELETING_NOIO && - ctrl->state != NVME_CTRL_DEAD && - !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) && - !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) - return BLK_STS_RESOURCE; - return nvme_host_path_error(rq); -} -EXPORT_SYMBOL_GPL(nvmf_fail_nonready_command); - -bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, - bool queue_live) -{ - struct nvme_request *req = nvme_req(rq); - - /* - * currently we have a problem sending passthru commands - * on the admin_q if the controller is not LIVE because we can't - * make sure that they are going out after the admin connect, - * controller enable and/or other commands in the initialization - * sequence. until the controller will be LIVE, fail with - * BLK_STS_RESOURCE so that they will be rescheduled. - */ - if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD)) - return false; - - /* - * Only allow commands on a live queue, except for the connect command, - * which is require to set the queue live in the appropinquate states. - */ - switch (ctrl->state) { - case NVME_CTRL_CONNECTING: - if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) && - req->cmd->fabrics.fctype == nvme_fabrics_type_connect) - return true; - break; - default: - break; - case NVME_CTRL_DEAD: - return false; - } - - return queue_live; -} -EXPORT_SYMBOL_GPL(__nvmf_check_ready); - static const match_table_t opt_tokens = { { NVMF_OPT_TRANSPORT, "transport=%s" }, { NVMF_OPT_TRADDR, "traddr=%s" }, diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 888b108d87a4..d7f7974dc208 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -184,20 +184,7 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops); void nvmf_free_options(struct nvmf_ctrl_options *opts); int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); -blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl, - struct request *rq); -bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, - bool queue_live); bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, struct nvmf_ctrl_options *opts); -static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, - bool queue_live) -{ - if (likely(ctrl->state == NVME_CTRL_LIVE || - ctrl->state == NVME_CTRL_DELETING)) - return true; - return __nvmf_check_ready(ctrl, rq, queue_live); -} - #endif /* _NVME_FABRICS_H */ diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 9b9b7be0f412..d9ab9e7871d0 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2766,8 +2766,8 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, blk_status_t ret; if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE || - !nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) - return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); + !nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq); ret = nvme_setup_cmd(ns, rq); if (ret) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index c1e086a0bc3f..05f31a2c64bb 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -638,6 +638,21 @@ struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags); void nvme_cleanup_cmd(struct request *req); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req); +blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl, + struct request *req); +bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live); + +static inline bool nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq, + bool queue_live) +{ + if (likely(ctrl->state == NVME_CTRL_LIVE)) + return true; + if (ctrl->ops->flags & NVME_F_FABRICS && + ctrl->state == NVME_CTRL_DELETING) + return true; + return __nvme_check_ready(ctrl, rq, queue_live); +} int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 660c774fa9e1..37943dc4c2c1 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2050,8 +2050,8 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, WARN_ON_ONCE(rq->tag < 0); - if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) - return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); + if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq); dev = queue->device->dev; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 75435cdb156c..0222e23f5936 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2338,8 +2338,8 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags); blk_status_t ret; - if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) - return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); + if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq); ret = nvme_tcp_setup_cmd_pdu(ns, rq); if (unlikely(ret)) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 6665da3b634f..74b3b150e1a5 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -138,8 +138,8 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, bool queue_ready = test_bit(NVME_LOOP_Q_LIVE, &queue->flags); blk_status_t ret; - if (!nvmf_check_ready(&queue->ctrl->ctrl, req, queue_ready)) - return nvmf_fail_nonready_command(&queue->ctrl->ctrl, req); + if (!nvme_check_ready(&queue->ctrl->ctrl, req, queue_ready)) + return nvme_fail_nonready_command(&queue->ctrl->ctrl, req); ret = nvme_setup_cmd(ns, req); if (ret) From d4060d2be1132596154f31f4d57976bd103e969d Mon Sep 17 00:00:00 2001 From: Tao Chiu Date: Mon, 26 Apr 2021 10:53:55 +0800 Subject: [PATCH 0972/1379] nvme-pci: fix controller reset hang when racing with nvme_timeout reset_work() in nvme-pci may hang forever in the following scenario: 1) A reset caused by a command timeout occurs due to a controller being temporarily irresponsive. 2) nvme_reset_work() restarts admin queue at nvme_alloc_admin_tags(). At the same time, a user-submitted admin command is queued and waiting for completion. Then, reset_work() changes its state to CONNECTING, and submits an identify command. 3) However, the controller does still not respond to any command, causing a timeout being fired at the user-submitted command. Unfortunately, nvme_timeout() does not see the completion on cq, and any timeout that takes place under CONNECTING state causes a controller shutdown. 4) Normally, the identify command in reset_work() would be canceled with SC_HOST_ABORTED by nvme_dev_disable(), then reset_work can tear down the controller accordingly. But the controller happens to return online and respond the identify command before nvme_dev_disable() should have been reaped it off. 5) reset_work() continues to setup_io_queues() as it observes no error in init_identify(). However, the admin queue has already been quiesced in dev_disable(). Thus, any following commands would be blocked forever in blk_execute_rq(). This can be fixed by restricting usercmd commands when controller is not in a LIVE state in nvme_queue_rq(), as what has been done previously in fabrics. ``` nvme_reset_work(): | nvme_alloc_admin_tags() | | nvme_submit_user_cmd(): nvme_init_identify(): | ... __nvme_submit_sync_cmd(): | ... | ... ---------------------------------------> nvme_timeout(): (Controller starts reponding commands) | nvme_dev_disable(, true): nvme_setup_io_queues(): | __nvme_submit_sync_cmd(): | (hung in blk_execute_rq | since run_hw_queue sees | queue quiesced) | ``` Signed-off-by: Tao Chiu Signed-off-by: Cody Wong Reviewed-by: Leon Chien Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 09d4c5f99fc3..a29b170701fc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -933,6 +933,9 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) return BLK_STS_IOERR; + if (!nvme_check_ready(&dev->ctrl, req, true)) + return nvme_fail_nonready_command(&dev->ctrl, req); + ret = nvme_setup_cmd(ns, req); if (ret) return ret; From ce86dad222e9074d3ec174ec81cb463a770331b5 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 3 May 2021 19:03:03 +0200 Subject: [PATCH 0973/1379] nvme-multipath: reset bdev to ns head when failover When a request finally completes in end_io() after it has failed over, the bdev pointer can be stale and thus the system can crash. Set the bdev back to ns head, so the request is map to an active path when resubmitted. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 0d0de3433f37..0551796517e6 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -70,6 +70,7 @@ void nvme_failover_req(struct request *req) struct nvme_ns *ns = req->q->queuedata; u16 status = nvme_req(req)->status & 0x7ff; unsigned long flags; + struct bio *bio; nvme_mpath_clear_current_path(ns); @@ -84,6 +85,8 @@ void nvme_failover_req(struct request *req) } spin_lock_irqsave(&ns->head->requeue_lock, flags); + for (bio = req->bio; bio; bio = bio->bi_next) + bio_set_dev(bio, ns->head->disk->part0); blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); From 4a20342572f66c5b20a1ee680f5ac0a13703748f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 28 Apr 2021 21:25:58 -0700 Subject: [PATCH 0974/1379] nvmet: remove unsupported command noise Nothing can stop a host from submitting invalid commands. The target just needs to respond with an appropriate status, but that's not a target error. Demote invalid command messages to the debug level so these events don't spam the kernel logs. Reported-by: Yi Zhang Signed-off-by: Keith Busch Reviewed-by: Klaus Jensen Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index d2a26ff3f7b3..e7a367cf6d36 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -307,7 +307,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) case NVME_LOG_ANA: return nvmet_execute_get_log_page_ana(req); } - pr_err("unhandled lid %d on qid %d\n", + pr_debug("unhandled lid %d on qid %d\n", req->cmd->get_log_page.lid, req->sq->qid); req->error_loc = offsetof(struct nvme_get_log_page_command, lid); nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR); @@ -659,7 +659,7 @@ static void nvmet_execute_identify(struct nvmet_req *req) return nvmet_execute_identify_desclist(req); } - pr_err("unhandled identify cns %d on qid %d\n", + pr_debug("unhandled identify cns %d on qid %d\n", req->cmd->identify.cns, req->sq->qid); req->error_loc = offsetof(struct nvme_identify, cns); nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR); @@ -977,7 +977,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) return 0; } - pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, + pr_debug("unhandled cmd %d on qid %d\n", cmd->common.opcode, req->sq->qid); req->error_loc = offsetof(struct nvme_common_command, opcode); return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; From 0f8a0b0b095fd9b301523c0f78686f5ac6fda564 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 4 May 2021 04:12:10 -0400 Subject: [PATCH 0975/1379] virtio_pci_modern: __force cast the notify mapping When switching virtio_pci_modern to use a helper for mappings we lost an __iomem tag. We should restore it. However, virtio_pci_modern is playing tricks by hiding an iomem pointer in a regular vq->priv pointer. Which is okay as long as it's all contained within a single file, but we need to __force cast the value otherwise we'll get sparse warnings. Reported-by: kernel test robot Fixes: 7dca6c0ea96b ("virtio-pci library: switch to use vp_modern_map_vq_notify()") Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 722ea44e7579..30654d3a0b41 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -224,7 +224,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, virtqueue_get_avail_addr(vq), virtqueue_get_used_addr(vq)); - vq->priv = vp_modern_map_vq_notify(mdev, index, NULL); + vq->priv = (void __force *)vp_modern_map_vq_notify(mdev, index, NULL); if (!vq->priv) { err = -ENOMEM; goto err_map_notify; From d7bce85aa7b92b5de8f69b3bcedfe51d7b1aabe1 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 4 May 2021 04:17:20 -0400 Subject: [PATCH 0976/1379] virtio_pci_modern: correct sparse tags for notify When switching virtio_pci_modern to use a helper for mappings we lost an __iomem tag. Restore it. Reported-by: kernel test robot Fixes: 9e3bb9b79a71 ("virtio_pci_modern: introduce helper to map vq notify area") Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern_dev.c | 9 ++++----- include/linux/virtio_pci_modern.h | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern_dev.c b/drivers/virtio/virtio_pci_modern_dev.c index ae87b3fa8858..54f297028586 100644 --- a/drivers/virtio/virtio_pci_modern_dev.c +++ b/drivers/virtio/virtio_pci_modern_dev.c @@ -605,8 +605,8 @@ static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev, * * Returns the address of the notification area */ -void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, - u16 index, resource_size_t *pa) +void __iomem *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, + u16 index, resource_size_t *pa) { u16 off = vp_modern_get_queue_notify_off(mdev, index); @@ -624,10 +624,9 @@ void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, if (pa) *pa = mdev->notify_pa + off * mdev->notify_offset_multiplier; - return (void __force *)mdev->notify_base + - off * mdev->notify_offset_multiplier; + return mdev->notify_base + off * mdev->notify_offset_multiplier; } else { - return (void __force *)vp_modern_map_capability(mdev, + return vp_modern_map_capability(mdev, mdev->notify_map_cap, 2, 2, off * mdev->notify_offset_multiplier, 2, NULL, pa); diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index cdfabbefacdf..6a95b58fd0f4 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -101,8 +101,8 @@ void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev, u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, u16 idx); u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev); -void *vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, - u16 index, resource_size_t *pa); +void __iomem * vp_modern_map_vq_notify(struct virtio_pci_modern_device *mdev, + u16 index, resource_size_t *pa); int vp_modern_probe(struct virtio_pci_modern_device *mdev); void vp_modern_remove(struct virtio_pci_modern_device *mdev); #endif From 8eedd3a70a70f51fa963f3ad7fa97afd0c75bd44 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 4 May 2021 10:20:57 +0200 Subject: [PATCH 0977/1379] ALSA: hda/realtek: Fix silent headphone output on ASUS UX430UA It was reported that the headphone output on ASUS UX430UA (SSID 1043:1740) with ALC295 codec is silent while the speaker works. After the investigation, it turned out that the DAC assignment has to be fixed on this machine; unlike others, it expects DAC 0x02 to be assigned to the speaker pin 0x07 while DAC 0x03 to headphone pin 0x21. This patch provides a fixup for the fixed DAC/pin mapping for this device. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=212933 Cc: Link: https://lore.kernel.org/r/20210504082057.6913-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a0fa99a31c23..bbbe37f3e724 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5695,6 +5695,18 @@ static void alc_fixup_tpt470_dacs(struct hda_codec *codec, spec->gen.preferred_dacs = preferred_pairs; } +static void alc295_fixup_asus_dacs(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + static const hda_nid_t preferred_pairs[] = { + 0x17, 0x02, 0x21, 0x03, 0 + }; + struct alc_spec *spec = codec->spec; + + if (action == HDA_FIXUP_ACT_PRE_PROBE) + spec->gen.preferred_dacs = preferred_pairs; +} + static void alc_shutup_dell_xps13(struct hda_codec *codec) { struct alc_spec *spec = codec->spec; @@ -6463,6 +6475,7 @@ enum { ALC256_FIXUP_ACER_HEADSET_MIC, ALC285_FIXUP_IDEAPAD_S740_COEF, ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST, + ALC295_FIXUP_ASUS_DACS, }; static const struct hda_fixup alc269_fixups[] = { @@ -7963,6 +7976,10 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC285_FIXUP_HP_MUTE_LED, }, + [ALC295_FIXUP_ASUS_DACS] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc295_fixup_asus_dacs, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -8161,6 +8178,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x1427, "Asus Zenbook UX31E", ALC269VB_FIXUP_ASUS_ZENBOOK), SND_PCI_QUIRK(0x1043, 0x1517, "Asus Zenbook UX31A", ALC269VB_FIXUP_ASUS_ZENBOOK_UX31A), SND_PCI_QUIRK(0x1043, 0x16e3, "ASUS UX50", ALC269_FIXUP_STEREO_DMIC), + SND_PCI_QUIRK(0x1043, 0x1740, "ASUS UX430UA", ALC295_FIXUP_ASUS_DACS), SND_PCI_QUIRK(0x1043, 0x17d1, "ASUS UX431FL", ALC294_FIXUP_ASUS_DUAL_SPK), SND_PCI_QUIRK(0x1043, 0x1881, "ASUS Zephyrus S/M", ALC294_FIXUP_ASUS_GX502_PINS), SND_PCI_QUIRK(0x1043, 0x18b1, "Asus MJ401TA", ALC256_FIXUP_ASUS_HEADSET_MIC), From 622464c893142f7beac89f5ba8c9773bca5e5004 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 4 May 2021 11:18:02 +0200 Subject: [PATCH 0978/1379] ALSA: hda/realtek: Fix speaker amp on HP Envy AiO 32 HP Envy AiO 32-a12xxx has an external amp that is controlled via GPIO bit 0x04. However, unlike other devices, this amp seems to shut down itself after the certain period, hence the OS needs to up/down the bit dynamically only during the actual playback. This patch adds the control of the GPIO bit via the existing pcm_hook mechanism. Ideally it should be triggered at the actual stream start, but we have only the state change at prepare/cleanup, so use those for switching the GPIO bit on/off. This should be good enough for the purpose, and was actually confirmed to work fine. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=212873 Cc: Link: https://lore.kernel.org/r/20210504091802.13200-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index bbbe37f3e724..ee2841dd0c6d 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4338,6 +4338,35 @@ static void alc245_fixup_hp_x360_amp(struct hda_codec *codec, } } +/* toggle GPIO2 at each time stream is started; we use PREPARE state instead */ +static void alc274_hp_envy_pcm_hook(struct hda_pcm_stream *hinfo, + struct hda_codec *codec, + struct snd_pcm_substream *substream, + int action) +{ + switch (action) { + case HDA_GEN_PCM_ACT_PREPARE: + alc_update_gpio_data(codec, 0x04, true); + break; + case HDA_GEN_PCM_ACT_CLEANUP: + alc_update_gpio_data(codec, 0x04, false); + break; + } +} + +static void alc274_fixup_hp_envy_gpio(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + struct alc_spec *spec = codec->spec; + + if (action == HDA_FIXUP_ACT_PROBE) { + spec->gpio_mask |= 0x04; + spec->gpio_dir |= 0x04; + spec->gen.pcm_playback_hook = alc274_hp_envy_pcm_hook; + } +} + static void alc_update_coef_led(struct hda_codec *codec, struct alc_coef_led *led, bool polarity, bool on) @@ -6465,6 +6494,7 @@ enum { ALC255_FIXUP_XIAOMI_HEADSET_MIC, ALC274_FIXUP_HP_MIC, ALC274_FIXUP_HP_HEADSET_MIC, + ALC274_FIXUP_HP_ENVY_GPIO, ALC256_FIXUP_ASUS_HPE, ALC285_FIXUP_THINKPAD_NO_BASS_SPK_HEADSET_JACK, ALC287_FIXUP_HP_GPIO_LED, @@ -7907,6 +7937,10 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC274_FIXUP_HP_MIC }, + [ALC274_FIXUP_HP_ENVY_GPIO] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc274_fixup_hp_envy_gpio, + }, [ALC256_FIXUP_ASUS_HPE] = { .type = HDA_FIXUP_VERBS, .v.verbs = (const struct hda_verb[]) { @@ -8140,6 +8174,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8497, "HP Envy x360", ALC269_FIXUP_HP_MUTE_LED_MIC3), SND_PCI_QUIRK(0x103c, 0x84e7, "HP Pavilion 15", ALC269_FIXUP_HP_MUTE_LED_MIC3), SND_PCI_QUIRK(0x103c, 0x869d, "HP", ALC236_FIXUP_HP_MUTE_LED), + SND_PCI_QUIRK(0x103c, 0x86c7, "HP Envy AiO 32", ALC274_FIXUP_HP_ENVY_GPIO), SND_PCI_QUIRK(0x103c, 0x8724, "HP EliteBook 850 G7", ALC285_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8729, "HP", ALC285_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8730, "HP ProBook 445 G7", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), From c61287bf17836b67e0b649343778bb4a659bd70d Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Tue, 4 May 2021 18:59:35 +0800 Subject: [PATCH 0979/1379] clk: sifive: Add pcie_aux clock in prci driver for PCIe driver We add pcie_aux clock in this patch so that pcie driver can use clk_prepare_enable() and clk_disable_unprepare() to enable and disable pcie_aux clock. Link: https://lore.kernel.org/r/20210504105940.100004-2-greentime.hu@sifive.com Signed-off-by: Greentime Hu Signed-off-by: Lorenzo Pieralisi Acked-by: Stephen Boyd --- drivers/clk/sifive/fu740-prci.c | 11 +++++ drivers/clk/sifive/fu740-prci.h | 2 +- drivers/clk/sifive/sifive-prci.c | 41 +++++++++++++++++++ drivers/clk/sifive/sifive-prci.h | 9 ++++ include/dt-bindings/clock/sifive-fu740-prci.h | 1 + 5 files changed, 63 insertions(+), 1 deletion(-) diff --git a/drivers/clk/sifive/fu740-prci.c b/drivers/clk/sifive/fu740-prci.c index 764d1097aa51..53f6e00a03b9 100644 --- a/drivers/clk/sifive/fu740-prci.c +++ b/drivers/clk/sifive/fu740-prci.c @@ -72,6 +72,12 @@ static const struct clk_ops sifive_fu740_prci_hfpclkplldiv_clk_ops = { .recalc_rate = sifive_prci_hfpclkplldiv_recalc_rate, }; +static const struct clk_ops sifive_fu740_prci_pcie_aux_clk_ops = { + .enable = sifive_prci_pcie_aux_clock_enable, + .disable = sifive_prci_pcie_aux_clock_disable, + .is_enabled = sifive_prci_pcie_aux_clock_is_enabled, +}; + /* List of clock controls provided by the PRCI */ struct __prci_clock __prci_init_clocks_fu740[] = { [PRCI_CLK_COREPLL] = { @@ -120,4 +126,9 @@ struct __prci_clock __prci_init_clocks_fu740[] = { .parent_name = "hfpclkpll", .ops = &sifive_fu740_prci_hfpclkplldiv_clk_ops, }, + [PRCI_CLK_PCIE_AUX] = { + .name = "pcie_aux", + .parent_name = "hfclk", + .ops = &sifive_fu740_prci_pcie_aux_clk_ops, + }, }; diff --git a/drivers/clk/sifive/fu740-prci.h b/drivers/clk/sifive/fu740-prci.h index 13ef971f7764..511a0bf7ba2b 100644 --- a/drivers/clk/sifive/fu740-prci.h +++ b/drivers/clk/sifive/fu740-prci.h @@ -9,7 +9,7 @@ #include "sifive-prci.h" -#define NUM_CLOCK_FU740 8 +#define NUM_CLOCK_FU740 9 extern struct __prci_clock __prci_init_clocks_fu740[NUM_CLOCK_FU740]; diff --git a/drivers/clk/sifive/sifive-prci.c b/drivers/clk/sifive/sifive-prci.c index 1490b01ce629..9997a3fa4a38 100644 --- a/drivers/clk/sifive/sifive-prci.c +++ b/drivers/clk/sifive/sifive-prci.c @@ -453,6 +453,47 @@ void sifive_prci_hfpclkpllsel_use_hfpclkpll(struct __prci_data *pd) r = __prci_readl(pd, PRCI_HFPCLKPLLSEL_OFFSET); /* barrier */ } +/* PCIE AUX clock APIs for enable, disable. */ +int sifive_prci_pcie_aux_clock_is_enabled(struct clk_hw *hw) +{ + struct __prci_clock *pc = clk_hw_to_prci_clock(hw); + struct __prci_data *pd = pc->pd; + u32 r; + + r = __prci_readl(pd, PRCI_PCIE_AUX_OFFSET); + + if (r & PRCI_PCIE_AUX_EN_MASK) + return 1; + else + return 0; +} + +int sifive_prci_pcie_aux_clock_enable(struct clk_hw *hw) +{ + struct __prci_clock *pc = clk_hw_to_prci_clock(hw); + struct __prci_data *pd = pc->pd; + u32 r __maybe_unused; + + if (sifive_prci_pcie_aux_clock_is_enabled(hw)) + return 0; + + __prci_writel(1, PRCI_PCIE_AUX_OFFSET, pd); + r = __prci_readl(pd, PRCI_PCIE_AUX_OFFSET); /* barrier */ + + return 0; +} + +void sifive_prci_pcie_aux_clock_disable(struct clk_hw *hw) +{ + struct __prci_clock *pc = clk_hw_to_prci_clock(hw); + struct __prci_data *pd = pc->pd; + u32 r __maybe_unused; + + __prci_writel(0, PRCI_PCIE_AUX_OFFSET, pd); + r = __prci_readl(pd, PRCI_PCIE_AUX_OFFSET); /* barrier */ + +} + /** * __prci_register_clocks() - register clock controls in the PRCI * @dev: Linux struct device diff --git a/drivers/clk/sifive/sifive-prci.h b/drivers/clk/sifive/sifive-prci.h index dbdbd1722688..022c67cf053c 100644 --- a/drivers/clk/sifive/sifive-prci.h +++ b/drivers/clk/sifive/sifive-prci.h @@ -67,6 +67,11 @@ #define PRCI_DDRPLLCFG1_CKE_SHIFT 31 #define PRCI_DDRPLLCFG1_CKE_MASK (0x1 << PRCI_DDRPLLCFG1_CKE_SHIFT) +/* PCIEAUX */ +#define PRCI_PCIE_AUX_OFFSET 0x14 +#define PRCI_PCIE_AUX_EN_SHIFT 0 +#define PRCI_PCIE_AUX_EN_MASK (0x1 << PRCI_PCIE_AUX_EN_SHIFT) + /* GEMGXLPLLCFG0 */ #define PRCI_GEMGXLPLLCFG0_OFFSET 0x1c #define PRCI_GEMGXLPLLCFG0_DIVR_SHIFT 0 @@ -296,4 +301,8 @@ unsigned long sifive_prci_tlclksel_recalc_rate(struct clk_hw *hw, unsigned long sifive_prci_hfpclkplldiv_recalc_rate(struct clk_hw *hw, unsigned long parent_rate); +int sifive_prci_pcie_aux_clock_is_enabled(struct clk_hw *hw); +int sifive_prci_pcie_aux_clock_enable(struct clk_hw *hw); +void sifive_prci_pcie_aux_clock_disable(struct clk_hw *hw); + #endif /* __SIFIVE_CLK_SIFIVE_PRCI_H */ diff --git a/include/dt-bindings/clock/sifive-fu740-prci.h b/include/dt-bindings/clock/sifive-fu740-prci.h index cd7706ea5677..7899b7fee7db 100644 --- a/include/dt-bindings/clock/sifive-fu740-prci.h +++ b/include/dt-bindings/clock/sifive-fu740-prci.h @@ -19,5 +19,6 @@ #define PRCI_CLK_CLTXPLL 5 #define PRCI_CLK_TLCLK 6 #define PRCI_CLK_PCLK 7 +#define PRCI_CLK_PCIE_AUX 8 #endif /* __DT_BINDINGS_CLOCK_SIFIVE_FU740_PRCI_H */ From e4d368e0b632717e57d064ade6afdcf535e58068 Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Tue, 4 May 2021 18:59:36 +0800 Subject: [PATCH 0980/1379] clk: sifive: Use reset-simple in prci driver for PCIe driver We use reset-simple in this patch so that pcie driver can use devm_reset_control_get() to get this reset data structure and use reset_control_deassert() to deassert pcie_power_up_rst_n. Link: https://lore.kernel.org/r/20210504105940.100004-3-greentime.hu@sifive.com Signed-off-by: Greentime Hu Signed-off-by: Lorenzo Pieralisi Reviewed-by: Philipp Zabel Acked-by: Stephen Boyd --- drivers/clk/sifive/Kconfig | 2 ++ drivers/clk/sifive/sifive-prci.c | 13 +++++++++++++ drivers/clk/sifive/sifive-prci.h | 4 ++++ drivers/reset/Kconfig | 1 + 4 files changed, 20 insertions(+) diff --git a/drivers/clk/sifive/Kconfig b/drivers/clk/sifive/Kconfig index 1c14eb20c066..9132c3c4aa86 100644 --- a/drivers/clk/sifive/Kconfig +++ b/drivers/clk/sifive/Kconfig @@ -10,6 +10,8 @@ if CLK_SIFIVE config CLK_SIFIVE_PRCI bool "PRCI driver for SiFive SoCs" + select RESET_CONTROLLER + select RESET_SIMPLE select CLK_ANALOGBITS_WRPLL_CLN28HPC help Supports the Power Reset Clock interface (PRCI) IP block found in diff --git a/drivers/clk/sifive/sifive-prci.c b/drivers/clk/sifive/sifive-prci.c index 9997a3fa4a38..0d79ba31a793 100644 --- a/drivers/clk/sifive/sifive-prci.c +++ b/drivers/clk/sifive/sifive-prci.c @@ -588,6 +588,19 @@ static int sifive_prci_probe(struct platform_device *pdev) if (IS_ERR(pd->va)) return PTR_ERR(pd->va); + pd->reset.rcdev.owner = THIS_MODULE; + pd->reset.rcdev.nr_resets = PRCI_RST_NR; + pd->reset.rcdev.ops = &reset_simple_ops; + pd->reset.rcdev.of_node = pdev->dev.of_node; + pd->reset.active_low = true; + pd->reset.membase = pd->va + PRCI_DEVICESRESETREG_OFFSET; + spin_lock_init(&pd->reset.lock); + + r = devm_reset_controller_register(&pdev->dev, &pd->reset.rcdev); + if (r) { + dev_err(dev, "could not register reset controller: %d\n", r); + return r; + } r = __prci_register_clocks(dev, pd, desc); if (r) { dev_err(dev, "could not register clocks: %d\n", r); diff --git a/drivers/clk/sifive/sifive-prci.h b/drivers/clk/sifive/sifive-prci.h index 022c67cf053c..91658a88af4e 100644 --- a/drivers/clk/sifive/sifive-prci.h +++ b/drivers/clk/sifive/sifive-prci.h @@ -11,6 +11,7 @@ #include #include +#include #include /* @@ -121,6 +122,8 @@ #define PRCI_DEVICESRESETREG_CHIPLINK_RST_N_MASK \ (0x1 << PRCI_DEVICESRESETREG_CHIPLINK_RST_N_SHIFT) +#define PRCI_RST_NR 7 + /* CLKMUXSTATUSREG */ #define PRCI_CLKMUXSTATUSREG_OFFSET 0x2c #define PRCI_CLKMUXSTATUSREG_TLCLKSEL_STATUS_SHIFT 1 @@ -221,6 +224,7 @@ */ struct __prci_data { void __iomem *va; + struct reset_simple_data reset; struct clk_hw_onecell_data hw_clks; }; diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig index 4171c6f76385..0f40dadf5705 100644 --- a/drivers/reset/Kconfig +++ b/drivers/reset/Kconfig @@ -197,6 +197,7 @@ config RESET_SIMPLE - RCC reset controller in STM32 MCUs - Allwinner SoCs - ZTE's zx2967 family + - SiFive FU740 SoCs config RESET_STM32MP157 bool "STM32MP157 Reset Driver" if COMPILE_TEST From 2da0dd5e30af22a125c38137ee980c5bce3da391 Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Tue, 4 May 2021 18:59:37 +0800 Subject: [PATCH 0981/1379] MAINTAINERS: Add maintainers for SiFive FU740 PCIe driver Here add maintainer information for SiFive FU740 PCIe driver. Link: https://lore.kernel.org/r/20210504105940.100004-4-greentime.hu@sifive.com Signed-off-by: Greentime Hu Signed-off-by: Lorenzo Pieralisi --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index d92f85ca831d..888171ead601 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13692,6 +13692,14 @@ S: Maintained F: Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt F: drivers/pci/controller/dwc/*imx6* +PCI DRIVER FOR FU740 +M: Paul Walmsley +M: Greentime Hu +L: linux-pci@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml +F: drivers/pci/controller/dwc/pcie-fu740.c + PCI DRIVER FOR INTEL VOLUME MANAGEMENT DEVICE (VMD) M: Jonathan Derrick L: linux-pci@vger.kernel.org From 43cea116be0b2e9636ce72bc8269b99344374a81 Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Tue, 4 May 2021 18:59:38 +0800 Subject: [PATCH 0982/1379] dt-bindings: PCI: Add SiFive FU740 PCIe host controller Add PCIe host controller DT bindings of SiFive FU740. Link: https://lore.kernel.org/r/20210504105940.100004-5-greentime.hu@sifive.com Signed-off-by: Greentime Hu Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring --- .../bindings/pci/sifive,fu740-pcie.yaml | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml diff --git a/Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml b/Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml new file mode 100644 index 000000000000..b03cbb9b6602 --- /dev/null +++ b/Documentation/devicetree/bindings/pci/sifive,fu740-pcie.yaml @@ -0,0 +1,113 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/pci/sifive,fu740-pcie.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: SiFive FU740 PCIe host controller + +description: |+ + SiFive FU740 PCIe host controller is based on the Synopsys DesignWare + PCI core. It shares common features with the PCIe DesignWare core and + inherits common properties defined in + Documentation/devicetree/bindings/pci/designware-pcie.txt. + +maintainers: + - Paul Walmsley + - Greentime Hu + +allOf: + - $ref: /schemas/pci/pci-bus.yaml# + +properties: + compatible: + const: sifive,fu740-pcie + + reg: + maxItems: 3 + + reg-names: + items: + - const: dbi + - const: config + - const: mgmt + + num-lanes: + const: 8 + + msi-parent: true + + interrupt-names: + items: + - const: msi + - const: inta + - const: intb + - const: intc + - const: intd + + resets: + description: A phandle to the PCIe power up reset line. + maxItems: 1 + + pwren-gpios: + description: Should specify the GPIO for controlling the PCI bus device power on. + maxItems: 1 + + reset-gpios: + maxItems: 1 + +required: + - dma-coherent + - num-lanes + - interrupts + - interrupt-names + - interrupt-parent + - interrupt-map-mask + - interrupt-map + - clock-names + - clocks + - resets + - pwren-gpios + - reset-gpios + +unevaluatedProperties: false + +examples: + - | + bus { + #address-cells = <2>; + #size-cells = <2>; + #include + + pcie@e00000000 { + compatible = "sifive,fu740-pcie"; + #address-cells = <3>; + #size-cells = <2>; + #interrupt-cells = <1>; + reg = <0xe 0x00000000 0x0 0x80000000>, + <0xd 0xf0000000 0x0 0x10000000>, + <0x0 0x100d0000 0x0 0x1000>; + reg-names = "dbi", "config", "mgmt"; + device_type = "pci"; + dma-coherent; + bus-range = <0x0 0xff>; + ranges = <0x81000000 0x0 0x60080000 0x0 0x60080000 0x0 0x10000>, /* I/O */ + <0x82000000 0x0 0x60090000 0x0 0x60090000 0x0 0xff70000>, /* mem */ + <0x82000000 0x0 0x70000000 0x0 0x70000000 0x0 0x1000000>, /* mem */ + <0xc3000000 0x20 0x00000000 0x20 0x00000000 0x20 0x00000000>; /* mem prefetchable */ + num-lanes = <0x8>; + interrupts = <56>, <57>, <58>, <59>, <60>, <61>, <62>, <63>, <64>; + interrupt-names = "msi", "inta", "intb", "intc", "intd"; + interrupt-parent = <&plic0>; + interrupt-map-mask = <0x0 0x0 0x0 0x7>; + interrupt-map = <0x0 0x0 0x0 0x1 &plic0 57>, + <0x0 0x0 0x0 0x2 &plic0 58>, + <0x0 0x0 0x0 0x3 &plic0 59>, + <0x0 0x0 0x0 0x4 &plic0 60>; + clock-names = "pcie_aux"; + clocks = <&prci PRCI_CLK_PCIE_AUX>; + resets = <&prci 4>; + pwren-gpios = <&gpio 5 0>; + reset-gpios = <&gpio 8 0>; + }; + }; From 5d84b5318d860c9d80ca5dfae0e971ede53b4921 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 4 May 2021 14:18:32 +0200 Subject: [PATCH 0983/1379] ALSA: hda/realtek: Add fixup for HP OMEN laptop HP OMEN dc0019-ur with codec SSID 103c:84da requires the pin config overrides and the existing mic/mute LED setup. This patch implements those in the fixup table. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=212733 Cc: Link: https://lore.kernel.org/r/20210504121832.4558-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index ee2841dd0c6d..6d58f24c9702 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6506,6 +6506,7 @@ enum { ALC285_FIXUP_IDEAPAD_S740_COEF, ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST, ALC295_FIXUP_ASUS_DACS, + ALC295_FIXUP_HP_OMEN, }; static const struct hda_fixup alc269_fixups[] = { @@ -8014,6 +8015,26 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc295_fixup_asus_dacs, }, + [ALC295_FIXUP_HP_OMEN] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x12, 0xb7a60130 }, + { 0x13, 0x40000000 }, + { 0x14, 0x411111f0 }, + { 0x16, 0x411111f0 }, + { 0x17, 0x90170110 }, + { 0x18, 0x411111f0 }, + { 0x19, 0x02a11030 }, + { 0x1a, 0x411111f0 }, + { 0x1b, 0x04a19030 }, + { 0x1d, 0x40600001 }, + { 0x1e, 0x411111f0 }, + { 0x21, 0x03211020 }, + {} + }, + .chained = true, + .chain_id = ALC269_FIXUP_HP_LINE1_MIC1_LED, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -8172,6 +8193,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x82c0, "HP G3 mini premium", ALC221_FIXUP_HP_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x83b9, "HP Spectre x360", ALC269_FIXUP_HP_MUTE_LED_MIC3), SND_PCI_QUIRK(0x103c, 0x8497, "HP Envy x360", ALC269_FIXUP_HP_MUTE_LED_MIC3), + SND_PCI_QUIRK(0x103c, 0x84da, "HP OMEN dc0019-ur", ALC295_FIXUP_HP_OMEN), SND_PCI_QUIRK(0x103c, 0x84e7, "HP Pavilion 15", ALC269_FIXUP_HP_MUTE_LED_MIC3), SND_PCI_QUIRK(0x103c, 0x869d, "HP", ALC236_FIXUP_HP_MUTE_LED), SND_PCI_QUIRK(0x103c, 0x86c7, "HP Envy AiO 32", ALC274_FIXUP_HP_ENVY_GPIO), @@ -8577,6 +8599,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC255_FIXUP_XIAOMI_HEADSET_MIC, .name = "alc255-xiaomi-headset"}, {.id = ALC274_FIXUP_HP_MIC, .name = "alc274-hp-mic-detect"}, {.id = ALC245_FIXUP_HP_X360_AMP, .name = "alc245-hp-x360-amp"}, + {.id = ALC295_FIXUP_HP_OMEN, .name = "alc295-hp-omen"}, {} }; #define ALC225_STANDARD_PINS \ From 40c753993e3aad51a12c21233486e2037417a4d6 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Thu, 29 Apr 2021 11:32:56 +0530 Subject: [PATCH 0984/1379] powerpc/kexec_file: Use current CPU info while setting up FDT kexec_file_load() uses initial_boot_params in setting up the device tree for the kernel to be loaded. Though initial_boot_params holds info about CPUs at the time of boot, it doesn't account for hot added CPUs. So, kexec'ing with kexec_file_load() syscall leaves the kexec'ed kernel with inaccurate CPU info. If kdump kernel is loaded with kexec_file_load() syscall and the system crashes on a hot added CPU, the capture kernel hangs failing to identify the boot CPU, with no output. To avoid this from happening, extract current CPU info from of_root device node and use it for setting up the fdt in kexec_file_load case. Fixes: 6ecd0163d360 ("powerpc/kexec_file: Add appropriate regions for memory reserve map") Cc: stable@vger.kernel.org # v5.9+ Signed-off-by: Sourabh Jain Reviewed-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210429060256.199714-1-sourabhjain@linux.ibm.com --- arch/powerpc/kexec/file_load_64.c | 92 +++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index f9eb49e23ec8..5056e175ca2c 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -950,6 +950,93 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) return (unsigned int)(usm_entries * sizeof(u64)); } +/** + * add_node_props - Reads node properties from device node structure and add + * them to fdt. + * @fdt: Flattened device tree of the kernel + * @node_offset: offset of the node to add a property at + * @dn: device node pointer + * + * Returns 0 on success, negative errno on error. + */ +static int add_node_props(void *fdt, int node_offset, const struct device_node *dn) +{ + int ret = 0; + struct property *pp; + + if (!dn) + return -EINVAL; + + for_each_property_of_node(dn, pp) { + ret = fdt_setprop(fdt, node_offset, pp->name, pp->value, pp->length); + if (ret < 0) { + pr_err("Unable to add %s property: %s\n", pp->name, fdt_strerror(ret)); + return ret; + } + } + return ret; +} + +/** + * update_cpus_node - Update cpus node of flattened device tree using of_root + * device node. + * @fdt: Flattened device tree of the kernel. + * + * Returns 0 on success, negative errno on error. + */ +static int update_cpus_node(void *fdt) +{ + struct device_node *cpus_node, *dn; + int cpus_offset, cpus_subnode_offset, ret = 0; + + cpus_offset = fdt_path_offset(fdt, "/cpus"); + if (cpus_offset < 0 && cpus_offset != -FDT_ERR_NOTFOUND) { + pr_err("Malformed device tree: error reading /cpus node: %s\n", + fdt_strerror(cpus_offset)); + return cpus_offset; + } + + if (cpus_offset > 0) { + ret = fdt_del_node(fdt, cpus_offset); + if (ret < 0) { + pr_err("Error deleting /cpus node: %s\n", fdt_strerror(ret)); + return -EINVAL; + } + } + + /* Add cpus node to fdt */ + cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"), "cpus"); + if (cpus_offset < 0) { + pr_err("Error creating /cpus node: %s\n", fdt_strerror(cpus_offset)); + return -EINVAL; + } + + /* Add cpus node properties */ + cpus_node = of_find_node_by_path("/cpus"); + ret = add_node_props(fdt, cpus_offset, cpus_node); + of_node_put(cpus_node); + if (ret < 0) + return ret; + + /* Loop through all subnodes of cpus and add them to fdt */ + for_each_node_by_type(dn, "cpu") { + cpus_subnode_offset = fdt_add_subnode(fdt, cpus_offset, dn->full_name); + if (cpus_subnode_offset < 0) { + pr_err("Unable to add %s subnode: %s\n", dn->full_name, + fdt_strerror(cpus_subnode_offset)); + ret = cpus_subnode_offset; + goto out; + } + + ret = add_node_props(fdt, cpus_subnode_offset, dn); + if (ret < 0) + goto out; + } +out: + of_node_put(dn); + return ret; +} + /** * setup_new_fdt_ppc64 - Update the flattend device-tree of the kernel * being loaded. @@ -1006,6 +1093,11 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt, } } + /* Update cpus nodes information to account hotplug CPUs. */ + ret = update_cpus_node(fdt); + if (ret < 0) + goto out; + /* Update memory reserve map */ ret = get_reserved_memory_ranges(&rmem); if (ret) From b910fcbada9721c21f1d59ab59e07e8e354c23cc Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Sat, 1 May 2021 21:32:54 +0530 Subject: [PATCH 0985/1379] powerpc/powernv/memtrace: Fix dcache flushing Trace memory is cleared and the corresponding dcache lines are flushed after allocation. However, this should not be done using the PFN. This adds the missing conversion to virtual address. Fixes: 2ac02e5ecec0 ("powerpc/mm: Remove dcache flush from memory remove.") Signed-off-by: Sandipan Das Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210501160254.1179831-1-sandipan@linux.ibm.com --- arch/powerpc/platforms/powernv/memtrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 71c1262589fe..537a4daed614 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -104,8 +104,8 @@ static void memtrace_clear_range(unsigned long start_pfn, * Before we go ahead and use this range as cache inhibited range * flush the cache. */ - flush_dcache_range_chunked(PFN_PHYS(start_pfn), - PFN_PHYS(start_pfn + nr_pages), + flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn), + (unsigned long)pfn_to_kaddr(start_pfn + nr_pages), FLUSH_CHUNK_SIZE); } From f5668260b872e89b8d3942a8b7d4278aa9c2c981 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 29 Apr 2021 16:52:09 +0000 Subject: [PATCH 0986/1379] powerpc/32: Fix boot failure with CONFIG_STACKPROTECTOR Commit 7c95d8893fb5 ("powerpc: Change calling convention for create_branch() et. al.") complexified the frame of function do_feature_fixups(), leading to GCC setting up a stack guard when CONFIG_STACKPROTECTOR is selected. The problem is that do_feature_fixups() is called very early while 'current' in r2 is not set up yet and the code is still not at the final address used at link time. So, like other instrumentation, stack protection needs to be deactivated for feature-fixups.c and code-patching.c Fixes: 7c95d8893fb5 ("powerpc: Change calling convention for create_branch() et. al.") Cc: stable@vger.kernel.org # v5.8+ Reported-by: Jonathan Neuschaefer Signed-off-by: Christophe Leroy Tested-by: Jonathan Neuschaefer Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b688fe82927b330349d9e44553363fa451ea4d95.1619715114.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index f2c690ee75d1..cc1a8a0f311e 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -5,6 +5,9 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) +CFLAGS_code-patching.o += -fno-stack-protector +CFLAGS_feature-fixups.o += -fno-stack-protector + CFLAGS_REMOVE_code-patching.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE) From c6b05f4e233cc666f003e9fe68b2f765952875a9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 21 Apr 2021 17:06:42 +0000 Subject: [PATCH 0987/1379] powerpc/kconfig: Restore alphabetic order of the selects under CONFIG_PPC Commit a7d2475af7ae ("powerpc: Sort the selects under CONFIG_PPC") sorted all selects under CONFIG_PPC. 4 years later, several items have been introduced at wrong place, a few other have been renamed without moving them to their correct place. Reorder them now. While we are at it, simplify the test for a couple of them: - PPC_64 && PPC_PSERIES is simplified in PPC_PSERIES - PPC_64 && PPC_BOOK3S is simplified in PPC_BOOK3S_64 Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/361ee3fc5009c709ae0ca592249bb0702c6ef073.1619024780.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c547a9d6a2dd..ab17a56c3d10 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -118,28 +118,29 @@ config PPC # Please keep this list sorted alphabetically. # select ARCH_32BIT_OFF_T if PPC32 + select ARCH_HAS_COPY_MC if PPC64 select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEVMEM_IS_ALLOWED + select ARCH_HAS_DMA_MAP_DIRECT if PPC_PSERIES select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_KCOV select ARCH_HAS_HUGEPD if HUGETLB_PAGE - select ARCH_HAS_MEMREMAP_COMPAT_ALIGN - select ARCH_HAS_MMIOWB if PPC64 - select ARCH_HAS_PHYS_TO_DMA - select ARCH_HAS_PMEM_API - select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - select ARCH_HAS_PTE_DEVMAP if PPC_BOOK3S_64 - select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_MEMREMAP_COMPAT_ALIGN + select ARCH_HAS_MMIOWB if PPC64 + select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + select ARCH_HAS_PHYS_TO_DMA + select ARCH_HAS_PMEM_API + select ARCH_HAS_PTE_DEVMAP if PPC_BOOK3S_64 + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64 select ARCH_HAS_STRICT_KERNEL_RWX if ((PPC_BOOK3S_64 || PPC32) && !HIBERNATION) select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE - select ARCH_HAS_COPY_MC if PPC64 select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK @@ -162,9 +163,8 @@ config PPC select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS select DCACHE_WORD_ACCESS if PPC64 && CPU_LITTLE_ENDIAN - select DMA_OPS if PPC64 select DMA_OPS_BYPASS if PPC64 - select ARCH_HAS_DMA_MAP_DIRECT if PPC64 && PPC_PSERIES + select DMA_OPS if PPC64 select DYNAMIC_FTRACE if FUNCTION_TRACER select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT @@ -190,18 +190,16 @@ config PPC select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if PPC32 && PPC_PAGE_SHIFT <= 14 select HAVE_ARCH_KASAN_VMALLOC if PPC32 && PPC_PAGE_SHIFT <= 14 - select HAVE_ARCH_KGDB select HAVE_ARCH_KFENCE if PPC32 + select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_NVRAM_OPS select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ASM_MODVERSIONS - select HAVE_C_RECORDMCOUNT - select HAVE_STACKPROTECTOR if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13) - select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2) select HAVE_CONTEXT_TRACKING if PPC64 + select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW select HAVE_DYNAMIC_FTRACE @@ -215,10 +213,13 @@ config PPC select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS if GCC_VERSION >= 50200 # plugin support on gcc <= 5.1 is buggy on PPC select HAVE_GENERIC_VDSO + select HAVE_HARDLOCKUP_DETECTOR_ARCH if PPC_BOOK3S_64 && SMP + select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) select HAVE_IDE select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK + select HAVE_IRQ_TIME_ACCOUNTING select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZMA if DEFAULT_UIMAGE select HAVE_KERNEL_LZO if DEFAULT_UIMAGE @@ -230,25 +231,24 @@ config PPC select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI if PERF_EVENTS || (PPC64 && PPC_BOOK3S) - select HAVE_HARDLOCKUP_DETECTOR_ARCH if PPC64 && PPC_BOOK3S && SMP select HAVE_OPTPROBES select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 - select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select MMU_GATHER_RCU_TABLE_FREE - select MMU_GATHER_PAGE_SIZE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE + select HAVE_RSEQ select HAVE_SOFTIRQ_ON_OWN_STACK + select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2) + select HAVE_STACKPROTECTOR if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13) select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING - select HAVE_IRQ_TIME_ACCOUNTING - select HAVE_RSEQ select IOMMU_HELPER if PPC64 select IRQ_DOMAIN select IRQ_FORCED_THREADING + select MMU_GATHER_PAGE_SIZE + select MMU_GATHER_RCU_TABLE_FREE select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE select NEED_SG_DMA_LENGTH From e7e21b3a339bd1b3c1d951b37be5e322c5c0dbf2 Mon Sep 17 00:00:00 2001 From: Paul Walmsley Date: Tue, 4 May 2021 18:59:39 +0800 Subject: [PATCH 0988/1379] PCI: fu740: Add SiFive FU740 PCIe host controller driver Add driver for the SiFive FU740 PCIe host controller. This controller is based on the DesignWare PCIe core. Co-developed-by: Henry Styles Co-developed-by: Erik Danie Co-developed-by: Greentime Hu Link: https://lore.kernel.org/r/20210504105940.100004-6-greentime.hu@sifive.com Signed-off-by: Paul Walmsley Signed-off-by: Henry Styles Signed-off-by: Erik Danie Signed-off-by: Greentime Hu Signed-off-by: Lorenzo Pieralisi --- drivers/pci/controller/dwc/Kconfig | 9 + drivers/pci/controller/dwc/Makefile | 1 + drivers/pci/controller/dwc/pcie-fu740.c | 309 ++++++++++++++++++++++++ 3 files changed, 319 insertions(+) create mode 100644 drivers/pci/controller/dwc/pcie-fu740.c diff --git a/drivers/pci/controller/dwc/Kconfig b/drivers/pci/controller/dwc/Kconfig index 22c5529e9a65..0a37d21ed64e 100644 --- a/drivers/pci/controller/dwc/Kconfig +++ b/drivers/pci/controller/dwc/Kconfig @@ -318,4 +318,13 @@ config PCIE_AL required only for DT-based platforms. ACPI platforms with the Annapurna Labs PCIe controller don't need to enable this. +config PCIE_FU740 + bool "SiFive FU740 PCIe host controller" + depends on PCI_MSI_IRQ_DOMAIN + depends on SOC_SIFIVE || COMPILE_TEST + select PCIE_DW_HOST + help + Say Y here if you want PCIe controller support for the SiFive + FU740. + endmenu diff --git a/drivers/pci/controller/dwc/Makefile b/drivers/pci/controller/dwc/Makefile index a751553fa0db..625f6aaeb5b8 100644 --- a/drivers/pci/controller/dwc/Makefile +++ b/drivers/pci/controller/dwc/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_PCIE_DW_EP) += pcie-designware-ep.o obj-$(CONFIG_PCIE_DW_PLAT) += pcie-designware-plat.o obj-$(CONFIG_PCI_DRA7XX) += pci-dra7xx.o obj-$(CONFIG_PCI_EXYNOS) += pci-exynos.o +obj-$(CONFIG_PCIE_FU740) += pcie-fu740.o obj-$(CONFIG_PCI_IMX6) += pci-imx6.o obj-$(CONFIG_PCIE_SPEAR13XX) += pcie-spear13xx.o obj-$(CONFIG_PCI_KEYSTONE) += pci-keystone.o diff --git a/drivers/pci/controller/dwc/pcie-fu740.c b/drivers/pci/controller/dwc/pcie-fu740.c new file mode 100644 index 000000000000..00cde9a248b5 --- /dev/null +++ b/drivers/pci/controller/dwc/pcie-fu740.c @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FU740 DesignWare PCIe Controller integration + * Copyright (C) 2019-2021 SiFive, Inc. + * Paul Walmsley + * Greentime Hu + * + * Based in part on the i.MX6 PCIe host controller shim which is: + * + * Copyright (C) 2013 Kosagi + * https://www.kosagi.com + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pcie-designware.h" + +#define to_fu740_pcie(x) dev_get_drvdata((x)->dev) + +struct fu740_pcie { + struct dw_pcie pci; + void __iomem *mgmt_base; + struct gpio_desc *reset; + struct gpio_desc *pwren; + struct clk *pcie_aux; + struct reset_control *rst; +}; + +#define SIFIVE_DEVICESRESETREG 0x28 + +#define PCIEX8MGMT_PERST_N 0x0 +#define PCIEX8MGMT_APP_LTSSM_ENABLE 0x10 +#define PCIEX8MGMT_APP_HOLD_PHY_RST 0x18 +#define PCIEX8MGMT_DEVICE_TYPE 0x708 +#define PCIEX8MGMT_PHY0_CR_PARA_ADDR 0x860 +#define PCIEX8MGMT_PHY0_CR_PARA_RD_EN 0x870 +#define PCIEX8MGMT_PHY0_CR_PARA_RD_DATA 0x878 +#define PCIEX8MGMT_PHY0_CR_PARA_SEL 0x880 +#define PCIEX8MGMT_PHY0_CR_PARA_WR_DATA 0x888 +#define PCIEX8MGMT_PHY0_CR_PARA_WR_EN 0x890 +#define PCIEX8MGMT_PHY0_CR_PARA_ACK 0x898 +#define PCIEX8MGMT_PHY1_CR_PARA_ADDR 0x8a0 +#define PCIEX8MGMT_PHY1_CR_PARA_RD_EN 0x8b0 +#define PCIEX8MGMT_PHY1_CR_PARA_RD_DATA 0x8b8 +#define PCIEX8MGMT_PHY1_CR_PARA_SEL 0x8c0 +#define PCIEX8MGMT_PHY1_CR_PARA_WR_DATA 0x8c8 +#define PCIEX8MGMT_PHY1_CR_PARA_WR_EN 0x8d0 +#define PCIEX8MGMT_PHY1_CR_PARA_ACK 0x8d8 + +#define PCIEX8MGMT_PHY_CDR_TRACK_EN BIT(0) +#define PCIEX8MGMT_PHY_LOS_THRSHLD BIT(5) +#define PCIEX8MGMT_PHY_TERM_EN BIT(9) +#define PCIEX8MGMT_PHY_TERM_ACDC BIT(10) +#define PCIEX8MGMT_PHY_EN BIT(11) +#define PCIEX8MGMT_PHY_INIT_VAL (PCIEX8MGMT_PHY_CDR_TRACK_EN|\ + PCIEX8MGMT_PHY_LOS_THRSHLD|\ + PCIEX8MGMT_PHY_TERM_EN|\ + PCIEX8MGMT_PHY_TERM_ACDC|\ + PCIEX8MGMT_PHY_EN) + +#define PCIEX8MGMT_PHY_LANEN_DIG_ASIC_RX_OVRD_IN_3 0x1008 +#define PCIEX8MGMT_PHY_LANE_OFF 0x100 +#define PCIEX8MGMT_PHY_LANE0_BASE (PCIEX8MGMT_PHY_LANEN_DIG_ASIC_RX_OVRD_IN_3 + 0x100 * 0) +#define PCIEX8MGMT_PHY_LANE1_BASE (PCIEX8MGMT_PHY_LANEN_DIG_ASIC_RX_OVRD_IN_3 + 0x100 * 1) +#define PCIEX8MGMT_PHY_LANE2_BASE (PCIEX8MGMT_PHY_LANEN_DIG_ASIC_RX_OVRD_IN_3 + 0x100 * 2) +#define PCIEX8MGMT_PHY_LANE3_BASE (PCIEX8MGMT_PHY_LANEN_DIG_ASIC_RX_OVRD_IN_3 + 0x100 * 3) + +static void fu740_pcie_assert_reset(struct fu740_pcie *afp) +{ + /* Assert PERST_N GPIO */ + gpiod_set_value_cansleep(afp->reset, 0); + /* Assert controller PERST_N */ + writel_relaxed(0x0, afp->mgmt_base + PCIEX8MGMT_PERST_N); +} + +static void fu740_pcie_deassert_reset(struct fu740_pcie *afp) +{ + /* Deassert controller PERST_N */ + writel_relaxed(0x1, afp->mgmt_base + PCIEX8MGMT_PERST_N); + /* Deassert PERST_N GPIO */ + gpiod_set_value_cansleep(afp->reset, 1); +} + +static void fu740_pcie_power_on(struct fu740_pcie *afp) +{ + gpiod_set_value_cansleep(afp->pwren, 1); + /* + * Ensure that PERST has been asserted for at least 100 ms. + * Section 2.2 of PCI Express Card Electromechanical Specification + * Revision 3.0 + */ + msleep(100); +} + +static void fu740_pcie_drive_reset(struct fu740_pcie *afp) +{ + fu740_pcie_assert_reset(afp); + fu740_pcie_power_on(afp); + fu740_pcie_deassert_reset(afp); +} + +static void fu740_phyregwrite(const uint8_t phy, const uint16_t addr, + const uint16_t wrdata, struct fu740_pcie *afp) +{ + struct device *dev = afp->pci.dev; + void __iomem *phy_cr_para_addr; + void __iomem *phy_cr_para_wr_data; + void __iomem *phy_cr_para_wr_en; + void __iomem *phy_cr_para_ack; + int ret, val; + + /* Setup */ + if (phy) { + phy_cr_para_addr = afp->mgmt_base + PCIEX8MGMT_PHY1_CR_PARA_ADDR; + phy_cr_para_wr_data = afp->mgmt_base + PCIEX8MGMT_PHY1_CR_PARA_WR_DATA; + phy_cr_para_wr_en = afp->mgmt_base + PCIEX8MGMT_PHY1_CR_PARA_WR_EN; + phy_cr_para_ack = afp->mgmt_base + PCIEX8MGMT_PHY1_CR_PARA_ACK; + } else { + phy_cr_para_addr = afp->mgmt_base + PCIEX8MGMT_PHY0_CR_PARA_ADDR; + phy_cr_para_wr_data = afp->mgmt_base + PCIEX8MGMT_PHY0_CR_PARA_WR_DATA; + phy_cr_para_wr_en = afp->mgmt_base + PCIEX8MGMT_PHY0_CR_PARA_WR_EN; + phy_cr_para_ack = afp->mgmt_base + PCIEX8MGMT_PHY0_CR_PARA_ACK; + } + + writel_relaxed(addr, phy_cr_para_addr); + writel_relaxed(wrdata, phy_cr_para_wr_data); + writel_relaxed(1, phy_cr_para_wr_en); + + /* Wait for wait_idle */ + ret = readl_poll_timeout(phy_cr_para_ack, val, val, 10, 5000); + if (ret) + dev_warn(dev, "Wait for wait_idle state failed!\n"); + + /* Clear */ + writel_relaxed(0, phy_cr_para_wr_en); + + /* Wait for ~wait_idle */ + ret = readl_poll_timeout(phy_cr_para_ack, val, !val, 10, 5000); + if (ret) + dev_warn(dev, "Wait for !wait_idle state failed!\n"); +} + +static void fu740_pcie_init_phy(struct fu740_pcie *afp) +{ + /* Enable phy cr_para_sel interfaces */ + writel_relaxed(0x1, afp->mgmt_base + PCIEX8MGMT_PHY0_CR_PARA_SEL); + writel_relaxed(0x1, afp->mgmt_base + PCIEX8MGMT_PHY1_CR_PARA_SEL); + + /* + * Wait 10 cr_para cycles to guarantee that the registers are ready + * to be edited. + */ + ndelay(10); + + /* Set PHY AC termination mode */ + fu740_phyregwrite(0, PCIEX8MGMT_PHY_LANE0_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp); + fu740_phyregwrite(0, PCIEX8MGMT_PHY_LANE1_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp); + fu740_phyregwrite(0, PCIEX8MGMT_PHY_LANE2_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp); + fu740_phyregwrite(0, PCIEX8MGMT_PHY_LANE3_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp); + fu740_phyregwrite(1, PCIEX8MGMT_PHY_LANE0_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp); + fu740_phyregwrite(1, PCIEX8MGMT_PHY_LANE1_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp); + fu740_phyregwrite(1, PCIEX8MGMT_PHY_LANE2_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp); + fu740_phyregwrite(1, PCIEX8MGMT_PHY_LANE3_BASE, PCIEX8MGMT_PHY_INIT_VAL, afp); +} + +static int fu740_pcie_start_link(struct dw_pcie *pci) +{ + struct device *dev = pci->dev; + struct fu740_pcie *afp = dev_get_drvdata(dev); + + /* Enable LTSSM */ + writel_relaxed(0x1, afp->mgmt_base + PCIEX8MGMT_APP_LTSSM_ENABLE); + return 0; +} + +static int fu740_pcie_host_init(struct pcie_port *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct fu740_pcie *afp = to_fu740_pcie(pci); + struct device *dev = pci->dev; + int ret; + + /* Power on reset */ + fu740_pcie_drive_reset(afp); + + /* Enable pcieauxclk */ + ret = clk_prepare_enable(afp->pcie_aux); + if (ret) { + dev_err(dev, "unable to enable pcie_aux clock\n"); + return ret; + } + + /* + * Assert hold_phy_rst (hold the controller LTSSM in reset after + * power_up_rst_n for register programming with cr_para) + */ + writel_relaxed(0x1, afp->mgmt_base + PCIEX8MGMT_APP_HOLD_PHY_RST); + + /* Deassert power_up_rst_n */ + ret = reset_control_deassert(afp->rst); + if (ret) { + dev_err(dev, "unable to deassert pcie_power_up_rst_n\n"); + return ret; + } + + fu740_pcie_init_phy(afp); + + /* Disable pcieauxclk */ + clk_disable_unprepare(afp->pcie_aux); + /* Clear hold_phy_rst */ + writel_relaxed(0x0, afp->mgmt_base + PCIEX8MGMT_APP_HOLD_PHY_RST); + /* Enable pcieauxclk */ + ret = clk_prepare_enable(afp->pcie_aux); + /* Set RC mode */ + writel_relaxed(0x4, afp->mgmt_base + PCIEX8MGMT_DEVICE_TYPE); + + return 0; +} + +static const struct dw_pcie_host_ops fu740_pcie_host_ops = { + .host_init = fu740_pcie_host_init, +}; + +static const struct dw_pcie_ops dw_pcie_ops = { + .start_link = fu740_pcie_start_link, +}; + +static int fu740_pcie_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct dw_pcie *pci; + struct fu740_pcie *afp; + + afp = devm_kzalloc(dev, sizeof(*afp), GFP_KERNEL); + if (!afp) + return -ENOMEM; + pci = &afp->pci; + pci->dev = dev; + pci->ops = &dw_pcie_ops; + pci->pp.ops = &fu740_pcie_host_ops; + + /* SiFive specific region: mgmt */ + afp->mgmt_base = devm_platform_ioremap_resource_byname(pdev, "mgmt"); + if (IS_ERR(afp->mgmt_base)) + return PTR_ERR(afp->mgmt_base); + + /* Fetch GPIOs */ + afp->reset = devm_gpiod_get_optional(dev, "reset-gpios", GPIOD_OUT_LOW); + if (IS_ERR(afp->reset)) + return dev_err_probe(dev, PTR_ERR(afp->reset), "unable to get reset-gpios\n"); + + afp->pwren = devm_gpiod_get_optional(dev, "pwren-gpios", GPIOD_OUT_LOW); + if (IS_ERR(afp->pwren)) + return dev_err_probe(dev, PTR_ERR(afp->pwren), "unable to get pwren-gpios\n"); + + /* Fetch clocks */ + afp->pcie_aux = devm_clk_get(dev, "pcie_aux"); + if (IS_ERR(afp->pcie_aux)) + return dev_err_probe(dev, PTR_ERR(afp->pcie_aux), + "pcie_aux clock source missing or invalid\n"); + + /* Fetch reset */ + afp->rst = devm_reset_control_get_exclusive(dev, NULL); + if (IS_ERR(afp->rst)) + return dev_err_probe(dev, PTR_ERR(afp->rst), "unable to get reset\n"); + + platform_set_drvdata(pdev, afp); + + return dw_pcie_host_init(&pci->pp); +} + +static void fu740_pcie_shutdown(struct platform_device *pdev) +{ + struct fu740_pcie *afp = platform_get_drvdata(pdev); + + /* Bring down link, so bootloader gets clean state in case of reboot */ + fu740_pcie_assert_reset(afp); +} + +static const struct of_device_id fu740_pcie_of_match[] = { + { .compatible = "sifive,fu740-pcie", }, + {}, +}; + +static struct platform_driver fu740_pcie_driver = { + .driver = { + .name = "fu740-pcie", + .of_match_table = fu740_pcie_of_match, + .suppress_bind_attrs = true, + }, + .probe = fu740_pcie_probe, + .shutdown = fu740_pcie_shutdown, +}; + +builtin_platform_driver(fu740_pcie_driver); From ae80d514808557018e44190fdbab23564a51e9ef Mon Sep 17 00:00:00 2001 From: Greentime Hu Date: Tue, 4 May 2021 18:59:40 +0800 Subject: [PATCH 0989/1379] riscv: dts: Add PCIe support for the SiFive FU740-C000 SoC Link: https://lore.kernel.org/r/20210504105940.100004-7-greentime.hu@sifive.com Signed-off-by: Greentime Hu Signed-off-by: Lorenzo Pieralisi Acked-by: Palmer Dabbelt --- arch/riscv/boot/dts/sifive/fu740-c000.dtsi | 33 ++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/riscv/boot/dts/sifive/fu740-c000.dtsi b/arch/riscv/boot/dts/sifive/fu740-c000.dtsi index eeb4f8c3e0e7..8eef82e4199f 100644 --- a/arch/riscv/boot/dts/sifive/fu740-c000.dtsi +++ b/arch/riscv/boot/dts/sifive/fu740-c000.dtsi @@ -159,6 +159,7 @@ reg = <0x0 0x10000000 0x0 0x1000>; clocks = <&hfclk>, <&rtcclk>; #clock-cells = <1>; + #reset-cells = <1>; }; uart0: serial@10010000 { compatible = "sifive,fu740-c000-uart", "sifive,uart0"; @@ -289,5 +290,37 @@ clocks = <&prci PRCI_CLK_PCLK>; status = "disabled"; }; + pcie@e00000000 { + compatible = "sifive,fu740-pcie"; + #address-cells = <3>; + #size-cells = <2>; + #interrupt-cells = <1>; + reg = <0xe 0x00000000 0x0 0x80000000>, + <0xd 0xf0000000 0x0 0x10000000>, + <0x0 0x100d0000 0x0 0x1000>; + reg-names = "dbi", "config", "mgmt"; + device_type = "pci"; + dma-coherent; + bus-range = <0x0 0xff>; + ranges = <0x81000000 0x0 0x60080000 0x0 0x60080000 0x0 0x10000>, /* I/O */ + <0x82000000 0x0 0x60090000 0x0 0x60090000 0x0 0xff70000>, /* mem */ + <0x82000000 0x0 0x70000000 0x0 0x70000000 0x0 0x1000000>, /* mem */ + <0xc3000000 0x20 0x00000000 0x20 0x00000000 0x20 0x00000000>; /* mem prefetchable */ + num-lanes = <0x8>; + interrupts = <56>, <57>, <58>, <59>, <60>, <61>, <62>, <63>, <64>; + interrupt-names = "msi", "inta", "intb", "intc", "intd"; + interrupt-parent = <&plic0>; + interrupt-map-mask = <0x0 0x0 0x0 0x7>; + interrupt-map = <0x0 0x0 0x0 0x1 &plic0 57>, + <0x0 0x0 0x0 0x2 &plic0 58>, + <0x0 0x0 0x0 0x3 &plic0 59>, + <0x0 0x0 0x0 0x4 &plic0 60>; + clock-names = "pcie_aux"; + clocks = <&prci PRCI_CLK_PCIE_AUX>; + pwren-gpios = <&gpio 5 0>; + reset-gpios = <&gpio 8 0>; + resets = <&prci 4>; + status = "okay"; + }; }; }; From 6799e3f281e962628be531e8331bacd05b866134 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 4 May 2021 11:03:00 +0200 Subject: [PATCH 0990/1379] dt-bindings: net: renesas,etheravb: Fix optional second clock name If the optional "clock-names" property is present, but the optional TXC reference clock is not, "make dtbs_check" complains: ethernet@e6800000: clock-names: ['fck'] is too short Fix this by declaring that a single clock name is valid. While at it, drop the superfluous upper limit on the number of clocks, as it is implied by the list of descriptions. Fixes: 6f43735b6da64bd4 ("dt-bindings: net: renesas,etheravb: Add additional clocks") Signed-off-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/b3d91c9f70a15792ad19c87e4ea35fc876600fae.1620118901.git.geert+renesas@glider.be Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/net/renesas,etheravb.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/renesas,etheravb.yaml b/Documentation/devicetree/bindings/net/renesas,etheravb.yaml index fe72a5598add..005868f703a6 100644 --- a/Documentation/devicetree/bindings/net/renesas,etheravb.yaml +++ b/Documentation/devicetree/bindings/net/renesas,etheravb.yaml @@ -51,12 +51,12 @@ properties: clocks: minItems: 1 - maxItems: 2 items: - description: AVB functional clock - description: Optional TXC reference clock clock-names: + minItems: 1 items: - const: fck - const: refclk From 3f1c6f2122fc780560f09735b6d1dbf39b44eb0f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 3 May 2021 17:09:01 +0200 Subject: [PATCH 0991/1379] libceph: allow addrvecs with a single NONE/blank address Normally, an unused OSD id/slot is represented by an empty addrvec. However, it also appears to be possible to generate an osdmap where an unused OSD id/slot has an addrvec with a single blank address of type NONE. Allow such addrvecs and make the end result be exactly the same as for the empty addrvec case -- leave addr intact. Cc: stable@vger.kernel.org # 5.11+ Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- net/ceph/decode.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/net/ceph/decode.c b/net/ceph/decode.c index b44f7651be04..bc109a1a4616 100644 --- a/net/ceph/decode.c +++ b/net/ceph/decode.c @@ -4,6 +4,7 @@ #include #include +#include /* for ceph_pr_addr() */ static int ceph_decode_entity_addr_versioned(void **p, void *end, @@ -110,6 +111,7 @@ int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2, } ceph_decode_32_safe(p, end, addr_cnt, e_inval); + dout("%s addr_cnt %d\n", __func__, addr_cnt); found = false; for (i = 0; i < addr_cnt; i++) { @@ -117,6 +119,7 @@ int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2, if (ret) return ret; + dout("%s i %d addr %s\n", __func__, i, ceph_pr_addr(&tmp_addr)); if (tmp_addr.type == my_type) { if (found) { pr_err("another match of type %d in addrvec\n", @@ -128,13 +131,18 @@ int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2, found = true; } } - if (!found && addr_cnt != 0) { - pr_err("no match of type %d in addrvec\n", - le32_to_cpu(my_type)); - return -ENOENT; - } - return 0; + if (found) + return 0; + + if (!addr_cnt) + return 0; /* normal -- e.g. unused OSD id/slot */ + + if (addr_cnt == 1 && !memchr_inv(&tmp_addr, 0, sizeof(tmp_addr))) + return 0; /* weird but effectively the same as !addr_cnt */ + + pr_err("no match of type %d in addrvec\n", le32_to_cpu(my_type)); + return -ENOENT; e_inval: return -EINVAL; From b9d79e4ca4ff23543d6b33c736ba07c1f0a9dcb1 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 4 May 2021 07:29:10 -0700 Subject: [PATCH 0992/1379] fbmem: Mark proc_fb_seq_ops as __maybe_unused With CONFIG_PROC_FS=n and -Werror, 0-day reports: drivers/video/fbdev/core/fbmem.c:736:36: error: 'proc_fb_seq_ops' defined but not used Mark it as __maybe_unused. Reported-by: kernel test robot Signed-off-by: Guenter Roeck Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20210504142910.2084722-1-linux@roeck-us.net --- drivers/video/fbdev/core/fbmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 372b52a2befa..52c606c0f8a2 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -733,7 +733,7 @@ static int fb_seq_show(struct seq_file *m, void *v) return 0; } -static const struct seq_operations proc_fb_seq_ops = { +static const struct __maybe_unused seq_operations proc_fb_seq_ops = { .start = fb_seq_start, .next = fb_seq_next, .stop = fb_seq_stop, From 8e9800f9f2b89e1efe2a5993361fae4d618a6c26 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 29 Apr 2021 14:39:33 -0700 Subject: [PATCH 0993/1379] xfs: don't allow log writes if the data device is readonly While running generic/050 with an external log, I observed this warning in dmesg: Trying to write to read-only block-device sda4 (partno 4) WARNING: CPU: 2 PID: 215677 at block/blk-core.c:704 submit_bio_checks+0x256/0x510 Call Trace: submit_bio_noacct+0x2c/0x430 _xfs_buf_ioapply+0x283/0x3c0 [xfs] __xfs_buf_submit+0x6a/0x210 [xfs] xfs_buf_delwri_submit_buffers+0xf8/0x270 [xfs] xfsaild+0x2db/0xc50 [xfs] kthread+0x14b/0x170 I think this happened because we tried to cover the log after a readonly mount, and the AIL tried to write the primary superblock to the data device. The test marks the data device readonly, but it doesn't do the same to the external log device. Therefore, XFS thinks that the log is writable, even though AIL writes whine to dmesg because the data device is read only. Fix this by amending xfs_log_writable to prevent writes when the AIL can't possible write anything into the filesystem. Note: As for the external log or the rt devices being readonly-- xfs_blkdev_get will complain about that if we aren't doing a norecovery mount. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_log.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 06041834daa3..c19a82adea1e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -355,13 +355,15 @@ xfs_log_writable( struct xfs_mount *mp) { /* - * Never write to the log on norecovery mounts, if the block device is - * read-only, or if the filesystem is shutdown. Read-only mounts still - * allow internal writes for log recovery and unmount purposes, so don't - * restrict that case here. + * Do not write to the log on norecovery mounts, if the data or log + * devices are read-only, or if the filesystem is shutdown. Read-only + * mounts allow internal writes for log recovery and unmount purposes, + * so don't restrict that case. */ if (mp->m_flags & XFS_MOUNT_NORECOVERY) return false; + if (xfs_readonly_buftarg(mp->m_ddev_targp)) + return false; if (xfs_readonly_buftarg(mp->m_log->l_targ)) return false; if (XFS_FORCED_SHUTDOWN(mp)) From 6e552494fb90acae005d74ce6a2ee102d965184b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 4 May 2021 08:54:29 -0700 Subject: [PATCH 0994/1379] iomap: remove unused private field from ioend The only remaining user of ->io_private is the generic ioend merging infrastructure. The only user of that is XFS, which no longer sets ->io_private or passes an associated merge callback. Remove the unused parameter and the ->io_private field. CC: linux-fsdevel@vger.kernel.org Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 7 +------ fs/xfs/xfs_aops.c | 2 +- include/linux/iomap.h | 5 +---- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 0129e6bab985..f2cd2034a87b 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1134,9 +1134,7 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) } void -iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends, - void (*merge_private)(struct iomap_ioend *ioend, - struct iomap_ioend *next)) +iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends) { struct iomap_ioend *next; @@ -1148,8 +1146,6 @@ iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends, break; list_move_tail(&next->io_list, &ioend->io_list); ioend->io_size += next->io_size; - if (next->io_private && merge_private) - merge_private(ioend, next); } } EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); @@ -1236,7 +1232,6 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, ioend->io_inode = inode; ioend->io_size = 0; ioend->io_offset = offset; - ioend->io_private = NULL; ioend->io_bio = bio; return ioend; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 9b08db45ce85..826caa6b4a5a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -146,7 +146,7 @@ xfs_end_io( while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, io_list))) { list_del_init(&ioend->io_list); - iomap_ioend_try_merge(ioend, &tmp, NULL); + iomap_ioend_try_merge(ioend, &tmp); xfs_end_ioend(ioend); } } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index d202fd2d0f91..c87d0cb0de6d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -198,7 +198,6 @@ struct iomap_ioend { struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ - void *io_private; /* file system private data */ struct bio *io_bio; /* bio being built */ struct bio io_inline_bio; /* MUST BE LAST! */ }; @@ -234,9 +233,7 @@ struct iomap_writepage_ctx { void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, - struct list_head *more_ioends, - void (*merge_private)(struct iomap_ioend *ioend, - struct iomap_ioend *next)); + struct list_head *more_ioends); void iomap_sort_ioends(struct list_head *ioend_list); int iomap_writepage(struct page *page, struct writeback_control *wbc, struct iomap_writepage_ctx *wpc, From 698f99ed5e06946764c3be035ce9d62a2691e08c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 29 Apr 2021 12:53:27 +0300 Subject: [PATCH 0995/1379] vfio/mdev: remove unnecessary NULL check in mbochs_create() Originally "type" could be NULL and these checks were required, but we recently changed how "type" is assigned and that's no longer the case. Now "type" points to an element in the middle of a non-NULL array. Removing the checks does not affect runtime at all, but it makes the code a little bit simpler to read. Fixes: 3d3a360e570616 ("vfio/mbochs: Use mdev_get_type_group_id()") Signed-off-by: Dan Carpenter Reviewed-by: Jason Gunthorpe Message-Id: <20210429095327.GY1981@kadam> Signed-off-by: Alex Williamson --- samples/vfio-mdev/mbochs.c | 2 -- samples/vfio-mdev/mdpy.c | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 861c76914e76..881ef9a7296f 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -513,8 +513,6 @@ static int mbochs_create(struct mdev_device *mdev) struct device *dev = mdev_dev(mdev); struct mdev_state *mdev_state; - if (!type) - type = &mbochs_types[0]; if (type->mbytes + mbochs_used_mbytes > max_mbytes) return -ENOMEM; diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index f0c0e7209719..e889c1cf8fd1 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -667,8 +667,7 @@ static ssize_t description_show(struct mdev_type *mtype, &mdpy_types[mtype_get_type_group_id(mtype)]; return sprintf(buf, "virtual display, %dx%d framebuffer\n", - type ? type->width : 0, - type ? type->height : 0); + type->width, type->height); } static MDEV_TYPE_ATTR_RO(description); From 5c1acf3fe05ce443edba5e2110c9e581765f66a8 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 3 May 2021 11:55:26 -0300 Subject: [PATCH 0996/1379] cifs: fix regression when mounting shares with prefix paths The commit 315db9a05b7a ("cifs: fix leak in cifs_smb3_do_mount() ctx") revealed an existing bug when mounting shares that contain a prefix path or DFS links. cifs_setup_volume_info() requires the @devname to contain the full path (UNC + prefix) to update the fs context with the new UNC and prepath values, however we were passing only the UNC path (old_ctx->UNC) in @device thus discarding any prefix paths. Instead of concatenating both old_ctx->{UNC,prepath} and pass it in @devname, just keep the dup'ed values of UNC and prepath in cifs_sb->ctx after calling smb3_fs_context_dup(), and fix smb3_parse_devname() to correctly parse and not leak the new UNC and prefix paths. Cc: # v5.11+ Fixes: 315db9a05b7a ("cifs: fix leak in cifs_smb3_do_mount() ctx") Signed-off-by: Paulo Alcantara (SUSE) Acked-by: David Disseldorp Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 8 +------- fs/cifs/connect.c | 24 ++++++++++++++++++------ fs/cifs/fs_context.c | 4 ++++ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8a6894577697..d7ea9c5fe0f8 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -863,13 +863,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, goto out; } - /* cifs_setup_volume_info->smb3_parse_devname() redups UNC & prepath */ - kfree(cifs_sb->ctx->UNC); - cifs_sb->ctx->UNC = NULL; - kfree(cifs_sb->ctx->prepath); - cifs_sb->ctx->prepath = NULL; - - rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, old_ctx->UNC); + rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, NULL); if (rc) { root = ERR_PTR(rc); goto out; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d7efbcb2d5df..495c395f9def 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3149,17 +3149,29 @@ out: int cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname) { - int rc = 0; + int rc; - smb3_parse_devname(devname, ctx); + if (devname) { + cifs_dbg(FYI, "%s: devname=%s\n", __func__, devname); + rc = smb3_parse_devname(devname, ctx); + if (rc) { + cifs_dbg(VFS, "%s: failed to parse %s: %d\n", __func__, devname, rc); + return rc; + } + } if (mntopts) { char *ip; - cifs_dbg(FYI, "%s: mntopts=%s\n", __func__, mntopts); rc = smb3_parse_opt(mntopts, "ip", &ip); - if (!rc && !cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip, - strlen(ip))) { + if (rc) { + cifs_dbg(VFS, "%s: failed to parse ip options: %d\n", __func__, rc); + return rc; + } + + rc = cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip, strlen(ip)); + kfree(ip); + if (!rc) { cifs_dbg(VFS, "%s: failed to convert ip address\n", __func__); return -EINVAL; } @@ -3179,7 +3191,7 @@ cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const c return -EINVAL; } - return rc; + return 0; } static int diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 1d6e0e15b034..3bcf881c3ae9 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -476,6 +476,7 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) /* move "pos" up to delimiter or NULL */ pos += len; + kfree(ctx->UNC); ctx->UNC = kstrndup(devname, pos - devname, GFP_KERNEL); if (!ctx->UNC) return -ENOMEM; @@ -486,6 +487,9 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) if (*pos == '/' || *pos == '\\') pos++; + kfree(ctx->prepath); + ctx->prepath = NULL; + /* If pos is NULL then no prepath */ if (!*pos) return 0; From 78c09634f7dc061a3bd09704cdbebb3762a45cdf Mon Sep 17 00:00:00 2001 From: Rohith Surabattula Date: Mon, 19 Apr 2021 19:02:03 +0000 Subject: [PATCH 0997/1379] Cifs: Fix kernel oops caused by deferred close for files. Fix regression issue caused by deferred close for files. Signed-off-by: Rohith Surabattula Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 2 ++ fs/cifs/file.c | 16 ++++++++++++---- fs/cifs/inode.c | 3 ++- fs/cifs/misc.c | 17 +++++++++++++++++ 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index c6dacce87d3a..3c6b97ef39d3 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -278,6 +278,8 @@ extern void cifs_del_deferred_close(struct cifsFileInfo *cfile); extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode); +extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon); + extern struct TCP_Server_Info *cifs_get_tcp_session(struct smb3_fs_context *ctx); extern void cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c95893351b6c..919c82d4713d 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -878,6 +878,10 @@ void smb2_deferred_work_close(struct work_struct *work) struct cifsFileInfo, deferred.work); spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + if (!cfile->deferred_scheduled) { + spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + return; + } cifs_del_deferred_close(cfile); cfile->deferred_scheduled = false; spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); @@ -1987,8 +1991,10 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, if (total_written > 0) { spin_lock(&d_inode(dentry)->i_lock); - if (*offset > d_inode(dentry)->i_size) + if (*offset > d_inode(dentry)->i_size) { i_size_write(d_inode(dentry), *offset); + d_inode(dentry)->i_blocks = (512 - 1 + *offset) >> 9; + } spin_unlock(&d_inode(dentry)->i_lock); } mark_inode_dirty_sync(d_inode(dentry)); @@ -2647,8 +2653,10 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, if (rc > 0) { spin_lock(&inode->i_lock); - if (pos > inode->i_size) + if (pos > inode->i_size) { i_size_write(inode, pos); + inode->i_blocks = (512 - 1 + pos) >> 9; + } spin_unlock(&inode->i_lock); } @@ -4864,7 +4872,6 @@ oplock_break_ack: cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } - _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false); /* * When oplock break is received and there are no active * file handles but cached, then set the flag oplock_break_received. @@ -4872,11 +4879,12 @@ oplock_break_ack: */ spin_lock(&CIFS_I(inode)->deferred_lock); is_deferred = cifs_is_deferred_close(cfile, &dclose); - if (is_deferred) { + if (is_deferred && cfile->deferred_scheduled) { cfile->oplock_break_received = true; mod_delayed_work(deferredclose_wq, &cfile->deferred, 0); } spin_unlock(&CIFS_I(inode)->deferred_lock); + _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 728ff45b6667..591f18e3e933 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1647,7 +1647,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) goto unlink_out; } - cifs_close_deferred_file(CIFS_I(inode)); + cifs_close_all_deferred_files(tcon); if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = CIFSPOSIXDelFile(xid, tcon, full_path, @@ -2125,6 +2125,7 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, goto cifs_rename_exit; } + cifs_close_all_deferred_files(tcon); rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, to_name); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index e63fbd4a6bfe..524dbdfb7184 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -734,6 +734,23 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) } } +void +cifs_close_all_deferred_files(struct cifs_tcon *tcon) +{ + struct cifsFileInfo *cfile; + struct cifsInodeInfo *cinode; + struct list_head *tmp; + + spin_lock(&tcon->open_file_lock); + list_for_each(tmp, &tcon->openFileList) { + cfile = list_entry(tmp, struct cifsFileInfo, tlist); + cinode = CIFS_I(d_inode(cfile->dentry)); + if (delayed_work_pending(&cfile->deferred)) + mod_delayed_work(deferredclose_wq, &cfile->deferred, 0); + } + spin_unlock(&tcon->open_file_lock); +} + /* parses DFS refferal V3 structure * caller is responsible for freeing target_nodes * returns: From bae4c0c1c2d576d32e37925ef972a5d45f34e36d Mon Sep 17 00:00:00 2001 From: Khaled ROMDHANI Date: Tue, 4 May 2021 16:38:55 +0100 Subject: [PATCH 0998/1379] fs/cifs: Fix resource leak The -EIO error return path is leaking memory allocated to page. Fix this by moving the allocation block after the check of cifs_forced_shutdown. Addresses-Coverity: ("Resource leak") Fixes: 087f757b0129 ("cifs: add shutdown support") Signed-off-by: Khaled ROMDHANI Reviewed-by: Dan Carpenter Signed-off-by: Steve French --- fs/cifs/link.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 1cbe7ec73728..970fcf2adb08 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -683,12 +683,16 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, struct tcon_link *tlink; struct cifs_tcon *pTcon; const char *full_path; - void *page = alloc_dentry_path(); + void *page; struct inode *newinode = NULL; if (unlikely(cifs_forced_shutdown(cifs_sb))) return -EIO; + page = alloc_dentry_path(); + if (!page) + return -ENOMEM; + xid = get_xid(); tlink = cifs_sb_tlink(cifs_sb); From b208108638c4bd3215792415944467c36f5dfd97 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 May 2021 14:12:44 +0200 Subject: [PATCH 0999/1379] s390: fix detection of vector enhancements facility 1 vs. vector packed decimal facility The PoP documents: 134: The vector packed decimal facility is installed in the z/Architecture architectural mode. When bit 134 is one, bit 129 is also one. 135: The vector enhancements facility 1 is installed in the z/Architecture architectural mode. When bit 135 is one, bit 129 is also one. Looks like we confuse the vector enhancements facility 1 ("EXT") with the Vector packed decimal facility ("BCD"). Let's fix the facility checks. Detected while working on QEMU/tcg z14 support and only unlocking the vector enhancements facility 1, but not the vector packed decimal facility. Fixes: 2583b848cad0 ("s390: report new vector facilities") Cc: Vasily Gorbik Signed-off-by: David Hildenbrand Reviewed-by: Christian Borntraeger Reviewed-by: Cornelia Huck Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20210503121244.25232-1-david@redhat.com Signed-off-by: Heiko Carstens --- arch/s390/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 72134f9f6ff5..5aab59ad5688 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -937,9 +937,9 @@ static int __init setup_hwcaps(void) if (MACHINE_HAS_VX) { elf_hwcap |= HWCAP_S390_VXRS; if (test_facility(134)) - elf_hwcap |= HWCAP_S390_VXRS_EXT; - if (test_facility(135)) elf_hwcap |= HWCAP_S390_VXRS_BCD; + if (test_facility(135)) + elf_hwcap |= HWCAP_S390_VXRS_EXT; if (test_facility(148)) elf_hwcap |= HWCAP_S390_VXRS_EXT2; if (test_facility(152)) From 8d432592f30fcc34ef5a10aac4887b4897884493 Mon Sep 17 00:00:00 2001 From: Jonathon Reinhart Date: Sat, 1 May 2021 04:28:22 -0400 Subject: [PATCH 1000/1379] net: Only allow init netns to set default tcp cong to a restricted algo tcp_set_default_congestion_control() is netns-safe in that it writes to &net->ipv4.tcp_congestion_control, but it also sets ca->flags |= TCP_CONG_NON_RESTRICTED which is not namespaced. This has the unintended side-effect of changing the global net.ipv4.tcp_allowed_congestion_control sysctl, despite the fact that it is read-only: 97684f0970f6 ("net: Make tcp_allowed_congestion_control readonly in non-init netns") Resolve this netns "leak" by only allowing the init netns to set the default algorithm to one that is restricted. This restriction could be removed if tcp_allowed_congestion_control were namespace-ified in the future. This bug was uncovered with https://github.com/JonathonReinhart/linux-netns-sysctl-verify Fixes: 6670e1524477 ("tcp: Namespace-ify sysctl_tcp_default_congestion_control") Signed-off-by: Jonathon Reinhart Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 563d016e7478..db5831e6c136 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -230,6 +230,10 @@ int tcp_set_default_congestion_control(struct net *net, const char *name) ret = -ENOENT; } else if (!bpf_try_module_get(ca, ca->owner)) { ret = -EBUSY; + } else if (!net_eq(net, &init_net) && + !(ca->flags & TCP_CONG_NON_RESTRICTED)) { + /* Only init netns can set default to a restricted algorithm */ + ret = -EPERM; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) From c61760e6940dd4039a7f5e84a6afc9cdbf4d82b6 Mon Sep 17 00:00:00 2001 From: Or Cohen Date: Tue, 4 May 2021 10:16:46 +0300 Subject: [PATCH 1001/1379] net/nfc: fix use-after-free llcp_sock_bind/connect Commits 8a4cd82d ("nfc: fix refcount leak in llcp_sock_connect()") and c33b1cc62 ("nfc: fix refcount leak in llcp_sock_bind()") fixed a refcount leak bug in bind/connect but introduced a use-after-free if the same local is assigned to 2 different sockets. This can be triggered by the following simple program: int sock1 = socket( AF_NFC, SOCK_STREAM, NFC_SOCKPROTO_LLCP ); int sock2 = socket( AF_NFC, SOCK_STREAM, NFC_SOCKPROTO_LLCP ); memset( &addr, 0, sizeof(struct sockaddr_nfc_llcp) ); addr.sa_family = AF_NFC; addr.nfc_protocol = NFC_PROTO_NFC_DEP; bind( sock1, (struct sockaddr*) &addr, sizeof(struct sockaddr_nfc_llcp) ) bind( sock2, (struct sockaddr*) &addr, sizeof(struct sockaddr_nfc_llcp) ) close(sock1); close(sock2); Fix this by assigning NULL to llcp_sock->local after calling nfc_llcp_local_put. This addresses CVE-2021-23134. Reported-by: Or Cohen Reported-by: Nadav Markus Fixes: c33b1cc62 ("nfc: fix refcount leak in llcp_sock_bind()") Signed-off-by: Or Cohen Signed-off-by: David S. Miller --- net/nfc/llcp_sock.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index a3b46f888803..53dbe733f998 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -109,12 +109,14 @@ static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) GFP_KERNEL); if (!llcp_sock->service_name) { nfc_llcp_local_put(llcp_sock->local); + llcp_sock->local = NULL; ret = -ENOMEM; goto put_dev; } llcp_sock->ssap = nfc_llcp_get_sdp_ssap(local, llcp_sock); if (llcp_sock->ssap == LLCP_SAP_MAX) { nfc_llcp_local_put(llcp_sock->local); + llcp_sock->local = NULL; kfree(llcp_sock->service_name); llcp_sock->service_name = NULL; ret = -EADDRINUSE; @@ -709,6 +711,7 @@ static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, llcp_sock->ssap = nfc_llcp_get_local_ssap(local); if (llcp_sock->ssap == LLCP_SAP_MAX) { nfc_llcp_local_put(llcp_sock->local); + llcp_sock->local = NULL; ret = -ENOMEM; goto put_dev; } @@ -756,6 +759,7 @@ sock_unlink: sock_llcp_release: nfc_llcp_put_ssap(local, llcp_sock->ssap); nfc_llcp_local_put(llcp_sock->local); + llcp_sock->local = NULL; put_dev: nfc_put_device(dev); From 4c7a94286ef7ac7301d633f17519fb1bb89d7550 Mon Sep 17 00:00:00 2001 From: Ramesh Babu B Date: Tue, 4 May 2021 21:12:41 +0530 Subject: [PATCH 1002/1379] net: stmmac: Clear receive all(RA) bit when promiscuous mode is off In promiscuous mode Receive All bit is set in GMAC packet filter register, but outside promiscuous mode Receive All bit is not cleared, which resulted in all network packets are received when toggle (ON/OFF) the promiscuous mode. Fixes: e0f9956a3862 ("net: stmmac: Add option for VLAN filter fail queue enable") Signed-off-by: Ramesh Babu B Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index 95864f014ffa..f35c03c9f91e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -642,6 +642,7 @@ static void dwmac4_set_filter(struct mac_device_info *hw, value &= ~GMAC_PACKET_FILTER_PCF; value &= ~GMAC_PACKET_FILTER_PM; value &= ~GMAC_PACKET_FILTER_PR; + value &= ~GMAC_PACKET_FILTER_RA; if (dev->flags & IFF_PROMISC) { /* VLAN Tag Filter Fail Packets Queuing */ if (hw->vlan_fail_q_en) { From c83c4e1912446db697a120eb30126cd80cbf6349 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Wed, 28 Apr 2021 12:00:20 +0800 Subject: [PATCH 1003/1379] drm/amdgpu: add new MC firmware for Polaris12 32bit ASIC Polaris12 32bit ASIC needs a special MC firmware. Signed-off-by: Evan Quan Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index c1bd190841f8..e4f27b3f28fb 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -59,6 +59,7 @@ MODULE_FIRMWARE("amdgpu/tonga_mc.bin"); MODULE_FIRMWARE("amdgpu/polaris11_mc.bin"); MODULE_FIRMWARE("amdgpu/polaris10_mc.bin"); MODULE_FIRMWARE("amdgpu/polaris12_mc.bin"); +MODULE_FIRMWARE("amdgpu/polaris12_32_mc.bin"); MODULE_FIRMWARE("amdgpu/polaris11_k_mc.bin"); MODULE_FIRMWARE("amdgpu/polaris10_k_mc.bin"); MODULE_FIRMWARE("amdgpu/polaris12_k_mc.bin"); @@ -243,10 +244,16 @@ static int gmc_v8_0_init_microcode(struct amdgpu_device *adev) chip_name = "polaris10"; break; case CHIP_POLARIS12: - if (ASICID_IS_P23(adev->pdev->device, adev->pdev->revision)) + if (ASICID_IS_P23(adev->pdev->device, adev->pdev->revision)) { chip_name = "polaris12_k"; - else - chip_name = "polaris12"; + } else { + WREG32(mmMC_SEQ_IO_DEBUG_INDEX, ixMC_IO_DEBUG_UP_159); + /* Polaris12 32bit ASIC needs a special MC firmware */ + if (RREG32(mmMC_SEQ_IO_DEBUG_DATA) == 0x05b4dc40) + chip_name = "polaris12_32"; + else + chip_name = "polaris12"; + } break; case CHIP_FIJI: case CHIP_CARRIZO: From 16e9b3e58bc3fce7391539e0eb3fd167cbf9951f Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Tue, 13 Apr 2021 20:06:04 -0400 Subject: [PATCH 1004/1379] drm/amd/display: Fix two cursor duplication when using overlay Our driver supports overlay planes, and as expected, some userspace compositor takes advantage of these features. If the userspace is not enabling the cursor, they can use multiple planes as they please. Nevertheless, we start to have constraints when userspace tries to enable hardware cursor with various planes. Basically, we cannot draw the cursor at the same size and position on two separated pipes since it uses extra bandwidth and DML only run with one cursor. For those reasons, when we enable hardware cursor and multiple planes, our driver should accept variations like the ones described below: +-------------+ +--------------+ | +---------+ | | | | |Primary | | | Primary | | | | | | Overlay | | +---------+ | | | |Overlay | | | +-------------+ +--------------+ In this scenario, we can have the desktop UI in the overlay and some other framebuffer attached to the primary plane (e.g., video). However, userspace needs to obey some rules and avoid scenarios like the ones described below (when enabling hw cursor): +--------+ |Overlay | +-------------+ +-----+-------+ +-| |--+ | +--------+ | +--------+ | | +--------+ | | |Overlay | | |Overlay | | | | | | | | | | | | | | +--------+ | +--------+ | | | | Primary | | Primary | | Primary | +-------------+ +-------------+ +-------------+ +-------------+ +-------------+ | +--------+ | Primary | | |Overlay | | | | | | | | | +--------+ | +--------+ | | Primary | | |Overlay | | +-------------+ +-| |--+ +--------+ If the userspace violates some of the above scenarios, our driver needs to reject the commit; otherwise, we can have unexpected behavior. Since we don't have a proper driver validation for the above case, we can see some problems like a duplicate cursor in applications that use multiple planes. This commit fixes the cursor issue and others by adding adequate verification for multiple planes. Change since V1 (Harry and Sean): - Remove cursor verification from the equation. Cc: Louis Li Cc: Nicholas Kazlauskas Cc: Harry Wentland Cc: Hersen Wu Cc: Sean Paul Signed-off-by: Rodrigo Siqueira Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index deec7e0466da..1b71e0ca1a89 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -9870,6 +9870,53 @@ static int add_affected_mst_dsc_crtcs(struct drm_atomic_state *state, struct drm } #endif +static int validate_overlay(struct drm_atomic_state *state) +{ + int i; + struct drm_plane *plane; + struct drm_plane_state *old_plane_state, *new_plane_state; + struct drm_plane_state *primary_state, *overlay_state = NULL; + + /* Check if primary plane is contained inside overlay */ + for_each_oldnew_plane_in_state_reverse(state, plane, old_plane_state, new_plane_state, i) { + if (plane->type == DRM_PLANE_TYPE_OVERLAY) { + if (drm_atomic_plane_disabling(plane->state, new_plane_state)) + return 0; + + overlay_state = new_plane_state; + continue; + } + } + + /* check if we're making changes to the overlay plane */ + if (!overlay_state) + return 0; + + /* check if overlay plane is enabled */ + if (!overlay_state->crtc) + return 0; + + /* find the primary plane for the CRTC that the overlay is enabled on */ + primary_state = drm_atomic_get_plane_state(state, overlay_state->crtc->primary); + if (IS_ERR(primary_state)) + return PTR_ERR(primary_state); + + /* check if primary plane is enabled */ + if (!primary_state->crtc) + return 0; + + /* Perform the bounds check to ensure the overlay plane covers the primary */ + if (primary_state->crtc_x < overlay_state->crtc_x || + primary_state->crtc_y < overlay_state->crtc_y || + primary_state->crtc_x + primary_state->crtc_w > overlay_state->crtc_x + overlay_state->crtc_w || + primary_state->crtc_y + primary_state->crtc_h > overlay_state->crtc_y + overlay_state->crtc_h) { + DRM_DEBUG_ATOMIC("Overlay plane is enabled with hardware cursor but does not fully cover primary plane\n"); + return -EINVAL; + } + + return 0; +} + /** * amdgpu_dm_atomic_check() - Atomic check implementation for AMDgpu DM. * @dev: The DRM device @@ -10044,6 +10091,10 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev, goto fail; } + ret = validate_overlay(state); + if (ret) + goto fail; + /* Add new/modified planes */ for_each_oldnew_plane_in_state_reverse(state, plane, old_plane_state, new_plane_state, i) { ret = dm_update_plane_state(dc, state, plane, From 5bbf219328849e83878bddb7c226d8d42e84affc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 2 May 2021 22:06:07 -0700 Subject: [PATCH 1005/1379] drm/radeon: Fix off-by-one power_state index heap overwrite An out of bounds write happens when setting the default power state. KASAN sees this as: [drm] radeon: 512M of GTT memory ready. [drm] GART: num cpu pages 131072, num gpu pages 131072 ================================================================== BUG: KASAN: slab-out-of-bounds in radeon_atombios_parse_power_table_1_3+0x1837/0x1998 [radeon] Write of size 4 at addr ffff88810178d858 by task systemd-udevd/157 CPU: 0 PID: 157 Comm: systemd-udevd Not tainted 5.12.0-E620 #50 Hardware name: eMachines eMachines E620 /Nile , BIOS V1.03 09/30/2008 Call Trace: dump_stack+0xa5/0xe6 print_address_description.constprop.0+0x18/0x239 kasan_report+0x170/0x1a8 radeon_atombios_parse_power_table_1_3+0x1837/0x1998 [radeon] radeon_atombios_get_power_modes+0x144/0x1888 [radeon] radeon_pm_init+0x1019/0x1904 [radeon] rs690_init+0x76e/0x84a [radeon] radeon_device_init+0x1c1a/0x21e5 [radeon] radeon_driver_load_kms+0xf5/0x30b [radeon] drm_dev_register+0x255/0x4a0 [drm] radeon_pci_probe+0x246/0x2f6 [radeon] pci_device_probe+0x1aa/0x294 really_probe+0x30e/0x850 driver_probe_device+0xe6/0x135 device_driver_attach+0xc1/0xf8 __driver_attach+0x13f/0x146 bus_for_each_dev+0xfa/0x146 bus_add_driver+0x2b3/0x447 driver_register+0x242/0x2c1 do_one_initcall+0x149/0x2fd do_init_module+0x1ae/0x573 load_module+0x4dee/0x5cca __do_sys_finit_module+0xf1/0x140 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xae Without KASAN, this will manifest later when the kernel attempts to allocate memory that was stomped, since it collides with the inline slab freelist pointer: invalid opcode: 0000 [#1] SMP NOPTI CPU: 0 PID: 781 Comm: openrc-run.sh Tainted: G W 5.10.12-gentoo-E620 #2 Hardware name: eMachines eMachines E620 /Nile , BIOS V1.03 09/30/2008 RIP: 0010:kfree+0x115/0x230 Code: 89 c5 e8 75 ea ff ff 48 8b 00 0f ba e0 09 72 63 e8 1f f4 ff ff 41 89 c4 48 8b 45 00 0f ba e0 10 72 0a 48 8b 45 08 a8 01 75 02 <0f> 0b 44 89 e1 48 c7 c2 00 f0 ff ff be 06 00 00 00 48 d3 e2 48 c7 RSP: 0018:ffffb42f40267e10 EFLAGS: 00010246 RAX: ffffd61280ee8d88 RBX: 0000000000000004 RCX: 000000008010000d RDX: 4000000000000000 RSI: ffffffffba1360b0 RDI: ffffd61280ee8d80 RBP: ffffd61280ee8d80 R08: ffffffffb91bebdf R09: 0000000000000000 R10: ffff8fe2c1047ac8 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000100 FS: 00007fe80eff6b68(0000) GS:ffff8fe339c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe80eec7bc0 CR3: 0000000038012000 CR4: 00000000000006f0 Call Trace: __free_fdtable+0x16/0x1f put_files_struct+0x81/0x9b do_exit+0x433/0x94d do_group_exit+0xa6/0xa6 __x64_sys_exit_group+0xf/0xf do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7fe80ef64bea Code: Unable to access opcode bytes at RIP 0x7fe80ef64bc0. RSP: 002b:00007ffdb1c47528 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007fe80ef64bea RDX: 00007fe80ef64f60 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 R10: 00007fe80ee2c620 R11: 0000000000000246 R12: 00007fe80eff41e0 R13: 00000000ffffffff R14: 0000000000000024 R15: 00007fe80edf9cd0 Modules linked in: radeon(+) ath5k(+) snd_hda_codec_realtek ... Use a valid power_state index when initializing the "flags" and "misc" and "misc2" fields. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=211537 Reported-by: Erhard F. Fixes: a48b9b4edb8b ("drm/radeon/kms/pm: add asic specific callbacks for getting power state (v2)") Fixes: 79daedc94281 ("drm/radeon/kms: minor pm cleanups") Signed-off-by: Kees Cook Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_atombios.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c index 42301b4e56f5..f9f4efa1738c 100644 --- a/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/drivers/gpu/drm/radeon/radeon_atombios.c @@ -2250,10 +2250,10 @@ static int radeon_atombios_parse_power_table_1_3(struct radeon_device *rdev) rdev->pm.default_power_state_index = state_index - 1; rdev->pm.power_state[state_index - 1].default_clock_mode = &rdev->pm.power_state[state_index - 1].clock_info[0]; - rdev->pm.power_state[state_index].flags &= + rdev->pm.power_state[state_index - 1].flags &= ~RADEON_PM_STATE_SINGLE_DISPLAY_ONLY; - rdev->pm.power_state[state_index].misc = 0; - rdev->pm.power_state[state_index].misc2 = 0; + rdev->pm.power_state[state_index - 1].misc = 0; + rdev->pm.power_state[state_index - 1].misc2 = 0; } return state_index; } From c69f27137a38d24301a6b659454a91ad85dff4aa Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 2 May 2021 22:06:08 -0700 Subject: [PATCH 1006/1379] drm/radeon: Avoid power table parsing memory leaks Avoid leaving a hanging pre-allocated clock_info if last mode is invalid, and avoid heap corruption if no valid modes are found. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=211537 Fixes: 6991b8f2a319 ("drm/radeon/kms: fix segfault in pm rework") Signed-off-by: Kees Cook Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_atombios.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c index f9f4efa1738c..28c4413f4dc8 100644 --- a/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/drivers/gpu/drm/radeon/radeon_atombios.c @@ -2120,11 +2120,14 @@ static int radeon_atombios_parse_power_table_1_3(struct radeon_device *rdev) return state_index; /* last mode is usually default, array is low to high */ for (i = 0; i < num_modes; i++) { - rdev->pm.power_state[state_index].clock_info = - kcalloc(1, sizeof(struct radeon_pm_clock_info), - GFP_KERNEL); + /* avoid memory leaks from invalid modes or unknown frev. */ + if (!rdev->pm.power_state[state_index].clock_info) { + rdev->pm.power_state[state_index].clock_info = + kzalloc(sizeof(struct radeon_pm_clock_info), + GFP_KERNEL); + } if (!rdev->pm.power_state[state_index].clock_info) - return state_index; + goto out; rdev->pm.power_state[state_index].num_clock_modes = 1; rdev->pm.power_state[state_index].clock_info[0].voltage.type = VOLTAGE_NONE; switch (frev) { @@ -2243,8 +2246,15 @@ static int radeon_atombios_parse_power_table_1_3(struct radeon_device *rdev) break; } } +out: + /* free any unused clock_info allocation. */ + if (state_index && state_index < num_modes) { + kfree(rdev->pm.power_state[state_index].clock_info); + rdev->pm.power_state[state_index].clock_info = NULL; + } + /* last mode is usually default */ - if (rdev->pm.default_power_state_index == -1) { + if (state_index && rdev->pm.default_power_state_index == -1) { rdev->pm.power_state[state_index - 1].type = POWER_STATE_TYPE_DEFAULT; rdev->pm.default_power_state_index = state_index - 1; From 8651fcb9873be097bb6fe8542bfb6089020726ae Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 30 Apr 2021 10:16:54 -0700 Subject: [PATCH 1007/1379] drm/amd/pm: initialize variable Static analysis reports this problem amdgpu_pm.c:478:16: warning: The right operand of '<' is a garbage value for (i = 0; i < data.nums; i++) { ^ ~~~~~~~~~ In some cases data is not set. Initialize to 0 and flag not setting data as an error with the existing check. Signed-off-by: Tom Rix Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/amdgpu_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c index a8d6cc2525b2..e2ea42b1b4f0 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c @@ -451,7 +451,7 @@ static ssize_t amdgpu_get_pp_cur_state(struct device *dev, struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(ddev); const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; - struct pp_states_info data; + struct pp_states_info data = {0}; enum amd_pm_state_type pm = 0; int i = 0, ret = 0; From 2af4f9b8596afbbd7667a18fa71d117bac227dea Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sat, 30 Jan 2021 23:43:10 -0500 Subject: [PATCH 1008/1379] tools/power turbostat: add built-in-counter for IPC -- Instructions per Cycle Use linux-perf to access the hardware instructions-retired counter. This is necessary because the counter is not enabled by default, and also the counter is prone to roll-over -- both of which perf manages. It is not necessary to use perf for the cycle counter, because turbostat already needs to collect delta-aperf to calcuate frequency. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 84 +++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a7c4f0772e53..b82295eaa744 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -33,10 +33,13 @@ #include #include #include +#include +#include char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; +int *fd_instr_count_percpu; struct timeval interval_tv = {5, 0}; struct timespec interval_ts = {5, 0}; unsigned int num_iterations; @@ -75,6 +78,7 @@ char *output_buffer, *outp; unsigned int do_rapl; unsigned int do_dts; unsigned int do_ptm; +unsigned int do_ipc; unsigned long long gfx_cur_rc6_ms; unsigned long long cpuidle_cur_cpu_lpi_us; unsigned long long cpuidle_cur_sys_lpi_us; @@ -173,6 +177,7 @@ struct thread_data { unsigned long long aperf; unsigned long long mperf; unsigned long long c1; + unsigned long long instr_count; unsigned long long irq_count; unsigned int smi_count; unsigned int cpu_id; @@ -490,6 +495,39 @@ int get_msr_fd(int cpu) return fd; } +static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) +{ + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); +} + +static int perf_instr_count_open(int cpu_num) +{ + struct perf_event_attr pea; + int fd; + + memset(&pea, 0, sizeof(struct perf_event_attr)); + pea.type = PERF_TYPE_HARDWARE; + pea.size = sizeof(struct perf_event_attr); + pea.config = PERF_COUNT_HW_INSTRUCTIONS; + + /* counter for cpu_num, including user + kernel and all processes */ + fd = perf_event_open(&pea, -1, cpu_num, -1, 0); + if (fd == -1) + err(-1, "cpu%d: perf instruction counter\n", cpu_num); + + return fd; +} + +int get_instr_count_fd(int cpu) +{ + if (fd_instr_count_percpu[cpu]) + return fd_instr_count_percpu[cpu]; + + fd_instr_count_percpu[cpu] = perf_instr_count_open(cpu); + + return fd_instr_count_percpu[cpu]; +} + int get_msr(int cpu, off_t offset, unsigned long long *msr) { ssize_t retval; @@ -561,6 +599,7 @@ struct msr_counter bic[] = { { 0x0, "X2APIC" }, { 0x0, "Die" }, { 0x0, "GFXAMHz" }, + { 0x0, "IPC" }, }; #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) @@ -616,6 +655,7 @@ struct msr_counter bic[] = { #define BIC_X2APIC (1ULL << 49) #define BIC_Die (1ULL << 50) #define BIC_GFXACTMHz (1ULL << 51) +#define BIC_IPC (1ULL << 52) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) @@ -627,6 +667,7 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define ENABLE_BIC(COUNTER_NAME) (bic_enabled |= COUNTER_NAME) #define BIC_PRESENT(COUNTER_BIT) (bic_present |= COUNTER_BIT) #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) +#define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) #define MAX_DEFERRED 16 @@ -764,6 +805,9 @@ void print_header(char *delim) if (DO_BIC(BIC_TSC_MHz)) outp += sprintf(outp, "%sTSC_MHz", (printed++ ? delim : "")); + if (DO_BIC(BIC_IPC)) + outp += sprintf(outp, "%sIPC", (printed++ ? delim : "")); + if (DO_BIC(BIC_IRQ)) { if (sums_need_wide_columns) outp += sprintf(outp, "%s IRQ", (printed++ ? delim : "")); @@ -926,6 +970,9 @@ int dump_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "mperf: %016llX\n", t->mperf); outp += sprintf(outp, "c1: %016llX\n", t->c1); + if (DO_BIC(BIC_IPC)) + outp += sprintf(outp, "IPC: %lld\n", t->instr_count); + if (DO_BIC(BIC_IRQ)) outp += sprintf(outp, "IRQ: %lld\n", t->irq_count); if (DO_BIC(BIC_SMI)) @@ -1105,6 +1152,9 @@ int format_counters(struct thread_data *t, struct core_data *c, if (DO_BIC(BIC_TSC_MHz)) outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 * t->tsc/units/interval_float); + if (DO_BIC(BIC_IPC)) + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 1.0 * t->instr_count / t->aperf); + /* IRQ */ if (DO_BIC(BIC_IRQ)) { if (sums_need_wide_columns) @@ -1482,6 +1532,9 @@ delta_thread(struct thread_data *new, struct thread_data *old, old->mperf = 1; /* divide by 0 protection */ } + if (DO_BIC(BIC_IPC)) + old->instr_count = new->instr_count - old->instr_count; + if (DO_BIC(BIC_IRQ)) old->irq_count = new->irq_count - old->irq_count; @@ -1536,6 +1589,8 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data t->mperf = 0; t->c1 = 0; + t->instr_count = 0; + t->irq_count = 0; t->smi_count = 0; @@ -1611,6 +1666,8 @@ int sum_counters(struct thread_data *t, struct core_data *c, average.threads.mperf += t->mperf; average.threads.c1 += t->c1; + average.threads.instr_count += t->instr_count; + average.threads.irq_count += t->irq_count; average.threads.smi_count += t->smi_count; @@ -1707,6 +1764,7 @@ void compute_average(struct thread_data *t, struct core_data *c, average.threads.tsc /= topo.num_cpus; average.threads.aperf /= topo.num_cpus; average.threads.mperf /= topo.num_cpus; + average.threads.instr_count /= topo.num_cpus; average.threads.c1 /= topo.num_cpus; if (average.threads.irq_count > 9999999) @@ -1989,6 +2047,10 @@ retry: t->mperf = t->mperf * aperf_mperf_multiplier; } + if (DO_BIC(BIC_IPC)) + if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long)) + return -4; + if (DO_BIC(BIC_IRQ)) t->irq_count = irqs_per_cpu[cpu]; if (DO_BIC(BIC_SMI)) { @@ -5031,6 +5093,26 @@ void print_dev_latency(void) close(fd); } + +/* + * Linux-perf manages the the HW instructions-retired counter + * by enabling when requested, and hiding rollover + */ +void linux_perf_init(void) +{ + if (!BIC_IS_ENABLED(BIC_IPC)) + return; + + if (access("/proc/sys/kernel/perf_event_paranoid", F_OK)) + return; + + fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); + if (fd_instr_count_percpu == NULL) + err(-1, "calloc fd_instr_count_percpu"); + + BIC_PRESENT(BIC_IPC); +} + void process_cpuid() { unsigned int eax, ebx, ecx, edx; @@ -5642,6 +5724,7 @@ void turbostat_init() check_dev_msr(); check_permissions(); process_cpuid(); + linux_perf_init(); if (!quiet) @@ -6087,6 +6170,7 @@ void cmdline(int argc, char **argv) {"debug", no_argument, 0, 'd'}, /* internal, not documented */ {"enable", required_argument, 0, 'e'}, {"interval", required_argument, 0, 'i'}, + {"IPC", no_argument, 0, 'I'}, {"num_iterations", required_argument, 0, 'n'}, {"help", no_argument, 0, 'h'}, {"hide", required_argument, 0, 'H'}, // meh, -h taken by --help From ed0757b83a00d1799c249073d688b018b82d0093 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 4 Feb 2021 14:44:12 -0500 Subject: [PATCH 1009/1379] tools/power turbostat: print microcode patch level (also available via "grep microcode /proc/cpuinfo") Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b82295eaa744..e1bc7937b1ec 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5118,6 +5118,7 @@ void process_cpuid() unsigned int eax, ebx, ecx, edx; unsigned int fms, family, model, stepping, ecx_flags, edx_flags; unsigned int has_turbo; + unsigned long long ucode_patch = 0; eax = ebx = ecx = edx = 0; @@ -5131,8 +5132,8 @@ void process_cpuid() hygon_genuine = 1; if (!quiet) - fprintf(outf, "CPUID(0): %.4s%.4s%.4s ", - (char *)&ebx, (char *)&edx, (char *)&ecx); + fprintf(outf, "CPUID(0): %.4s%.4s%.4s 0x%x CPUID levels\n", + (char *)&ebx, (char *)&edx, (char *)&ecx, max_level); __cpuid(1, fms, ebx, ecx, edx); family = (fms >> 8) & 0xf; @@ -5145,6 +5146,9 @@ void process_cpuid() ecx_flags = ecx; edx_flags = edx; + if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch)) + warnx("get_msr(UCODE)\n"); + /* * check max extended function levels of CPUID. * This is needed to check for invariant TSC. @@ -5154,8 +5158,9 @@ void process_cpuid() __cpuid(0x80000000, max_extended_level, ebx, ecx, edx); if (!quiet) { - fprintf(outf, "0x%x CPUID levels; 0x%x xlevels; family:model:stepping 0x%x:%x:%x (%d:%d:%d)\n", - max_level, max_extended_level, family, model, stepping, family, model, stepping); + fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n", + family, model, stepping, family, model, stepping, (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level); fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n", ecx_flags & (1 << 0) ? "SSE3" : "-", ecx_flags & (1 << 3) ? "MONITOR" : "-", From 5683460b85a8a14c5eec10e363635ad4660eb961 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 3 Feb 2021 16:19:59 +0800 Subject: [PATCH 1010/1379] tools/power turbostat: Support Alder Lake Mobile Share the code between Alder Lake Mobile and Alder Lake Desktop. Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e1bc7937b1ec..a4745825047f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5056,6 +5056,7 @@ unsigned int intel_model_duplicates(unsigned int model) case INTEL_FAM6_ROCKETLAKE: case INTEL_FAM6_LAKEFIELD: case INTEL_FAM6_ALDERLAKE: + case INTEL_FAM6_ALDERLAKE_L: return INTEL_FAM6_CANNONLAKE_L; case INTEL_FAM6_ATOM_TREMONT_L: From 6c5c656006cf314196faea7bd76eebbfa0941cd1 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 3 Feb 2021 16:26:32 +0800 Subject: [PATCH 1011/1379] tools/power turbostat: Support Ice Lake D Ice Lake D is low-end server version of Ice Lake X, reuse the code accordingly. Tested-by: Wendy Wang Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a4745825047f..d27e899328f9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5063,6 +5063,7 @@ unsigned int intel_model_duplicates(unsigned int model) return INTEL_FAM6_ATOM_TREMONT; case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_ICELAKE_D: case INTEL_FAM6_SAPPHIRERAPIDS_X: return INTEL_FAM6_SKYLAKE_X; } From b2b94be787bf47eedd5890a249f3318bf9f1f1d5 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 11 Mar 2021 18:36:35 -0500 Subject: [PATCH 1012/1379] Revert "tools/power turbostat: adjust for temperature offset" This reverts commit 6ff7cb371c4bea3dba03a56d774da925e78a5087. Apparently the TCC offset should not be used to adjust what temperature we show the user after all. (on most systems, TCC offset is 0, FWIW) Fixes: 6ff7cb371c4b Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 64 ++++++++++++++------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index d27e899328f9..98a0a731da8a 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4879,33 +4879,12 @@ double discover_bclk(unsigned int family, unsigned int model) * below this value, including the Digital Thermal Sensor (DTS), * Package Thermal Management Sensor (PTM), and thermal event thresholds. */ -int read_tcc_activation_temp() -{ - unsigned long long msr; - unsigned int tcc, target_c, offset_c; - - /* Temperature Target MSR is Nehalem and newer only */ - if (!do_nhm_platform_info) - return 0; - - if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) - return 0; - - target_c = (msr >> 16) & 0xFF; - - offset_c = (msr >> 24) & 0xF; - - tcc = target_c - offset_c; - - if (!quiet) - fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", - base_cpu, msr, tcc, target_c, offset_c); - - return tcc; -} - int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p) { + unsigned long long msr; + unsigned int target_c_local; + int cpu; + /* tcc_activation_temp is used only for dts or ptm */ if (!(do_dts || do_ptm)) return 0; @@ -4914,18 +4893,43 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE) || !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) return 0; + cpu = t->cpu_id; + if (cpu_migrate(cpu)) { + fprintf(outf, "Could not migrate to CPU %d\n", cpu); + return -1; + } + if (tcc_activation_temp_override != 0) { tcc_activation_temp = tcc_activation_temp_override; - fprintf(outf, "Using cmdline TCC Target (%d C)\n", tcc_activation_temp); + fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", + cpu, tcc_activation_temp); return 0; } - tcc_activation_temp = read_tcc_activation_temp(); - if (tcc_activation_temp) - return 0; + /* Temperature Target MSR is Nehalem and newer only */ + if (!do_nhm_platform_info) + goto guess; + if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) + goto guess; + + target_c_local = (msr >> 16) & 0xFF; + + if (!quiet) + fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", + cpu, msr, target_c_local); + + if (!target_c_local) + goto guess; + + tcc_activation_temp = target_c_local; + + return 0; + +guess: tcc_activation_temp = TJMAX_DEFAULT; - fprintf(outf, "Guessing tjMax %d C, Please use -T to specify\n", tcc_activation_temp); + fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", + cpu, tcc_activation_temp); return 0; } From abdc75ab53b7fd2ef42c79e88cf0caf2d007c4f2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 11 Mar 2021 10:05:13 +0800 Subject: [PATCH 1013/1379] tools/power turbostat: Fix DRAM Energy Unit on SKX SKX uses fixed DRAM Energy Unit, just like HSX and BDX. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 98a0a731da8a..5afe85efca5c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4272,6 +4272,7 @@ rapl_dram_energy_units_probe(int model, double rapl_energy_units) switch (model) { case INTEL_FAM6_HASWELL_X: /* HSX */ case INTEL_FAM6_BROADWELL_X: /* BDX */ + case INTEL_FAM6_SKYLAKE_X: /* SKX */ case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ return (rapl_dram_energy_units = 15.3 / 1000000); default: From ba58ecde5eec898f647bba7cb07e6ec6ea1b875c Mon Sep 17 00:00:00 2001 From: Len Brown Date: Fri, 12 Mar 2021 17:30:30 -0500 Subject: [PATCH 1014/1379] tools/power turbostat: update version number --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 5afe85efca5c..ace100dd5a83 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5834,7 +5834,7 @@ int get_and_dump_counters(void) } void print_version() { - fprintf(outf, "turbostat version 20.09.30" + fprintf(outf, "turbostat version 21.03.12" " - Len Brown \n"); } From 301b1d3a9104f4f3a8ab4171cf88d0f55d632b41 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Wed, 28 Apr 2021 17:09:03 +0800 Subject: [PATCH 1015/1379] tools/power/turbostat: Fix turbostat for AMD Zen CPUs It was reported that on Zen+ system turbostat started exiting, which was tracked down to the MSR_PKG_ENERGY_STAT read failing because offset_to_idx wasn't returning a non-negative index. This patch combined the modification from Bingsong Si and Bas Nieuwenhuizen and addd the MSR to the index system as alternative for MSR_PKG_ENERGY_STATUS. Fixes: 9972d5d84d76 ("tools/power turbostat: Enable accumulate RAPL display") Reported-by: youling257 Tested-by: youling257 Tested-by: Kurt Garloff Tested-by: Bingsong Si Tested-by: Artem S. Tashkinov Co-developed-by: Bingsong Si Co-developed-by: Terry Bowman Signed-off-by: Bas Nieuwenhuizen Reviewed-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ace100dd5a83..b5f4ec24fea9 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -302,7 +302,10 @@ int idx_to_offset(int idx) switch (idx) { case IDX_PKG_ENERGY: - offset = MSR_PKG_ENERGY_STATUS; + if (do_rapl & RAPL_AMD_F17H) + offset = MSR_PKG_ENERGY_STAT; + else + offset = MSR_PKG_ENERGY_STATUS; break; case IDX_DRAM_ENERGY: offset = MSR_DRAM_ENERGY_STATUS; @@ -331,6 +334,7 @@ int offset_to_idx(int offset) switch (offset) { case MSR_PKG_ENERGY_STATUS: + case MSR_PKG_ENERGY_STAT: idx = IDX_PKG_ENERGY; break; case MSR_DRAM_ENERGY_STATUS: @@ -358,7 +362,7 @@ int idx_valid(int idx) { switch (idx) { case IDX_PKG_ENERGY: - return do_rapl & RAPL_PKG; + return do_rapl & (RAPL_PKG | RAPL_AMD_F17H); case IDX_DRAM_ENERGY: return do_rapl & RAPL_DRAM; case IDX_PP0_ENERGY: From 13a779de4175df602366d129e41782ad7168cef0 Mon Sep 17 00:00:00 2001 From: Calvin Walton Date: Wed, 28 Apr 2021 17:09:16 +0800 Subject: [PATCH 1016/1379] tools/power turbostat: Fix offset overflow issue in index converting The idx_to_offset() function returns type int (32-bit signed), but MSR_PKG_ENERGY_STAT is u32 and would be interpreted as a negative number. The end result is that it hits the if (offset < 0) check in update_msr_sum() which prevents the timer callback from updating the stat in the background when long durations are used. The similar issue exists in offset_to_idx() and update_msr_sum(). Fix this issue by converting the 'int' to 'off_t' accordingly. Fixes: 9972d5d84d76 ("tools/power turbostat: Enable accumulate RAPL display") Signed-off-by: Calvin Walton Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b5f4ec24fea9..f3cb06a115b5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -296,9 +296,9 @@ struct msr_sum_array { /* The percpu MSR sum array.*/ struct msr_sum_array *per_cpu_msr_sum; -int idx_to_offset(int idx) +off_t idx_to_offset(int idx) { - int offset; + off_t offset; switch (idx) { case IDX_PKG_ENERGY: @@ -328,7 +328,7 @@ int idx_to_offset(int idx) return offset; } -int offset_to_idx(int offset) +int offset_to_idx(off_t offset) { int idx; @@ -3338,7 +3338,7 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) { unsigned long long msr_cur, msr_last; - int offset; + off_t offset; if (!idx_valid(i)) continue; @@ -3347,7 +3347,8 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg continue; ret = get_msr(cpu, offset, &msr_cur); if (ret) { - fprintf(outf, "Can not update msr(0x%x)\n", offset); + fprintf(outf, "Can not update msr(0x%llx)\n", + (unsigned long long)offset); continue; } From 25368d7cefcd87a94ccabcc6f9f31796607bbe4e Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Tue, 4 May 2021 17:52:34 +0300 Subject: [PATCH 1017/1379] tools/power/turbostat: Remove Package C6 Retention on Ice Lake Server Currently the turbostat treats ICX the same way as SKX and shares the code among them. But one difference is that ICX does not support Package C6 Retention, unlike SKX and CLX. So this patch: 1. Splitting SKX and ICX in turbostat. 2. Removing Package C6 Rentention for ICX. And after this split, it would be easier to cutomize Ice Lake Server in turbostat in the future. Suggested-by: Artem Bityutskiy Signed-off-by: Chen Yu Reviewed-by: Artem Bityutskiy Tested-by: Artem Bityutskiy Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 36 ++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f3cb06a115b5..b43816e4c1ff 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2263,7 +2263,7 @@ int amt_pkg_cstate_limits[16] = {PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, int phi_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; int glm_pkg_cstate_limits[16] = {PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; int skx_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; - +int icx_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; static void calculate_tsc_tweak() @@ -2378,6 +2378,7 @@ int has_turbo_ratio_group_limits(int family, int model) switch (model) { case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_ATOM_GOLDMONT_D: case INTEL_FAM6_ATOM_TREMONT_D: return 1; @@ -3618,6 +3619,10 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) pkg_cstate_limits = skx_pkg_cstate_limits; has_misc_feature_control = 1; break; + case INTEL_FAM6_ICELAKE_X: /* ICX */ + pkg_cstate_limits = icx_pkg_cstate_limits; + has_misc_feature_control = 1; + break; case INTEL_FAM6_ATOM_SILVERMONT: /* BYT */ no_MSR_MISC_PWR_MGMT = 1; case INTEL_FAM6_ATOM_SILVERMONT_D: /* AVN */ @@ -3706,6 +3711,20 @@ int is_skx(unsigned int family, unsigned int model) } return 0; } + +int is_icx(unsigned int family, unsigned int model) +{ + + if (!genuine_intel) + return 0; + + switch (model) { + case INTEL_FAM6_ICELAKE_X: + return 1; + } + return 0; +} + int is_ehl(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -3808,6 +3827,7 @@ int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model) switch (model) { case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ICELAKE_X: return 1; default: return 0; @@ -3833,6 +3853,7 @@ int has_config_tdp(unsigned int family, unsigned int model) case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ case INTEL_FAM6_SKYLAKE_X: /* SKX */ + case INTEL_FAM6_ICELAKE_X: /* ICX */ case INTEL_FAM6_XEON_PHI_KNL: /* Knights Landing */ return 1; @@ -4363,6 +4384,7 @@ void rapl_probe_intel(unsigned int family, unsigned int model) case INTEL_FAM6_HASWELL_X: /* HSX */ case INTEL_FAM6_BROADWELL_X: /* BDX */ case INTEL_FAM6_SKYLAKE_X: /* SKX */ + case INTEL_FAM6_ICELAKE_X: /* ICX */ case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO; BIC_PRESENT(BIC_PKG__); @@ -4519,7 +4541,8 @@ void perf_limit_reasons_probe(unsigned int family, unsigned int model) void automatic_cstate_conversion_probe(unsigned int family, unsigned int model) { - if (is_skx(family, model) || is_bdx(family, model)) + if (is_skx(family, model) || is_bdx(family, model) || + is_icx(family, model)) has_automatic_cstate_conversion = 1; } @@ -4734,6 +4757,7 @@ int has_snb_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ case INTEL_FAM6_SKYLAKE_X: /* SKX */ + case INTEL_FAM6_ICELAKE_X: /* ICX */ case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ @@ -5072,10 +5096,9 @@ unsigned int intel_model_duplicates(unsigned int model) case INTEL_FAM6_ATOM_TREMONT_L: return INTEL_FAM6_ATOM_TREMONT; - case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_ICELAKE_D: case INTEL_FAM6_SAPPHIRERAPIDS_X: - return INTEL_FAM6_SKYLAKE_X; + return INTEL_FAM6_ICELAKE_X; } return model; } @@ -5185,8 +5208,9 @@ void process_cpuid() edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-"); } - if (genuine_intel) + if (genuine_intel) { model = intel_model_duplicates(model); + } if (!(edx_flags & (1 << 5))) errx(1, "CPUID: no MSR"); @@ -5364,7 +5388,7 @@ void process_cpuid() BIC_NOT_PRESENT(BIC_Pkgpc7); use_c1_residency_msr = 1; } - if (is_skx(family, model)) { + if (is_skx(family, model) || is_icx(family, model)) { BIC_NOT_PRESENT(BIC_CPU_c3); BIC_NOT_PRESENT(BIC_Pkgpc3); BIC_NOT_PRESENT(BIC_CPU_c7); From 1e3ec5cdfb63bc2a1ff06145faa2be08d6ec9594 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 25 Mar 2021 13:13:33 -0700 Subject: [PATCH 1018/1379] tools/power turbostat: unmark non-kernel-doc comment Do not mark a comment as kernel-doc notation when it is not meant to be in kernel-doc notation. Signed-off-by: Randy Dunlap Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b43816e4c1ff..407e80e72546 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2516,7 +2516,7 @@ dump_knl_turbo_ratio_limits(void) fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr); - /** + /* * Turbo encoding in KNL is as follows: * [0] -- Reserved * [7:1] -- Base value of number of active cores of bucket 1. From 8c69da293041352d15a2b6e8010c141822a416c5 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 28 Apr 2021 10:51:57 +0800 Subject: [PATCH 1019/1379] tools/power turbostat: Enable tsc_tweak for Elkhart Lake and Jasper Lake It was found that on Elkhart Lake the TSC frequency is driven by a separate crystal-clock domain, which is different from the BCLK domain which includes mperf. This has result in small different speed thus inconsistence between TSC and the mperf, which caused the Busy% to be higher than 100%. On this platform it seems that the mperf runs faster than tsc when the CPU is 100% utilized: delta tsc(18815473183) < delta mperf(18958403680) for 10 seconds. To align TSC with mperf, leverage the tsc_tweak mechanism introduced for cores newer than Skylake, so that TSC and mperf would be calculated in the same domain. Reported-by: Zhang Rui Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 407e80e72546..9ec13f06c0f3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5440,7 +5440,7 @@ void process_cpuid() if (!quiet) dump_sysfs_pstate_config(); - if (has_skl_msrs(family, model)) + if (has_skl_msrs(family, model) || is_ehl(family, model)) calculate_tsc_tweak(); if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK)) From aeb01e6d71ffaf3011ac755c3083cc200ed57cb4 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 28 Apr 2021 12:18:12 +0800 Subject: [PATCH 1020/1379] tools/power turbostat: Print the C-state Pre-wake settings C-state pre-wake setting[1] is an optimization for some Intel CPUs to be woken up from deep C-states in order to reduce latency. According to the spec, the BIT30 is the C-state Pre-wake Disable. Expose this setting accordingly. Sample output from turbostat: ... cpu51: MSR_IA32_POWER_CTL: 0x1a00a40059 (C1E auto-promotion: DISabled) C-state Pre-wake: ENabled cpu51: MSR_TURBO_RATIO_LIMIT: 0x2021212121212224 ... [1] https://intel.github.io/wult/#c-state-pre-wake Signed-off-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 9ec13f06c0f3..e1ed14c666db 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -91,6 +91,7 @@ double rapl_dram_energy_units, rapl_energy_units; double rapl_joule_counter_range; unsigned int do_core_perf_limit_reasons; unsigned int has_automatic_cstate_conversion; +unsigned int dis_cstate_prewake; unsigned int do_gfx_perf_limit_reasons; unsigned int do_ring_perf_limit_reasons; unsigned int crystal_hz; @@ -2271,6 +2272,8 @@ calculate_tsc_tweak() tsc_tweak = base_hz / tsc_hz; } +void prewake_cstate_probe(unsigned int family, unsigned int model); + static void dump_nhm_platform_info(void) { @@ -2293,6 +2296,11 @@ dump_nhm_platform_info(void) fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n", base_cpu, msr, msr & 0x2 ? "EN" : "DIS"); + /* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */ + if (dis_cstate_prewake) + fprintf(outf, "C-state Pre-wake: %sabled\n", + msr & 0x40000000 ? "DIS" : "EN"); + return; } @@ -4546,6 +4554,12 @@ void automatic_cstate_conversion_probe(unsigned int family, unsigned int model) has_automatic_cstate_conversion = 1; } +void prewake_cstate_probe(unsigned int family, unsigned int model) +{ + if (is_icx(family, model)) + dis_cstate_prewake = 1; +} + int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned long long msr; From 7ab5ff4937a338783d147ec2d8c8714f48a5de79 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 21 Apr 2021 22:22:47 +0800 Subject: [PATCH 1021/1379] tools/power turbostat: Fix Core C6 residency on Atom CPUs For Atom CPUs that have core cstate deeper than C6, MSR_CORE_C6_RESIDENCY actually returns the residency for both CC6 and deeper Core cstates. Thus, the real Core C6 residency should be the subtraction of MSR_CORE_C6_RESIDENCY return value and MSR_CORE_C6_RESIDENCY return value. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 39 ++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index e1ed14c666db..ee18966f65a4 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -35,6 +35,7 @@ #include #include #include +#include char *proc_stat = "/proc/stat"; FILE *outf; @@ -185,6 +186,7 @@ struct thread_data { unsigned int apic_id; unsigned int x2apic_id; unsigned int flags; + bool is_atom; #define CPU_IS_FIRST_THREAD_IN_CORE 0x2 #define CPU_IS_FIRST_CORE_IN_PACKAGE 0x4 unsigned long long counter[MAX_ADDED_THREAD_COUNTERS]; @@ -2090,9 +2092,19 @@ retry: return -7; } - if (DO_BIC(BIC_CPU_c7) || soft_c1_residency_display(BIC_CPU_c7)) + if (DO_BIC(BIC_CPU_c7) || soft_c1_residency_display(BIC_CPU_c7)) { if (get_msr(cpu, MSR_CORE_C7_RESIDENCY, &c->c7)) return -8; + else if (t->is_atom) { + /* + * For Atom CPUs that has core cstate deeper than c6, + * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper. + * Minus CC7 (and deeper cstates) residency to get + * accturate cc6 residency. + */ + c->c6 -= c->c7; + } + } if (DO_BIC(BIC_Mod_c6)) if (get_msr(cpu, MSR_MODULE_C6_RES_MS, &c->mc6_us)) @@ -4911,6 +4923,28 @@ double discover_bclk(unsigned int family, unsigned int model) return 133.33; } +int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) +{ + unsigned int eax, ebx, ecx, edx; + + if (!genuine_intel) + return 0; + + if (cpu_migrate(t->cpu_id)) { + fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id); + return -1; + } + + if (max_level < 0x1a) + return 0; + + __cpuid(0x1a, eax, ebx, ecx, edx); + eax = (eax >> 24) & 0xFF; + if (eax == 0x20 ) + t->is_atom = true; + return 0; +} + /* * MSR_IA32_TEMPERATURE_TARGET indicates the temperature where * the Thermal Control Circuit (TCC) activates. @@ -5796,6 +5830,9 @@ void turbostat_init() for_all_cpus(set_temperature_target, ODD_COUNTERS); + for_all_cpus(get_cpu_type, ODD_COUNTERS); + for_all_cpus(get_cpu_type, EVEN_COUNTERS); + if (!quiet) for_all_cpus(print_thermal, ODD_COUNTERS); From e9d3092f6d7c21031c8ac10ba2016ae0482a39fe Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 26 Apr 2021 10:05:27 +0800 Subject: [PATCH 1022/1379] tools/power turbostat: save original CPU model CPU model may get changed in intel_model_duplicates() for code reuse. But there are still some cases we need the original CPU model to handle minor differences between generations. Thus save the original CPU model. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index ee18966f65a4..d1ae6b248377 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -43,6 +43,10 @@ int *fd_percpu; int *fd_instr_count_percpu; struct timeval interval_tv = {5, 0}; struct timespec interval_ts = {5, 0}; + +/* Save original CPU model */ +unsigned int model_orig; + unsigned int num_iterations; unsigned int debug; unsigned int quiet; @@ -5257,6 +5261,7 @@ void process_cpuid() edx_flags & (1 << 29) ? "TM" : "-"); } if (genuine_intel) { + model_orig = model; model = intel_model_duplicates(model); } From 0b9a0b9be991656f125b58a240065cdf72077244 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 21 Apr 2021 23:22:14 +0800 Subject: [PATCH 1023/1379] tools/power turbostat: add TCC Offset support The length of TCC Offset bits varies on different platforms. Decode TCC Offset bits only for the platforms that we have verified. For the others, only show default TCC activation temperature. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 58 +++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index d1ae6b248377..6326bee97c0b 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -91,6 +91,7 @@ unsigned int gfx_cur_mhz; unsigned int gfx_act_mhz; unsigned int tcc_activation_temp; unsigned int tcc_activation_temp_override; +int tcc_offset_bits; double rapl_power_units, rapl_time_units; double rapl_dram_energy_units, rapl_energy_units; double rapl_joule_counter_range; @@ -3886,6 +3887,40 @@ int has_config_tdp(unsigned int family, unsigned int model) } } +/* + * tcc_offset_bits: + * 0: Tcc Offset not supported (Default) + * 6: Bit 29:24 of MSR_PLATFORM_INFO + * 4: Bit 27:24 of MSR_PLATFORM_INFO + */ +void check_tcc_offset(int model) +{ + unsigned long long msr; + + if (!genuine_intel) + return; + + switch (model) { + case INTEL_FAM6_SKYLAKE_L: + case INTEL_FAM6_SKYLAKE: + case INTEL_FAM6_KABYLAKE_L: + case INTEL_FAM6_KABYLAKE: + case INTEL_FAM6_ICELAKE_L: + case INTEL_FAM6_ICELAKE: + case INTEL_FAM6_TIGERLAKE_L: + case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_COMETLAKE: + if (!get_msr(base_cpu, MSR_PLATFORM_INFO, &msr)) { + msr = (msr >> 30) & 1; + if (msr) + tcc_offset_bits = 6; + } + return; + default: + return; + } +} + static void remove_underbar(char *s) { @@ -4964,7 +4999,7 @@ int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned long long msr; - unsigned int target_c_local; + unsigned int target_c_local, tcc_offset; int cpu; /* tcc_activation_temp is used only for dts or ptm */ @@ -4997,9 +5032,24 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk target_c_local = (msr >> 16) & 0xFF; - if (!quiet) - fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", + if (!quiet) { + switch (tcc_offset_bits) { + case 4: + tcc_offset = (msr >> 24) & 0xF; + fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", + cpu, msr, target_c_local - tcc_offset, target_c_local, tcc_offset); + break; + case 6: + tcc_offset = (msr >> 24) & 0x3F; + fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", + cpu, msr, target_c_local - tcc_offset, target_c_local, tcc_offset); + break; + default: + fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, target_c_local); + break; + } + } if (!target_c_local) goto guess; @@ -5483,6 +5533,8 @@ void process_cpuid() perf_limit_reasons_probe(family, model); automatic_cstate_conversion_probe(family, model); + check_tcc_offset(model_orig); + if (!quiet) dump_cstate_pstate_config_info(family, model); From 55279aef754c5eab170077ae4ba4ebd304dea64f Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 26 Apr 2021 18:49:26 +0800 Subject: [PATCH 1024/1379] tools/power turbostat: rename tcc variables There are two TCC activation temeprature. One is the default TCC activation temperature, also known as TJ_MAX. Another one is the effective TCC activation temperature, which is the subtraction of default TCC activation temperature and TCC offset. The name of variable tcc_activation_temp might be misleading here. Thus rename tcc_activation_temp to tj_max, and use tcc_default and tcc_offset to calculate the effective TCC activation temperature. No functional change in this patch. Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 44 +++++++++++++-------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 6326bee97c0b..8f0a1d8a0366 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -89,8 +89,8 @@ unsigned long long cpuidle_cur_cpu_lpi_us; unsigned long long cpuidle_cur_sys_lpi_us; unsigned int gfx_cur_mhz; unsigned int gfx_act_mhz; -unsigned int tcc_activation_temp; -unsigned int tcc_activation_temp_override; +unsigned int tj_max; +unsigned int tj_max_override; int tcc_offset_bits; double rapl_power_units, rapl_time_units; double rapl_dram_energy_units, rapl_energy_units; @@ -2118,7 +2118,7 @@ retry: if (DO_BIC(BIC_CoreTmp)) { if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr)) return -9; - c->core_temp_c = tcc_activation_temp - ((msr >> 16) & 0x7F); + c->core_temp_c = tj_max - ((msr >> 16) & 0x7F); } if (do_rapl & RAPL_AMD_F17H) { @@ -2224,7 +2224,7 @@ retry: if (DO_BIC(BIC_PkgTmp)) { if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) return -17; - p->pkg_temp_c = tcc_activation_temp - ((msr >> 16) & 0x7F); + p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F); } if (DO_BIC(BIC_GFX_rc6)) @@ -4637,7 +4637,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p dts = (msr >> 16) & 0x7F; fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", - cpu, msr, tcc_activation_temp - dts); + cpu, msr, tj_max - dts); if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr)) return 0; @@ -4645,7 +4645,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p dts = (msr >> 16) & 0x7F; dts2 = (msr >> 8) & 0x7F; fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", - cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2); + cpu, msr, tj_max - dts, tj_max - dts2); } @@ -4658,7 +4658,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p dts = (msr >> 16) & 0x7F; resolution = (msr >> 27) & 0xF; fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n", - cpu, msr, tcc_activation_temp - dts, resolution); + cpu, msr, tj_max - dts, resolution); if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr)) return 0; @@ -4666,7 +4666,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p dts = (msr >> 16) & 0x7F; dts2 = (msr >> 8) & 0x7F; fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n", - cpu, msr, tcc_activation_temp - dts, tcc_activation_temp - dts2); + cpu, msr, tj_max - dts, tj_max - dts2); } return 0; @@ -4999,10 +4999,10 @@ int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned long long msr; - unsigned int target_c_local, tcc_offset; + unsigned int tcc_default, tcc_offset; int cpu; - /* tcc_activation_temp is used only for dts or ptm */ + /* tj_max is used only for dts or ptm */ if (!(do_dts || do_ptm)) return 0; @@ -5016,10 +5016,10 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk return -1; } - if (tcc_activation_temp_override != 0) { - tcc_activation_temp = tcc_activation_temp_override; + if (tj_max_override != 0) { + tj_max = tj_max_override; fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", - cpu, tcc_activation_temp); + cpu, tj_max); return 0; } @@ -5030,38 +5030,38 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr)) goto guess; - target_c_local = (msr >> 16) & 0xFF; + tcc_default = (msr >> 16) & 0xFF; if (!quiet) { switch (tcc_offset_bits) { case 4: tcc_offset = (msr >> 24) & 0xF; fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", - cpu, msr, target_c_local - tcc_offset, target_c_local, tcc_offset); + cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); break; case 6: tcc_offset = (msr >> 24) & 0x3F; fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", - cpu, msr, target_c_local - tcc_offset, target_c_local, tcc_offset); + cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); break; default: fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", - cpu, msr, target_c_local); + cpu, msr, tcc_default); break; } } - if (!target_c_local) + if (!tcc_default) goto guess; - tcc_activation_temp = target_c_local; + tj_max = tcc_default; return 0; guess: - tcc_activation_temp = TJMAX_DEFAULT; + tj_max = TJMAX_DEFAULT; fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", - cpu, tcc_activation_temp); + cpu, tj_max); return 0; } @@ -6421,7 +6421,7 @@ void cmdline(int argc, char **argv) summary_only++; break; case 'T': - tcc_activation_temp_override = atoi(optarg); + tj_max_override = atoi(optarg); break; case 'v': print_version(); From 1b439f01b67c77a374adbbd97ad0c745b7abb09b Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 4 May 2021 19:21:34 -0400 Subject: [PATCH 1025/1379] tools/power turbostat: formatting Spring is here... run a long overdue Lendent on turbostat.c no functional change Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 941 ++++++++++++-------------- 1 file changed, 432 insertions(+), 509 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 8f0a1d8a0366..13805e460a4d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -41,8 +41,8 @@ char *proc_stat = "/proc/stat"; FILE *outf; int *fd_percpu; int *fd_instr_count_percpu; -struct timeval interval_tv = {5, 0}; -struct timespec interval_ts = {5, 0}; +struct timeval interval_tv = { 5, 0 }; +struct timespec interval_ts = { 5, 0 }; /* Save original CPU model */ unsigned int model_orig; @@ -84,7 +84,7 @@ unsigned int do_rapl; unsigned int do_dts; unsigned int do_ptm; unsigned int do_ipc; -unsigned long long gfx_cur_rc6_ms; +unsigned long long gfx_cur_rc6_ms; unsigned long long cpuidle_cur_cpu_lpi_us; unsigned long long cpuidle_cur_sys_lpi_us; unsigned int gfx_cur_mhz; @@ -104,12 +104,12 @@ unsigned int crystal_hz; unsigned long long tsc_hz; int base_cpu; double discover_bclk(unsigned int family, unsigned int model); -unsigned int has_hwp; /* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */ +unsigned int has_hwp; /* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */ /* IA32_HWP_REQUEST, IA32_HWP_STATUS */ -unsigned int has_hwp_notify; /* IA32_HWP_INTERRUPT */ +unsigned int has_hwp_notify; /* IA32_HWP_INTERRUPT */ unsigned int has_hwp_activity_window; /* IA32_HWP_REQUEST[bits 41:32] */ -unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */ -unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ +unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */ +unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int has_misc_feature_control; unsigned int first_counter_read = 1; int ignore_stdin; @@ -185,7 +185,7 @@ struct thread_data { unsigned long long mperf; unsigned long long c1; unsigned long long instr_count; - unsigned long long irq_count; + unsigned long long irq_count; unsigned int smi_count; unsigned int cpu_id; unsigned int apic_id; @@ -253,12 +253,11 @@ struct pkg_data { ((node_no) * topo.cores_per_node) + \ (core_no)) - #define GET_PKG(pkg_base, pkg_no) (pkg_base + pkg_no) -enum counter_scope {SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE}; -enum counter_type {COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC}; -enum counter_format {FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT}; +enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; +enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC }; +enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT }; struct msr_counter { unsigned int msr_num; @@ -294,9 +293,9 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long *msr); struct msr_sum_array { /* get_msr_sum() = sum + (get_msr() - last) */ struct { - /*The accumulated MSR value is updated by the timer*/ + /*The accumulated MSR value is updated by the timer */ unsigned long long sum; - /*The MSR footprint recorded in last timer*/ + /*The MSR footprint recorded in last timer */ unsigned long long last; } entries[IDX_COUNT]; }; @@ -385,6 +384,7 @@ int idx_valid(int idx) return 0; } } + struct sys_counters { unsigned int added_thread_counters; unsigned int added_core_counters; @@ -408,7 +408,7 @@ struct cpu_topology { int logical_node_id; /* 0-based count within the package */ int physical_core_id; int thread_id; - cpu_set_t *put_ids; /* Processing Unit/Thread IDs */ + cpu_set_t *put_ids; /* Processing Unit/Thread IDs */ } *cpus; struct topo_params { @@ -425,7 +425,7 @@ struct topo_params { struct timeval tv_even, tv_odd, tv_delta; -int *irq_column_2_cpu; /* /proc/interrupts column numbers */ +int *irq_column_2_cpu; /* /proc/interrupts column numbers */ int *irqs_per_cpu; /* indexed by cpu_num */ void setup_all_buffers(void); @@ -438,34 +438,31 @@ int cpu_is_not_present(int cpu) { return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set); } + /* * run func(thread, core, package) in topology order * skip non-present cpus */ -int for_all_cpus(int (func)(struct thread_data *, struct core_data *, struct pkg_data *), - struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base) +int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pkg_data *), + struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base) { int retval, pkg_no, core_no, thread_no, node_no; for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) { for (node_no = 0; node_no < topo.nodes_per_pkg; node_no++) { for (core_no = 0; core_no < topo.cores_per_node; ++core_no) { - for (thread_no = 0; thread_no < - topo.threads_per_core; ++thread_no) { + for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) { struct thread_data *t; struct core_data *c; struct pkg_data *p; - t = GET_THREAD(thread_base, thread_no, - core_no, node_no, - pkg_no); + t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); if (cpu_is_not_present(t->cpu_id)) continue; - c = GET_CORE(core_base, core_no, - node_no, pkg_no); + c = GET_CORE(core_base, core_no, node_no, pkg_no); p = GET_PKG(pkg_base, pkg_no); retval = func(t, c, p); @@ -487,6 +484,7 @@ int cpu_migrate(int cpu) else return 0; } + int get_msr_fd(int cpu) { char pathname[32]; @@ -524,7 +522,7 @@ static int perf_instr_count_open(int cpu_num) /* counter for cpu_num, including user + kernel and all processes */ fd = perf_event_open(&pea, -1, cpu_num, -1, 0); - if (fd == -1) + if (fd == -1) err(-1, "cpu%d: perf instruction counter\n", cpu_num); return fd; @@ -568,7 +566,7 @@ struct msr_counter bic[] = { { 0x0, "Bzy_MHz" }, { 0x0, "TSC_MHz" }, { 0x0, "IRQ" }, - { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL}, + { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL }, { 0x0, "sysfs" }, { 0x0, "CPU%c1" }, { 0x0, "CPU%c3" }, @@ -681,7 +679,6 @@ unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT) #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT) - #define MAX_DEFERRED 16 char *deferred_skip_names[MAX_DEFERRED]; int deferred_skip_index; @@ -695,42 +692,40 @@ enum show_hide_mode { SHOW_LIST, HIDE_LIST } global_show_hide_mode = HIDE_LIST; void help(void) { fprintf(outf, - "Usage: turbostat [OPTIONS][(--interval seconds) | COMMAND ...]\n" - "\n" - "Turbostat forks the specified COMMAND and prints statistics\n" - "when COMMAND completes.\n" - "If no COMMAND is specified, turbostat wakes every 5-seconds\n" - "to print statistics, until interrupted.\n" - " -a, --add add a counter\n" - " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" - " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" - " {core | package | j,k,l..m,n-p }\n" - " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" - " -D, --Dump displays the raw counter values\n" - " -e, --enable [all | column]\n" - " shows all or the specified disabled column\n" - " -H, --hide [column|column,column,...]\n" - " hide the specified column(s)\n" - " -i, --interval sec.subsec\n" - " Override default 5-second measurement interval\n" - " -J, --Joules displays energy in Joules instead of Watts\n" - " -l, --list list column headers only\n" - " -n, --num_iterations num\n" - " number of the measurement iterations\n" - " -o, --out file\n" - " create or truncate \"file\" for all output\n" - " -q, --quiet skip decoding system configuration header\n" - " -s, --show [column|column,column,...]\n" - " show only the specified column(s)\n" - " -S, --Summary\n" - " limits output to 1-line system summary per interval\n" - " -T, --TCC temperature\n" - " sets the Thermal Control Circuit temperature in\n" - " degrees Celsius\n" - " -h, --help print this help message\n" - " -v, --version print version information\n" - "\n" - "For more help, run \"man turbostat\"\n"); + "Usage: turbostat [OPTIONS][(--interval seconds) | COMMAND ...]\n" + "\n" + "Turbostat forks the specified COMMAND and prints statistics\n" + "when COMMAND completes.\n" + "If no COMMAND is specified, turbostat wakes every 5-seconds\n" + "to print statistics, until interrupted.\n" + " -a, --add add a counter\n" + " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" + " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" + " {core | package | j,k,l..m,n-p }\n" + " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" + " -D, --Dump displays the raw counter values\n" + " -e, --enable [all | column]\n" + " shows all or the specified disabled column\n" + " -H, --hide [column|column,column,...]\n" + " hide the specified column(s)\n" + " -i, --interval sec.subsec\n" + " Override default 5-second measurement interval\n" + " -J, --Joules displays energy in Joules instead of Watts\n" + " -l, --list list column headers only\n" + " -n, --num_iterations num\n" + " number of the measurement iterations\n" + " -o, --out file\n" + " create or truncate \"file\" for all output\n" + " -q, --quiet skip decoding system configuration header\n" + " -s, --show [column|column,column,...]\n" + " show only the specified column(s)\n" + " -S, --Summary\n" + " limits output to 1-line system summary per interval\n" + " -T, --TCC temperature\n" + " sets the Thermal Control Circuit temperature in\n" + " degrees Celsius\n" + " -h, --help print this help message\n" + " -v, --version print version information\n" "\n" "For more help, run \"man turbostat\"\n"); } /* @@ -784,7 +779,6 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) return retval; } - void print_header(char *delim) { struct msr_counter *mp; @@ -966,8 +960,7 @@ void print_header(char *delim) outp += sprintf(outp, "\n"); } -int dump_counters(struct thread_data *t, struct core_data *c, - struct pkg_data *p) +int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; struct msr_counter *mp; @@ -975,8 +968,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "t %p, c %p, p %p\n", t, c, p); if (t) { - outp += sprintf(outp, "CPU: %d flags 0x%x\n", - t->cpu_id, t->flags); + outp += sprintf(outp, "CPU: %d flags 0x%x\n", t->cpu_id, t->flags); outp += sprintf(outp, "TSC: %016llX\n", t->tsc); outp += sprintf(outp, "aperf: %016llX\n", t->aperf); outp += sprintf(outp, "mperf: %016llX\n", t->mperf); @@ -991,8 +983,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "SMI: %d\n", t->smi_count); for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "tADDED [%d] msr0x%x: %08llX\n", - i, mp->msr_num, t->counter[i]); + outp += sprintf(outp, "tADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, t->counter[i]); } } @@ -1005,8 +996,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "Joules: %0X\n", c->core_energy); for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n", - i, mp->msr_num, c->counter[i]); + outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, c->counter[i]); } outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us); } @@ -1035,15 +1025,12 @@ int dump_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores); outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx); outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram); - outp += sprintf(outp, "Throttle PKG: %0llX\n", - p->rapl_pkg_perf_status); - outp += sprintf(outp, "Throttle RAM: %0llX\n", - p->rapl_dram_perf_status); + outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status); + outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status); outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { - outp += sprintf(outp, "pADDED [%d] msr0x%x: %08llX\n", - i, mp->msr_num, p->counter[i]); + outp += sprintf(outp, "pADDED [%d] msr0x%x: %08llX\n", i, mp->msr_num, p->counter[i]); } } @@ -1055,8 +1042,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, /* * column formatting convention & formats */ -int format_counters(struct thread_data *t, struct core_data *c, - struct pkg_data *p) +int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { double interval_float, tsc; char *fmt8; @@ -1065,17 +1051,16 @@ int format_counters(struct thread_data *t, struct core_data *c, char *delim = "\t"; int printed = 0; - /* if showing only 1st thread in core and this isn't one, bail out */ + /* if showing only 1st thread in core and this isn't one, bail out */ if (show_core_only && !(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) return 0; - /* if showing only 1st thread in pkg and this isn't one, bail out */ + /* if showing only 1st thread in pkg and this isn't one, bail out */ if (show_pkg_only && !(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) return 0; /*if not summary line and --cpu is used */ - if ((t != &average.threads) && - (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset))) + if ((t != &average.threads) && (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset))) return 0; if (DO_BIC(BIC_USEC)) { @@ -1090,7 +1075,7 @@ int format_counters(struct thread_data *t, struct core_data *c, if (DO_BIC(BIC_TOD)) outp += sprintf(outp, "%10ld.%06ld\t", t->tv_end.tv_sec, t->tv_end.tv_usec); - interval_float = t->tv_delta.tv_sec + t->tv_delta.tv_usec/1000000.0; + interval_float = t->tv_delta.tv_sec + t->tv_delta.tv_usec / 1000000.0; tsc = t->tsc * tsc_tweak; @@ -1126,11 +1111,9 @@ int format_counters(struct thread_data *t, struct core_data *c, if (DO_BIC(BIC_Node)) { if (t) outp += sprintf(outp, "%s%d", - (printed++ ? delim : ""), - cpus[t->cpu_id].physical_node_id); + (printed++ ? delim : ""), cpus[t->cpu_id].physical_node_id); else - outp += sprintf(outp, "%s-", - (printed++ ? delim : "")); + outp += sprintf(outp, "%s-", (printed++ ? delim : "")); } if (DO_BIC(BIC_Core)) { if (c) @@ -1147,22 +1130,22 @@ int format_counters(struct thread_data *t, struct core_data *c, } if (DO_BIC(BIC_Avg_MHz)) - outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), - 1.0 / units * t->aperf / interval_float); + outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 / units * t->aperf / interval_float); if (DO_BIC(BIC_Busy)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->mperf/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->mperf / tsc); if (DO_BIC(BIC_Bzy_MHz)) { if (has_base_hz) - outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), base_hz / units * t->aperf / t->mperf); + outp += + sprintf(outp, "%s%.0f", (printed++ ? delim : ""), base_hz / units * t->aperf / t->mperf); else outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), - tsc / units * t->aperf / t->mperf / interval_float); + tsc / units * t->aperf / t->mperf / interval_float); } if (DO_BIC(BIC_TSC_MHz)) - outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 * t->tsc/units/interval_float); + outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 * t->tsc / units / interval_float); if (DO_BIC(BIC_IPC)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 1.0 * t->instr_count / t->aperf); @@ -1183,7 +1166,8 @@ int format_counters(struct thread_data *t, struct core_data *c, for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) { if (mp->width == 32) - outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int) t->counter[i]); + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)t->counter[i]); else outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->counter[i]); } else if (mp->format == FORMAT_DELTA) { @@ -1193,27 +1177,28 @@ int format_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->counter[i]); } else if (mp->format == FORMAT_PERCENT) { if (mp->type == COUNTER_USEC) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), t->counter[i]/interval_float/10000); + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), + t->counter[i] / interval_float / 10000); else - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->counter[i]/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->counter[i] / tsc); } } /* C1 */ if (DO_BIC(BIC_CPU_c1)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1/tsc); - + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc); /* print per-core data only for 1st thread in core */ if (!(t->flags & CPU_IS_FIRST_THREAD_IN_CORE)) goto done; if (DO_BIC(BIC_CPU_c3)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c3/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c3 / tsc); if (DO_BIC(BIC_CPU_c6)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c6/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c6 / tsc); if (DO_BIC(BIC_CPU_c7)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c7/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c7 / tsc); /* Mod%c6 */ if (DO_BIC(BIC_Mod_c6)) @@ -1225,7 +1210,8 @@ int format_counters(struct thread_data *t, struct core_data *c, for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) { if (mp->width == 32) - outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int) c->counter[i]); + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)c->counter[i]); else outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->counter[i]); } else if (mp->format == FORMAT_DELTA) { @@ -1234,14 +1220,15 @@ int format_counters(struct thread_data *t, struct core_data *c, else outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->counter[i]); } else if (mp->format == FORMAT_PERCENT) { - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->counter[i]/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->counter[i] / tsc); } } fmt8 = "%s%.2f"; if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units); @@ -1259,7 +1246,7 @@ int format_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "%s**.**", (printed++ ? delim : "")); } else { outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), - p->gfx_rc6_ms / 10.0 / interval_float); + p->gfx_rc6_ms / 10.0 / interval_float); } } @@ -1273,42 +1260,49 @@ int format_counters(struct thread_data *t, struct core_data *c, /* Totl%C0, Any%C0 GFX%C0 CPUGFX% */ if (DO_BIC(BIC_Totl_c0)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc); if (DO_BIC(BIC_Any_c0)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_core_c0/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_core_c0 / tsc); if (DO_BIC(BIC_GFX_c0)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_gfxe_c0/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_gfxe_c0 / tsc); if (DO_BIC(BIC_CPUGFX)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_both_core_gfxe_c0/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_both_core_gfxe_c0 / tsc); if (DO_BIC(BIC_Pkgpc2)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc2/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc2 / tsc); if (DO_BIC(BIC_Pkgpc3)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc3/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc3 / tsc); if (DO_BIC(BIC_Pkgpc6)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc6/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc6 / tsc); if (DO_BIC(BIC_Pkgpc7)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc7/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc7 / tsc); if (DO_BIC(BIC_Pkgpc8)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc8/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc8 / tsc); if (DO_BIC(BIC_Pkgpc9)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc9/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc9 / tsc); if (DO_BIC(BIC_Pkgpc10)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc); if (DO_BIC(BIC_CPU_LPI)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->cpu_lpi / 1000000.0 / interval_float); + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->cpu_lpi / 1000000.0 / interval_float); if (DO_BIC(BIC_SYS_LPI)) - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float); + outp += + sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float); if (DO_BIC(BIC_PkgWatt)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float); if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float); if (DO_BIC(BIC_GFXWatt)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float); if (DO_BIC(BIC_RAMWatt)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), + p->energy_dram * rapl_dram_energy_units / interval_float); if (DO_BIC(BIC_Pkg_J)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units); if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY)) @@ -1318,14 +1312,19 @@ int format_counters(struct thread_data *t, struct core_data *c, if (DO_BIC(BIC_RAM_J)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units); if (DO_BIC(BIC_PKG__)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), + 100.0 * p->rapl_pkg_perf_status * rapl_time_units / interval_float); if (DO_BIC(BIC_RAM__)) - outp += sprintf(outp, fmt8, (printed++ ? delim : ""), 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); + outp += + sprintf(outp, fmt8, (printed++ ? delim : ""), + 100.0 * p->rapl_dram_perf_status * rapl_time_units / interval_float); for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) { if (mp->width == 32) - outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int) p->counter[i]); + outp += + sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)p->counter[i]); else outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->counter[i]); } else if (mp->format == FORMAT_DELTA) { @@ -1334,7 +1333,7 @@ int format_counters(struct thread_data *t, struct core_data *c, else outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]); } else if (mp->format == FORMAT_PERCENT) { - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i]/tsc); + outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc); } } @@ -1359,12 +1358,14 @@ void flush_output_stdout(void) outp = output_buffer; } + void flush_output_stderr(void) { fputs(output_buffer, outf); fflush(outf); outp = output_buffer; } + void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { static int printed; @@ -1385,13 +1386,11 @@ void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_ #define DELTA_WRAP32(new, old) \ old = ((((unsigned long long)new << 32) - ((unsigned long long)old << 32)) >> 32); -int -delta_package(struct pkg_data *new, struct pkg_data *old) +int delta_package(struct pkg_data *new, struct pkg_data *old) { int i; struct msr_counter *mp; - if (DO_BIC(BIC_Totl_c0)) old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0; if (DO_BIC(BIC_Any_c0)) @@ -1416,7 +1415,7 @@ delta_package(struct pkg_data *new, struct pkg_data *old) old->pkg_temp_c = new->pkg_temp_c; /* flag an error when rc6 counter resets/wraps */ - if (old->gfx_rc6_ms > new->gfx_rc6_ms) + if (old->gfx_rc6_ms > new->gfx_rc6_ms) old->gfx_rc6_ms = -1; else old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms; @@ -1441,8 +1440,7 @@ delta_package(struct pkg_data *new, struct pkg_data *old) return 0; } -void -delta_core(struct core_data *new, struct core_data *old) +void delta_core(struct core_data *new, struct core_data *old) { int i; struct msr_counter *mp; @@ -1474,9 +1472,7 @@ int soft_c1_residency_display(int bic) /* * old = new - old */ -int -delta_thread(struct thread_data *new, struct thread_data *old, - struct core_data *core_delta) +int delta_thread(struct thread_data *new, struct thread_data *old, struct core_data *core_delta) { int i; struct msr_counter *mp; @@ -1507,8 +1503,7 @@ delta_thread(struct thread_data *new, struct thread_data *old, old->c1 = new->c1 - old->c1; - if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || - soft_c1_residency_display(BIC_Avg_MHz)) { + if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) { if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) { old->aperf = new->aperf - old->aperf; old->mperf = new->mperf - old->mperf; @@ -1517,7 +1512,6 @@ delta_thread(struct thread_data *new, struct thread_data *old, } } - if (use_c1_residency_msr) { /* * Some models have a dedicated C1 residency MSR, @@ -1534,7 +1528,7 @@ delta_thread(struct thread_data *new, struct thread_data *old, else { /* normal case, derive c1 */ old->c1 = (old->tsc * tsc_tweak) - old->mperf - core_delta->c3 - - core_delta->c6 - core_delta->c7; + - core_delta->c6 - core_delta->c7; } } @@ -1563,8 +1557,7 @@ delta_thread(struct thread_data *new, struct thread_data *old, } int delta_cpu(struct thread_data *t, struct core_data *c, - struct pkg_data *p, struct thread_data *t2, - struct core_data *c2, struct pkg_data *p2) + struct pkg_data *p, struct thread_data *t2, struct core_data *c2, struct pkg_data *p2) { int retval = 0; @@ -1587,7 +1580,7 @@ int delta_cpu(struct thread_data *t, struct core_data *c, void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; - struct msr_counter *mp; + struct msr_counter *mp; t->tv_begin.tv_sec = 0; t->tv_begin.tv_usec = 0; @@ -1654,8 +1647,8 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) p->counter[i] = 0; } -int sum_counters(struct thread_data *t, struct core_data *c, - struct pkg_data *p) + +int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; struct msr_counter *mp; @@ -1756,12 +1749,12 @@ int sum_counters(struct thread_data *t, struct core_data *c, } return 0; } + /* * sum the counters for all cpus in the system * compute the weighted average */ -void compute_average(struct thread_data *t, struct core_data *c, - struct pkg_data *p) +void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data *p) { int i; struct msr_counter *mp; @@ -1842,7 +1835,7 @@ static unsigned long long rdtsc(void) { unsigned int low, high; - asm volatile("rdtsc" : "=a" (low), "=d" (high)); + asm volatile ("rdtsc":"=a" (low), "=d"(high)); return low | ((unsigned long long)high) << 32; } @@ -1858,6 +1851,7 @@ FILE *fopen_or_die(const char *path, const char *mode) err(1, "%s: open failed", path); return filep; } + /* * snapshot_sysfs_counter() * @@ -1889,8 +1883,7 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) char path[128 + PATH_BYTES]; if (mp->flags & SYSFS_PERCPU) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", - cpu, mp->path); + sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->path); *counterp = snapshot_sysfs_counter(path); } else { @@ -1950,7 +1943,7 @@ void get_apic_id(struct thread_data *t) eax = ebx = ecx = edx = 0; __cpuid(0x80000001, eax, ebx, ecx, edx); - topology_extensions = ecx & (1 << 22); + topology_extensions = ecx & (1 << 22); if (topology_extensions == 0) return; @@ -1973,8 +1966,7 @@ void get_apic_id(struct thread_data *t) t->x2apic_id = edx; if (debug && (t->apic_id != (t->x2apic_id & 0xff))) - fprintf(outf, "cpu%d: BIOS BUG: apic 0x%x x2apic 0x%x\n", - t->cpu_id, t->apic_id, t->x2apic_id); + fprintf(outf, "cpu%d: BIOS BUG: apic 0x%x x2apic 0x%x\n", t->cpu_id, t->apic_id, t->x2apic_id); } /* @@ -2002,8 +1994,7 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) retry: t->tsc = rdtsc(); /* we are running on local CPU of interest */ - if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || - soft_c1_residency_display(BIC_Avg_MHz)) { + if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || soft_c1_residency_display(BIC_Avg_MHz)) { unsigned long long tsc_before, tsc_between, tsc_after, aperf_time, mperf_time; /* @@ -2050,8 +2041,7 @@ retry: if (aperf_mperf_retry_count < 5) goto retry; else - warnx("cpu%d jitter %lld %lld", - cpu, aperf_time, mperf_time); + warnx("cpu%d jitter %lld %lld", cpu, aperf_time, mperf_time); } aperf_mperf_retry_count = 0; @@ -2252,47 +2242,64 @@ done: * (>= PCL__7) and to index pkg_cstate_limit_strings[]. */ -#define PCLUKN 0 /* Unknown */ -#define PCLRSV 1 /* Reserved */ -#define PCL__0 2 /* PC0 */ -#define PCL__1 3 /* PC1 */ -#define PCL__2 4 /* PC2 */ -#define PCL__3 5 /* PC3 */ -#define PCL__4 6 /* PC4 */ -#define PCL__6 7 /* PC6 */ -#define PCL_6N 8 /* PC6 No Retention */ -#define PCL_6R 9 /* PC6 Retention */ -#define PCL__7 10 /* PC7 */ -#define PCL_7S 11 /* PC7 Shrink */ -#define PCL__8 12 /* PC8 */ -#define PCL__9 13 /* PC9 */ -#define PCL_10 14 /* PC10 */ -#define PCLUNL 15 /* Unlimited */ +#define PCLUKN 0 /* Unknown */ +#define PCLRSV 1 /* Reserved */ +#define PCL__0 2 /* PC0 */ +#define PCL__1 3 /* PC1 */ +#define PCL__2 4 /* PC2 */ +#define PCL__3 5 /* PC3 */ +#define PCL__4 6 /* PC4 */ +#define PCL__6 7 /* PC6 */ +#define PCL_6N 8 /* PC6 No Retention */ +#define PCL_6R 9 /* PC6 Retention */ +#define PCL__7 10 /* PC7 */ +#define PCL_7S 11 /* PC7 Shrink */ +#define PCL__8 12 /* PC8 */ +#define PCL__9 13 /* PC9 */ +#define PCL_10 14 /* PC10 */ +#define PCLUNL 15 /* Unlimited */ int pkg_cstate_limit = PCLUKN; char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2", - "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited"}; + "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited" +}; -int nhm_pkg_cstate_limits[16] = {PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int snb_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int hsw_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int slv_pkg_cstate_limits[16] = {PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7}; -int amt_pkg_cstate_limits[16] = {PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int phi_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int glm_pkg_cstate_limits[16] = {PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int skx_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; -int icx_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV}; +int nhm_pkg_cstate_limits[16] = + { PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int snb_pkg_cstate_limits[16] = + { PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int hsw_pkg_cstate_limits[16] = + { PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int slv_pkg_cstate_limits[16] = + { PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCL__6, PCL__7 }; +int amt_pkg_cstate_limits[16] = + { PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int phi_pkg_cstate_limits[16] = + { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int glm_pkg_cstate_limits[16] = + { PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int skx_pkg_cstate_limits[16] = + { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; +int icx_pkg_cstate_limits[16] = + { PCL__0, PCL__2, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, +PCLRSV, PCLRSV }; -static void -calculate_tsc_tweak() +static void calculate_tsc_tweak() { tsc_tweak = base_hz / tsc_hz; } void prewake_cstate_probe(unsigned int family, unsigned int model); -static void -dump_nhm_platform_info(void) +static void dump_nhm_platform_info(void) { unsigned long long msr; unsigned int ratio; @@ -2302,12 +2309,10 @@ dump_nhm_platform_info(void) fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr); ratio = (msr >> 40) & 0xFF; - fprintf(outf, "%d * %.1f = %.1f MHz max efficiency frequency\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max efficiency frequency\n", ratio, bclk, ratio * bclk); ratio = (msr >> 8) & 0xFF; - fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk); get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr); fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n", @@ -2315,14 +2320,12 @@ dump_nhm_platform_info(void) /* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */ if (dis_cstate_prewake) - fprintf(outf, "C-state Pre-wake: %sabled\n", - msr & 0x40000000 ? "DIS" : "EN"); + fprintf(outf, "C-state Pre-wake: %sabled\n", msr & 0x40000000 ? "DIS" : "EN"); return; } -static void -dump_hsw_turbo_ratio_limits(void) +static void dump_hsw_turbo_ratio_limits(void) { unsigned long long msr; unsigned int ratio; @@ -2333,18 +2336,15 @@ dump_hsw_turbo_ratio_limits(void) ratio = (msr >> 8) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 18 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 18 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 0) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 17 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 17 active cores\n", ratio, bclk, ratio * bclk); return; } -static void -dump_ivt_turbo_ratio_limits(void) +static void dump_ivt_turbo_ratio_limits(void) { unsigned long long msr; unsigned int ratio; @@ -2355,45 +2355,38 @@ dump_ivt_turbo_ratio_limits(void) ratio = (msr >> 56) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 16 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 16 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 48) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 15 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 15 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 40) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 14 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 14 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 32) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 13 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 13 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 24) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 12 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 12 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 16) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 11 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 11 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 8) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 10 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 10 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 0) & 0xFF; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 9 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 9 active cores\n", ratio, bclk, ratio * bclk); return; } + int has_turbo_ratio_group_limits(int family, int model) { @@ -2411,8 +2404,7 @@ int has_turbo_ratio_group_limits(int family, int model) return 0; } -static void -dump_turbo_ratio_limits(int family, int model) +static void dump_turbo_ratio_limits(int family, int model) { unsigned long long msr, core_counts; unsigned int ratio, group_size; @@ -2477,8 +2469,7 @@ dump_turbo_ratio_limits(int family, int model) return; } -static void -dump_atom_turbo_ratio_limits(void) +static void dump_atom_turbo_ratio_limits(void) { unsigned long long msr; unsigned int ratio; @@ -2488,45 +2479,37 @@ dump_atom_turbo_ratio_limits(void) ratio = (msr >> 0) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz minimum operating frequency\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz minimum operating frequency\n", ratio, bclk, ratio * bclk); ratio = (msr >> 8) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz low frequency mode (LFM)\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz low frequency mode (LFM)\n", ratio, bclk, ratio * bclk); ratio = (msr >> 16) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk); get_msr(base_cpu, MSR_ATOM_CORE_TURBO_RATIOS, &msr); fprintf(outf, "cpu%d: MSR_ATOM_CORE_TURBO_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF); ratio = (msr >> 24) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 4 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 4 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 16) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 3 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 3 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 8) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 2 active cores\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 2 active cores\n", ratio, bclk, ratio * bclk); ratio = (msr >> 0) & 0x3F; if (ratio) - fprintf(outf, "%d * %.1f = %.1f MHz max turbo 1 active core\n", - ratio, bclk, ratio * bclk); + fprintf(outf, "%d * %.1f = %.1f MHz max turbo 1 active core\n", ratio, bclk, ratio * bclk); } -static void -dump_knl_turbo_ratio_limits(void) +static void dump_knl_turbo_ratio_limits(void) { const unsigned int buckets_no = 7; @@ -2538,8 +2521,7 @@ dump_knl_turbo_ratio_limits(void) get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT, &msr); - fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", - base_cpu, msr); + fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr); /* * Turbo encoding in KNL is as follows: @@ -2584,8 +2566,7 @@ dump_knl_turbo_ratio_limits(void) ratio[i], bclk, ratio[i] * bclk, cores[i]); } -static void -dump_nhm_cst_cfg(void) +static void dump_nhm_cst_cfg(void) { unsigned long long msr; @@ -2598,14 +2579,11 @@ dump_nhm_cst_cfg(void) (msr & SNB_C1_AUTO_UNDEMOTE) ? "UNdemote-C1, " : "", (msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "", (msr & NHM_C1_AUTO_DEMOTE) ? "demote-C1, " : "", - (msr & (1 << 15)) ? "" : "UN", - (unsigned int)msr & 0xF, - pkg_cstate_limit_strings[pkg_cstate_limit]); + (msr & (1 << 15)) ? "" : "UN", (unsigned int)msr & 0xF, pkg_cstate_limit_strings[pkg_cstate_limit]); #define AUTOMATIC_CSTATE_CONVERSION (1UL << 16) if (has_automatic_cstate_conversion) { - fprintf(outf, ", automatic c-state conversion=%s", - (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off"); + fprintf(outf, ", automatic c-state conversion=%s", (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off"); } fprintf(outf, ")\n"); @@ -2613,8 +2591,7 @@ dump_nhm_cst_cfg(void) return; } -static void -dump_config_tdp(void) +static void dump_config_tdp(void) { unsigned long long msr; @@ -2656,7 +2633,7 @@ dump_config_tdp(void) fprintf(outf, ")\n"); } -unsigned int irtl_time_units[] = {1, 32, 1024, 32768, 1048576, 33554432, 0, 0 }; +unsigned int irtl_time_units[] = { 1, 32, 1024, 32768, 1048576, 33554432, 0, 0 }; void print_irtl(void) { @@ -2696,6 +2673,7 @@ void print_irtl(void) (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]); } + void free_fd_percpu(void) { int i; @@ -2752,7 +2730,6 @@ void free_all_buffers(void) free(cpus); } - /* * Parse a file containing a single int. * Return 0 if file can not be opened @@ -2827,8 +2804,7 @@ void set_node_data(void) * the logical_node_id */ for (cpux = cpu; cpux <= topo.max_cpu_num; cpux++) { - if ((cpus[cpux].physical_package_id == pkg) && - (cpus[cpux].physical_node_id == node)) { + if ((cpus[cpux].physical_package_id == pkg) && (cpus[cpux].physical_node_id == node)) { cpus[cpux].logical_node_id = lnode; cpu_count++; } @@ -2850,8 +2826,7 @@ int get_physical_node_id(struct cpu_topology *thiscpu) int cpu = thiscpu->logical_cpu_id; for (i = 0; i <= topo.max_cpu_num; i++) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/node%i/cpulist", - cpu, i); + sprintf(path, "/sys/devices/system/cpu/cpu%d/node%i/cpulist", cpu, i); filep = fopen(path, "r"); if (!filep) continue; @@ -2881,8 +2856,7 @@ int get_thread_siblings(struct cpu_topology *thiscpu) size = CPU_ALLOC_SIZE((topo.max_cpu_num + 1)); CPU_ZERO_S(size, thiscpu->put_ids); - sprintf(path, - "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu); + sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu); filep = fopen(path, "r"); if (!filep) { @@ -2899,10 +2873,8 @@ int get_thread_siblings(struct cpu_topology *thiscpu) sib_core = get_core_id(so); if (sib_core == thiscpu->physical_core_id) { CPU_SET_S(so, size, thiscpu->put_ids); - if ((so != cpu) && - (cpus[so].thread_id < 0)) - cpus[so].thread_id = - thread_id++; + if ((so != cpu) && (cpus[so].thread_id < 0)) + cpus[so].thread_id = thread_id++; } } } @@ -2917,41 +2889,31 @@ int get_thread_siblings(struct cpu_topology *thiscpu) * skip non-present cpus */ -int for_all_cpus_2(int (func)(struct thread_data *, struct core_data *, - struct pkg_data *, struct thread_data *, struct core_data *, - struct pkg_data *), struct thread_data *thread_base, - struct core_data *core_base, struct pkg_data *pkg_base, - struct thread_data *thread_base2, struct core_data *core_base2, - struct pkg_data *pkg_base2) +int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *, + struct pkg_data *, struct thread_data *, struct core_data *, + struct pkg_data *), struct thread_data *thread_base, + struct core_data *core_base, struct pkg_data *pkg_base, + struct thread_data *thread_base2, struct core_data *core_base2, struct pkg_data *pkg_base2) { int retval, pkg_no, node_no, core_no, thread_no; for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) { for (node_no = 0; node_no < topo.nodes_per_pkg; ++node_no) { - for (core_no = 0; core_no < topo.cores_per_node; - ++core_no) { - for (thread_no = 0; thread_no < - topo.threads_per_core; ++thread_no) { + for (core_no = 0; core_no < topo.cores_per_node; ++core_no) { + for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) { struct thread_data *t, *t2; struct core_data *c, *c2; struct pkg_data *p, *p2; - t = GET_THREAD(thread_base, thread_no, - core_no, node_no, - pkg_no); + t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no); if (cpu_is_not_present(t->cpu_id)) continue; - t2 = GET_THREAD(thread_base2, thread_no, - core_no, node_no, - pkg_no); + t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no); - c = GET_CORE(core_base, core_no, - node_no, pkg_no); - c2 = GET_CORE(core_base2, core_no, - node_no, - pkg_no); + c = GET_CORE(core_base, core_no, node_no, pkg_no); + c2 = GET_CORE(core_base2, core_no, node_no, pkg_no); p = GET_PKG(pkg_base, pkg_no); p2 = GET_PKG(pkg_base2, pkg_no); @@ -2970,7 +2932,7 @@ int for_all_cpus_2(int (func)(struct thread_data *, struct core_data *, * run func(cpu) on every cpu in /proc/stat * return max_cpu number */ -int for_all_proc_cpus(int (func)(int)) +int for_all_proc_cpus(int (func) (int)) { FILE *fp; int cpu_num; @@ -2990,7 +2952,7 @@ int for_all_proc_cpus(int (func)(int)) retval = func(cpu_num); if (retval) { fclose(fp); - return(retval); + return (retval); } } fclose(fp); @@ -3014,16 +2976,14 @@ void set_max_cpu_num(void) base_cpu = sched_getcpu(); if (base_cpu < 0) err(1, "cannot find calling cpu ID"); - sprintf(pathname, - "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", - base_cpu); + sprintf(pathname, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", base_cpu); filep = fopen_or_die(pathname, "r"); topo.max_cpu_num = 0; while (fscanf(filep, "%lx,", &dummy) == 1) topo.max_cpu_num += BITMASK_SIZE; fclose(filep); - topo.max_cpu_num--; /* 0 based */ + topo.max_cpu_num--; /* 0 based */ } /* @@ -3035,6 +2995,7 @@ int count_cpus(int cpu) topo.num_cpus++; return 0; } + int mark_cpu_present(int cpu) { CPU_SET_S(cpu, cpu_present_setsize, cpu_present_set); @@ -3104,12 +3065,12 @@ int snapshot_proc_interrupts(void) } - while (getc(fp) != '\n') - ; /* flush interrupt description */ + while (getc(fp) != '\n') ; /* flush interrupt description */ } return 0; } + /* * snapshot_gfx_rc6_ms() * @@ -3133,6 +3094,7 @@ int snapshot_gfx_rc6_ms(void) return 0; } + /* * snapshot_gfx_mhz() * @@ -3212,6 +3174,7 @@ int snapshot_cpu_lpi_us(void) return 0; } + /* * snapshot_sys_lpi() * @@ -3235,6 +3198,7 @@ int snapshot_sys_lpi_us(void) return 0; } + /* * snapshot /proc and /sys files * @@ -3266,7 +3230,7 @@ int snapshot_proc_sysfs_files(void) int exit_requested; -static void signal_handler (int signal) +static void signal_handler(int signal) { switch (signal) { case SIGINT: @@ -3373,8 +3337,7 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg continue; ret = get_msr(cpu, offset, &msr_cur); if (ret) { - fprintf(outf, "Can not update msr(0x%llx)\n", - (unsigned long long)offset); + fprintf(outf, "Can not update msr(0x%llx)\n", (unsigned long long)offset); continue; } @@ -3387,8 +3350,7 @@ static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg return 0; } -static void -msr_record_handler(union sigval v) +static void msr_record_handler(union sigval v) { for_all_cpus(update_msr_sum, EVEN_COUNTERS); } @@ -3433,9 +3395,9 @@ void msr_sum_record(void) } return; - release_timer: +release_timer: timer_delete(timerid); - release_msr: +release_msr: free(per_cpu_msr_sum); } @@ -3527,7 +3489,7 @@ void check_dev_msr() sprintf(pathname, "/dev/cpu/%d/msr", base_cpu); if (stat(pathname, &sb)) - if (system("/sbin/modprobe msr > /dev/null 2>&1")) + if (system("/sbin/modprobe msr > /dev/null 2>&1")) err(-5, "no /dev/cpu/0/msr, Try \"# modprobe msr\" "); } @@ -3549,8 +3511,7 @@ int check_for_cap_sys_rawio(void) err(-6, "cap_get\n"); if (cap_flag_value != CAP_SET) { - warnx("capget(CAP_SYS_RAWIO) failed," - " try \"# setcap cap_sys_rawio=ep %s\"", progname); + warnx("capget(CAP_SYS_RAWIO) failed," " try \"# setcap cap_sys_rawio=ep %s\"", progname); return 1; } @@ -3559,6 +3520,7 @@ int check_for_cap_sys_rawio(void) return 0; } + void check_permissions(void) { int do_exit = 0; @@ -3664,7 +3626,7 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ + case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ pkg_cstate_limits = glm_pkg_cstate_limits; break; default: @@ -3680,6 +3642,7 @@ int probe_nhm_msrs(unsigned int family, unsigned int model) has_base_hz = 1; return 1; } + /* * SLV client has support for unique MSRs: * @@ -3700,6 +3663,7 @@ int has_slv_msrs(unsigned int family, unsigned int model) } return 0; } + int is_dnv(unsigned int family, unsigned int model) { @@ -3712,6 +3676,7 @@ int is_dnv(unsigned int family, unsigned int model) } return 0; } + int is_bdx(unsigned int family, unsigned int model) { @@ -3724,6 +3689,7 @@ int is_bdx(unsigned int family, unsigned int model) } return 0; } + int is_skx(unsigned int family, unsigned int model) { @@ -3761,6 +3727,7 @@ int is_ehl(unsigned int family, unsigned int model) } return 0; } + int is_jvl(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -3779,7 +3746,7 @@ int has_turbo_ratio_limit(unsigned int family, unsigned int model) return 0; switch (model) { - /* Nehalem compatible, but do not include turbo-ratio limit support */ + /* Nehalem compatible, but do not include turbo-ratio limit support */ case INTEL_FAM6_NEHALEM_EX: /* Nehalem-EX Xeon - Beckton */ case INTEL_FAM6_XEON_PHI_KNL: /* PHI - Knights Landing (different MSR definition) */ return 0; @@ -3787,6 +3754,7 @@ int has_turbo_ratio_limit(unsigned int family, unsigned int model) return 1; } } + int has_atom_turbo_ratio_limit(unsigned int family, unsigned int model) { if (has_slv_msrs(family, model)) @@ -3794,6 +3762,7 @@ int has_atom_turbo_ratio_limit(unsigned int family, unsigned int model) return 0; } + int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -3810,6 +3779,7 @@ int has_ivt_turbo_ratio_limit(unsigned int family, unsigned int model) return 0; } } + int has_hsw_turbo_ratio_limit(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -3841,6 +3811,7 @@ int has_knl_turbo_ratio_limit(unsigned int family, unsigned int model) return 0; } } + int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -3858,6 +3829,7 @@ int has_glm_turbo_ratio_limit(unsigned int family, unsigned int model) return 0; } } + int has_config_tdp(unsigned int family, unsigned int model) { if (!genuine_intel) @@ -3921,8 +3893,7 @@ void check_tcc_offset(int model) } } -static void -remove_underbar(char *s) +static void remove_underbar(char *s) { char *to = s; @@ -3935,8 +3906,7 @@ remove_underbar(char *s) *to = 0; } -static void -dump_cstate_pstate_config_info(unsigned int family, unsigned int model) +static void dump_cstate_pstate_config_info(unsigned int family, unsigned int model) { if (!do_nhm_platform_info) return; @@ -3981,8 +3951,8 @@ static void dump_sysfs_file(char *path) fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf); } -static void -dump_sysfs_cstate_config(void) + +static void dump_sysfs_cstate_config(void) { char path[64]; char name_buf[16]; @@ -4002,15 +3972,14 @@ dump_sysfs_cstate_config(void) for (state = 0; state < 10; ++state) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", - base_cpu, state); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); input = fopen(path, "r"); if (input == NULL) continue; if (!fgets(name_buf, sizeof(name_buf), input)) err(1, "%s: failed to read file", path); - /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */ + /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */ sp = strchr(name_buf, '-'); if (!sp) sp = strchrnul(name_buf, '\n'); @@ -4019,8 +3988,7 @@ dump_sysfs_cstate_config(void) remove_underbar(name_buf); - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", - base_cpu, state); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", base_cpu, state); input = fopen(path, "r"); if (input == NULL) continue; @@ -4031,8 +3999,8 @@ dump_sysfs_cstate_config(void) fclose(input); } } -static void -dump_sysfs_pstate_config(void) + +static void dump_sysfs_pstate_config(void) { char path[64]; char driver_buf[64]; @@ -4040,8 +4008,7 @@ dump_sysfs_pstate_config(void) FILE *input; int turbo; - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", - base_cpu); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", base_cpu); input = fopen(path, "r"); if (input == NULL) { fprintf(outf, "NSFOD %s\n", path); @@ -4051,8 +4018,7 @@ dump_sysfs_pstate_config(void) err(1, "%s: failed to read file", path); fclose(input); - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", - base_cpu); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", base_cpu); input = fopen(path, "r"); if (input == NULL) { fprintf(outf, "NSFOD %s\n", path); @@ -4084,7 +4050,6 @@ dump_sysfs_pstate_config(void) } } - /* * print_epb() * Decode the ENERGY_PERF_BIAS MSR @@ -4130,6 +4095,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; } + /* * print_hwp() * Decode the MSR_HWP_CAPABILITIES @@ -4156,8 +4122,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_PM_ENABLE, &msr)) return 0; - fprintf(outf, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n", - cpu, msr, (msr & (1 << 0)) ? "" : "No-"); + fprintf(outf, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n", cpu, msr, (msr & (1 << 0)) ? "" : "No-"); /* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */ if ((msr & (1 << 0)) == 0) @@ -4167,25 +4132,23 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) return 0; fprintf(outf, "cpu%d: MSR_HWP_CAPABILITIES: 0x%08llx " - "(high %d guar %d eff %d low %d)\n", - cpu, msr, - (unsigned int)HWP_HIGHEST_PERF(msr), - (unsigned int)HWP_GUARANTEED_PERF(msr), - (unsigned int)HWP_MOSTEFFICIENT_PERF(msr), - (unsigned int)HWP_LOWEST_PERF(msr)); + "(high %d guar %d eff %d low %d)\n", + cpu, msr, + (unsigned int)HWP_HIGHEST_PERF(msr), + (unsigned int)HWP_GUARANTEED_PERF(msr), + (unsigned int)HWP_MOSTEFFICIENT_PERF(msr), (unsigned int)HWP_LOWEST_PERF(msr)); if (get_msr(cpu, MSR_HWP_REQUEST, &msr)) return 0; fprintf(outf, "cpu%d: MSR_HWP_REQUEST: 0x%08llx " - "(min %d max %d des %d epp 0x%x window 0x%x pkg 0x%x)\n", - cpu, msr, - (unsigned int)(((msr) >> 0) & 0xff), - (unsigned int)(((msr) >> 8) & 0xff), - (unsigned int)(((msr) >> 16) & 0xff), - (unsigned int)(((msr) >> 24) & 0xff), - (unsigned int)(((msr) >> 32) & 0xff3), - (unsigned int)(((msr) >> 42) & 0x1)); + "(min %d max %d des %d epp 0x%x window 0x%x pkg 0x%x)\n", + cpu, msr, + (unsigned int)(((msr) >> 0) & 0xff), + (unsigned int)(((msr) >> 8) & 0xff), + (unsigned int)(((msr) >> 16) & 0xff), + (unsigned int)(((msr) >> 24) & 0xff), + (unsigned int)(((msr) >> 32) & 0xff3), (unsigned int)(((msr) >> 42) & 0x1)); if (has_hwp_pkg) { if (get_msr(cpu, MSR_HWP_REQUEST_PKG, &msr)) @@ -4197,8 +4160,7 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) (unsigned int)(((msr) >> 0) & 0xff), (unsigned int)(((msr) >> 8) & 0xff), (unsigned int)(((msr) >> 16) & 0xff), - (unsigned int)(((msr) >> 24) & 0xff), - (unsigned int)(((msr) >> 32) & 0xff3)); + (unsigned int)(((msr) >> 24) & 0xff), (unsigned int)(((msr) >> 32) & 0xff3)); } if (has_hwp_notify) { if (get_msr(cpu, MSR_HWP_INTERRUPT, &msr)) @@ -4206,18 +4168,14 @@ int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p) fprintf(outf, "cpu%d: MSR_HWP_INTERRUPT: 0x%08llx " "(%s_Guaranteed_Perf_Change, %s_Excursion_Min)\n", - cpu, msr, - ((msr) & 0x1) ? "EN" : "Dis", - ((msr) & 0x2) ? "EN" : "Dis"); + cpu, msr, ((msr) & 0x1) ? "EN" : "Dis", ((msr) & 0x2) ? "EN" : "Dis"); } if (get_msr(cpu, MSR_HWP_STATUS, &msr)) return 0; fprintf(outf, "cpu%d: MSR_HWP_STATUS: 0x%08llx " - "(%sGuaranteed_Perf_Change, %sExcursion_Min)\n", - cpu, msr, - ((msr) & 0x1) ? "" : "No-", - ((msr) & 0x2) ? "" : "No-"); + "(%sGuaranteed_Perf_Change, %sExcursion_Min)\n", + cpu, msr, ((msr) & 0x1) ? "" : "No-", ((msr) & 0x2) ? "" : "No-"); return 0; } @@ -4257,8 +4215,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 5) ? "Auto-HWP, " : "", (msr & 1 << 4) ? "Graphics, " : "", (msr & 1 << 2) ? "bit2, " : "", - (msr & 1 << 1) ? "ThermStatus, " : "", - (msr & 1 << 0) ? "PROCHOT, " : ""); + (msr & 1 << 1) ? "ThermStatus, " : "", (msr & 1 << 0) ? "PROCHOT, " : ""); fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n", (msr & 1 << 31) ? "bit31, " : "", (msr & 1 << 30) ? "bit30, " : "", @@ -4272,8 +4229,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 21) ? "Auto-HWP, " : "", (msr & 1 << 20) ? "Graphics, " : "", (msr & 1 << 18) ? "bit18, " : "", - (msr & 1 << 17) ? "ThermStatus, " : "", - (msr & 1 << 16) ? "PROCHOT, " : ""); + (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : ""); } if (do_gfx_perf_limit_reasons) { @@ -4286,8 +4242,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 6) ? "VR-Therm, " : "", (msr & 1 << 8) ? "Amps, " : "", (msr & 1 << 9) ? "GFXPwr, " : "", - (msr & 1 << 10) ? "PkgPwrL1, " : "", - (msr & 1 << 11) ? "PkgPwrL2, " : ""); + (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : ""); fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s)\n", (msr & 1 << 16) ? "PROCHOT, " : "", (msr & 1 << 17) ? "ThermStatus, " : "", @@ -4295,8 +4250,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 22) ? "VR-Therm, " : "", (msr & 1 << 24) ? "Amps, " : "", (msr & 1 << 25) ? "GFXPwr, " : "", - (msr & 1 << 26) ? "PkgPwrL1, " : "", - (msr & 1 << 27) ? "PkgPwrL2, " : ""); + (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : ""); } if (do_ring_perf_limit_reasons) { get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr); @@ -4306,21 +4260,19 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data (msr & 1 << 1) ? "ThermStatus, " : "", (msr & 1 << 6) ? "VR-Therm, " : "", (msr & 1 << 8) ? "Amps, " : "", - (msr & 1 << 10) ? "PkgPwrL1, " : "", - (msr & 1 << 11) ? "PkgPwrL2, " : ""); + (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : ""); fprintf(outf, " (Logged: %s%s%s%s%s%s)\n", (msr & 1 << 16) ? "PROCHOT, " : "", (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 22) ? "VR-Therm, " : "", (msr & 1 << 24) ? "Amps, " : "", - (msr & 1 << 26) ? "PkgPwrL1, " : "", - (msr & 1 << 27) ? "PkgPwrL2, " : ""); + (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : ""); } return 0; } #define RAPL_POWER_GRANULARITY 0x7FFF /* 15 bit power granularity */ -#define RAPL_TIME_GRANULARITY 0x3F /* 6 bit time granularity */ +#define RAPL_TIME_GRANULARITY 0x3F /* 6 bit time granularity */ double get_tdp_intel(unsigned int model) { @@ -4349,8 +4301,7 @@ double get_tdp_amd(unsigned int family) * rapl_dram_energy_units_probe() * Energy units are either hard-coded, or come from RAPL Energy Unit MSR. */ -static double -rapl_dram_energy_units_probe(int model, double rapl_energy_units) +static double rapl_dram_energy_units_probe(int model, double rapl_energy_units) { /* only called for genuine_intel, family 6 */ @@ -4402,7 +4353,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model) BIC_PRESENT(BIC_PkgWatt); break; case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_GFX | RAPL_PKG_POWER_INFO; + do_rapl = + RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS + | RAPL_GFX | RAPL_PKG_POWER_INFO; if (rapl_joules) { BIC_PRESENT(BIC_Pkg_J); BIC_PRESENT(BIC_Cor_J); @@ -4425,7 +4378,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model) break; case INTEL_FAM6_SKYLAKE_L: /* SKL */ case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_GFX | RAPL_PKG_POWER_INFO; + do_rapl = + RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS + | RAPL_GFX | RAPL_PKG_POWER_INFO; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4445,7 +4400,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model) case INTEL_FAM6_SKYLAKE_X: /* SKX */ case INTEL_FAM6_ICELAKE_X: /* ICX */ case INTEL_FAM6_XEON_PHI_KNL: /* KNL */ - do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO; + do_rapl = + RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | + RAPL_PKG_POWER_INFO; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4458,7 +4415,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model) break; case INTEL_FAM6_SANDYBRIDGE_X: case INTEL_FAM6_IVYBRIDGE_X: - do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_PKG_PERF_STATUS | RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO; + do_rapl = + RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_PKG_PERF_STATUS | + RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4483,7 +4442,9 @@ void rapl_probe_intel(unsigned int family, unsigned int model) } break; case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO | RAPL_CORES_ENERGY_STATUS; + do_rapl = + RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | + RAPL_PKG_POWER_INFO | RAPL_CORES_ENERGY_STATUS; BIC_PRESENT(BIC_PKG__); BIC_PRESENT(BIC_RAM__); if (rapl_joules) { @@ -4600,8 +4561,7 @@ void perf_limit_reasons_probe(unsigned int family, unsigned int model) void automatic_cstate_conversion_probe(unsigned int family, unsigned int model) { - if (is_skx(family, model) || is_bdx(family, model) || - is_icx(family, model)) + if (is_skx(family, model) || is_bdx(family, model) || is_icx(family, model)) has_automatic_cstate_conversion = 1; } @@ -4636,8 +4596,7 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p return 0; dts = (msr >> 16) & 0x7F; - fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", - cpu, msr, tj_max - dts); + fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts); if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr)) return 0; @@ -4648,7 +4607,6 @@ int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p cpu, msr, tj_max - dts, tj_max - dts2); } - if (do_dts && debug) { unsigned int resolution; @@ -4678,7 +4636,7 @@ void print_power_limit_msr(int cpu, unsigned long long msr, char *label) cpu, label, ((msr >> 15) & 1) ? "EN" : "DIS", ((msr >> 0) & 0x7FFF) * rapl_power_units, - (1.0 + (((msr >> 22) & 0x3)/4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units, + (1.0 + (((msr >> 22) & 0x3) / 4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units, (((msr >> 16) & 1) ? "EN" : "DIS")); return; @@ -4719,12 +4677,11 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (do_rapl & RAPL_PKG_POWER_INFO) { if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr)) - return -5; - + return -5; fprintf(outf, "cpu%d: MSR_PKG_POWER_INFO: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n", cpu, msr, - ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units, + ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); @@ -4743,17 +4700,17 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) cpu, ((msr >> 47) & 1) ? "EN" : "DIS", ((msr >> 32) & 0x7FFF) * rapl_power_units, - (1.0 + (((msr >> 54) & 0x3)/4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units, + (1.0 + (((msr >> 54) & 0x3) / 4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units, ((msr >> 48) & 1) ? "EN" : "DIS"); } if (do_rapl & RAPL_DRAM_POWER_INFO) { if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr)) - return -6; + return -6; fprintf(outf, "cpu%d: MSR_DRAM_POWER_INFO,: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n", cpu, msr, - ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units, + ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units, ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units); @@ -4762,7 +4719,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr)) return -9; fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n", - cpu, msr, (msr >> 31) & 1 ? "" : "UN"); + cpu, msr, (msr >> 31) & 1 ? "" : "UN"); print_power_limit_msr(cpu, msr, "DRAM Limit"); } @@ -4776,7 +4733,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr)) return -9; fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n", - cpu, msr, (msr >> 31) & 1 ? "" : "UN"); + cpu, msr, (msr >> 31) & 1 ? "" : "UN"); print_power_limit_msr(cpu, msr, "Cores Limit"); } if (do_rapl & RAPL_GFX) { @@ -4788,7 +4745,7 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) if (get_msr(cpu, MSR_PP1_POWER_LIMIT, &msr)) return -9; fprintf(outf, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n", - cpu, msr, (msr >> 31) & 1 ? "" : "UN"); + cpu, msr, (msr >> 31) & 1 ? "" : "UN"); print_power_limit_msr(cpu, msr, "GFX Limit"); } return 0; @@ -4810,24 +4767,24 @@ int has_snb_msrs(unsigned int family, unsigned int model) switch (model) { case INTEL_FAM6_SANDYBRIDGE: case INTEL_FAM6_SANDYBRIDGE_X: - case INTEL_FAM6_IVYBRIDGE: /* IVB */ - case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ - case INTEL_FAM6_HASWELL: /* HSW */ - case INTEL_FAM6_HASWELL_X: /* HSW */ - case INTEL_FAM6_HASWELL_L: /* HSW */ - case INTEL_FAM6_HASWELL_G: /* HSW */ - case INTEL_FAM6_BROADWELL: /* BDW */ - case INTEL_FAM6_BROADWELL_G: /* BDW */ - case INTEL_FAM6_BROADWELL_X: /* BDX */ - case INTEL_FAM6_SKYLAKE_L: /* SKL */ - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ - case INTEL_FAM6_SKYLAKE_X: /* SKX */ - case INTEL_FAM6_ICELAKE_X: /* ICX */ - case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ + case INTEL_FAM6_IVYBRIDGE: /* IVB */ + case INTEL_FAM6_IVYBRIDGE_X: /* IVB Xeon */ + case INTEL_FAM6_HASWELL: /* HSW */ + case INTEL_FAM6_HASWELL_X: /* HSW */ + case INTEL_FAM6_HASWELL_L: /* HSW */ + case INTEL_FAM6_HASWELL_G: /* HSW */ + case INTEL_FAM6_BROADWELL: /* BDW */ + case INTEL_FAM6_BROADWELL_G: /* BDW */ + case INTEL_FAM6_BROADWELL_X: /* BDX */ + case INTEL_FAM6_SKYLAKE_L: /* SKL */ + case INTEL_FAM6_CANNONLAKE_L: /* CNL */ + case INTEL_FAM6_SKYLAKE_X: /* SKX */ + case INTEL_FAM6_ICELAKE_X: /* ICX */ + case INTEL_FAM6_ATOM_GOLDMONT: /* BXT */ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_GOLDMONT_D: /* DNV */ - case INTEL_FAM6_ATOM_TREMONT: /* EHL */ - case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ + case INTEL_FAM6_ATOM_TREMONT: /* EHL */ + case INTEL_FAM6_ATOM_TREMONT_D: /* JVL */ return 1; } return 0; @@ -4913,7 +4870,7 @@ int is_cnl(unsigned int family, unsigned int model) return 0; switch (model) { - case INTEL_FAM6_CANNONLAKE_L: /* CNL */ + case INTEL_FAM6_CANNONLAKE_L: /* CNL */ return 1; } @@ -4928,7 +4885,7 @@ unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model) } #define SLM_BCLK_FREQS 5 -double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0}; +double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 }; double slm_bclk(void) { @@ -4979,7 +4936,7 @@ int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p) __cpuid(0x1a, eax, ebx, ecx, edx); eax = (eax >> 24) & 0xFF; - if (eax == 0x20 ) + if (eax == 0x20) t->is_atom = true; return 0; } @@ -5018,8 +4975,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk if (tj_max_override != 0) { tj_max = tj_max_override; - fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", - cpu, tj_max); + fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", cpu, tj_max); return 0; } @@ -5037,16 +4993,15 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk case 4: tcc_offset = (msr >> 24) & 0xF; fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", - cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); + cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); break; case 6: tcc_offset = (msr >> 24) & 0x3F; fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n", - cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); + cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset); break; default: - fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", - cpu, msr, tcc_default); + fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, tcc_default); break; } } @@ -5060,8 +5015,7 @@ int set_temperature_target(struct thread_data *t, struct core_data *c, struct pk guess: tj_max = TJMAX_DEFAULT; - fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", - cpu, tj_max); + fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", cpu, tj_max); return 0; } @@ -5072,9 +5026,7 @@ void decode_feature_control_msr(void) if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr)) fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n", - base_cpu, msr, - msr & FEAT_CTL_LOCKED ? "" : "UN-", - msr & (1 << 18) ? "SGX" : ""); + base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : ""); } void decode_misc_enable_msr(void) @@ -5102,13 +5054,12 @@ void decode_misc_feature_control(void) return; if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr)) - fprintf(outf, "cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n", - base_cpu, msr, - msr & (0 << 0) ? "No-" : "", - msr & (1 << 0) ? "No-" : "", - msr & (2 << 0) ? "No-" : "", - msr & (3 << 0) ? "No-" : ""); + fprintf(outf, + "cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n", + base_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "", + msr & (2 << 0) ? "No-" : "", msr & (3 << 0) ? "No-" : ""); } + /* * Decode MSR_MISC_PWR_MGMT * @@ -5129,10 +5080,9 @@ void decode_misc_pwr_mgmt_msr(void) if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr)) fprintf(outf, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB %sable-OOB)\n", base_cpu, msr, - msr & (1 << 0) ? "DIS" : "EN", - msr & (1 << 1) ? "EN" : "DIS", - msr & (1 << 8) ? "EN" : "DIS"); + msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS"); } + /* * Decode MSR_CC6_DEMOTION_POLICY_CONFIG, MSR_MC6_DEMOTION_POLICY_CONFIG * @@ -5158,10 +5108,10 @@ void decode_c6_demotion_policy_msr(void) unsigned int intel_model_duplicates(unsigned int model) { - switch(model) { + switch (model) { case INTEL_FAM6_NEHALEM_EP: /* Core i7, Xeon 5500 series - Bloomfield, Gainstown NHM-EP */ case INTEL_FAM6_NEHALEM: /* Core i7 and i5 Processor - Clarksfield, Lynnfield, Jasper Forest */ - case 0x1F: /* Core i7 and i5 Processor - Nehalem */ + case 0x1F: /* Core i7 and i5 Processor - Nehalem */ case INTEL_FAM6_WESTMERE: /* Westmere Client - Clarkdale, Arrandale */ case INTEL_FAM6_WESTMERE_EP: /* Westmere EP - Gulftown */ return INTEL_FAM6_NEHALEM; @@ -5224,13 +5174,11 @@ void print_dev_latency(void) close(fd); return; } - fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n", - value, value == 2000000000 ? "default" : "constrained"); + fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n", value, value == 2000000000 ? "default" : "constrained"); close(fd); } - /* * Linux-perf manages the the HW instructions-retired counter * by enabling when requested, and hiding rollover @@ -5296,7 +5244,8 @@ void process_cpuid() if (!quiet) { fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d) microcode 0x%x\n", - family, model, stepping, family, model, stepping, (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); + family, model, stepping, family, model, stepping, + (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF)); fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level); fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n", ecx_flags & (1 << 0) ? "SSE3" : "-", @@ -5307,8 +5256,7 @@ void process_cpuid() edx_flags & (1 << 4) ? "TSC" : "-", edx_flags & (1 << 5) ? "MSR" : "-", edx_flags & (1 << 22) ? "ACPI-TM" : "-", - edx_flags & (1 << 28) ? "HT" : "-", - edx_flags & (1 << 29) ? "TM" : "-"); + edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-"); } if (genuine_intel) { model_orig = model; @@ -5364,14 +5312,11 @@ void process_cpuid() has_hwp ? "" : "No-", has_hwp_notify ? "" : "No-", has_hwp_activity_window ? "" : "No-", - has_hwp_epp ? "" : "No-", - has_hwp_pkg ? "" : "No-", - has_epb ? "" : "No-"); + has_hwp_epp ? "" : "No-", has_hwp_pkg ? "" : "No-", has_epb ? "" : "No-"); if (!quiet) decode_misc_enable_msr(); - if (max_level >= 0x7 && !quiet) { int has_sgx; @@ -5403,7 +5348,7 @@ void process_cpuid() eax_crystal, ebx_tsc, crystal_hz); if (crystal_hz == 0) - switch(model) { + switch (model) { case INTEL_FAM6_SKYLAKE_L: /* SKL */ crystal_hz = 24000000; /* 24.0 MHz */ break; @@ -5416,13 +5361,13 @@ void process_cpuid() break; default: crystal_hz = 0; - } + } if (crystal_hz) { - tsc_hz = (unsigned long long) crystal_hz * ebx_tsc / eax_crystal; + tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal; if (!quiet) fprintf(outf, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n", - tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal); + tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal); } } } @@ -5517,10 +5462,9 @@ void process_cpuid() BIC_PRESENT(BIC_CPUGFX); } do_slm_cstates = is_slm(family, model); - do_knl_cstates = is_knl(family, model); + do_knl_cstates = is_knl(family, model); - if (do_slm_cstates || do_knl_cstates || is_cnl(family, model) || - is_ehl(family, model)) + if (do_slm_cstates || do_knl_cstates || is_cnl(family, model) || is_ehl(family, model)) BIC_NOT_PRESENT(BIC_CPU_c3); if (!quiet) @@ -5614,7 +5558,7 @@ void topology_probe() if (debug > 1) fprintf(outf, "num_cpus %d max_cpu_num %d\n", topo.num_cpus, topo.max_cpu_num); - cpus = calloc(1, (topo.max_cpu_num + 1) * sizeof(struct cpu_topology)); + cpus = calloc(1, (topo.max_cpu_num + 1) * sizeof(struct cpu_topology)); if (cpus == NULL) err(1, "calloc cpus"); @@ -5693,22 +5637,19 @@ void topology_probe() topo.cores_per_node = max_core_id + 1; if (debug > 1) - fprintf(outf, "max_core_id %d, sizing for %d cores per package\n", - max_core_id, topo.cores_per_node); + fprintf(outf, "max_core_id %d, sizing for %d cores per package\n", max_core_id, topo.cores_per_node); if (!summary_only && topo.cores_per_node > 1) BIC_PRESENT(BIC_Core); topo.num_die = max_die_id + 1; if (debug > 1) - fprintf(outf, "max_die_id %d, sizing for %d die\n", - max_die_id, topo.num_die); + fprintf(outf, "max_die_id %d, sizing for %d die\n", max_die_id, topo.num_die); if (!summary_only && topo.num_die > 1) BIC_PRESENT(BIC_Die); topo.num_packages = max_package_id + 1; if (debug > 1) - fprintf(outf, "max_package_id %d, sizing for %d packages\n", - max_package_id, topo.num_packages); + fprintf(outf, "max_package_id %d, sizing for %d packages\n", max_package_id, topo.num_packages); if (!summary_only && topo.num_packages > 1) BIC_PRESENT(BIC_Package); @@ -5731,21 +5672,15 @@ void topology_probe() fprintf(outf, "cpu %d pkg %d die %d node %d lnode %d core %d thread %d\n", i, cpus[i].physical_package_id, cpus[i].die_id, - cpus[i].physical_node_id, - cpus[i].logical_node_id, - cpus[i].physical_core_id, - cpus[i].thread_id); + cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].physical_core_id, cpus[i].thread_id); } } -void -allocate_counters(struct thread_data **t, struct core_data **c, - struct pkg_data **p) +void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_data **p) { int i; - int num_cores = topo.cores_per_node * topo.nodes_per_pkg * - topo.num_packages; + int num_cores = topo.cores_per_node * topo.nodes_per_pkg * topo.num_packages; int num_threads = topo.threads_per_core * num_cores; *t = calloc(num_threads, sizeof(struct thread_data)); @@ -5773,13 +5708,13 @@ allocate_counters(struct thread_data **t, struct core_data **c, error: err(1, "calloc counters"); } + /* * init_counter() * * set FIRST_THREAD_IN_CORE and FIRST_CORE_IN_PACKAGE */ -void init_counter(struct thread_data *thread_base, struct core_data *core_base, - struct pkg_data *pkg_base, int cpu_id) +void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base, int cpu_id) { int pkg_id = cpus[cpu_id].physical_package_id; int node_id = cpus[cpu_id].logical_node_id; @@ -5789,7 +5724,6 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct core_data *c; struct pkg_data *p; - /* Workaround for systems where physical_node_id==-1 * and logical_node_id==(-1 - topo.num_cpus) */ @@ -5811,7 +5745,6 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base, p->package_id = pkg_id; } - int initialize_counters(int cpu_id) { init_counter(EVEN_COUNTERS, cpu_id); @@ -5826,12 +5759,14 @@ void allocate_output_buffer() if (outp == NULL) err(-1, "calloc output buffer"); } + void allocate_fd_percpu(void) { fd_percpu = calloc(topo.max_cpu_num + 1, sizeof(int)); if (fd_percpu == NULL) err(-1, "calloc fd_percpu"); } + void allocate_irq_buffers(void) { irq_column_2_cpu = calloc(topo.num_cpus, sizeof(int)); @@ -5842,6 +5777,7 @@ void allocate_irq_buffers(void) if (irqs_per_cpu == NULL) err(-1, "calloc %d", topo.max_cpu_num + 1); } + void setup_all_buffers(void) { topology_probe(); @@ -5872,7 +5808,6 @@ void turbostat_init() process_cpuid(); linux_perf_init(); - if (!quiet) for_all_cpus(print_hwp, ODD_COUNTERS); @@ -5945,7 +5880,7 @@ int fork_it(char **argv) format_all_counters(EVEN_COUNTERS); } - fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec/1000000.0); + fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec / 1000000.0); flush_output_stderr(); @@ -5970,14 +5905,14 @@ int get_and_dump_counters(void) return status; } -void print_version() { - fprintf(outf, "turbostat version 21.03.12" - " - Len Brown \n"); +void print_version() +{ + fprintf(outf, "turbostat version 21.03.12" " - Len Brown \n"); } int add_counter(unsigned int msr_num, char *path, char *name, - unsigned int width, enum counter_scope scope, - enum counter_type type, enum counter_format format, int flags) + unsigned int width, enum counter_scope scope, + enum counter_type type, enum counter_format format, int flags) { struct msr_counter *msrp; @@ -6003,8 +5938,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, sys.tp = msrp; sys.added_thread_counters++; if (sys.added_thread_counters > MAX_ADDED_THREAD_COUNTERS) { - fprintf(stderr, "exceeded max %d added thread counters\n", - MAX_ADDED_COUNTERS); + fprintf(stderr, "exceeded max %d added thread counters\n", MAX_ADDED_COUNTERS); exit(-1); } break; @@ -6014,8 +5948,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, sys.cp = msrp; sys.added_core_counters++; if (sys.added_core_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added core counters\n", - MAX_ADDED_COUNTERS); + fprintf(stderr, "exceeded max %d added core counters\n", MAX_ADDED_COUNTERS); exit(-1); } break; @@ -6025,8 +5958,7 @@ int add_counter(unsigned int msr_num, char *path, char *name, sys.pp = msrp; sys.added_package_counters++; if (sys.added_package_counters > MAX_ADDED_COUNTERS) { - fprintf(stderr, "exceeded max %d added package counters\n", - MAX_ADDED_COUNTERS); + fprintf(stderr, "exceeded max %d added package counters\n", MAX_ADDED_COUNTERS); exit(-1); } break; @@ -6163,15 +6095,14 @@ void probe_sysfs(void) for (state = 10; state >= 0; --state) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", - base_cpu, state); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); input = fopen(path, "r"); if (input == NULL) continue; if (!fgets(name_buf, sizeof(name_buf), input)) err(1, "%s: failed to read file", path); - /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */ + /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */ sp = strchr(name_buf, '-'); if (!sp) sp = strchrnul(name_buf, '\n'); @@ -6187,20 +6118,18 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, - FORMAT_PERCENT, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU); } for (state = 10; state >= 0; --state) { - sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", - base_cpu, state); + sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state); input = fopen(path, "r"); if (input == NULL) continue; if (!fgets(name_buf, sizeof(name_buf), input)) err(1, "%s: failed to read file", path); - /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */ + /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */ sp = strchr(name_buf, '-'); if (!sp) sp = strchrnul(name_buf, '\n'); @@ -6214,13 +6143,11 @@ void probe_sysfs(void) if (is_deferred_skip(name_buf)) continue; - add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, - FORMAT_DELTA, SYSFS_PERCPU); + add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU); } } - /* * parse cpuset with following syntax * 1,2,4..6,8-10 and set bits in cpu_subset @@ -6307,37 +6234,35 @@ error: exit(-1); } - void cmdline(int argc, char **argv) { int opt; int option_index = 0; static struct option long_options[] = { - {"add", required_argument, 0, 'a'}, - {"cpu", required_argument, 0, 'c'}, - {"Dump", no_argument, 0, 'D'}, - {"debug", no_argument, 0, 'd'}, /* internal, not documented */ - {"enable", required_argument, 0, 'e'}, - {"interval", required_argument, 0, 'i'}, - {"IPC", no_argument, 0, 'I'}, - {"num_iterations", required_argument, 0, 'n'}, - {"help", no_argument, 0, 'h'}, - {"hide", required_argument, 0, 'H'}, // meh, -h taken by --help - {"Joules", no_argument, 0, 'J'}, - {"list", no_argument, 0, 'l'}, - {"out", required_argument, 0, 'o'}, - {"quiet", no_argument, 0, 'q'}, - {"show", required_argument, 0, 's'}, - {"Summary", no_argument, 0, 'S'}, - {"TCC", required_argument, 0, 'T'}, - {"version", no_argument, 0, 'v' }, - {0, 0, 0, 0 } + { "add", required_argument, 0, 'a' }, + { "cpu", required_argument, 0, 'c' }, + { "Dump", no_argument, 0, 'D' }, + { "debug", no_argument, 0, 'd' }, /* internal, not documented */ + { "enable", required_argument, 0, 'e' }, + { "interval", required_argument, 0, 'i' }, + { "IPC", no_argument, 0, 'I' }, + { "num_iterations", required_argument, 0, 'n' }, + { "help", no_argument, 0, 'h' }, + { "hide", required_argument, 0, 'H' }, // meh, -h taken by --help + { "Joules", no_argument, 0, 'J' }, + { "list", no_argument, 0, 'l' }, + { "out", required_argument, 0, 'o' }, + { "quiet", no_argument, 0, 'q' }, + { "show", required_argument, 0, 's' }, + { "Summary", no_argument, 0, 'S' }, + { "TCC", required_argument, 0, 'T' }, + { "version", no_argument, 0, 'v' }, + { 0, 0, 0, 0 } }; progname = argv[0]; - while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", - long_options, &option_index)) != -1) { + while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qST:v", long_options, &option_index)) != -1) { switch (opt) { case 'a': parse_add_command(optarg); @@ -6372,8 +6297,7 @@ void cmdline(int argc, char **argv) double interval = strtod(optarg, NULL); if (interval < 0.001) { - fprintf(outf, "interval %f seconds is too small\n", - interval); + fprintf(outf, "interval %f seconds is too small\n", interval); exit(2); } @@ -6400,8 +6324,7 @@ void cmdline(int argc, char **argv) num_iterations = strtod(optarg, NULL); if (num_iterations <= 0) { - fprintf(outf, "iterations %d should be positive number\n", - num_iterations); + fprintf(outf, "iterations %d should be positive number\n", num_iterations); exit(2); } break; From 38c6663a68903cf1187003129cd1873551979865 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 4 May 2021 19:27:34 -0400 Subject: [PATCH 1026/1379] tools/power turbostat: elevate priority of interval mode This makes interval mode less likely to see delayed results on a heavily loaded system. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 66 +++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 9 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 13805e460a4d..a0dc39d682cd 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2266,31 +2266,48 @@ char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2", int nhm_pkg_cstate_limits[16] = { PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int snb_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int hsw_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int slv_pkg_cstate_limits[16] = { PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCL__6, PCL__7 }; + PCL__6, PCL__7 +}; + int amt_pkg_cstate_limits[16] = { PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int phi_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int glm_pkg_cstate_limits[16] = { PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int skx_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; + int icx_pkg_cstate_limits[16] = { PCL__0, PCL__2, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, -PCLRSV, PCLRSV }; + PCLRSV, PCLRSV +}; static void calculate_tsc_tweak() { @@ -3401,6 +3418,32 @@ release_msr: free(per_cpu_msr_sum); } +/* + * set_my_sched_priority(pri) + * return previous + */ +int set_my_sched_priority(int priority) +{ + int retval; + int original_priority; + + errno = 0; + original_priority = getpriority(PRIO_PROCESS, 0); + if (errno && (original_priority == -1)) + err(errno, "getpriority"); + + retval = setpriority(PRIO_PROCESS, 0, priority); + if (retval) + err(retval, "setpriority(%d)", priority); + + errno = 0; + retval = getpriority(PRIO_PROCESS, 0); + if (retval != priority) + err(-1, "getpriority(%d) != setpriority(%d)", retval, priority); + + return original_priority; +} + void turbostat_loop() { int retval; @@ -3409,6 +3452,11 @@ void turbostat_loop() setup_signal_handler(); + /* + * elevate own priority for interval mode + */ + set_my_sched_priority(-20); + restart: restarted++; From b60c573dc241ab3a8719e990d86a0011b79eebcb Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 4 May 2021 19:56:17 -0400 Subject: [PATCH 1027/1379] tools/power turbostat: Support "turbostat --hide idle" As idle, in particular, can have many columns on some machines... Make it easy to ignore them all at once. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 6 ++++-- tools/power/x86/turbostat/turbostat.c | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index f6b7e85b121c..9b17097bc3d7 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -54,12 +54,14 @@ name as necessary to disambiguate it from others is necessary. Note that option .PP \fB--cpu cpu-set\fP limit output to system summary plus the specified cpu-set. If cpu-set is the string "core", then the system summary plus the first CPU in each core are printed -- eg. subsequent HT siblings are not printed. Or if cpu-set is the string "package", then the system summary plus the first CPU in each package is printed. Otherwise, the system summary plus the specified set of CPUs are printed. The cpu-set is ordered from low to high, comma delimited with ".." and "-" permitted to denote a range. eg. 1,2,8,14..17,21-44 .PP -\fB--hide column\fP do not show the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. Use "--hide sysfs" to hide the sysfs statistics columns as a group. +\fB--hide column\fP do not show the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. .PP \fB--enable column\fP show the specified built-in columns, which are otherwise disabled, by default. Currently the only built-in counters disabled by default are "usec", "Time_Of_Day_Seconds", "APIC" and "X2APIC". The column name "all" can be used to enable all disabled-by-default built-in counters. .PP -\fB--show column\fP show only the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. Use "--show sysfs" to show the sysfs statistics columns as a group. +\fB--show column\fP show only the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. +.PP +\fB--show CATEGORY --hide CATEGORY\fP Show and hide also accept a single CATEGORY of columns: "all", "topology", "idle", "frequency", "power", "sysfs", "other". .PP \fB--Dump\fP displays the raw counter values. .PP diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index a0dc39d682cd..4bb08de4cb71 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -667,6 +667,12 @@ struct msr_counter bic[] = { #define BIC_GFXACTMHz (1ULL << 51) #define BIC_IPC (1ULL << 52) +#define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die ) +#define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__) +#define BIC_FREQUENCY ( BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz ) +#define BIC_IDLE ( BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX) +#define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC) + #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT); @@ -748,6 +754,18 @@ unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode) if (!strcmp(name_list, "all")) return ~0; + if (!strcmp(name_list, "topology")) + return BIC_TOPOLOGY; + if (!strcmp(name_list, "power")) + return BIC_THERMAL_PWR; + if (!strcmp(name_list, "idle")) + return BIC_IDLE; + if (!strcmp(name_list, "frequency")) + return BIC_FREQUENCY; + if (!strcmp(name_list, "other")) + return BIC_OTHER; + if (!strcmp(name_list, "all")) + return 0; for (i = 0; i < MAX_BIC; ++i) { if (!strcmp(name_list, bic[i].name)) { From 3c070b2abf85b92455c2721d0a9edc68893ab6c1 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Tue, 4 May 2021 19:58:08 -0400 Subject: [PATCH 1028/1379] tools/power turbostat: version 2021.05.04 Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 4bb08de4cb71..47d3ba895d6d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -3,7 +3,7 @@ * turbostat -- show CPU frequency and C-state residency * on modern Intel and AMD processors. * - * Copyright (c) 2013 Intel Corporation. + * Copyright (c) 2021 Intel Corporation. * Len Brown */ @@ -5973,7 +5973,7 @@ int get_and_dump_counters(void) void print_version() { - fprintf(outf, "turbostat version 21.03.12" " - Len Brown \n"); + fprintf(outf, "turbostat version 21.05.04" " - Len Brown \n"); } int add_counter(unsigned int msr_num, char *path, char *name, From 025768a966a3dde8455de46d1f121a51bacb6a77 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 4 May 2021 14:07:53 -0700 Subject: [PATCH 1029/1379] x86/cpu: Use alternative to generate the TASK_SIZE_MAX constant We used to generate this constant with static jumps, which certainly works, but generates some quite unreadable and horrid code, and extra jumps. It's actually much simpler to just use our alternative_asm() infrastructure to generate a simple alternative constant, making the generated code much more obvious (and straight-line rather than "jump around to load the right constant"). Acked-by: Borislav Petkov Signed-off-by: Linus Torvalds Signed-off-by: Ingo Molnar Cc: Thomas Gleixner Cc: Ingo Molnar --- arch/x86/include/asm/page_64.h | 33 ++++++++++++++++++++++++++++ arch/x86/include/asm/page_64_types.h | 23 +++---------------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 939b1cff4a7b..ca840fec7776 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -56,6 +56,39 @@ static inline void clear_page(void *page) void copy_page(void *to, void *from); +#ifdef CONFIG_X86_5LEVEL +/* + * User space process size. This is the first address outside the user range. + * There are a few constraints that determine this: + * + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical + * address, then that syscall will enter the kernel with a + * non-canonical return address, and SYSRET will explode dangerously. + * We avoid this particular problem by preventing anything + * from being mapped at the maximum canonical address. + * + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the + * CPUs malfunction if they execute code from the highest canonical page. + * They'll speculate right off the end of the canonical space, and + * bad things happen. This is worked around in the same way as the + * Intel problem. + * + * With page table isolation enabled, we map the LDT in ... [stay tuned] + */ +static inline unsigned long task_size_max(void) +{ + unsigned long ret; + + alternative_io("movq %[small],%0","movq %[large],%0", + X86_FEATURE_LA57, + "=r" (ret), + [small] "i" ((1ul << 47)-PAGE_SIZE), + [large] "i" ((1ul << 56)-PAGE_SIZE)); + + return ret; +} +#endif /* CONFIG_X86_5LEVEL */ + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 64297eabad63..a8d4ad856568 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -55,30 +55,13 @@ #ifdef CONFIG_X86_5LEVEL #define __VIRTUAL_MASK_SHIFT (pgtable_l5_enabled() ? 56 : 47) +/* See task_size_max() in */ #else #define __VIRTUAL_MASK_SHIFT 47 +#define task_size_max() ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) #endif -/* - * User space process size. This is the first address outside the user range. - * There are a few constraints that determine this: - * - * On Intel CPUs, if a SYSCALL instruction is at the highest canonical - * address, then that syscall will enter the kernel with a - * non-canonical return address, and SYSRET will explode dangerously. - * We avoid this particular problem by preventing anything - * from being mapped at the maximum canonical address. - * - * On AMD CPUs in the Ryzen family, there's a nasty bug in which the - * CPUs malfunction if they execute code from the highest canonical page. - * They'll speculate right off the end of the canonical space, and - * bad things happen. This is worked around in the same way as the - * Intel problem. - * - * With page table isolation enabled, we map the LDT in ... [stay tuned] - */ -#define TASK_SIZE_MAX ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) - +#define TASK_SIZE_MAX task_size_max() #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) /* This decides where the kernel will search for a free chunk of vm From 98635b29a73f1a49ab6882ae58d56c9cd5ecb902 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 15 Mar 2021 10:13:54 +0100 Subject: [PATCH 1030/1379] lib: bitmap: remove the 'extern' keyword from function declarations The 'extern' keyword doesn't have any benefits for functions in header files. Remove it. Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko --- include/linux/bitmap.h | 113 ++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 57 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 70a932470b2d..6939a8983026 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -118,54 +118,53 @@ * Allocation and deallocation of bitmap. * Provided in lib/bitmap.c to avoid circular dependency. */ -extern unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags); -extern unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags); -extern void bitmap_free(const unsigned long *bitmap); +unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags); +unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags); +void bitmap_free(const unsigned long *bitmap); /* * lib/bitmap.c provides these functions: */ -extern int __bitmap_equal(const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); -extern bool __pure __bitmap_or_equal(const unsigned long *src1, - const unsigned long *src2, - const unsigned long *src3, - unsigned int nbits); -extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, - unsigned int nbits); -extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, - unsigned int shift, unsigned int nbits); -extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, - unsigned int shift, unsigned int nbits); -extern void bitmap_cut(unsigned long *dst, const unsigned long *src, - unsigned int first, unsigned int cut, - unsigned int nbits); -extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, +int __bitmap_equal(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +bool __pure __bitmap_or_equal(const unsigned long *src1, + const unsigned long *src2, + const unsigned long *src3, + unsigned int nbits); +void __bitmap_complement(unsigned long *dst, const unsigned long *src, + unsigned int nbits); +void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, + unsigned int shift, unsigned int nbits); +void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, + unsigned int shift, unsigned int nbits); +void bitmap_cut(unsigned long *dst, const unsigned long *src, + unsigned int first, unsigned int cut, unsigned int nbits); +int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +void __bitmap_replace(unsigned long *dst, + const unsigned long *old, const unsigned long *new, + const unsigned long *mask, unsigned int nbits); +int __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); -extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); -extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); -extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); -extern void __bitmap_replace(unsigned long *dst, - const unsigned long *old, const unsigned long *new, - const unsigned long *mask, unsigned int nbits); -extern int __bitmap_intersects(const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); -extern int __bitmap_subset(const unsigned long *bitmap1, - const unsigned long *bitmap2, unsigned int nbits); -extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); -extern void __bitmap_set(unsigned long *map, unsigned int start, int len); -extern void __bitmap_clear(unsigned long *map, unsigned int start, int len); +int __bitmap_subset(const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); +int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); +void __bitmap_set(unsigned long *map, unsigned int start, int len); +void __bitmap_clear(unsigned long *map, unsigned int start, int len); -extern unsigned long bitmap_find_next_zero_area_off(unsigned long *map, - unsigned long size, - unsigned long start, - unsigned int nr, - unsigned long align_mask, - unsigned long align_offset); +unsigned long bitmap_find_next_zero_area_off(unsigned long *map, + unsigned long size, + unsigned long start, + unsigned int nr, + unsigned long align_mask, + unsigned long align_offset); /** * bitmap_find_next_zero_area - find a contiguous aligned zero area @@ -190,33 +189,33 @@ bitmap_find_next_zero_area(unsigned long *map, align_mask, 0); } -extern int bitmap_parse(const char *buf, unsigned int buflen, +int bitmap_parse(const char *buf, unsigned int buflen, unsigned long *dst, int nbits); -extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, +int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *dst, int nbits); -extern int bitmap_parselist(const char *buf, unsigned long *maskp, +int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits); -extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, +int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, unsigned long *dst, int nbits); -extern void bitmap_remap(unsigned long *dst, const unsigned long *src, +void bitmap_remap(unsigned long *dst, const unsigned long *src, const unsigned long *old, const unsigned long *new, unsigned int nbits); -extern int bitmap_bitremap(int oldbit, +int bitmap_bitremap(int oldbit, const unsigned long *old, const unsigned long *new, int bits); -extern void bitmap_onto(unsigned long *dst, const unsigned long *orig, +void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, unsigned int bits); -extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, +void bitmap_fold(unsigned long *dst, const unsigned long *orig, unsigned int sz, unsigned int nbits); -extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); -extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); -extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); +int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); +void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); +int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); #ifdef __BIG_ENDIAN -extern void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits); +void bitmap_copy_le(unsigned long *dst, const unsigned long *src, unsigned int nbits); #else #define bitmap_copy_le bitmap_copy #endif -extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits); -extern int bitmap_print_to_pagebuf(bool list, char *buf, +unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits); +int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) @@ -265,9 +264,9 @@ static inline void bitmap_copy_clear_tail(unsigned long *dst, * therefore conversion is not needed when copying data from/to arrays of u32. */ #if BITS_PER_LONG == 64 -extern void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, +void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits); -extern void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, +void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits); #else #define bitmap_from_arr32(bitmap, buf, nbits) \ From c13656b904b6173aad723d9680a81c60de2f5edc Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 15 Mar 2021 10:13:55 +0100 Subject: [PATCH 1031/1379] lib: bitmap: order includes alphabetically For better readability and maintenance: order the includes in bitmap source files alphabetically. Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko --- include/linux/bitmap.h | 4 ++-- lib/bitmap.c | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 6939a8983026..3282db97e06c 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -4,10 +4,10 @@ #ifndef __ASSEMBLY__ -#include #include -#include #include +#include +#include /* * bitmaps provide bit arrays that consume one or more unsigned diff --git a/lib/bitmap.c b/lib/bitmap.c index 75006c4036e9..78f70d9007ad 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -3,17 +3,18 @@ * lib/bitmap.c * Helper functions for bitmap.h. */ -#include -#include -#include -#include + #include #include #include +#include +#include +#include #include #include #include #include +#include #include #include From e829c2e4744850bab4d8f8ffebd00df10b4c6c2b Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 15 Mar 2021 10:13:56 +0100 Subject: [PATCH 1032/1379] lib: bitmap: provide devm_bitmap_alloc() and devm_bitmap_zalloc() Provide managed variants of bitmap_alloc() and bitmap_zalloc(). Signed-off-by: Bartosz Golaszewski Reviewed-by: Andy Shevchenko --- include/linux/bitmap.h | 8 ++++++++ lib/bitmap.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 3282db97e06c..73d039476fa4 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -9,6 +9,8 @@ #include #include +struct device; + /* * bitmaps provide bit arrays that consume one or more unsigned * longs. The bitmap interface and available operations are listed @@ -122,6 +124,12 @@ unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags); unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags); void bitmap_free(const unsigned long *bitmap); +/* Managed variants of the above. */ +unsigned long *devm_bitmap_alloc(struct device *dev, + unsigned int nbits, gfp_t flags); +unsigned long *devm_bitmap_zalloc(struct device *dev, + unsigned int nbits, gfp_t flags); + /* * lib/bitmap.c provides these functions: */ diff --git a/lib/bitmap.c b/lib/bitmap.c index 78f70d9007ad..27e08c0e547e 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -1263,6 +1264,38 @@ void bitmap_free(const unsigned long *bitmap) } EXPORT_SYMBOL(bitmap_free); +static void devm_bitmap_free(void *data) +{ + unsigned long *bitmap = data; + + bitmap_free(bitmap); +} + +unsigned long *devm_bitmap_alloc(struct device *dev, + unsigned int nbits, gfp_t flags) +{ + unsigned long *bitmap; + int ret; + + bitmap = bitmap_alloc(nbits, flags); + if (!bitmap) + return NULL; + + ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap); + if (ret) + return NULL; + + return bitmap; +} +EXPORT_SYMBOL_GPL(devm_bitmap_alloc); + +unsigned long *devm_bitmap_zalloc(struct device *dev, + unsigned int nbits, gfp_t flags) +{ + return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO); +} +EXPORT_SYMBOL_GPL(devm_bitmap_zalloc); + #if BITS_PER_LONG == 64 /** * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap From 3eb52226de6f14d9409fd5485e7bdb8430bf8449 Mon Sep 17 00:00:00 2001 From: Alexander Dahl Date: Mon, 29 Mar 2021 13:16:47 +0200 Subject: [PATCH 1033/1379] docs: kernel-parameters: Move gpio-mockup for alphabetic order All other sections are ordered alphabetically so do the same for gpio-mockup. Fixes: 0f98dd1b27d2 ("gpio/mockup: add virtual gpio device") Signed-off-by: Alexander Dahl Signed-off-by: Bartosz Golaszewski --- Documentation/admin-guide/kernel-parameters.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 04545725f187..782dc6d9b7fb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1461,6 +1461,10 @@ Don't use this when you are not running on the android emulator + gpio-mockup.gpio_mockup_ranges + [HW] Sets the ranges of gpiochip of for this device. + Format: ,,,... + gpt [EFI] Forces disk with valid GPT signature but invalid Protective MBR to be treated as GPT. If the primary GPT is corrupted, it enables the backup/alternate @@ -1484,10 +1488,6 @@ Format: such that (rxsize & ~0x1fffc0) == 0. Default: 1024 - gpio-mockup.gpio_mockup_ranges - [HW] Sets the ranges of gpiochip of for this device. - Format: ,,,... - hardlockup_all_cpu_backtrace= [KNL] Should the hard-lockup detector generate backtraces on all cpus. From 6984a320349d61e6bcf3aa03d750a78d70ca98ad Mon Sep 17 00:00:00 2001 From: Alexander Dahl Date: Mon, 29 Mar 2021 13:16:48 +0200 Subject: [PATCH 1034/1379] docs: kernel-parameters: Add gpio_mockup_named_lines Missing since introduced in the driver. Fixes: 8a68ea00a62e ("gpio: mockup: implement naming the lines") Signed-off-by: Alexander Dahl Signed-off-by: Bartosz Golaszewski --- Documentation/admin-guide/kernel-parameters.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 782dc6d9b7fb..4b12f944ca44 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1464,6 +1464,8 @@ gpio-mockup.gpio_mockup_ranges [HW] Sets the ranges of gpiochip of for this device. Format: ,,,... + gpio-mockup.gpio_mockup_named_lines + [HW] Let the driver know GPIO lines should be named. gpt [EFI] Forces disk with valid GPT signature but invalid Protective MBR to be treated as GPT. If the From 951f7da9f60bf62d26dd0f8b71d5671ab3929ba2 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Tue, 30 Mar 2021 19:48:42 +0200 Subject: [PATCH 1035/1379] dt-bindings: gpio: Binding for Realtek Otto GPIO Add a binding description for Realtek's GPIO controller found on several of their MIPS-based SoCs (codenamed Otto), such as the RTL838x and RTL839x series of switch SoCs. A fallback binding 'realtek,otto-gpio' is provided for cases where the actual port ordering is not known yet, and enabling the interrupt controller may result in uncaught interrupts. Signed-off-by: Sander Vanheule Reviewed-by: Linus Walleij Reviewed-by: Rob Herring Signed-off-by: Bartosz Golaszewski --- .../bindings/gpio/realtek,otto-gpio.yaml | 78 +++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 Documentation/devicetree/bindings/gpio/realtek,otto-gpio.yaml diff --git a/Documentation/devicetree/bindings/gpio/realtek,otto-gpio.yaml b/Documentation/devicetree/bindings/gpio/realtek,otto-gpio.yaml new file mode 100644 index 000000000000..100f20cebd76 --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/realtek,otto-gpio.yaml @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/gpio/realtek,otto-gpio.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Realtek Otto GPIO controller + +maintainers: + - Sander Vanheule + - Bert Vermeulen + +description: | + Realtek's GPIO controller on their MIPS switch SoCs (Otto platform) consists + of two banks of 32 GPIOs. These GPIOs can generate edge-triggered interrupts. + Each bank's interrupts are cascased into one interrupt line on the parent + interrupt controller, if provided. + This binding allows defining a single bank in the devicetree. The interrupt + controller is not supported on the fallback compatible name, which only + allows for GPIO port use. + +properties: + $nodename: + pattern: "^gpio@[0-9a-f]+$" + + compatible: + items: + - enum: + - realtek,rtl8380-gpio + - realtek,rtl8390-gpio + - const: realtek,otto-gpio + + reg: + maxItems: 1 + + "#gpio-cells": + const: 2 + + gpio-controller: true + + ngpios: + minimum: 1 + maximum: 32 + + interrupt-controller: true + + "#interrupt-cells": + const: 2 + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - "#gpio-cells" + - gpio-controller + +additionalProperties: false + +dependencies: + interrupt-controller: [ interrupts ] + +examples: + - | + gpio@3500 { + compatible = "realtek,rtl8380-gpio", "realtek,otto-gpio"; + reg = <0x3500 0x1c>; + gpio-controller; + #gpio-cells = <2>; + ngpios = <24>; + interrupt-controller; + #interrupt-cells = <2>; + interrupt-parent = <&rtlintc>; + interrupts = <23>; + }; + +... From 0d82fb1127fb7cc8287614eb0992acb0583bc323 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Tue, 30 Mar 2021 19:48:43 +0200 Subject: [PATCH 1036/1379] gpio: Add Realtek Otto GPIO support Realtek MIPS SoCs (platform name Otto) have GPIO controllers with up to 64 GPIOs, divided over two banks. Each bank has a set of registers for 32 GPIOs, with support for edge-triggered interrupts. Each GPIO bank consists of four 8-bit GPIO ports (ABCD and EFGH). Most registers pack one bit per GPIO, except for the IMR register, which packs two bits per GPIO (AB-CD). Although the byte order is currently assumed to have port A..D at offset 0x0..0x3, this has been observed to be reversed on other, Lexra-based, SoCs (e.g. RTL8196E/97D/97F). Interrupt support is disabled for the fallback devicetree-compatible 'realtek,otto-gpio'. This allows for quick support of GPIO banks in which the byte order would be unknown. In this case, the port ordering in the IMR registers may not match the reversed order in the other registers (DCBA, and BA-DC or DC-BA). Signed-off-by: Sander Vanheule Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 13 ++ drivers/gpio/Makefile | 1 + drivers/gpio/gpio-realtek-otto.c | 325 +++++++++++++++++++++++++++++++ 3 files changed, 339 insertions(+) create mode 100644 drivers/gpio/gpio-realtek-otto.c diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index e3607ec4c2e8..6fb13d6507db 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -502,6 +502,19 @@ config GPIO_RDA help Say Y here to support RDA Micro GPIO controller. +config GPIO_REALTEK_OTTO + tristate "Realtek Otto GPIO support" + depends on MACH_REALTEK_RTL + default MACH_REALTEK_RTL + select GPIO_GENERIC + select GPIOLIB_IRQCHIP + help + The GPIO controller on the Otto MIPS platform supports up to two + banks of 32 GPIOs, with edge triggered interrupts. The 32 GPIOs + are grouped in four 8-bit wide ports. + + When built as a module, the module will be called realtek_otto_gpio. + config GPIO_REG bool help diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index c58a90a3c3b1..8ace5934e3c3 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -124,6 +124,7 @@ obj-$(CONFIG_GPIO_RC5T583) += gpio-rc5t583.o obj-$(CONFIG_GPIO_RCAR) += gpio-rcar.o obj-$(CONFIG_GPIO_RDA) += gpio-rda.o obj-$(CONFIG_GPIO_RDC321X) += gpio-rdc321x.o +obj-$(CONFIG_GPIO_REALTEK_OTTO) += gpio-realtek-otto.o obj-$(CONFIG_GPIO_REG) += gpio-reg.o obj-$(CONFIG_ARCH_SA1100) += gpio-sa1100.o obj-$(CONFIG_GPIO_SAMA5D2_PIOBU) += gpio-sama5d2-piobu.o diff --git a/drivers/gpio/gpio-realtek-otto.c b/drivers/gpio/gpio-realtek-otto.c new file mode 100644 index 000000000000..cb64fb5a51aa --- /dev/null +++ b/drivers/gpio/gpio-realtek-otto.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include + +/* + * Total register block size is 0x1C for one bank of four ports (A, B, C, D). + * An optional second bank, with ports E, F, G, and H, may be present, starting + * at register offset 0x1C. + */ + +/* + * Pin select: (0) "normal", (1) "dedicate peripheral" + * Not used on RTL8380/RTL8390, peripheral selection is managed by control bits + * in the peripheral registers. + */ +#define REALTEK_GPIO_REG_CNR 0x00 +/* Clear bit (0) for input, set bit (1) for output */ +#define REALTEK_GPIO_REG_DIR 0x08 +#define REALTEK_GPIO_REG_DATA 0x0C +/* Read bit for IRQ status, write 1 to clear IRQ */ +#define REALTEK_GPIO_REG_ISR 0x10 +/* Two bits per GPIO in IMR registers */ +#define REALTEK_GPIO_REG_IMR 0x14 +#define REALTEK_GPIO_REG_IMR_AB 0x14 +#define REALTEK_GPIO_REG_IMR_CD 0x18 +#define REALTEK_GPIO_IMR_LINE_MASK GENMASK(1, 0) +#define REALTEK_GPIO_IRQ_EDGE_FALLING 1 +#define REALTEK_GPIO_IRQ_EDGE_RISING 2 +#define REALTEK_GPIO_IRQ_EDGE_BOTH 3 + +#define REALTEK_GPIO_MAX 32 +#define REALTEK_GPIO_PORTS_PER_BANK 4 + +/** + * realtek_gpio_ctrl - Realtek Otto GPIO driver data + * + * @gc: Associated gpio_chip instance + * @base: Base address of the register block for a GPIO bank + * @lock: Lock for accessing the IRQ registers and values + * @intr_mask: Mask for interrupts lines + * @intr_type: Interrupt type selection + * + * Because the interrupt mask register (IMR) combines the function of IRQ type + * selection and masking, two extra values are stored. @intr_mask is used to + * mask/unmask the interrupts for a GPIO port, and @intr_type is used to store + * the selected interrupt types. The logical AND of these values is written to + * IMR on changes. + */ +struct realtek_gpio_ctrl { + struct gpio_chip gc; + void __iomem *base; + raw_spinlock_t lock; + u16 intr_mask[REALTEK_GPIO_PORTS_PER_BANK]; + u16 intr_type[REALTEK_GPIO_PORTS_PER_BANK]; +}; + +/* Expand with more flags as devices with other quirks are added */ +enum realtek_gpio_flags { + /* + * Allow disabling interrupts, for cases where the port order is + * unknown. This may result in a port mismatch between ISR and IMR. + * An interrupt would appear to come from a different line than the + * line the IRQ handler was assigned to, causing uncaught interrupts. + */ + GPIO_INTERRUPTS_DISABLED = BIT(0), +}; + +static struct realtek_gpio_ctrl *irq_data_to_ctrl(struct irq_data *data) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(data); + + return container_of(gc, struct realtek_gpio_ctrl, gc); +} + +/* + * Normal port order register access + * + * Port information is stored with the first port at offset 0, followed by the + * second, etc. Most registers store one bit per GPIO and use a u8 value per + * port. The two interrupt mask registers store two bits per GPIO, so use u16 + * values. + */ +static void realtek_gpio_write_imr(struct realtek_gpio_ctrl *ctrl, + unsigned int port, u16 irq_type, u16 irq_mask) +{ + iowrite16(irq_type & irq_mask, ctrl->base + REALTEK_GPIO_REG_IMR + 2 * port); +} + +static void realtek_gpio_clear_isr(struct realtek_gpio_ctrl *ctrl, + unsigned int port, u8 mask) +{ + iowrite8(mask, ctrl->base + REALTEK_GPIO_REG_ISR + port); +} + +static u8 realtek_gpio_read_isr(struct realtek_gpio_ctrl *ctrl, unsigned int port) +{ + return ioread8(ctrl->base + REALTEK_GPIO_REG_ISR + port); +} + +/* Set the rising and falling edge mask bits for a GPIO port pin */ +static u16 realtek_gpio_imr_bits(unsigned int pin, u16 value) +{ + return (value & REALTEK_GPIO_IMR_LINE_MASK) << 2 * pin; +} + +static void realtek_gpio_irq_ack(struct irq_data *data) +{ + struct realtek_gpio_ctrl *ctrl = irq_data_to_ctrl(data); + irq_hw_number_t line = irqd_to_hwirq(data); + unsigned int port = line / 8; + unsigned int port_pin = line % 8; + + realtek_gpio_clear_isr(ctrl, port, BIT(port_pin)); +} + +static void realtek_gpio_irq_unmask(struct irq_data *data) +{ + struct realtek_gpio_ctrl *ctrl = irq_data_to_ctrl(data); + unsigned int line = irqd_to_hwirq(data); + unsigned int port = line / 8; + unsigned int port_pin = line % 8; + unsigned long flags; + u16 m; + + raw_spin_lock_irqsave(&ctrl->lock, flags); + m = ctrl->intr_mask[port]; + m |= realtek_gpio_imr_bits(port_pin, REALTEK_GPIO_IMR_LINE_MASK); + ctrl->intr_mask[port] = m; + realtek_gpio_write_imr(ctrl, port, ctrl->intr_type[port], m); + raw_spin_unlock_irqrestore(&ctrl->lock, flags); +} + +static void realtek_gpio_irq_mask(struct irq_data *data) +{ + struct realtek_gpio_ctrl *ctrl = irq_data_to_ctrl(data); + unsigned int line = irqd_to_hwirq(data); + unsigned int port = line / 8; + unsigned int port_pin = line % 8; + unsigned long flags; + u16 m; + + raw_spin_lock_irqsave(&ctrl->lock, flags); + m = ctrl->intr_mask[port]; + m &= ~realtek_gpio_imr_bits(port_pin, REALTEK_GPIO_IMR_LINE_MASK); + ctrl->intr_mask[port] = m; + realtek_gpio_write_imr(ctrl, port, ctrl->intr_type[port], m); + raw_spin_unlock_irqrestore(&ctrl->lock, flags); +} + +static int realtek_gpio_irq_set_type(struct irq_data *data, unsigned int flow_type) +{ + struct realtek_gpio_ctrl *ctrl = irq_data_to_ctrl(data); + unsigned int line = irqd_to_hwirq(data); + unsigned int port = line / 8; + unsigned int port_pin = line % 8; + unsigned long flags; + u16 type, t; + + switch (flow_type & IRQ_TYPE_SENSE_MASK) { + case IRQ_TYPE_EDGE_FALLING: + type = REALTEK_GPIO_IRQ_EDGE_FALLING; + break; + case IRQ_TYPE_EDGE_RISING: + type = REALTEK_GPIO_IRQ_EDGE_RISING; + break; + case IRQ_TYPE_EDGE_BOTH: + type = REALTEK_GPIO_IRQ_EDGE_BOTH; + break; + default: + return -EINVAL; + } + + irq_set_handler_locked(data, handle_edge_irq); + + raw_spin_lock_irqsave(&ctrl->lock, flags); + t = ctrl->intr_type[port]; + t &= ~realtek_gpio_imr_bits(port_pin, REALTEK_GPIO_IMR_LINE_MASK); + t |= realtek_gpio_imr_bits(port_pin, type); + ctrl->intr_type[port] = t; + realtek_gpio_write_imr(ctrl, port, t, ctrl->intr_mask[port]); + raw_spin_unlock_irqrestore(&ctrl->lock, flags); + + return 0; +} + +static void realtek_gpio_irq_handler(struct irq_desc *desc) +{ + struct gpio_chip *gc = irq_desc_get_handler_data(desc); + struct realtek_gpio_ctrl *ctrl = gpiochip_get_data(gc); + struct irq_chip *irq_chip = irq_desc_get_chip(desc); + unsigned int lines_done; + unsigned int port_pin_count; + unsigned int irq; + unsigned long status; + int offset; + + chained_irq_enter(irq_chip, desc); + + for (lines_done = 0; lines_done < gc->ngpio; lines_done += 8) { + status = realtek_gpio_read_isr(ctrl, lines_done / 8); + port_pin_count = min(gc->ngpio - lines_done, 8U); + for_each_set_bit(offset, &status, port_pin_count) { + irq = irq_find_mapping(gc->irq.domain, offset); + generic_handle_irq(irq); + } + } + + chained_irq_exit(irq_chip, desc); +} + +static int realtek_gpio_irq_init(struct gpio_chip *gc) +{ + struct realtek_gpio_ctrl *ctrl = gpiochip_get_data(gc); + unsigned int port; + + for (port = 0; (port * 8) < gc->ngpio; port++) { + realtek_gpio_write_imr(ctrl, port, 0, 0); + realtek_gpio_clear_isr(ctrl, port, GENMASK(7, 0)); + } + + return 0; +} + +static struct irq_chip realtek_gpio_irq_chip = { + .name = "realtek-otto-gpio", + .irq_ack = realtek_gpio_irq_ack, + .irq_mask = realtek_gpio_irq_mask, + .irq_unmask = realtek_gpio_irq_unmask, + .irq_set_type = realtek_gpio_irq_set_type, +}; + +static const struct of_device_id realtek_gpio_of_match[] = { + { + .compatible = "realtek,otto-gpio", + .data = (void *)GPIO_INTERRUPTS_DISABLED, + }, + { + .compatible = "realtek,rtl8380-gpio", + }, + { + .compatible = "realtek,rtl8390-gpio", + }, + {} +}; +MODULE_DEVICE_TABLE(of, realtek_gpio_of_match); + +static int realtek_gpio_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + unsigned int dev_flags; + struct gpio_irq_chip *girq; + struct realtek_gpio_ctrl *ctrl; + u32 ngpios; + int err, irq; + + ctrl = devm_kzalloc(dev, sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return -ENOMEM; + + dev_flags = (unsigned int) device_get_match_data(dev); + + ngpios = REALTEK_GPIO_MAX; + device_property_read_u32(dev, "ngpios", &ngpios); + + if (ngpios > REALTEK_GPIO_MAX) { + dev_err(&pdev->dev, "invalid ngpios (max. %d)\n", + REALTEK_GPIO_MAX); + return -EINVAL; + } + + ctrl->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(ctrl->base)) + return PTR_ERR(ctrl->base); + + raw_spin_lock_init(&ctrl->lock); + + err = bgpio_init(&ctrl->gc, dev, 4, + ctrl->base + REALTEK_GPIO_REG_DATA, NULL, NULL, + ctrl->base + REALTEK_GPIO_REG_DIR, NULL, + BGPIOF_BIG_ENDIAN_BYTE_ORDER); + if (err) { + dev_err(dev, "unable to init generic GPIO"); + return err; + } + + ctrl->gc.ngpio = ngpios; + ctrl->gc.owner = THIS_MODULE; + + irq = platform_get_irq_optional(pdev, 0); + if (!(dev_flags & GPIO_INTERRUPTS_DISABLED) && irq > 0) { + girq = &ctrl->gc.irq; + girq->chip = &realtek_gpio_irq_chip; + girq->default_type = IRQ_TYPE_NONE; + girq->handler = handle_bad_irq; + girq->parent_handler = realtek_gpio_irq_handler; + girq->num_parents = 1; + girq->parents = devm_kcalloc(dev, girq->num_parents, + sizeof(*girq->parents), GFP_KERNEL); + if (!girq->parents) + return -ENOMEM; + girq->parents[0] = irq; + girq->init_hw = realtek_gpio_irq_init; + } + + return devm_gpiochip_add_data(dev, &ctrl->gc, ctrl); +} + +static struct platform_driver realtek_gpio_driver = { + .driver = { + .name = "realtek-otto-gpio", + .of_match_table = realtek_gpio_of_match, + }, + .probe = realtek_gpio_probe, +}; +module_platform_driver(realtek_gpio_driver); + +MODULE_DESCRIPTION("Realtek Otto GPIO support"); +MODULE_AUTHOR("Sander Vanheule "); +MODULE_LICENSE("GPL v2"); From ca40daf39daf62355d87287a8732cadb62d13e2e Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Wed, 31 Mar 2021 16:19:11 +0800 Subject: [PATCH 1037/1379] gpio: omap: Use device_get_match_data() helper Use the device_get_match_data() helper instead of open coding. Signed-off-by: Tian Tao Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-omap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c index 41952bb818ad..f4df555fc39c 100644 --- a/drivers/gpio/gpio-omap.c +++ b/drivers/gpio/gpio-omap.c @@ -1364,15 +1364,14 @@ static int omap_gpio_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct device_node *node = dev->of_node; - const struct of_device_id *match; const struct omap_gpio_platform_data *pdata; struct gpio_bank *bank; struct irq_chip *irqc; int ret; - match = of_match_device(of_match_ptr(omap_gpio_match), dev); + pdata = device_get_match_data(dev); - pdata = match ? match->data : dev_get_platdata(dev); + pdata = pdata ?: dev_get_platdata(dev); if (!pdata) return -EINVAL; From 65dd36a39d3b350dc96d8324b348f0863d76404d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 1 Mar 2021 18:59:31 +0200 Subject: [PATCH 1038/1379] lib/cmdline: Export next_arg() for being used in modules At least one module will benefit from using next_arg() helper. Let's export it for that module and others if they consider it helpful. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Reviewed-by: Geert Uytterhoeven --- lib/cmdline.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/cmdline.c b/lib/cmdline.c index 5d474c626e24..5546bf588780 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -272,3 +272,4 @@ char *next_arg(char *args, char **param, char **val) /* Chew up trailing spaces. */ return skip_spaces(args); } +EXPORT_SYMBOL(next_arg); From ac505b6f5fa8289c3d3a311344de0da23f6ff767 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 1 Mar 2021 18:59:32 +0200 Subject: [PATCH 1039/1379] gpio: aggregator: Replace custom get_arg() with a generic next_arg() cmdline library provides next_arg() helper to traverse over parameters and their values given in command line. Replace custom approach in the driver by it. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Reviewed-by: Geert Uytterhoeven --- drivers/gpio/gpio-aggregator.c | 39 +++++----------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) diff --git a/drivers/gpio/gpio-aggregator.c b/drivers/gpio/gpio-aggregator.c index 08171431bb8f..34e35b64dcdc 100644 --- a/drivers/gpio/gpio-aggregator.c +++ b/drivers/gpio/gpio-aggregator.c @@ -37,31 +37,6 @@ struct gpio_aggregator { static DEFINE_MUTEX(gpio_aggregator_lock); /* protects idr */ static DEFINE_IDR(gpio_aggregator_idr); -static char *get_arg(char **args) -{ - char *start, *end; - - start = skip_spaces(*args); - if (!*start) - return NULL; - - if (*start == '"') { - /* Quoted arg */ - end = strchr(++start, '"'); - if (!end) - return ERR_PTR(-EINVAL); - } else { - /* Unquoted arg */ - for (end = start; *end && !isspace(*end); end++) ; - } - - if (*end) - *end++ = '\0'; - - *args = end; - return start; -} - static int aggr_add_gpio(struct gpio_aggregator *aggr, const char *key, int hwnum, unsigned int *n) { @@ -83,8 +58,8 @@ static int aggr_add_gpio(struct gpio_aggregator *aggr, const char *key, static int aggr_parse(struct gpio_aggregator *aggr) { + char *args = skip_spaces(aggr->args); char *name, *offsets, *p; - char *args = aggr->args; unsigned long *bitmap; unsigned int i, n = 0; int error = 0; @@ -93,13 +68,9 @@ static int aggr_parse(struct gpio_aggregator *aggr) if (!bitmap) return -ENOMEM; - for (name = get_arg(&args), offsets = get_arg(&args); name; - offsets = get_arg(&args)) { - if (IS_ERR(name)) { - pr_err("Cannot get GPIO specifier: %pe\n", name); - error = PTR_ERR(name); - goto free_bitmap; - } + args = next_arg(args, &name, &p); + while (*args) { + args = next_arg(args, &offsets, &p); p = get_options(offsets, 0, &error); if (error == 0 || *p) { @@ -125,7 +96,7 @@ static int aggr_parse(struct gpio_aggregator *aggr) goto free_bitmap; } - name = get_arg(&args); + args = next_arg(args, &name, &p); } if (!n) { From 7a81638485c1a62a87b4c391ecc9c651a4a9dc19 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 17 Mar 2021 17:19:27 +0200 Subject: [PATCH 1040/1379] gpio: sch: Add edge event support Add the required infrastructure to enable and report edge events of the pins to the GPIO core. The actual hook-up of the event interrupt will happen separately. Signed-off-by: Jan Kiszka Co-developed-by: Andy Shevchenko Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij --- drivers/gpio/Kconfig | 1 + drivers/gpio/gpio-sch.c | 116 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 109 insertions(+), 8 deletions(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 6fb13d6507db..ad1a325be727 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -861,6 +861,7 @@ config GPIO_IT87 config GPIO_SCH tristate "Intel SCH/TunnelCreek/Centerton/Quark X1000 GPIO" depends on (X86 || COMPILE_TEST) && PCI + select GPIOLIB_IRQCHIP select MFD_CORE select LPC_SCH help diff --git a/drivers/gpio/gpio-sch.c b/drivers/gpio/gpio-sch.c index 3a1b1adb08c6..06ab55d134b9 100644 --- a/drivers/gpio/gpio-sch.c +++ b/drivers/gpio/gpio-sch.c @@ -10,17 +10,28 @@ #include #include #include +#include #include #include #include #include +#include #define GEN 0x00 #define GIO 0x04 #define GLV 0x08 +#define GTPE 0x0c +#define GTNE 0x10 +#define GGPE 0x14 +#define GSMI 0x18 +#define GTS 0x1c + +#define CORE_BANK_OFFSET 0x00 +#define RESUME_BANK_OFFSET 0x20 struct sch_gpio { struct gpio_chip chip; + struct irq_chip irqchip; spinlock_t lock; unsigned short iobase; unsigned short resume_base; @@ -29,11 +40,11 @@ struct sch_gpio { static unsigned int sch_gpio_offset(struct sch_gpio *sch, unsigned int gpio, unsigned int reg) { - unsigned int base = 0; + unsigned int base = CORE_BANK_OFFSET; if (gpio >= sch->resume_base) { gpio -= sch->resume_base; - base += 0x20; + base = RESUME_BANK_OFFSET; } return base + reg + gpio / 8; @@ -79,10 +90,11 @@ static void sch_gpio_reg_set(struct sch_gpio *sch, unsigned int gpio, unsigned i static int sch_gpio_direction_in(struct gpio_chip *gc, unsigned int gpio_num) { struct sch_gpio *sch = gpiochip_get_data(gc); + unsigned long flags; - spin_lock(&sch->lock); + spin_lock_irqsave(&sch->lock, flags); sch_gpio_reg_set(sch, gpio_num, GIO, 1); - spin_unlock(&sch->lock); + spin_unlock_irqrestore(&sch->lock, flags); return 0; } @@ -96,20 +108,22 @@ static int sch_gpio_get(struct gpio_chip *gc, unsigned int gpio_num) static void sch_gpio_set(struct gpio_chip *gc, unsigned int gpio_num, int val) { struct sch_gpio *sch = gpiochip_get_data(gc); + unsigned long flags; - spin_lock(&sch->lock); + spin_lock_irqsave(&sch->lock, flags); sch_gpio_reg_set(sch, gpio_num, GLV, val); - spin_unlock(&sch->lock); + spin_unlock_irqrestore(&sch->lock, flags); } static int sch_gpio_direction_out(struct gpio_chip *gc, unsigned int gpio_num, int val) { struct sch_gpio *sch = gpiochip_get_data(gc); + unsigned long flags; - spin_lock(&sch->lock); + spin_lock_irqsave(&sch->lock, flags); sch_gpio_reg_set(sch, gpio_num, GIO, 0); - spin_unlock(&sch->lock); + spin_unlock_irqrestore(&sch->lock, flags); /* * according to the datasheet, writing to the level register has no @@ -144,8 +158,80 @@ static const struct gpio_chip sch_gpio_chip = { .get_direction = sch_gpio_get_direction, }; +static int sch_irq_type(struct irq_data *d, unsigned int type) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct sch_gpio *sch = gpiochip_get_data(gc); + irq_hw_number_t gpio_num = irqd_to_hwirq(d); + unsigned long flags; + int rising, falling; + + switch (type & IRQ_TYPE_SENSE_MASK) { + case IRQ_TYPE_EDGE_RISING: + rising = 1; + falling = 0; + break; + case IRQ_TYPE_EDGE_FALLING: + rising = 0; + falling = 1; + break; + case IRQ_TYPE_EDGE_BOTH: + rising = 1; + falling = 1; + break; + default: + return -EINVAL; + } + + spin_lock_irqsave(&sch->lock, flags); + + sch_gpio_reg_set(sch, gpio_num, GTPE, rising); + sch_gpio_reg_set(sch, gpio_num, GTNE, falling); + + irq_set_handler_locked(d, handle_edge_irq); + + spin_unlock_irqrestore(&sch->lock, flags); + + return 0; +} + +static void sch_irq_ack(struct irq_data *d) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct sch_gpio *sch = gpiochip_get_data(gc); + irq_hw_number_t gpio_num = irqd_to_hwirq(d); + unsigned long flags; + + spin_lock_irqsave(&sch->lock, flags); + sch_gpio_reg_set(sch, gpio_num, GTS, 1); + spin_unlock_irqrestore(&sch->lock, flags); +} + +static void sch_irq_mask_unmask(struct irq_data *d, int val) +{ + struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + struct sch_gpio *sch = gpiochip_get_data(gc); + irq_hw_number_t gpio_num = irqd_to_hwirq(d); + unsigned long flags; + + spin_lock_irqsave(&sch->lock, flags); + sch_gpio_reg_set(sch, gpio_num, GGPE, val); + spin_unlock_irqrestore(&sch->lock, flags); +} + +static void sch_irq_mask(struct irq_data *d) +{ + sch_irq_mask_unmask(d, 0); +} + +static void sch_irq_unmask(struct irq_data *d) +{ + sch_irq_mask_unmask(d, 1); +} + static int sch_gpio_probe(struct platform_device *pdev) { + struct gpio_irq_chip *girq; struct sch_gpio *sch; struct resource *res; @@ -207,6 +293,20 @@ static int sch_gpio_probe(struct platform_device *pdev) platform_set_drvdata(pdev, sch); + sch->irqchip.name = "sch_gpio"; + sch->irqchip.irq_ack = sch_irq_ack; + sch->irqchip.irq_mask = sch_irq_mask; + sch->irqchip.irq_unmask = sch_irq_unmask; + sch->irqchip.irq_set_type = sch_irq_type; + + girq = &sch->chip.irq; + girq->chip = &sch->irqchip; + girq->num_parents = 0; + girq->parents = NULL; + girq->parent_handler = NULL; + girq->default_type = IRQ_TYPE_NONE; + girq->handler = handle_bad_irq; + return devm_gpiochip_add_data(&pdev->dev, &sch->chip, sch); } From fdc1f5dfb9aa890473d6f94bd224d45cf2f0443d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 17 Mar 2021 17:19:28 +0200 Subject: [PATCH 1041/1379] gpio: sch: Hook into ACPI GPE handler to catch GPIO edge events Neither the ACPI description on Intel Minnowboard (v1) platform provides the required information to establish a generic handling nor the hardware capable of doing it. According to the data sheet the hardware can generate SCI events. Therefore, we need to hook from the driver into GPE handler of the ACPI subsystem in order to catch and report GPIO-related events. Validated on the Inlel Minnowboard (v1) platform and Intel Galileo Gen 2. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij --- drivers/gpio/Kconfig | 2 +- drivers/gpio/gpio-sch.c | 82 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index ad1a325be727..3a56a6370b6c 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -860,7 +860,7 @@ config GPIO_IT87 config GPIO_SCH tristate "Intel SCH/TunnelCreek/Centerton/Quark X1000 GPIO" - depends on (X86 || COMPILE_TEST) && PCI + depends on (X86 || COMPILE_TEST) && ACPI select GPIOLIB_IRQCHIP select MFD_CORE select LPC_SCH diff --git a/drivers/gpio/gpio-sch.c b/drivers/gpio/gpio-sch.c index 06ab55d134b9..a6f0421d6e50 100644 --- a/drivers/gpio/gpio-sch.c +++ b/drivers/gpio/gpio-sch.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -29,12 +30,22 @@ #define CORE_BANK_OFFSET 0x00 #define RESUME_BANK_OFFSET 0x20 +/* + * iLB datasheet describes GPE0BLK registers, in particular GPE0E.GPIO bit. + * Document Number: 328195-001 + */ +#define GPE0E_GPIO 14 + struct sch_gpio { struct gpio_chip chip; struct irq_chip irqchip; spinlock_t lock; unsigned short iobase; unsigned short resume_base; + + /* GPE handling */ + u32 gpe; + acpi_gpe_handler gpe_handler; }; static unsigned int sch_gpio_offset(struct sch_gpio *sch, unsigned int gpio, @@ -229,11 +240,74 @@ static void sch_irq_unmask(struct irq_data *d) sch_irq_mask_unmask(d, 1); } +static u32 sch_gpio_gpe_handler(acpi_handle gpe_device, u32 gpe, void *context) +{ + struct sch_gpio *sch = context; + struct gpio_chip *gc = &sch->chip; + unsigned long core_status, resume_status; + unsigned long pending; + unsigned long flags; + int offset; + u32 ret; + + spin_lock_irqsave(&sch->lock, flags); + + core_status = inl(sch->iobase + CORE_BANK_OFFSET + GTS); + resume_status = inl(sch->iobase + RESUME_BANK_OFFSET + GTS); + + spin_unlock_irqrestore(&sch->lock, flags); + + pending = (resume_status << sch->resume_base) | core_status; + for_each_set_bit(offset, &pending, sch->chip.ngpio) + generic_handle_irq(irq_find_mapping(gc->irq.domain, offset)); + + /* Set returning value depending on whether we handled an interrupt */ + ret = pending ? ACPI_INTERRUPT_HANDLED : ACPI_INTERRUPT_NOT_HANDLED; + + /* Acknowledge GPE to ACPICA */ + ret |= ACPI_REENABLE_GPE; + + return ret; +} + +static void sch_gpio_remove_gpe_handler(void *data) +{ + struct sch_gpio *sch = data; + + acpi_disable_gpe(NULL, sch->gpe); + acpi_remove_gpe_handler(NULL, sch->gpe, sch->gpe_handler); +} + +static int sch_gpio_install_gpe_handler(struct sch_gpio *sch) +{ + struct device *dev = sch->chip.parent; + acpi_status status; + + status = acpi_install_gpe_handler(NULL, sch->gpe, ACPI_GPE_LEVEL_TRIGGERED, + sch->gpe_handler, sch); + if (ACPI_FAILURE(status)) { + dev_err(dev, "Failed to install GPE handler for %u: %s\n", + sch->gpe, acpi_format_exception(status)); + return -ENODEV; + } + + status = acpi_enable_gpe(NULL, sch->gpe); + if (ACPI_FAILURE(status)) { + dev_err(dev, "Failed to enable GPE handler for %u: %s\n", + sch->gpe, acpi_format_exception(status)); + acpi_remove_gpe_handler(NULL, sch->gpe, sch->gpe_handler); + return -ENODEV; + } + + return devm_add_action_or_reset(dev, sch_gpio_remove_gpe_handler, sch); +} + static int sch_gpio_probe(struct platform_device *pdev) { struct gpio_irq_chip *girq; struct sch_gpio *sch; struct resource *res; + int ret; sch = devm_kzalloc(&pdev->dev, sizeof(*sch), GFP_KERNEL); if (!sch) @@ -307,6 +381,14 @@ static int sch_gpio_probe(struct platform_device *pdev) girq->default_type = IRQ_TYPE_NONE; girq->handler = handle_bad_irq; + /* GPE setup is optional */ + sch->gpe = GPE0E_GPIO; + sch->gpe_handler = sch_gpio_gpe_handler; + + ret = sch_gpio_install_gpe_handler(sch); + if (ret) + dev_warn(&pdev->dev, "Can't setup GPE, no IRQ support\n"); + return devm_gpiochip_add_data(&pdev->dev, &sch->chip, sch); } From da91ece226729c76f60708efc275ebd4716ad089 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 1 Apr 2021 18:27:40 +0200 Subject: [PATCH 1042/1379] gpiolib: acpi: Add quirk to ignore EC wakeups on Dell Venue 10 Pro 5055 Like some other Bay and Cherry Trail SoC based devices the Dell Venue 10 Pro 5055 has an embedded-controller which uses ACPI GPIO events to report events instead of using the standard ACPI EC interface for this. The EC interrupt is only used to report battery-level changes and it keeps doing this while the system is suspended, causing the system to not stay suspended. Add an ignore-wake quirk for the GPIO pin used by the EC to fix the spurious wakeups from suspend. Signed-off-by: Hans de Goede Acked-by: Andy Shevchenko Signed-off-by: Andy Shevchenko --- drivers/gpio/gpiolib-acpi.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c index 21750be9c489..3ef22a3c104d 100644 --- a/drivers/gpio/gpiolib-acpi.c +++ b/drivers/gpio/gpiolib-acpi.c @@ -1445,6 +1445,20 @@ static const struct dmi_system_id gpiolib_acpi_quirks[] __initconst = { .no_edge_events_on_boot = true, }, }, + { + /* + * The Dell Venue 10 Pro 5055, with Bay Trail SoC + TI PMIC uses an + * external embedded-controller connected via I2C + an ACPI GPIO + * event handler on INT33FFC:02 pin 12, causing spurious wakeups. + */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Venue 10 Pro 5055"), + }, + .driver_data = &(struct acpi_gpiolib_dmi_quirk) { + .ignore_wake = "INT33FC:02@12", + }, + }, { /* * HP X2 10 models with Cherry Trail SoC + TI PMIC use an From 71cf76d451ef40ff700320069fe58ae239f6f5aa Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 2 Apr 2021 09:17:51 -0700 Subject: [PATCH 1043/1379] gpio: sch: depends on LPC_SCH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since LPC_SCH provides GPIO functionality, GPIO_SCH should depend on LPC_SCH to prevent kconfig warning and build errors: WARNING: unmet direct dependencies detected for LPC_SCH Depends on [n]: HAS_IOMEM [=y] && PCI [=n] Selected by [y]: - GPIO_SCH [=y] && GPIOLIB [=y] && X86 [=y] && (X86 [=y] || COMPILE_TEST [=n]) && ACPI [=y] and ../drivers/mfd/lpc_sch.c:204:1: warning: data definition has no type or storage class module_pci_driver(lpc_sch_driver); ^~~~~~~~~~~~~~~~~ ../drivers/mfd/lpc_sch.c:204:1: error: type defaults to ‘int’ in declaration of ‘module_pci_driver’ [-Werror=implicit-int] ../drivers/mfd/lpc_sch.c:204:1: warning: parameter names (without types) in function declaration ../drivers/mfd/lpc_sch.c:197:26: warning: ‘lpc_sch_driver’ defined but not used [-Wunused-variable] static struct pci_driver lpc_sch_driver = { ^~~~~~~~~~~~~~ Fixes: 6c46215d6b62 ("gpio: sch: Hook into ACPI GPE handler to catch GPIO edge events") Signed-off-by: Randy Dunlap Cc: Linus Walleij Cc: linux-gpio@vger.kernel.org Cc: Bartosz Golaszewski Cc: Denis Turischev Signed-off-by: Andy Shevchenko --- drivers/gpio/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 3a56a6370b6c..b9df2f0c05ba 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -861,9 +861,9 @@ config GPIO_IT87 config GPIO_SCH tristate "Intel SCH/TunnelCreek/Centerton/Quark X1000 GPIO" depends on (X86 || COMPILE_TEST) && ACPI + depends on LPC_SCH select GPIOLIB_IRQCHIP select MFD_CORE - select LPC_SCH help Say yes here to support GPIO interface on Intel Poulsbo SCH, Intel Tunnel Creek processor, Intel Centerton processor or From c6b4853fa25a7f0549731c141e6b2b3f29a6b473 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Apr 2021 21:21:03 +0300 Subject: [PATCH 1044/1379] gpio: sch: Drop MFD_CORE selection Since we are depended on LPC_SCH, which selects MFD_CORE, we don't need to do it ourselves. Signed-off-by: Andy Shevchenko --- drivers/gpio/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index b9df2f0c05ba..39a4b8207b48 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -863,7 +863,6 @@ config GPIO_SCH depends on (X86 || COMPILE_TEST) && ACPI depends on LPC_SCH select GPIOLIB_IRQCHIP - select MFD_CORE help Say yes here to support GPIO interface on Intel Poulsbo SCH, Intel Tunnel Creek processor, Intel Centerton processor or From ba134d29e9526aa8396da355e69f55e8f9badd6d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 2 Apr 2021 21:42:25 +0300 Subject: [PATCH 1045/1379] gpio: ich: Switch to be dependent on LPC_ICH Driver is neither dependent to PCI nor using MFD_CORE. Replace those dependency and selection by dependency on LPC_ICH. Signed-off-by: Andy Shevchenko --- drivers/gpio/Kconfig | 5 ++--- drivers/gpio/gpio-ich.c | 2 -- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 39a4b8207b48..3456b418127a 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -321,9 +321,8 @@ config GPIO_HLWD config GPIO_ICH tristate "Intel ICH GPIO" - depends on PCI && X86 - select MFD_CORE - select LPC_ICH + depends on X86 + depends on LPC_ICH help Say yes here to support the GPIO functionality of a number of Intel ICH-based chipsets. Currently supported devices: ICH6, ICH7, ICH8 diff --git a/drivers/gpio/gpio-ich.c b/drivers/gpio/gpio-ich.c index de56c013a658..3b31f5e9bf40 100644 --- a/drivers/gpio/gpio-ich.c +++ b/drivers/gpio/gpio-ich.c @@ -5,13 +5,11 @@ * Copyright (C) 2010 Extreme Engineering Solutions. */ - #include #include #include #include #include -#include #include #define DRV_NAME "gpio_ich" From 76c47d1449fc2ad58fec3a4ace45e33c3952720e Mon Sep 17 00:00:00 2001 From: Ran Wang Date: Mon, 22 Mar 2021 11:38:46 +0800 Subject: [PATCH 1046/1379] gpio: mpc8xxx: Add ACPI support Current implementation only supports DT, now add ACPI support. Signed-off-by: Ran Wang Reviewed-by: Andy Shevchenko Acked-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mpc8xxx.c | 47 ++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index 6dfca83bcd90..4b9157a69fca 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -9,6 +9,7 @@ * kind, whether express or implied. */ +#include #include #include #include @@ -18,6 +19,8 @@ #include #include #include +#include +#include #include #include #include @@ -303,8 +306,8 @@ static int mpc8xxx_probe(struct platform_device *pdev) struct device_node *np = pdev->dev.of_node; struct mpc8xxx_gpio_chip *mpc8xxx_gc; struct gpio_chip *gc; - const struct mpc8xxx_gpio_devtype *devtype = - of_device_get_match_data(&pdev->dev); + const struct mpc8xxx_gpio_devtype *devtype = NULL; + struct fwnode_handle *fwnode; int ret; mpc8xxx_gc = devm_kzalloc(&pdev->dev, sizeof(*mpc8xxx_gc), GFP_KERNEL); @@ -315,14 +318,14 @@ static int mpc8xxx_probe(struct platform_device *pdev) raw_spin_lock_init(&mpc8xxx_gc->lock); - mpc8xxx_gc->regs = of_iomap(np, 0); - if (!mpc8xxx_gc->regs) - return -ENOMEM; + mpc8xxx_gc->regs = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(mpc8xxx_gc->regs)) + return PTR_ERR(mpc8xxx_gc->regs); gc = &mpc8xxx_gc->gc; gc->parent = &pdev->dev; - if (of_property_read_bool(np, "little-endian")) { + if (device_property_read_bool(&pdev->dev, "little-endian")) { ret = bgpio_init(gc, &pdev->dev, 4, mpc8xxx_gc->regs + GPIO_DAT, NULL, NULL, @@ -345,6 +348,7 @@ static int mpc8xxx_probe(struct platform_device *pdev) mpc8xxx_gc->direction_output = gc->direction_output; + devtype = device_get_match_data(&pdev->dev); if (!devtype) devtype = &mpc8xxx_gpio_devtype_default; @@ -369,24 +373,29 @@ static int mpc8xxx_probe(struct platform_device *pdev) * associated input enable must be set (GPIOxGPIE[IEn]=1) to propagate * the port value to the GPIO Data Register. */ + fwnode = dev_fwnode(&pdev->dev); if (of_device_is_compatible(np, "fsl,qoriq-gpio") || of_device_is_compatible(np, "fsl,ls1028a-gpio") || - of_device_is_compatible(np, "fsl,ls1088a-gpio")) + of_device_is_compatible(np, "fsl,ls1088a-gpio") || + is_acpi_node(fwnode)) gc->write_reg(mpc8xxx_gc->regs + GPIO_IBE, 0xffffffff); ret = gpiochip_add_data(gc, mpc8xxx_gc); if (ret) { - pr_err("%pOF: GPIO chip registration failed with status %d\n", - np, ret); + dev_err(&pdev->dev, + "GPIO chip registration failed with status %d\n", ret); goto err; } - mpc8xxx_gc->irqn = irq_of_parse_and_map(np, 0); + mpc8xxx_gc->irqn = platform_get_irq(pdev, 0); if (!mpc8xxx_gc->irqn) return 0; - mpc8xxx_gc->irq = irq_domain_add_linear(np, MPC8XXX_GPIO_PINS, - &mpc8xxx_gpio_irq_ops, mpc8xxx_gc); + mpc8xxx_gc->irq = irq_domain_create_linear(fwnode, + MPC8XXX_GPIO_PINS, + &mpc8xxx_gpio_irq_ops, + mpc8xxx_gc); + if (!mpc8xxx_gc->irq) return 0; @@ -399,8 +408,9 @@ static int mpc8xxx_probe(struct platform_device *pdev) IRQF_SHARED, "gpio-cascade", mpc8xxx_gc); if (ret) { - dev_err(&pdev->dev, "%s: failed to devm_request_irq(%d), ret = %d\n", - np->full_name, mpc8xxx_gc->irqn, ret); + dev_err(&pdev->dev, + "failed to devm_request_irq(%d), ret = %d\n", + mpc8xxx_gc->irqn, ret); goto err; } @@ -425,12 +435,21 @@ static int mpc8xxx_remove(struct platform_device *pdev) return 0; } +#ifdef CONFIG_ACPI +static const struct acpi_device_id gpio_acpi_ids[] = { + {"NXP0031",}, + { } +}; +MODULE_DEVICE_TABLE(acpi, gpio_acpi_ids); +#endif + static struct platform_driver mpc8xxx_plat_driver = { .probe = mpc8xxx_probe, .remove = mpc8xxx_remove, .driver = { .name = "gpio-mpc8xxx", .of_match_table = mpc8xxx_gpio_ids, + .acpi_match_table = ACPI_PTR(gpio_acpi_ids), }, }; From abd7a8eab8139e1e184712965e69165464a660e2 Mon Sep 17 00:00:00 2001 From: Barney Goette Date: Thu, 8 Apr 2021 10:53:34 -0500 Subject: [PATCH 1047/1379] gpio: 104-dio-48e: Fix coding style issues Fixed multiple bare uses of 'unsigned' without 'int'. Fixed space around "*" operator. Fixed function parameter alignment to opening parenthesis. Reported by checkpatch. Signed-off-by: Barney Goette Acked-by: William Breathitt Gray Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-104-dio-48e.c | 50 ++++++++++++++++----------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/gpio/gpio-104-dio-48e.c b/drivers/gpio/gpio-104-dio-48e.c index 7a9021c4fa48..71c0bea34d7b 100644 --- a/drivers/gpio/gpio-104-dio-48e.c +++ b/drivers/gpio/gpio-104-dio-48e.c @@ -49,15 +49,15 @@ struct dio48e_gpio { unsigned char out_state[6]; unsigned char control[2]; raw_spinlock_t lock; - unsigned base; + unsigned int base; unsigned char irq_mask; }; -static int dio48e_gpio_get_direction(struct gpio_chip *chip, unsigned offset) +static int dio48e_gpio_get_direction(struct gpio_chip *chip, unsigned int offset) { struct dio48e_gpio *const dio48egpio = gpiochip_get_data(chip); - const unsigned port = offset / 8; - const unsigned mask = BIT(offset % 8); + const unsigned int port = offset / 8; + const unsigned int mask = BIT(offset % 8); if (dio48egpio->io_state[port] & mask) return GPIO_LINE_DIRECTION_IN; @@ -65,14 +65,14 @@ static int dio48e_gpio_get_direction(struct gpio_chip *chip, unsigned offset) return GPIO_LINE_DIRECTION_OUT; } -static int dio48e_gpio_direction_input(struct gpio_chip *chip, unsigned offset) +static int dio48e_gpio_direction_input(struct gpio_chip *chip, unsigned int offset) { struct dio48e_gpio *const dio48egpio = gpiochip_get_data(chip); - const unsigned io_port = offset / 8; + const unsigned int io_port = offset / 8; const unsigned int control_port = io_port / 3; - const unsigned control_addr = dio48egpio->base + 3 + control_port*4; + const unsigned int control_addr = dio48egpio->base + 3 + control_port * 4; unsigned long flags; - unsigned control; + unsigned int control; raw_spin_lock_irqsave(&dio48egpio->lock, flags); @@ -104,17 +104,17 @@ static int dio48e_gpio_direction_input(struct gpio_chip *chip, unsigned offset) return 0; } -static int dio48e_gpio_direction_output(struct gpio_chip *chip, unsigned offset, - int value) +static int dio48e_gpio_direction_output(struct gpio_chip *chip, unsigned int offset, + int value) { struct dio48e_gpio *const dio48egpio = gpiochip_get_data(chip); - const unsigned io_port = offset / 8; + const unsigned int io_port = offset / 8; const unsigned int control_port = io_port / 3; - const unsigned mask = BIT(offset % 8); - const unsigned control_addr = dio48egpio->base + 3 + control_port*4; - const unsigned out_port = (io_port > 2) ? io_port + 1 : io_port; + const unsigned int mask = BIT(offset % 8); + const unsigned int control_addr = dio48egpio->base + 3 + control_port * 4; + const unsigned int out_port = (io_port > 2) ? io_port + 1 : io_port; unsigned long flags; - unsigned control; + unsigned int control; raw_spin_lock_irqsave(&dio48egpio->lock, flags); @@ -154,14 +154,14 @@ static int dio48e_gpio_direction_output(struct gpio_chip *chip, unsigned offset, return 0; } -static int dio48e_gpio_get(struct gpio_chip *chip, unsigned offset) +static int dio48e_gpio_get(struct gpio_chip *chip, unsigned int offset) { struct dio48e_gpio *const dio48egpio = gpiochip_get_data(chip); - const unsigned port = offset / 8; - const unsigned mask = BIT(offset % 8); - const unsigned in_port = (port > 2) ? port + 1 : port; + const unsigned int port = offset / 8; + const unsigned int mask = BIT(offset % 8); + const unsigned int in_port = (port > 2) ? port + 1 : port; unsigned long flags; - unsigned port_state; + unsigned int port_state; raw_spin_lock_irqsave(&dio48egpio->lock, flags); @@ -202,12 +202,12 @@ static int dio48e_gpio_get_multiple(struct gpio_chip *chip, unsigned long *mask, return 0; } -static void dio48e_gpio_set(struct gpio_chip *chip, unsigned offset, int value) +static void dio48e_gpio_set(struct gpio_chip *chip, unsigned int offset, int value) { struct dio48e_gpio *const dio48egpio = gpiochip_get_data(chip); - const unsigned port = offset / 8; - const unsigned mask = BIT(offset % 8); - const unsigned out_port = (port > 2) ? port + 1 : port; + const unsigned int port = offset / 8; + const unsigned int mask = BIT(offset % 8); + const unsigned int out_port = (port > 2) ? port + 1 : port; unsigned long flags; raw_spin_lock_irqsave(&dio48egpio->lock, flags); @@ -306,7 +306,7 @@ static void dio48e_irq_unmask(struct irq_data *data) raw_spin_unlock_irqrestore(&dio48egpio->lock, flags); } -static int dio48e_irq_set_type(struct irq_data *data, unsigned flow_type) +static int dio48e_irq_set_type(struct irq_data *data, unsigned int flow_type) { const unsigned long offset = irqd_to_hwirq(data); From 5fe706730800555ece3308965e231308ca0cf877 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Tue, 6 Apr 2021 15:20:39 +0800 Subject: [PATCH 1048/1379] gpio: it87: remove unused code Fix the following clang warning: drivers/gpio/gpio-it87.c:128:20: warning: unused function 'superio_outw' [-Wunused-function]. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Acked-by: Simon Guinot Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-it87.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/gpio/gpio-it87.c b/drivers/gpio/gpio-it87.c index 8f1be34953ce..f332341fd4c8 100644 --- a/drivers/gpio/gpio-it87.c +++ b/drivers/gpio/gpio-it87.c @@ -125,14 +125,6 @@ static inline int superio_inw(int reg) return val; } -static inline void superio_outw(int val, int reg) -{ - outb(reg++, REG); - outb(val >> 8, VAL); - outb(reg, REG); - outb(val, VAL); -} - static inline void superio_set_mask(int mask, int reg) { u8 curr_val = superio_inb(reg); From 56b01acc1c79a4fc70d575ed7861f26a0d5d43ea Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 2 Apr 2021 14:19:58 +0200 Subject: [PATCH 1049/1379] dt-bindings: gpio: fairchild,74hc595: Convert to json-schema Convert the Generic 8-bit shift register Device Tree binding documentation to json-schema. Rename from gpio-74x164 to fairchild,74hc595, as the former refers to the Linux driver, and not to a hardware name. Add the missing hog description. Signed-off-by: Geert Uytterhoeven Reviewed-by: Linus Walleij Reviewed-by: Rob Herring Signed-off-by: Bartosz Golaszewski --- .../bindings/gpio/fairchild,74hc595.yaml | 77 +++++++++++++++++++ .../devicetree/bindings/gpio/gpio-74x164.txt | 27 ------- 2 files changed, 77 insertions(+), 27 deletions(-) create mode 100644 Documentation/devicetree/bindings/gpio/fairchild,74hc595.yaml delete mode 100644 Documentation/devicetree/bindings/gpio/gpio-74x164.txt diff --git a/Documentation/devicetree/bindings/gpio/fairchild,74hc595.yaml b/Documentation/devicetree/bindings/gpio/fairchild,74hc595.yaml new file mode 100644 index 000000000000..5fe19fa5f67c --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/fairchild,74hc595.yaml @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/gpio/fairchild,74hc595.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Generic 8-bit shift register + +maintainers: + - Maxime Ripard + +properties: + compatible: + enum: + - fairchild,74hc595 + - nxp,74lvc594 + + reg: + maxItems: 1 + + gpio-controller: true + + '#gpio-cells': + description: + The second cell is only used to specify the GPIO polarity. + const: 2 + + registers-number: + description: Number of daisy-chained shift registers + + enable-gpios: + description: GPIO connected to the OE (Output Enable) pin. + maxItems: 1 + + spi-max-frequency: true + +patternProperties: + "^(hog-[0-9]+|.+-hog(-[0-9]+)?)$": + type: object + + properties: + gpio-hog: true + gpios: true + output-high: true + output-low: true + line-name: true + + required: + - gpio-hog + - gpios + + additionalProperties: false + +required: + - compatible + - reg + - gpio-controller + - '#gpio-cells' + - registers-number + +additionalProperties: false + +examples: + - | + spi { + #address-cells = <1>; + #size-cells = <0>; + + gpio5: gpio5@0 { + compatible = "fairchild,74hc595"; + reg = <0>; + gpio-controller; + #gpio-cells = <2>; + registers-number = <4>; + spi-max-frequency = <100000>; + }; + }; diff --git a/Documentation/devicetree/bindings/gpio/gpio-74x164.txt b/Documentation/devicetree/bindings/gpio/gpio-74x164.txt deleted file mode 100644 index 2a97553d8d76..000000000000 --- a/Documentation/devicetree/bindings/gpio/gpio-74x164.txt +++ /dev/null @@ -1,27 +0,0 @@ -* Generic 8-bits shift register GPIO driver - -Required properties: -- compatible: Should contain one of the following: - "fairchild,74hc595" - "nxp,74lvc594" -- reg : chip select number -- gpio-controller : Marks the device node as a gpio controller. -- #gpio-cells : Should be two. The first cell is the pin number and - the second cell is used to specify the gpio polarity: - 0 = active high - 1 = active low -- registers-number: Number of daisy-chained shift registers - -Optional properties: -- enable-gpios: GPIO connected to the OE (Output Enable) pin. - -Example: - -gpio5: gpio5@0 { - compatible = "fairchild,74hc595"; - reg = <0>; - gpio-controller; - #gpio-cells = <2>; - registers-number = <4>; - spi-max-frequency = <100000>; -}; From e29eaf1c1a68499188c71b1d75f9637ddd29e039 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 12 Apr 2021 10:16:21 +0800 Subject: [PATCH 1050/1379] gpio: mxs: remove useless function Fix the following gcc warning: drivers/gpio/gpio-mxs.c:63:19: warning: kernel/sys_ni.cunused function 'is_imx28_gpio'. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mxs.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/gpio/gpio-mxs.c b/drivers/gpio/gpio-mxs.c index dfc0c1eb1b33..524b668eb1ac 100644 --- a/drivers/gpio/gpio-mxs.c +++ b/drivers/gpio/gpio-mxs.c @@ -60,11 +60,6 @@ static inline int is_imx23_gpio(struct mxs_gpio_port *port) return port->devid == IMX23_GPIO; } -static inline int is_imx28_gpio(struct mxs_gpio_port *port) -{ - return port->devid == IMX28_GPIO; -} - /* Note: This driver assumes 32 GPIOs are handled in one register */ static int mxs_gpio_set_irq_type(struct irq_data *d, unsigned int type) From 444952956f34a5de935159561d56a34276ffffd6 Mon Sep 17 00:00:00 2001 From: Johan Jonker Date: Tue, 13 Apr 2021 00:36:15 +0200 Subject: [PATCH 1051/1379] dt-bindings: gpio: add YAML description for rockchip,gpio-bank Current dts files with "rockchip,gpio-bank" subnodes are manually verified. In order to automate this process the text that describes the compatible in rockchip,pinctrl.txt is removed and converted to YAML in rockchip,gpio-bank.yaml. Signed-off-by: Johan Jonker Reviewed-by: Rob Herring Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- .../bindings/gpio/rockchip,gpio-bank.yaml | 82 +++++++++++++++++++ .../bindings/pinctrl/rockchip,pinctrl.txt | 58 +------------ 2 files changed, 83 insertions(+), 57 deletions(-) create mode 100644 Documentation/devicetree/bindings/gpio/rockchip,gpio-bank.yaml diff --git a/Documentation/devicetree/bindings/gpio/rockchip,gpio-bank.yaml b/Documentation/devicetree/bindings/gpio/rockchip,gpio-bank.yaml new file mode 100644 index 000000000000..d993e002cebe --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/rockchip,gpio-bank.yaml @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: GPL-2.0 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/gpio/rockchip,gpio-bank.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Rockchip GPIO bank + +maintainers: + - Heiko Stuebner + +properties: + compatible: + enum: + - rockchip,gpio-bank + - rockchip,rk3188-gpio-bank0 + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 1 + + gpio-controller: true + + "#gpio-cells": + const: 2 + + interrupt-controller: true + + "#interrupt-cells": + const: 2 + +required: + - compatible + - reg + - interrupts + - clocks + - gpio-controller + - "#gpio-cells" + - interrupt-controller + - "#interrupt-cells" + +additionalProperties: false + +examples: + - | + #include + pinctrl: pinctrl { + #address-cells = <1>; + #size-cells = <1>; + ranges; + + gpio0: gpio@2000a000 { + compatible = "rockchip,rk3188-gpio-bank0"; + reg = <0x2000a000 0x100>; + interrupts = ; + clocks = <&clk_gates8 9>; + + gpio-controller; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + }; + + gpio1: gpio@2003c000 { + compatible = "rockchip,gpio-bank"; + reg = <0x2003c000 0x100>; + interrupts = ; + clocks = <&clk_gates8 10>; + + gpio-controller; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + }; + }; diff --git a/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt index d3eae61a340d..4719a6a0706c 100644 --- a/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt @@ -50,23 +50,7 @@ Deprecated properties for iomux controller: Use rockchip,grf and rockchip,pmu described above instead. Required properties for gpio sub nodes: - - compatible: "rockchip,gpio-bank" - - reg: register of the gpio bank (different than the iomux registerset) - - interrupts: base interrupt of the gpio bank in the interrupt controller - - clocks: clock that drives this bank - - gpio-controller: identifies the node as a gpio controller and pin bank. - - #gpio-cells: number of cells in GPIO specifier. Since the generic GPIO - binding is used, the amount of cells must be specified as 2. See generic - GPIO binding documentation for description of particular cells. - - interrupt-controller: identifies the controller node as interrupt-parent. - - #interrupt-cells: the value of this property should be 2 and the interrupt - cells should use the standard two-cell scheme described in - bindings/interrupt-controller/interrupts.txt - -Deprecated properties for gpio sub nodes: - - compatible: "rockchip,rk3188-gpio-bank0" - - reg: second element: separate pull register for rk3188 bank0, use - rockchip,pmu described above instead +See rockchip,gpio-bank.yaml Required properties for pin configuration node: - rockchip,pins: 3 integers array, represents a group of pins mux and config @@ -127,43 +111,3 @@ uart2: serial@20064000 { pinctrl-names = "default"; pinctrl-0 = <&uart2_xfer>; }; - -Example for rk3188: - - pinctrl@20008000 { - compatible = "rockchip,rk3188-pinctrl"; - rockchip,grf = <&grf>; - rockchip,pmu = <&pmu>; - #address-cells = <1>; - #size-cells = <1>; - ranges; - - gpio0: gpio0@2000a000 { - compatible = "rockchip,rk3188-gpio-bank0"; - reg = <0x2000a000 0x100>; - interrupts = ; - clocks = <&clk_gates8 9>; - - gpio-controller; - #gpio-cells = <2>; - - interrupt-controller; - #interrupt-cells = <2>; - }; - - gpio1: gpio1@2003c000 { - compatible = "rockchip,gpio-bank"; - reg = <0x2003c000 0x100>; - interrupts = ; - clocks = <&clk_gates8 10>; - - gpio-controller; - #gpio-cells = <2>; - - interrupt-controller; - #interrupt-cells = <2>; - }; - - ... - - }; From 32b48bf8514c28cdc89cd8069eceeb6e6cff0612 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 5 May 2021 22:15:09 +1000 Subject: [PATCH 1052/1379] KVM: PPC: Book3S HV: Fix conversion to gfn-based MMU notifier callbacks Commit b1c5356e873c ("KVM: PPC: Convert to the gfn-based MMU notifier callbacks") causes unmap_gfn_range and age_gfn callbacks to only work on the first gfn in the range. It also makes the aging callbacks call into both radix and hash aging functions for radix guests. Fix this. Add warnings for the single-gfn calls that have been converted to range callbacks, in case they ever receieve ranges greater than 1. Fixes: b1c5356e873c ("KVM: PPC: Convert to the gfn-based MMU notifier callbacks") Reported-by: Bharata B Rao Signed-off-by: Nicholas Piggin Tested-by: Bharata B Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210505121509.1470207-1-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_book3s.h | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 48 ++++++++++++++++++-------- arch/powerpc/kvm/book3s_64_mmu_radix.c | 5 ++- 3 files changed, 37 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index a6e9a5585e61..e6b53c6e21e3 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -210,7 +210,7 @@ extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid); extern int kvmppc_radix_init(void); extern void kvmppc_radix_exit(void); -extern bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, +extern void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); extern bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index b7bd9ca040b8..2d9193cd73be 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -795,7 +795,7 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i, } } -static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, +static void kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { unsigned long i; @@ -829,15 +829,21 @@ static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unlock_rmap(rmapp); __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } - return false; } bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - if (kvm_is_radix(kvm)) - return kvm_unmap_radix(kvm, range->slot, range->start); + gfn_t gfn; - return kvm_unmap_rmapp(kvm, range->slot, range->start); + if (kvm_is_radix(kvm)) { + for (gfn = range->start; gfn < range->end; gfn++) + kvm_unmap_radix(kvm, range->slot, gfn); + } else { + for (gfn = range->start; gfn < range->end; gfn++) + kvm_unmap_rmapp(kvm, range->slot, range->start); + } + + return false; } void kvmppc_core_flush_memslot_hv(struct kvm *kvm, @@ -924,10 +930,18 @@ static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - if (kvm_is_radix(kvm)) - kvm_age_radix(kvm, range->slot, range->start); + gfn_t gfn; + bool ret = false; - return kvm_age_rmapp(kvm, range->slot, range->start); + if (kvm_is_radix(kvm)) { + for (gfn = range->start; gfn < range->end; gfn++) + ret |= kvm_age_radix(kvm, range->slot, gfn); + } else { + for (gfn = range->start; gfn < range->end; gfn++) + ret |= kvm_age_rmapp(kvm, range->slot, gfn); + } + + return ret; } static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -965,18 +979,24 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - if (kvm_is_radix(kvm)) - kvm_test_age_radix(kvm, range->slot, range->start); + WARN_ON(range->start + 1 != range->end); - return kvm_test_age_rmapp(kvm, range->slot, range->start); + if (kvm_is_radix(kvm)) + return kvm_test_age_radix(kvm, range->slot, range->start); + else + return kvm_test_age_rmapp(kvm, range->slot, range->start); } bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range) { - if (kvm_is_radix(kvm)) - return kvm_unmap_radix(kvm, range->slot, range->start); + WARN_ON(range->start + 1 != range->end); - return kvm_unmap_rmapp(kvm, range->slot, range->start); + if (kvm_is_radix(kvm)) + kvm_unmap_radix(kvm, range->slot, range->start); + else + kvm_unmap_rmapp(kvm, range->slot, range->start); + + return false; } static int vcpus_running(struct kvm *kvm) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index ec4f58fa9f5a..d909c069363e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -993,7 +993,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, } /* Called with kvm->mmu_lock held */ -bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, +void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { pte_t *ptep; @@ -1002,14 +1002,13 @@ bool kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) { uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT); - return false; + return; } ptep = find_kvm_secondary_pte(kvm, gpa, &shift); if (ptep && pte_present(*ptep)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, kvm->arch.lpid); - return false; } /* Called with kvm->mmu_lock held */ From 8c9af478c06bb1ab1422f90d8ecbc53defd44bc3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 5 May 2021 10:38:24 -0400 Subject: [PATCH 1053/1379] ftrace: Handle commands when closing set_ftrace_filter file # echo switch_mm:traceoff > /sys/kernel/tracing/set_ftrace_filter will cause switch_mm to stop tracing by the traceoff command. # echo -n switch_mm:traceoff > /sys/kernel/tracing/set_ftrace_filter does nothing. The reason is that the parsing in the write function only processes commands if it finished parsing (there is white space written after the command). That's to handle: write(fd, "switch_mm:", 10); write(fd, "traceoff", 8); cases, where the command is broken over multiple writes. The problem is if the file descriptor is closed, then the write call is not processed, and the command needs to be processed in the release code. The release code can handle matching of functions, but does not handle commands. Cc: stable@vger.kernel.org Fixes: eda1e32855656 ("tracing: handle broken names in ftrace filter") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 057e962ca5ce..c57508445faa 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5591,7 +5591,10 @@ int ftrace_regex_release(struct inode *inode, struct file *file) parser = &iter->parser; if (trace_parser_loaded(parser)) { - ftrace_match_records(iter->hash, parser->buffer, parser->idx); + int enable = !(iter->flags & FTRACE_ITER_NOTRACE); + + ftrace_process_regex(iter, parser->buffer, + parser->idx, enable); } trace_parser_put(parser); From 7072a355ba191c08b0579f0f66e3eba0e28bf818 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 May 2021 00:33:24 -0700 Subject: [PATCH 1054/1379] netfilter: nfnetlink: add a missing rcu_read_unlock() Reported by syzbot : BUG: sleeping function called from invalid context at include/linux/sched/mm.h:201 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 26899, name: syz-executor.5 1 lock held by syz-executor.5/26899: #0: ffffffff8bf797a0 (rcu_read_lock){....}-{1:2}, at: nfnetlink_get_subsys net/netfilter/nfnetlink.c:148 [inline] #0: ffffffff8bf797a0 (rcu_read_lock){....}-{1:2}, at: nfnetlink_rcv_msg+0x1da/0x1300 net/netfilter/nfnetlink.c:226 Preemption disabled at: [] preempt_schedule_irq+0x3e/0x90 kernel/sched/core.c:5533 CPU: 1 PID: 26899 Comm: syz-executor.5 Not tainted 5.12.0-next-20210504-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x141/0x1d7 lib/dump_stack.c:120 ___might_sleep.cold+0x1f1/0x237 kernel/sched/core.c:8338 might_alloc include/linux/sched/mm.h:201 [inline] slab_pre_alloc_hook mm/slab.h:500 [inline] slab_alloc_node mm/slub.c:2845 [inline] kmem_cache_alloc_node+0x33d/0x3e0 mm/slub.c:2960 __alloc_skb+0x20b/0x340 net/core/skbuff.c:413 alloc_skb include/linux/skbuff.h:1107 [inline] nlmsg_new include/net/netlink.h:953 [inline] netlink_ack+0x1ed/0xaa0 net/netlink/af_netlink.c:2437 netlink_rcv_skb+0x33d/0x420 net/netlink/af_netlink.c:2508 nfnetlink_rcv+0x1ac/0x420 net/netfilter/nfnetlink.c:650 netlink_unicast_kernel net/netlink/af_netlink.c:1312 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1338 netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1927 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:674 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2350 ___sys_sendmsg+0xf3/0x170 net/socket.c:2404 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2433 do_syscall_64+0x3a/0xb0 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x4665f9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fa8a03ee188 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 000000000056bf60 RCX: 00000000004665f9 RDX: 0000000000000000 RSI: 0000000020000480 RDI: 0000000000000004 RBP: 00000000004bfce1 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000056bf60 R13: 00007fffe864480f R14: 00007fa8a03ee300 R15: 0000000000022000 ================================================ WARNING: lock held when returning to user space! 5.12.0-next-20210504-syzkaller #0 Tainted: G W ------------------------------------------------ syz-executor.5/26899 is leaving the kernel with locks still held! 1 lock held by syz-executor.5/26899: #0: ffffffff8bf797a0 (rcu_read_lock){....}-{1:2}, at: nfnetlink_get_subsys net/netfilter/nfnetlink.c:148 [inline] #0: ffffffff8bf797a0 (rcu_read_lock){....}-{1:2}, at: nfnetlink_rcv_msg+0x1da/0x1300 net/netfilter/nfnetlink.c:226 ------------[ cut here ]------------ WARNING: CPU: 0 PID: 26899 at kernel/rcu/tree_plugin.h:359 rcu_note_context_switch+0xfd/0x16e0 kernel/rcu/tree_plugin.h:359 Modules linked in: CPU: 0 PID: 26899 Comm: syz-executor.5 Tainted: G W 5.12.0-next-20210504-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:rcu_note_context_switch+0xfd/0x16e0 kernel/rcu/tree_plugin.h:359 Code: 48 89 fa 48 c1 ea 03 0f b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 2e 0d 00 00 8b bd cc 03 00 00 85 ff 7e 02 <0f> 0b 65 48 8b 2c 25 00 f0 01 00 48 8d bd cc 03 00 00 48 b8 00 00 RSP: 0000:ffffc90002fffdb0 EFLAGS: 00010002 RAX: 0000000000000007 RBX: ffff8880b9c36080 RCX: ffffffff8dc99bac RDX: 0000000000000000 RSI: 0000000000000008 RDI: 0000000000000001 RBP: ffff88808b9d1c80 R08: 0000000000000000 R09: ffffffff8dc96917 R10: fffffbfff1b92d22 R11: 0000000000000000 R12: 0000000000000000 R13: ffff88808b9d1c80 R14: ffff88808b9d1c80 R15: ffffc90002ff8000 FS: 00007fa8a03ee700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f09896ed000 CR3: 0000000032070000 CR4: 00000000001526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __schedule+0x214/0x23e0 kernel/sched/core.c:5044 schedule+0xcf/0x270 kernel/sched/core.c:5226 exit_to_user_mode_loop kernel/entry/common.c:162 [inline] exit_to_user_mode_prepare+0x13e/0x280 kernel/entry/common.c:208 irqentry_exit_to_user_mode+0x5/0x40 kernel/entry/common.c:314 asm_sysvec_reschedule_ipi+0x12/0x20 arch/x86/include/asm/idtentry.h:637 RIP: 0033:0x4665f9 Fixes: 50f2db9e368f ("netfilter: nfnetlink: consolidate callback types") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index d7a9628b6cee..e8dbd8379027 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -295,6 +295,7 @@ replay: nfnl_unlock(subsys_id); break; default: + rcu_read_unlock(); err = -EINVAL; break; } From 77b8aeb9da0490357f1f5a2b0d12125e6332c37a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 4 May 2021 09:52:02 -0600 Subject: [PATCH 1055/1379] vfio/pci: Revert nvlink removal uAPI breakage Revert the uAPI changes from the below commit with notice that these regions and capabilities are no longer provided. Fixes: b392a1989170 ("vfio/pci: remove vfio_pci_nvlink2") Reported-by: Greg Kurz Signed-off-by: Alex Williamson Reviewed-by: Cornelia Huck Reviewed-by: Greg Kurz Tested-by: Greg Kurz Reviewed-by: Christoph Hellwig Message-Id: <162014341432.3807030.11054087109120670135.stgit@omen> --- include/uapi/linux/vfio.h | 46 +++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 34b1f53a3901..ef33ea002b0b 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -333,10 +333,21 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) /* 10de vendor PCI sub-types */ -/* subtype 1 was VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, don't use */ +/* + * NVIDIA GPU NVlink2 RAM is coherent RAM mapped onto the host address space. + * + * Deprecated, region no longer provided + */ +#define VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM (1) /* 1014 vendor PCI sub-types */ -/* subtype 1 was VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, don't use */ +/* + * IBM NPU NVlink2 ATSD (Address Translation Shootdown) register of NPU + * to do TLB invalidation on a GPU. + * + * Deprecated, region no longer provided + */ +#define VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD (1) /* sub-types for VFIO_REGION_TYPE_GFX */ #define VFIO_REGION_SUBTYPE_GFX_EDID (1) @@ -630,9 +641,36 @@ struct vfio_device_migration_info { */ #define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE 3 -/* subtype 4 was VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, don't use */ +/* + * Capability with compressed real address (aka SSA - small system address) + * where GPU RAM is mapped on a system bus. Used by a GPU for DMA routing + * and by the userspace to associate a NVLink bridge with a GPU. + * + * Deprecated, capability no longer provided + */ +#define VFIO_REGION_INFO_CAP_NVLINK2_SSATGT 4 -/* subtype 5 was VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, don't use */ +struct vfio_region_info_cap_nvlink2_ssatgt { + struct vfio_info_cap_header header; + __u64 tgt; +}; + +/* + * Capability with an NVLink link speed. The value is read by + * the NVlink2 bridge driver from the bridge's "ibm,nvlink-speed" + * property in the device tree. The value is fixed in the hardware + * and failing to provide the correct value results in the link + * not working with no indication from the driver why. + * + * Deprecated, capability no longer provided + */ +#define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD 5 + +struct vfio_region_info_cap_nvlink2_lnkspd { + struct vfio_info_cap_header header; + __u32 link_speed; + __u32 __pad; +}; /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, From cc35518d29bc8e38902866b74874b4a3f1ad3617 Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Tue, 4 May 2021 21:06:51 +0000 Subject: [PATCH 1056/1379] docs: vfio: fix typo Signed-off-by: Alyssa Ross Message-Id: <20210504210651.1316078-1-hi@alyssa.is> Signed-off-by: Alex Williamson --- Documentation/driver-api/vfio.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst index decc68cb8114..606eed8823ce 100644 --- a/Documentation/driver-api/vfio.rst +++ b/Documentation/driver-api/vfio.rst @@ -2,7 +2,7 @@ VFIO - "Virtual Function I/O" [1]_ ================================== -Many modern system now provide DMA and interrupt remapping facilities +Many modern systems now provide DMA and interrupt remapping facilities to help ensure I/O devices behave within the boundaries they've been allotted. This includes x86 hardware with AMD-Vi and Intel VT-d, POWER systems with Partitionable Endpoints (PEs) and embedded PowerPC From 298a58e165e447ccfaae35fe9f651f9d7e15166f Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 5 May 2021 11:23:50 +0100 Subject: [PATCH 1057/1379] ARM: footbridge: remove personal server platform Remove the personal server platform, as that has had an array overrun issue identified. It is believed that no one is using this code. Signed-off-by: Russell King --- arch/arm/configs/footbridge_defconfig | 1 - arch/arm/mach-footbridge/Kconfig | 21 --------- arch/arm/mach-footbridge/Makefile | 2 - arch/arm/mach-footbridge/personal-pci.c | 58 ------------------------- arch/arm/mach-footbridge/personal.c | 25 ----------- 5 files changed, 107 deletions(-) delete mode 100644 arch/arm/mach-footbridge/personal-pci.c delete mode 100644 arch/arm/mach-footbridge/personal.c diff --git a/arch/arm/configs/footbridge_defconfig b/arch/arm/configs/footbridge_defconfig index 3a7938f244e5..2aa3ebeb89d7 100644 --- a/arch/arm/configs/footbridge_defconfig +++ b/arch/arm/configs/footbridge_defconfig @@ -7,7 +7,6 @@ CONFIG_EXPERT=y CONFIG_MODULES=y CONFIG_ARCH_FOOTBRIDGE=y CONFIG_ARCH_CATS=y -CONFIG_ARCH_PERSONAL_SERVER=y CONFIG_ARCH_EBSA285_HOST=y CONFIG_ARCH_NETWINDER=y CONFIG_LEDS=y diff --git a/arch/arm/mach-footbridge/Kconfig b/arch/arm/mach-footbridge/Kconfig index 844aa585b966..728aff93fba9 100644 --- a/arch/arm/mach-footbridge/Kconfig +++ b/arch/arm/mach-footbridge/Kconfig @@ -16,27 +16,6 @@ config ARCH_CATS Saying N will reduce the size of the Footbridge kernel. -config ARCH_PERSONAL_SERVER - bool "Compaq Personal Server" - select FOOTBRIDGE_HOST - select ISA - select ISA_DMA - select FORCE_PCI - help - Say Y here if you intend to run this kernel on the Compaq - Personal Server. - - Saying N will reduce the size of the Footbridge kernel. - - The Compaq Personal Server is not available for purchase. - There are no product plans beyond the current research - prototypes at this time. Information is available at: - - - - If you have any questions or comments about the Compaq Personal - Server, send e-mail to . - config ARCH_EBSA285_ADDIN bool "EBSA285 (addin mode)" select ARCH_EBSA285 diff --git a/arch/arm/mach-footbridge/Makefile b/arch/arm/mach-footbridge/Makefile index a09f1041f141..6262993c0555 100644 --- a/arch/arm/mach-footbridge/Makefile +++ b/arch/arm/mach-footbridge/Makefile @@ -11,12 +11,10 @@ pci-y += dc21285.o pci-$(CONFIG_ARCH_CATS) += cats-pci.o pci-$(CONFIG_ARCH_EBSA285_HOST) += ebsa285-pci.o pci-$(CONFIG_ARCH_NETWINDER) += netwinder-pci.o -pci-$(CONFIG_ARCH_PERSONAL_SERVER) += personal-pci.o obj-$(CONFIG_ARCH_CATS) += cats-hw.o isa-timer.o obj-$(CONFIG_ARCH_EBSA285) += ebsa285.o dc21285-timer.o obj-$(CONFIG_ARCH_NETWINDER) += netwinder-hw.o isa-timer.o -obj-$(CONFIG_ARCH_PERSONAL_SERVER) += personal.o dc21285-timer.o obj-$(CONFIG_PCI) +=$(pci-y) diff --git a/arch/arm/mach-footbridge/personal-pci.c b/arch/arm/mach-footbridge/personal-pci.c deleted file mode 100644 index 4391e433a4b2..000000000000 --- a/arch/arm/mach-footbridge/personal-pci.c +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/arch/arm/mach-footbridge/personal-pci.c - * - * PCI bios-type initialisation for PCI machines - * - * Bits taken from various places. - */ -#include -#include -#include - -#include -#include -#include - -static int irqmap_personal_server[] __initdata = { - IRQ_IN0, IRQ_IN1, IRQ_IN2, IRQ_IN3, 0, 0, 0, - IRQ_DOORBELLHOST, IRQ_DMA1, IRQ_DMA2, IRQ_PCI -}; - -static int __init personal_server_map_irq(const struct pci_dev *dev, u8 slot, - u8 pin) -{ - unsigned char line; - - pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line); - - if (line > 0x40 && line <= 0x5f) { - /* line corresponds to the bit controlling this interrupt - * in the footbridge. Ignore the first 8 interrupt bits, - * look up the rest in the map. IN0 is bit number 8 - */ - return irqmap_personal_server[(line & 0x1f) - 8]; - } else if (line == 0) { - /* no interrupt */ - return 0; - } else - return irqmap_personal_server[(line - 1) & 3]; -} - -static struct hw_pci personal_server_pci __initdata = { - .map_irq = personal_server_map_irq, - .nr_controllers = 1, - .ops = &dc21285_ops, - .setup = dc21285_setup, - .preinit = dc21285_preinit, - .postinit = dc21285_postinit, -}; - -static int __init personal_pci_init(void) -{ - if (machine_is_personal_server()) - pci_common_init(&personal_server_pci); - return 0; -} - -subsys_initcall(personal_pci_init); diff --git a/arch/arm/mach-footbridge/personal.c b/arch/arm/mach-footbridge/personal.c deleted file mode 100644 index ca715754fc00..000000000000 --- a/arch/arm/mach-footbridge/personal.c +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/arch/arm/mach-footbridge/personal.c - * - * Personal server (Skiff) machine fixup - */ -#include -#include - -#include -#include - -#include - -#include "common.h" - -MACHINE_START(PERSONAL_SERVER, "Compaq-PersonalServer") - /* Maintainer: Jamey Hicks / George France */ - .atag_offset = 0x100, - .map_io = footbridge_map_io, - .init_irq = footbridge_init_irq, - .init_time = footbridge_timer_init, - .restart = footbridge_restart, -MACHINE_END - From 23243c1ace9fb4eae2f75e0fe0ece8e3219fb4f3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 2 May 2021 02:24:36 +0900 Subject: [PATCH 1058/1379] arch: use cross_compiling to check whether it is a cross build or not 'cross_compiling' is defined by the top Makefile and available for arch Makefiles to check whether it is a cross build or not. A good thing is the variable name 'cross_compiling' is self-documenting. This is a simple replacement for m68k, mips, sh, for which $(ARCH) and $(SRCARCH) always match. No functional change is intended for xtensa, either. This is rather a fix for parisc because arch/parisc/Makefile defines UTS_MATCHINE depending on CONFIG_64BIT, therefore cc-cross-prefix is not working in Kconfig time. Signed-off-by: Masahiro Yamada Acked-by: Geert Uytterhoeven Acked-by: Helge Deller # parisc Acked-by: Max Filippov # xtensa --- arch/m68k/Makefile | 2 +- arch/mips/Makefile | 2 +- arch/parisc/Makefile | 2 +- arch/sh/Makefile | 2 +- arch/xtensa/Makefile | 6 +----- 5 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile index ea14f2046fb4..82620f14124d 100644 --- a/arch/m68k/Makefile +++ b/arch/m68k/Makefile @@ -16,7 +16,7 @@ KBUILD_DEFCONFIG := multi_defconfig -ifneq ($(SUBARCH),$(ARCH)) +ifdef cross_compiling ifeq ($(CROSS_COMPILE),) CROSS_COMPILE := $(call cc-cross-prefix, \ m68k-linux-gnu- m68k-linux- m68k-unknown-linux-gnu-) diff --git a/arch/mips/Makefile b/arch/mips/Makefile index e71d587af49c..258234c35a09 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -50,7 +50,7 @@ tool-archpref = $(64bit-tool-archpref) UTS_MACHINE := mips64 endif -ifneq ($(SUBARCH),$(ARCH)) +ifdef cross_compiling ifeq ($(CROSS_COMPILE),) CROSS_COMPILE := $(call cc-cross-prefix, $(tool-archpref)-linux- $(tool-archpref)-linux-gnu- $(tool-archpref)-unknown-linux-gnu-) endif diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index 7d9f71aa829a..aed8ea29268b 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -41,7 +41,7 @@ endif export LD_BFD -ifneq ($(SUBARCH),$(UTS_MACHINE)) +ifdef cross_compiling ifeq ($(CROSS_COMPILE),) CC_SUFFIXES = linux linux-gnu unknown-linux-gnu CROSS_COMPILE := $(call cc-cross-prefix, \ diff --git a/arch/sh/Makefile b/arch/sh/Makefile index 3bcbf52fb30e..44bcb80e791a 100644 --- a/arch/sh/Makefile +++ b/arch/sh/Makefile @@ -9,7 +9,7 @@ # License. See the file "COPYING" in the main directory of this archive # for more details. # -ifneq ($(SUBARCH),$(ARCH)) +ifdef cross_compiling ifeq ($(CROSS_COMPILE),) CROSS_COMPILE := $(call cc-cross-prefix, sh-linux- sh-linux-gnu- sh-unknown-linux-gnu-) endif diff --git a/arch/xtensa/Makefile b/arch/xtensa/Makefile index ba9fee75e675..e9c8f064c44d 100644 --- a/arch/xtensa/Makefile +++ b/arch/xtensa/Makefile @@ -19,12 +19,8 @@ variant-y := $(patsubst "%",%,$(CONFIG_XTENSA_VARIANT_NAME)) VARIANT = $(variant-y) export VARIANT -# Test for cross compiling - ifneq ($(VARIANT),) - COMPILE_ARCH = $(shell uname -m) - - ifneq ($(COMPILE_ARCH), xtensa) + ifdef cross_compiling ifndef CROSS_COMPILE CROSS_COMPILE = xtensa_$(VARIANT)- endif From 4d6a38da8e79e94cbd1344aa90876f0f805db705 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 28 Apr 2021 12:15:55 +0100 Subject: [PATCH 1059/1379] arm64: entry: always set GIC_PRIO_PSR_I_SET during entry Zenghui reports that booting a kernel with "irqchip.gicv3_pseudo_nmi=1" on the command line hits a warning during kernel entry, due to the way we manipulate the PMR. Early in the entry sequence, we call lockdep_hardirqs_off() to inform lockdep that interrupts have been masked (as the HW sets DAIF wqhen entering an exception). Architecturally PMR_EL1 is not affected by exception entry, and we don't set GIC_PRIO_PSR_I_SET in the PMR early in the exception entry sequence, so early in exception entry the PMR can indicate that interrupts are unmasked even though they are masked by DAIF. If DEBUG_LOCKDEP is selected, lockdep_hardirqs_off() will check that interrupts are masked, before we set GIC_PRIO_PSR_I_SET in any of the exception entry paths, and hence lockdep_hardirqs_off() will WARN() that something is amiss. We can avoid this by consistently setting GIC_PRIO_PSR_I_SET during exception entry so that kernel code sees a consistent environment. We must also update local_daif_inherit() to undo this, as currently only touches DAIF. For other paths, local_daif_restore() will update both DAIF and the PMR. With this done, we can remove the existing special cases which set this later in the entry code. We always use (GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET) for consistency with local_daif_save(), as this will warn if it ever encounters (GIC_PRIO_IRQOFF | GIC_PRIO_PSR_I_SET), and never sets this itself. This matches the gic_prio_kentry_setup that we have to retain for ret_to_user. The original splat from Zenghui's report was: | DEBUG_LOCKS_WARN_ON(!irqs_disabled()) | WARNING: CPU: 3 PID: 125 at kernel/locking/lockdep.c:4258 lockdep_hardirqs_off+0xd4/0xe8 | Modules linked in: | CPU: 3 PID: 125 Comm: modprobe Tainted: G W 5.12.0-rc8+ #463 | Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 | pstate: 604003c5 (nZCv DAIF +PAN -UAO -TCO BTYPE=--) | pc : lockdep_hardirqs_off+0xd4/0xe8 | lr : lockdep_hardirqs_off+0xd4/0xe8 | sp : ffff80002a39bad0 | pmr_save: 000000e0 | x29: ffff80002a39bad0 x28: ffff0000de214bc0 | x27: ffff0000de1c0400 x26: 000000000049b328 | x25: 0000000000406f30 x24: ffff0000de1c00a0 | x23: 0000000020400005 x22: ffff8000105f747c | x21: 0000000096000044 x20: 0000000000498ef9 | x19: ffff80002a39bc88 x18: ffffffffffffffff | x17: 0000000000000000 x16: ffff800011c61eb0 | x15: ffff800011700a88 x14: 0720072007200720 | x13: 0720072007200720 x12: 0720072007200720 | x11: 0720072007200720 x10: 0720072007200720 | x9 : ffff80002a39bad0 x8 : ffff80002a39bad0 | x7 : ffff8000119f0800 x6 : c0000000ffff7fff | x5 : ffff8000119f07a8 x4 : 0000000000000001 | x3 : 9bcdab23f2432800 x2 : ffff800011730538 | x1 : 9bcdab23f2432800 x0 : 0000000000000000 | Call trace: | lockdep_hardirqs_off+0xd4/0xe8 | enter_from_kernel_mode.isra.5+0x7c/0xa8 | el1_abort+0x24/0x100 | el1_sync_handler+0x80/0xd0 | el1_sync+0x6c/0x100 | __arch_clear_user+0xc/0x90 | load_elf_binary+0x9fc/0x1450 | bprm_execve+0x404/0x880 | kernel_execve+0x180/0x188 | call_usermodehelper_exec_async+0xdc/0x158 | ret_from_fork+0x10/0x18 Fixes: 23529049c684 ("arm64: entry: fix non-NMI user<->kernel transitions") Fixes: 7cd1ea1010ac ("arm64: entry: fix non-NMI kernel<->kernel transitions") Fixes: f0cd5ac1e4c5 ("arm64: entry: fix NMI {user, kernel}->kernel transitions") Fixes: 2a9b3e6ac69a ("arm64: entry: fix EL1 debug transitions") Link: https://lore.kernel.org/r/f4012761-026f-4e51-3a0c-7524e434e8b3@huawei.com Signed-off-by: Mark Rutland Reported-by: Zenghui Yu Cc: Marc Zyngier Cc: Will Deacon Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20210428111555.50880-1-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/daifflags.h | 3 +++ arch/arm64/kernel/entry-common.c | 17 ----------------- arch/arm64/kernel/entry.S | 15 ++------------- 3 files changed, 5 insertions(+), 30 deletions(-) diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h index 5eb7af9c4557..55f57dfa8e2f 100644 --- a/arch/arm64/include/asm/daifflags.h +++ b/arch/arm64/include/asm/daifflags.h @@ -131,6 +131,9 @@ static inline void local_daif_inherit(struct pt_regs *regs) if (interrupts_enabled(regs)) trace_hardirqs_on(); + if (system_uses_irq_prio_masking()) + gic_write_pmr(regs->pmr_save); + /* * We can't use local_daif_restore(regs->pstate) here as * system_has_prio_mask_debugging() won't restore the I bit if it can diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index a1ec351c36bd..340d04e13617 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -230,14 +230,6 @@ static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); - /* - * The CPU masked interrupts, and we are leaving them masked during - * do_debug_exception(). Update PMR as if we had called - * local_daif_mask(). - */ - if (system_uses_irq_prio_masking()) - gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); - arm64_enter_el1_dbg(regs); if (!cortex_a76_erratum_1463225_debug_handler(regs)) do_debug_exception(far, esr, regs); @@ -404,9 +396,6 @@ static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr) /* Only watchpoints write FAR_EL1, otherwise its UNKNOWN */ unsigned long far = read_sysreg(far_el1); - if (system_uses_irq_prio_masking()) - gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); - enter_from_user_mode(); do_debug_exception(far, esr, regs); local_daif_restore(DAIF_PROCCTX_NOIRQ); @@ -414,9 +403,6 @@ static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr) static void noinstr el0_svc(struct pt_regs *regs) { - if (system_uses_irq_prio_masking()) - gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); - enter_from_user_mode(); cortex_a76_erratum_1463225_svc_handler(); do_el0_svc(regs); @@ -492,9 +478,6 @@ static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) static void noinstr el0_svc_compat(struct pt_regs *regs) { - if (system_uses_irq_prio_masking()) - gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); - enter_from_user_mode(); cortex_a76_erratum_1463225_svc_handler(); do_el0_svc_compat(regs); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 806a39635482..7be1c194409b 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -312,6 +312,8 @@ alternative_else_nop_endif alternative_if ARM64_HAS_IRQ_PRIO_MASKING mrs_s x20, SYS_ICC_PMR_EL1 str x20, [sp, #S_PMR_SAVE] + mov x20, #GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET + msr_s SYS_ICC_PMR_EL1, x20 alternative_else_nop_endif /* Re-enable tag checking (TCO set on exception entry) */ @@ -548,17 +550,7 @@ tsk .req x28 // current thread_info #endif .endm - .macro gic_prio_irq_setup, pmr:req, tmp:req -#ifdef CONFIG_ARM64_PSEUDO_NMI - alternative_if ARM64_HAS_IRQ_PRIO_MASKING - orr \tmp, \pmr, #GIC_PRIO_PSR_I_SET - msr_s SYS_ICC_PMR_EL1, \tmp - alternative_else_nop_endif -#endif - .endm - .macro el1_interrupt_handler, handler:req - gic_prio_irq_setup pmr=x20, tmp=x1 enable_da mov x0, sp @@ -588,7 +580,6 @@ alternative_else_nop_endif .endm .macro el0_interrupt_handler, handler:req - gic_prio_irq_setup pmr=x20, tmp=x0 user_exit_irqoff enable_da @@ -786,7 +777,6 @@ SYM_CODE_END(el0_fiq) SYM_CODE_START_LOCAL(el1_error) kernel_entry 1 mrs x1, esr_el1 - gic_prio_kentry_setup tmp=x2 enable_dbg mov x0, sp bl do_serror @@ -797,7 +787,6 @@ SYM_CODE_START_LOCAL(el0_error) kernel_entry 0 el0_error_naked: mrs x25, esr_el1 - gic_prio_kentry_setup tmp=x2 user_exit_irqoff enable_dbg mov x0, sp From 44f87191d105519cdf37fb0d4988006ea04eb34e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 3 May 2021 03:09:55 +0900 Subject: [PATCH 1060/1379] kbuild: parameterize the .o part of suffix-search The suffix-search macro hard-codes the suffix, '.o'. Make it a parameter so that the multi-search and real-search macros can be reused for foo-dtbs syntax introduced by commit 15d16d6dadf6 ("kbuild: Add generic rule to apply fdtoverlay"). Signed-off-by: Masahiro Yamada --- scripts/Makefile.lib | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index c0982b8b2b6d..0c8d146879eb 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -44,19 +44,22 @@ else obj-y := $(filter-out %/, $(obj-y)) endif -# Expand $(foo-objs) $(foo-y) by calling $(call suffix-search,foo.o,-objs -y) -suffix-search = $(strip $(foreach s, $2, $($(1:.o=$s)))) +# Expand $(foo-objs) $(foo-y) etc. by replacing their individuals +suffix-search = $(strip $(foreach s, $3, $($(1:%$(strip $2)=%$s)))) +# List composite targets that are constructed by combining other targets +multi-search = $(sort $(foreach m, $1, $(if $(call suffix-search, $m, $2, $3 -), $m))) +# List primitive targets that are compiled from source files +real-search = $(foreach m, $1, $(if $(call suffix-search, $m, $2, $3 -), $(call suffix-search, $m, $2, $3), $m)) + # If $(foo-objs), $(foo-y), $(foo-m), or $(foo-) exists, foo.o is a composite object -multi-search = $(sort $(foreach m, $1, $(if $(call suffix-search, $m, $2 -), $m))) -multi-obj-y := $(call multi-search,$(obj-y),-objs -y) -multi-obj-m := $(call multi-search,$(obj-m),-objs -y -m) +multi-obj-y := $(call multi-search, $(obj-y), .o, -objs -y) +multi-obj-m := $(call multi-search, $(obj-m), .o, -objs -y -m) multi-obj-ym := $(multi-obj-y) $(multi-obj-m) # Replace multi-part objects by their individual parts, # including built-in.a from subdirectories -real-search = $(foreach m, $1, $(if $(call suffix-search, $m, $2 -), $(call suffix-search, $m, $2), $m)) -real-obj-y := $(call real-search, $(obj-y),-objs -y) -real-obj-m := $(call real-search, $(obj-m),-objs -y -m) +real-obj-y := $(call real-search, $(obj-y), .o, -objs -y) +real-obj-m := $(call real-search, $(obj-m), .o, -objs -y -m) always-y += $(always-m) From bcf0c6642833673830ee9d9b40862a4c476d1565 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 3 May 2021 03:09:56 +0900 Subject: [PATCH 1061/1379] kbuild: refactor fdtoverlay rule Rename overlay-y to multi-dtb-y, which is a consistent name with multi-obj-y. Also, use multi-search to avoid code duplication. Introduce real-dtb-y, which is a consistent name with real-obj-y, to contain primitive blobs compiled from *.dts. This is used to calculate the list of *.dt.yaml files. Set -@ to base DTB without using $(eval ). Signed-off-by: Masahiro Yamada --- scripts/Makefile.build | 2 +- scripts/Makefile.lib | 33 +++++++++++++++------------------ 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 5e39b0517186..949f723efe53 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -354,7 +354,7 @@ $(obj)/%.o: $(src)/%.S $(objtool_dep) FORCE targets += $(filter-out $(subdir-builtin), $(real-obj-y)) targets += $(filter-out $(subdir-modorder), $(real-obj-m)) -targets += $(lib-y) $(always-y) $(MAKECMDGOALS) +targets += $(real-dtb-y) $(lib-y) $(always-y) $(MAKECMDGOALS) # Linker scripts preprocessor (.lds.S -> .lds) # --------------------------------------------------------------------------- diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 0c8d146879eb..deac60727e48 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -78,24 +78,18 @@ always-y += $(userprogs-always-y) $(userprogs-always-m) # If CONFIG_OF_ALL_DTBS is enabled, all DT blobs are built dtb-$(CONFIG_OF_ALL_DTBS) += $(dtb-) -# List all dtbs to be generated by fdtoverlay -overlay-y := $(foreach m,$(dtb-y), $(if $(strip $($(m:.dtb=-dtbs))),$(m),)) - -# Generate symbols for the base files so overlays can be applied to them. -$(foreach m,$(overlay-y), $(eval DTC_FLAGS_$(basename $(firstword $($(m:.dtb=-dtbs)))) += -@)) - -# Add base dtb and overlay dtbo -dtb-y += $(foreach m,$(overlay-y), $($(m:.dtb=-dtbs))) +# Composite DTB (i.e. DTB constructed by overlay) +multi-dtb-y := $(call multi-search, $(dtb-y), .dtb, -dtbs) +# Primitive DTB compiled from *.dts +real-dtb-y := $(call real-search, $(dtb-y), .dtb, -dtbs) +# Base DTB that overlay is applied onto (each first word of $(*-dtbs) expansion) +base-dtb-y := $(foreach m, $(multi-dtb-y), $(firstword $(call suffix-search, $m, .dtb, -dtbs))) always-y += $(dtb-y) ifneq ($(CHECK_DTBS),) -# Don't run schema checks for dtbs created by fdtoverlay as they don't -# have corresponding dts files. -dt-yaml-y := $(filter-out $(overlay-y),$(dtb-y)) - -always-y += $(patsubst %.dtb,%.dt.yaml, $(dt-yaml-y)) -always-y += $(patsubst %.dtbo,%.dt.yaml, $(dt-yaml-y)) +always-y += $(patsubst %.dtb,%.dt.yaml, $(real-dtb-y)) +always-y += $(patsubst %.dtbo,%.dt.yaml, $(real-dtb-y)) endif # Add subdir path @@ -108,6 +102,8 @@ lib-y := $(addprefix $(obj)/,$(lib-y)) real-obj-y := $(addprefix $(obj)/,$(real-obj-y)) real-obj-m := $(addprefix $(obj)/,$(real-obj-m)) multi-obj-m := $(addprefix $(obj)/, $(multi-obj-m)) +multi-dtb-y := $(addprefix $(obj)/, $(multi-dtb-y)) +real-dtb-y := $(addprefix $(obj)/, $(real-dtb-y)) subdir-ym := $(addprefix $(obj)/,$(subdir-ym)) # Finds the multi-part object the current object will be linked into. @@ -325,6 +321,9 @@ endif DTC_FLAGS += $(DTC_FLAGS_$(basetarget)) +# Set -@ if the target is a base DTB that overlay is applied onto +DTC_FLAGS += $(if $(filter $(patsubst $(obj)/%,%,$@), $(base-dtb-y)), -@) + # Generate an assembly file to wrap the output of the device tree compiler quiet_cmd_dt_S_dtb= DTB $@ cmd_dt_S_dtb= \ @@ -356,14 +355,12 @@ $(obj)/%.dtb: $(src)/%.dts $(DTC) FORCE $(obj)/%.dtbo: $(src)/%.dts $(DTC) FORCE $(call if_changed_dep,dtc) -overlay-y := $(addprefix $(obj)/, $(overlay-y)) - quiet_cmd_fdtoverlay = DTOVL $@ cmd_fdtoverlay = $(objtree)/scripts/dtc/fdtoverlay -o $@ -i $(real-prereqs) -$(overlay-y): FORCE +$(multi-dtb-y): FORCE $(call if_changed,fdtoverlay) -$(call multi_depend, $(overlay-y), .dtb, -dtbs) +$(call multi_depend, $(multi-dtb-y), .dtb, -dtbs) DT_CHECKER ?= dt-validate DT_CHECKER_FLAGS ?= $(if $(DT_SCHEMA_FILES),,-m) From d4452837ffbeb59e18f2499ef907579a618d623d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 3 May 2021 03:09:57 +0900 Subject: [PATCH 1062/1379] kbuild: refactor modname-multi by using suffix-search Improve the readability slightly. Signed-off-by: Masahiro Yamada --- scripts/Makefile.lib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index deac60727e48..10950559b223 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -109,7 +109,7 @@ subdir-ym := $(addprefix $(obj)/,$(subdir-ym)) # Finds the multi-part object the current object will be linked into. # If the object belongs to two or more multi-part objects, list them all. modname-multi = $(sort $(foreach m,$(multi-obj-ym),\ - $(if $(filter $*.o, $($(m:.o=-objs)) $($(m:.o=-y)) $($(m:.o=-m))),$(m:.o=)))) + $(if $(filter $*.o, $(call suffix-search, $m, .o, -objs -y -m)),$(m:.o=)))) __modname = $(if $(modname-multi),$(modname-multi),$(basetarget)) From 19c8d912837e45e99b2991228adfc4419ffff248 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 4 May 2021 19:10:56 +0900 Subject: [PATCH 1063/1379] kbuild: make distclean work against $(objtree) instead of $(srctree) This reverts the old commit [1], which seems questionable to me. It claimed 'make distclean' could not remove editor backup files, but I believe KBUILD_OUTPUT or O= was set. When O= is given, Kbuild should always work against $(objtree). If O= is not given, $(objtree) and $(srctree) are the same, therefore $(srctree) is cleaned up. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/commit/?id=dd47df980c02eb33833b2690b033c34fba2fa80d Signed-off-by: Masahiro Yamada --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index cbf4b18cf51c..560408808c33 100644 --- a/Makefile +++ b/Makefile @@ -1546,7 +1546,7 @@ PHONY += distclean distclean: mrproper $(call cmd,rmfiles) - @find $(srctree) $(RCS_FIND_IGNORE) \ + @find . $(RCS_FIND_IGNORE) \ \( -name '*.orig' -o -name '*.rej' -o -name '*~' \ -o -name '*.bak' -o -name '#*#' -o -name '*%' \ -o -name 'core' \) \ From 7a02cec523a90fec78634c655e2470f72d2fdcbf Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 4 May 2021 19:10:57 +0900 Subject: [PATCH 1064/1379] kbuild: make distclean remove tag files in sub-directories 'make tags' and friends create tag files in the top directory, but people may manually create tag files in sub-directories. Signed-off-by: Masahiro Yamada --- Makefile | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 560408808c33..2cbf68dc40b7 100644 --- a/Makefile +++ b/Makefile @@ -1511,9 +1511,6 @@ MRPROPER_FILES += include/config include/generated \ signing_key.x509.signer vmlinux-gdb.py \ *.spec -# Directories & files removed with 'make distclean' -DISTCLEAN_FILES += tags TAGS cscope* GPATH GTAGS GRTAGS GSYMS - # clean - Delete most, but leave enough to build external modules # clean: rm-files := $(CLEAN_FILES) @@ -1540,16 +1537,14 @@ mrproper: clean $(mrproper-dirs) # distclean # -distclean: rm-files := $(wildcard $(DISTCLEAN_FILES)) - PHONY += distclean distclean: mrproper - $(call cmd,rmfiles) @find . $(RCS_FIND_IGNORE) \ \( -name '*.orig' -o -name '*.rej' -o -name '*~' \ -o -name '*.bak' -o -name '#*#' -o -name '*%' \ - -o -name 'core' \) \ + -o -name 'core' -o -name tags -o -name TAGS -o -name 'cscope*' \ + -o -name GPATH -o -name GRTAGS -o -name GSYMS -o -name GTAGS \) \ -type f -print | xargs rm -f From 11122b860bc52a09c779c3de9415436794fb5605 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 4 May 2021 19:10:58 +0900 Subject: [PATCH 1065/1379] kbuild: remove the unneeded comments for external module builds The supported targets for external modules are listed in the help target a few lines below. Let's not have duplicated information in two places. Signed-off-by: Masahiro Yamada --- Makefile | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 2cbf68dc40b7..9b9a003bd88f 100644 --- a/Makefile +++ b/Makefile @@ -1711,17 +1711,7 @@ else # KBUILD_EXTMOD # When building external modules the kernel used as basis is considered # read-only, and no consistency checks are made and the make # system is not used on the basis kernel. If updates are required -# in the basis kernel ordinary make commands (without M=...) must -# be used. -# -# The following are the only valid targets when building external -# modules. -# make M=dir clean Delete all automatically generated files -# make M=dir modules Make all modules in specified dir -# make M=dir Same as 'make M=dir modules' -# make M=dir modules_install -# Install the modules built in the module directory -# Assumes install directory is already created +# in the basis kernel ordinary make commands (without M=...) must be used. # We are always building only modules. KBUILD_BUILTIN := From 51eb95e2da41802454f48b9afeb4d96a77295035 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 4 May 2021 20:35:27 -0700 Subject: [PATCH 1066/1379] kbuild: Don't remove link-vmlinux temporary files on exit/signal Keep them around until they are cleaned up by make clean. This uses a bit more disk space, but makes it easier to debug any problems with the kernel link process. Suggested-by: Masahiro Yamada Signed-off-by: Andi Kleen Signed-off-by: Masahiro Yamada --- scripts/link-vmlinux.sh | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 7d112681f332..f4de4c97015b 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -320,20 +320,6 @@ cleanup() rm -f .vmlinux.d } -on_exit() -{ - if [ $? -ne 0 ]; then - cleanup - fi -} -trap on_exit EXIT - -on_signals() -{ - exit 1 -} -trap on_signals HUP INT QUIT TERM - # Use "make V=1" to debug this script case "${KBUILD_VERBOSE}" in *1*) From 5d8505fd039c1e757ad3490e46fe0fe73d78e2e0 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 5 May 2021 14:28:45 +0100 Subject: [PATCH 1067/1379] arm64: Fix the documented event stream frequency It should be 10KHz, matching the ARCH_TIMER_EVT_STREAM_PERIOD_US of 100us. Note that this is only a documentation bug. Fixes: 611a7bc74ed2 ("arm64: docs: describe ELF hwcaps") Signed-off-by: Catalin Marinas Cc: Mark Rutland Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20210505132845.23698-1-catalin.marinas@arm.com Signed-off-by: Catalin Marinas --- Documentation/arm64/elf_hwcaps.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst index 87821662eeb2..ec1a5a63c1d0 100644 --- a/Documentation/arm64/elf_hwcaps.rst +++ b/Documentation/arm64/elf_hwcaps.rst @@ -74,7 +74,7 @@ HWCAP_ASIMD HWCAP_EVTSTRM The generic timer is configured to generate events at a frequency of - approximately 100KHz. + approximately 10KHz. HWCAP_AES Functionality implied by ID_AA64ISAR0_EL1.AES == 0b0001. From 7716506adac4664793a9d6d3dfa31ffddfa98714 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 18:32:45 -0700 Subject: [PATCH 1068/1379] mm: introduce and use mapping_empty() Patch series "Remove nrexceptional tracking", v2. We actually use nrexceptional for very little these days. It's a minor pain to keep in sync with nrpages, but the pain becomes much bigger with the THP patches because we don't know how many indices a shadow entry occupies. It's easier to just remove it than keep it accurate. Also, we save 8 bytes per inode which is nothing to sneeze at; on my laptop, it would improve shmem_inode_cache from 22 to 23 objects per 16kB, and inode_cache from 26 to 27 objects. Combined, that saves a megabyte of memory from a combined usage of 25MB for both caches. Unfortunately, ext4 doesn't cross a magic boundary, so it doesn't save any memory for ext4. This patch (of 4): Instead of checking the two counters (nrpages and nrexceptional), we can just check whether i_pages is empty. Link: https://lkml.kernel.org/r/20201026151849.24232-1-willy@infradead.org Link: https://lkml.kernel.org/r/20201026151849.24232-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Vishal Verma Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 2 +- fs/dax.c | 2 +- fs/gfs2/glock.c | 3 +-- include/linux/pagemap.h | 5 +++++ mm/truncate.c | 18 +++--------------- 5 files changed, 11 insertions(+), 19 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index a5244e08b6c8..9114e0a0e7b4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -79,7 +79,7 @@ static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + if (mapping_empty(mapping)) return; invalidate_bh_lrus(); diff --git a/fs/dax.c b/fs/dax.c index b3d27fdc6775..999f3f22aea3 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -965,7 +965,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; - if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) + if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL) return 0; trace_dax_writeback_range(inode, xas.xa_index, end_index); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 84c38103aa06..ea7fc5c641c7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -273,8 +273,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) if (mapping) { truncate_inode_pages_final(mapping); if (!gfs2_withdrawn(sdp)) - GLOCK_BUG_ON(gl, mapping->nrpages || - mapping->nrexceptional); + GLOCK_BUG_ON(gl, !mapping_empty(mapping)); } trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 469fa7ffcf96..a4bd41128bf3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -18,6 +18,11 @@ struct pagevec; +static inline bool mapping_empty(struct address_space *mapping) +{ + return xa_empty(&mapping->i_pages); +} + /* * Bits in mapping->flags. */ diff --git a/mm/truncate.c b/mm/truncate.c index 455944264663..adb8d4107988 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -295,7 +295,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t index; int i; - if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + if (mapping_empty(mapping)) goto out; /* Offsets within partial pages */ @@ -440,9 +440,6 @@ EXPORT_SYMBOL(truncate_inode_pages); */ void truncate_inode_pages_final(struct address_space *mapping) { - unsigned long nrexceptional; - unsigned long nrpages; - /* * Page reclaim can not participate in regular inode lifetime * management (can't call iput()) and thus can race with the @@ -452,16 +449,7 @@ void truncate_inode_pages_final(struct address_space *mapping) */ mapping_set_exiting(mapping); - /* - * When reclaim installs eviction entries, it increases - * nrexceptional first, then decreases nrpages. Make sure we see - * this in the right order or we might miss an entry. - */ - nrpages = mapping->nrpages; - smp_rmb(); - nrexceptional = mapping->nrexceptional; - - if (nrpages || nrexceptional) { + if (!mapping_empty(mapping)) { /* * As truncation uses a lockless tree lookup, cycle * the tree lock to make sure any ongoing tree @@ -633,7 +621,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int ret2 = 0; int did_range_unmap = 0; - if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + if (mapping_empty(mapping)) goto out; pagevec_init(&pvec); From 46be67b424efab933562a29ea8f1df0c20aa9959 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 18:32:48 -0700 Subject: [PATCH 1069/1379] mm: stop accounting shadow entries We no longer need to keep track of how many shadow entries are present in a mapping. This saves a few writes to the inode and memory barriers. Link: https://lkml.kernel.org/r/20201026151849.24232-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Vishal Verma Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 13 ------------- mm/swap_state.c | 4 ---- mm/truncate.c | 1 - mm/workingset.c | 1 - 4 files changed, 19 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 5be57ba01d33..d08ff1504e64 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -142,17 +142,6 @@ static void page_cache_delete(struct address_space *mapping, page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - - if (shadow) { - mapping->nrexceptional += nr; - /* - * Make sure the nrexceptional update is committed before - * the nrpages update so that final truncate racing - * with reclaim does not see both counters 0 at the - * same time and miss a shadow entry. - */ - smp_wmb(); - } mapping->nrpages -= nr; } @@ -925,8 +914,6 @@ noinline int __add_to_page_cache_locked(struct page *page, if (xas_error(&xas)) goto unlock; - if (old) - mapping->nrexceptional--; mapping->nrpages++; /* hugetlb pages do not participate in page cache accounting */ diff --git a/mm/swap_state.c b/mm/swap_state.c index fb7efa08fe57..3a1259c13f3b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -132,7 +132,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, xas_store(&xas, page); xas_next(&xas); } - address_space->nrexceptional -= nr_shadows; address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); @@ -172,8 +171,6 @@ void __delete_from_swap_cache(struct page *page, xas_next(&xas); } ClearPageSwapCache(page); - if (shadow) - address_space->nrexceptional += nr; address_space->nrpages -= nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr); @@ -275,7 +272,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, xas_store(&xas, NULL); nr_shadows++; } - address_space->nrexceptional -= nr_shadows; xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ diff --git a/mm/truncate.c b/mm/truncate.c index adb8d4107988..95af244b112a 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -40,7 +40,6 @@ static inline void __clear_shadow_entry(struct address_space *mapping, if (xas_load(&xas) != entry) return; xas_store(&xas, NULL); - mapping->nrexceptional--; } static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, diff --git a/mm/workingset.c b/mm/workingset.c index cd39902c1062..b7cdeca5a76d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -554,7 +554,6 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; - mapping->nrexceptional -= node->nr_values; xa_delete_node(node, workingset_update_node); __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); From 7f0e07fb0289519af7e726e4f7b7118f7ecc979b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 18:32:51 -0700 Subject: [PATCH 1070/1379] dax: account DAX entries as nrpages Simplify mapping_needs_writeback() by accounting DAX entries as pages instead of exceptional entries. Link: https://lkml.kernel.org/r/20201026151849.24232-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Vishal Verma Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 6 +++--- mm/filemap.c | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 999f3f22aea3..69216241392f 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -525,7 +525,7 @@ retry: dax_disassociate_entry(entry, mapping, false); xas_store(xas, NULL); /* undo the PMD join */ dax_wake_entry(xas, entry, true); - mapping->nrexceptional--; + mapping->nrpages -= PG_PMD_NR; entry = NULL; xas_set(xas, index); } @@ -541,7 +541,7 @@ retry: dax_lock_entry(xas, entry); if (xas_error(xas)) goto out_unlock; - mapping->nrexceptional++; + mapping->nrpages += 1UL << order; } out_unlock: @@ -661,7 +661,7 @@ static int __dax_invalidate_entry(struct address_space *mapping, goto out; dax_disassociate_entry(entry, mapping, trunc); xas_store(&xas, NULL); - mapping->nrexceptional--; + mapping->nrpages -= 1UL << dax_entry_order(entry); ret = 1; out: put_unlocked_entry(&xas, entry); diff --git a/mm/filemap.c b/mm/filemap.c index d08ff1504e64..ecc5f8a4c488 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -618,9 +618,6 @@ EXPORT_SYMBOL(filemap_fdatawait_keep_errors); /* Returns true if writeback might be needed or already in progress. */ static bool mapping_needs_writeback(struct address_space *mapping) { - if (dax_mapping(mapping)) - return mapping->nrexceptional; - return mapping->nrpages; } From 8bc3c481b3d0dcef2cf8e1b7c6b780af6725f7e3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 4 May 2021 18:32:54 -0700 Subject: [PATCH 1071/1379] mm: remove nrexceptional from inode We no longer track anything in nrexceptional, so remove it, saving 8 bytes per inode. Link: https://lkml.kernel.org/r/20201026151849.24232-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Vishal Verma Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 2 +- include/linux/fs.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 9e192bea0630..af48d1b722f0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -529,7 +529,7 @@ void clear_inode(struct inode *inode) */ xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); - BUG_ON(inode->i_data.nrexceptional); + BUG_ON(!mapping_empty(&inode->i_data)); xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 12766edee81f..acef282b97c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -442,7 +442,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping, * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. - * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). @@ -463,7 +462,6 @@ struct address_space { struct rb_root_cached i_mmap; struct rw_semaphore i_mmap_rwsem; unsigned long nrpages; - unsigned long nrexceptional; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; From 786b31121a2ce4309a81a7f36d63f02ca588839e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 May 2021 18:32:57 -0700 Subject: [PATCH 1072/1379] mm: remove nrexceptional from inode: remove BUG_ON clear_inode()'s BUG_ON(!mapping_empty(&inode->i_data)) is unsafe: we know of two ways in which nodes can and do (on rare occasions) get left behind. Until those are fixed, do not BUG_ON() nor even WARN_ON(). Yes, this will then leak those nodes (or the next user of the struct inode may use them); but this has been happening for years, and the new BUG_ON(!mapping_empty) was only guilty of revealing that. A proper fix will follow, but no hurry. Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2104292229380.16080@eggly.anvils Signed-off-by: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index af48d1b722f0..c93500d84264 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -529,7 +529,14 @@ void clear_inode(struct inode *inode) */ xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); - BUG_ON(!mapping_empty(&inode->i_data)); + /* + * Almost always, mapping_empty(&inode->i_data) here; but there are + * two known and long-standing ways in which nodes may get left behind + * (when deep radix-tree node allocation failed partway; or when THP + * collapse_file() failed). Until those two known cases are cleaned up, + * or a cleanup function is called here, do not BUG_ON(!mapping_empty), + * nor even WARN_ON(!mapping_empty). + */ xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); From aec44e0f0213e36d4f0868a80cdc5097a510f79d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:00 -0700 Subject: [PATCH 1073/1379] hugetlb: pass vma into huge_pte_alloc() and huge_pmd_share() Patch series "hugetlb: Disable huge pmd unshare for uffd-wp", v4. This series tries to disable huge pmd unshare of hugetlbfs backed memory for uffd-wp. Although uffd-wp of hugetlbfs is still during rfc stage, the idea of this series may be needed for multiple tasks (Axel's uffd minor fault series, and Mike's soft dirty series), so I picked it out from the larger series. This patch (of 4): It is a preparation work to be able to behave differently in the per architecture huge_pte_alloc() according to different VMA attributes. Pass it deeper into huge_pmd_share() so that we can avoid the find_vma() call. [peterx@redhat.com: build fix] Link: https://lkml.kernel.org/r/20210304164653.GB397383@xz-x1Link: https://lkml.kernel.org/r/20210218230633.15028-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210218230633.15028-2-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Mike Kravetz Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 4 ++-- arch/ia64/mm/hugetlbpage.c | 3 ++- arch/mips/mm/hugetlbpage.c | 4 ++-- arch/parisc/mm/hugetlbpage.c | 2 +- arch/powerpc/mm/hugetlbpage.c | 3 ++- arch/s390/mm/hugetlbpage.c | 2 +- arch/sh/mm/hugetlbpage.c | 2 +- arch/sparc/mm/hugetlbpage.c | 2 +- include/linux/hugetlb.h | 5 +++-- mm/hugetlb.c | 15 ++++++++------- mm/userfaultfd.c | 2 +- 11 files changed, 24 insertions(+), 20 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 55ecf6de9ff7..6e3bcffe2837 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -252,7 +252,7 @@ void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, pte); } -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgdp; @@ -286,7 +286,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, } else if (sz == PMD_SIZE) { if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && pud_none(READ_ONCE(*pudp))) - ptep = huge_pmd_share(mm, addr, pudp); + ptep = huge_pmd_share(mm, vma, addr, pudp); else ptep = (pte_t *)pmd_alloc(mm, pudp, addr); } else if (sz == (CONT_PMD_SIZE)) { diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index b331f94d20ac..f993cb36c062 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -25,7 +25,8 @@ unsigned int hpage_shift = HPAGE_SHIFT_DEFAULT; EXPORT_SYMBOL(hpage_shift); pte_t * -huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) +huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long sz) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index b9f76f433617..7eaff5b07873 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -21,8 +21,8 @@ #include #include -pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, - unsigned long sz) +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index 43652de5f139..d1d3990b83f6 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -44,7 +44,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index d142b76d507d..9a75ba078e1b 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -106,7 +106,8 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, * At this point we do the placement change only for BOOK3S 64. This would * possibly work on other subarchs. */ -pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long sz) { pgd_t *pg; p4d_t *p4; diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 3b5a4d25ca9b..da36d13ffc16 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -189,7 +189,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, return pte; } -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgdp; diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 220d7bc43d2b..999ab5916e69 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -21,7 +21,7 @@ #include #include -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index ad4b42f04988..04d8790f6c32 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -279,7 +279,7 @@ unsigned long pud_leaf_size(pud_t pud) { return 1UL << tte_to_shift(*(pte_t *)&p unsigned long pmd_leaf_size(pmd_t pmd) { return 1UL << tte_to_shift(*(pte_t *)&pmd); } unsigned long pte_leaf_size(pte_t pte) { return 1UL << tte_to_shift(pte); } -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index cccd1aab69dd..653ef322fac9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -152,7 +152,8 @@ void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud); struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage); @@ -161,7 +162,7 @@ extern struct list_head huge_boot_pages; /* arch callbacks */ -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6c72433bec1e..a02a651088d3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3795,7 +3795,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, addr, sz); + dst_pte = huge_pte_alloc(dst, vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; break; @@ -4563,7 +4563,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ mapping = vma->vm_file->f_mapping; i_mmap_lock_read(mapping); - ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); + ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); if (!ptep) { i_mmap_unlock_read(mapping); return VM_FAULT_OOM; @@ -5370,9 +5370,9 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is * only required for subsequent processing. */ -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) { - struct vm_area_struct *vma = find_vma(mm, addr); struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -5450,7 +5450,8 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, } #define want_pmd_share() (1) #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) { return NULL; } @@ -5469,7 +5470,7 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; @@ -5488,7 +5489,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, } else { BUG_ON(sz != PMD_SIZE); if (want_pmd_share() && pud_none(*pud)) - pte = huge_pmd_share(mm, addr, pud); + pte = huge_pmd_share(mm, vma, addr, pud); else pte = (pte_t *)pmd_alloc(mm, pud, addr); } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9a3d451402d7..063cbb17e8d8 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -290,7 +290,7 @@ retry: mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; - dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize); + dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); From c1991e0705d143be773c984b006f2078aa9f2853 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:04 -0700 Subject: [PATCH 1074/1379] hugetlb/userfaultfd: forbid huge pmd sharing when uffd enabled Huge pmd sharing could bring problem to userfaultfd. The thing is that userfaultfd is running its logic based on the special bits on page table entries, however the huge pmd sharing could potentially share page table entries for different address ranges. That could cause issues on either: - When sharing huge pmd page tables for an uffd write protected range, the newly mapped huge pmd range will also be write protected unexpectedly, or, - When we try to write protect a range of huge pmd shared range, we'll first do huge_pmd_unshare() in hugetlb_change_protection(), however that also means the UFFDIO_WRITEPROTECT could be silently skipped for the shared region, which could lead to data loss. While at it, a few other things are done altogether: - Move want_pmd_share() from mm/hugetlb.c into linux/hugetlb.h, because that's definitely something that arch code would like to use too - ARM64 currently directly check against CONFIG_ARCH_WANT_HUGE_PMD_SHARE when trying to share huge pmd. Switch to the want_pmd_share() helper. - Move vma_shareable() from huge_pmd_share() into want_pmd_share(). [peterx@redhat.com: fix build with !ARCH_WANT_HUGE_PMD_SHARE] Link: https://lkml.kernel.org/r/20210310185359.88297-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210218231202.15426-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: Axel Rasmussen Tested-by: Naresh Kamboju Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 3 +-- include/linux/hugetlb.h | 2 ++ include/linux/userfaultfd_k.h | 9 +++++++++ mm/hugetlb.c | 22 ++++++++++++++++------ 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 6e3bcffe2837..58987a98e179 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -284,8 +284,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, */ ptep = pte_alloc_map(mm, pmdp, addr); } else if (sz == PMD_SIZE) { - if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && - pud_none(READ_ONCE(*pudp))) + if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp))) ptep = huge_pmd_share(mm, vma, addr, pudp); else ptep = (pte_t *)pmd_alloc(mm, pudp, addr); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 653ef322fac9..88e93809a455 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1040,4 +1040,6 @@ static inline __init void hugetlb_cma_check(void) } #endif +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); + #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index a8e5f3ea9bb2..c63ccdae3eab 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -52,6 +52,15 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx; } +/* + * Never enable huge pmd sharing on uffd-wp registered vmas, because uffd-wp + * protect information is per pgtable entry. + */ +static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_UFFD_WP; +} + static inline bool userfaultfd_missing(struct vm_area_struct *vma) { return vma->vm_flags & VM_UFFD_MISSING; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a02a651088d3..91647e824015 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5326,6 +5326,15 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) return false; } +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) +{ +#ifdef CONFIG_USERFAULTFD + if (uffd_disable_huge_pmd_share(vma)) + return false; +#endif + return vma_shareable(vma, addr); +} + /* * Determine if start,end range within vma could be mapped by shared pmd. * If yes, adjust start and end to cover range associated with possible @@ -5382,9 +5391,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *pte; spinlock_t *ptl; - if (!vma_shareable(vma, addr)) - return (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_assert_locked(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) @@ -5448,7 +5454,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; return 1; } -#define want_pmd_share() (1) + #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) @@ -5466,7 +5472,11 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } -#define want_pmd_share() (0) + +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) +{ + return false; +} #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB @@ -5488,7 +5498,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte = (pte_t *)pud; } else { BUG_ON(sz != PMD_SIZE); - if (want_pmd_share() && pud_none(*pud)) + if (want_pmd_share(vma, addr) && pud_none(*pud)) pte = huge_pmd_share(mm, vma, addr, pud); else pte = (pte_t *)pmd_alloc(mm, pud, addr); From 537cf30bba241ae88d5f4b0b6a5e66271b394852 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:08 -0700 Subject: [PATCH 1075/1379] mm/hugetlb: move flush_hugetlb_tlb_range() into hugetlb.h Prepare for it to be called outside of mm/hugetlb.c. Link: https://lkml.kernel.org/r/20210218231204.15474-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: Axel Rasmussen Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 8 ++++++++ mm/hugetlb.c | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 88e93809a455..e43668144664 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1042,4 +1042,12 @@ static inline __init void hugetlb_cma_check(void) bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); +#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE +/* + * ARCHes with special requirements for evicting HUGETLB backing TLB entries can + * implement this. + */ +#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) +#endif + #endif /* _LINUX_HUGETLB_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 91647e824015..3868f3126534 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4996,14 +4996,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i ? i : err; } -#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE -/* - * ARCHes with special requirements for evicting HUGETLB backing TLB entries can - * implement this. - */ -#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) -#endif - unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) { From 6dfeaff93be1a4cab4fb48dad7df326d05059a99 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 May 2021 18:33:13 -0700 Subject: [PATCH 1076/1379] hugetlb/userfaultfd: unshare all pmds for hugetlbfs when register wp Huge pmd sharing for hugetlbfs is racy with userfaultfd-wp because userfaultfd-wp is always based on pgtable entries, so they cannot be shared. Walk the hugetlb range and unshare all such mappings if there is, right before UFFDIO_REGISTER will succeed and return to userspace. This will pair with want_pmd_share() in hugetlb code so that huge pmd sharing is completely disabled for userfaultfd-wp registered range. Link: https://lkml.kernel.org/r/20210218231206.15524-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Peter Xu Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Lokesh Gidra Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 4 ++++ include/linux/hugetlb.h | 3 +++ mm/hugetlb.c | 51 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0be8cdd4425a..e5ce3b4e6c3d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -1449,6 +1450,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx.ctx = ctx; + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) + hugetlb_unshare_all_pmds(vma); + skip: prev = vma; start = vma->vm_end; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e43668144664..0f5813522224 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -188,6 +188,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); bool is_hugetlb_entry_migration(pte_t pte); +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -369,6 +370,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, return 0; } +static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { } + #endif /* !CONFIG_HUGETLB_PAGE */ /* * hugepages at page global directory. If arch support diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3868f3126534..e86d3abcc300 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5691,6 +5691,57 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) } } +/* + * This function will unconditionally remove all the shared pmd pgtable entries + * within the specific vma for a hugetlbfs memory range. + */ +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) +{ + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; + unsigned long address, start, end; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + + start = ALIGN(vma->vm_start, PUD_SIZE); + end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); + + if (start >= end) + return; + + /* + * No need to call adjust_range_if_pmd_sharing_possible(), because + * we have already done the PUD_SIZE alignment. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + start, end); + mmu_notifier_invalidate_range_start(&range); + i_mmap_lock_write(vma->vm_file->f_mapping); + for (address = start; address < end; address += PUD_SIZE) { + unsigned long tmp = address; + + ptep = huge_pte_offset(mm, address, sz); + if (!ptep) + continue; + ptl = huge_pte_lock(h, mm, ptep); + /* We don't want 'address' to be changed */ + huge_pmd_unshare(mm, vma, &tmp, ptep); + spin_unlock(ptl); + } + flush_hugetlb_tlb_range(vma, start, end); + i_mmap_unlock_write(vma->vm_file->f_mapping); + /* + * No need to call mmu_notifier_invalidate_range(), see + * Documentation/vm/mmu_notifier.rst. + */ + mmu_notifier_invalidate_range_end(&range); +} + #ifdef CONFIG_CMA static bool cma_reserve_called __initdata; From 6501fe5f162395ba6dfa6ac86be05f1c24c1a7e0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:16 -0700 Subject: [PATCH 1077/1379] mm/hugetlb: remove redundant reservation check condition in alloc_huge_page() vma_resv_map(vma) checks if a reserve map is associated with the vma. The routine vma_needs_reservation() will check vma_resv_map(vma) and return 1 if no reserv map is present. map_chg is set to the return value of vma_needs_reservation(). Therefore, !vma_resv_map(vma) is redundant in the expression: map_chg || avoid_reserve || !vma_resv_map(vma); Remove the redundant check. [Thanks Mike Kravetz for reshaping this commit message!] Link: https://lkml.kernel.org/r/20210301104726.45159-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e86d3abcc300..c0d4c7784a84 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2316,7 +2316,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, /* If this allocation is not consuming a reservation, charge it now. */ - deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma); + deferred_reserve = map_chg || avoid_reserve; if (deferred_reserve) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); From 4bfb68a0858deae4c40ea585037a3261f0717b0a Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:33:19 -0700 Subject: [PATCH 1078/1379] mm: generalize HUGETLB_PAGE_SIZE_VARIABLE HUGETLB_PAGE_SIZE_VARIABLE need not be defined for each individual platform subscribing it. Instead just make it generic. Link: https://lkml.kernel.org/r/1614914928-22039-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Acked-by: Michael Ellerman [powerpc] Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christophe Leroy Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/Kconfig | 6 +----- arch/powerpc/Kconfig | 6 +----- mm/Kconfig | 7 +++++++ 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 81e2b893b1e7..fd2cfc65fdc9 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -32,6 +32,7 @@ config IA64 select TTY select HAVE_ARCH_TRACEHOOK select HAVE_VIRT_CPU_ACCOUNTING + select HUGETLB_PAGE_SIZE_VARIABLE if HUGETLB_PAGE select VIRT_TO_BUS select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP @@ -82,11 +83,6 @@ config STACKTRACE_SUPPORT config GENERIC_LOCKBREAK def_bool n -config HUGETLB_PAGE_SIZE_VARIABLE - bool - depends on HUGETLB_PAGE - default y - config GENERIC_CALIBRATE_DELAY bool default y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2d060c002595..965278498dd7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -232,6 +232,7 @@ config PPC select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HUGETLB_PAGE_SIZE_VARIABLE if PPC_BOOK3S_64 && HUGETLB_PAGE select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_PAGE_SIZE select HAVE_REGS_AND_STACK_ACCESS_API @@ -416,11 +417,6 @@ config HIGHMEM source "kernel/Kconfig.hz" -config HUGETLB_PAGE_SIZE_VARIABLE - bool - depends on HUGETLB_PAGE && PPC_BOOK3S_64 - default y - config MATH_EMULATION bool "Math emulation" depends on 4xx || PPC_8xx || PPC_MPC832x || BOOKE diff --git a/mm/Kconfig b/mm/Kconfig index 3636da27c385..f4ceeaedb966 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -273,6 +273,13 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION config ARCH_ENABLE_THP_MIGRATION bool +config HUGETLB_PAGE_SIZE_VARIABLE + def_bool n + help + Allows the pageblock_order value to be dynamic instead of just standard + HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available + on a platform. + config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA From 04adbc3f7bff403a97355531da0190a263d66ea5 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:22 -0700 Subject: [PATCH 1079/1379] mm/hugetlb: use some helper functions to cleanup code Patch series "Some cleanups for hugetlb". This series contains cleanups to remove unnecessary VM_BUG_ON_PAGE, use helper function and so on. I also collect some previous patches into this series in case they are forgotten. This patch (of 5): We could use pages_per_huge_page to get the number of pages per hugepage, use get_hstate_idx to calculate hstate index, and use hstate_is_gigantic to check if a hstate is gigantic to make code more succinct. Link: https://lkml.kernel.org/r/20210308112809.26107-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210308112809.26107-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- mm/hugetlb.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 701c82c36138..c262566f7c5d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1435,7 +1435,7 @@ static int get_hstate_idx(int page_size_log) if (!h) return -1; - return h - hstates; + return hstate_index(h); } /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c0d4c7784a84..87104dc78e59 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1273,7 +1273,7 @@ static void free_gigantic_page(struct page *page, unsigned int order) static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - unsigned long nr_pages = 1UL << huge_page_order(h); + unsigned long nr_pages = pages_per_huge_page(h); if (nid == NUMA_NO_NODE) nid = numa_mem_id(); @@ -3267,10 +3267,10 @@ static int __init hugepages_setup(char *s) /* * Global state is always initialized later in hugetlb_init. - * But we need to allocate >= MAX_ORDER hstates here early to still + * But we need to allocate gigantic hstates here early to still * use the bootmem allocator. */ - if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) + if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) hugetlb_hstate_alloc_pages(parsed_hstate); last_mhp = mhp; From 5af1ab1d24e0842e2ca72c1fd0833864f6fa458a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:25 -0700 Subject: [PATCH 1080/1379] mm/hugetlb: optimize the surplus state transfer code in move_hugetlb_state() We should not transfer the per-node surplus state when we do not cross the node in order to save some cpu cycles Link: https://lkml.kernel.org/r/20210308112809.26107-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 87104dc78e59..c3d47af59137 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5682,6 +5682,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) SetHPageTemporary(oldpage); ClearHPageTemporary(newpage); + /* + * There is no need to transfer the per-node surplus state + * when we do not cross the node. + */ + if (new_nid == old_nid) + return; spin_lock(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { h->surplus_huge_pages_node[old_nid]--; From 5c8ecb131a655e775287380428ac1c764c117ee6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:28 -0700 Subject: [PATCH 1081/1379] mm/hugetlb_cgroup: remove unnecessary VM_BUG_ON_PAGE in hugetlb_cgroup_migrate() !PageHuge(oldhpage) is implicitly checked in page_hstate() above, so we remove this explicit one. Link: https://lkml.kernel.org/r/20210308112809.26107-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb_cgroup.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 603a131e262d..726b85f4f303 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -784,7 +784,6 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) if (hugetlb_cgroup_disabled()) return; - VM_BUG_ON_PAGE(!PageHuge(oldhpage), oldhpage); spin_lock(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(oldhpage); h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); From d83e6c8a9b65876b0dcd11ca25e8c39bd7bb1a1c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:31 -0700 Subject: [PATCH 1082/1379] mm/hugetlb: simplify the code when alloc_huge_page() failed in hugetlb_no_page() Rework the error handling code when alloc_huge_page() failed to remove some duplicated code and simplify the code slightly. Link: https://lkml.kernel.org/r/20210308112809.26107-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c3d47af59137..c8f0a38588ba 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4395,13 +4395,10 @@ retry: * sure there really is no pte entry. */ ptl = huge_pte_lock(h, mm, ptep); - if (!huge_pte_none(huge_ptep_get(ptep))) { - ret = 0; - spin_unlock(ptl); - goto out; - } + ret = 0; + if (huge_pte_none(huge_ptep_get(ptep))) + ret = vmf_error(PTR_ERR(page)); spin_unlock(ptl); - ret = vmf_error(PTR_ERR(page)); goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); From d4241a049ac0049fe96b3dae0598092517dbf6bd Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:34 -0700 Subject: [PATCH 1083/1379] mm/hugetlb: avoid calculating fault_mutex_hash in truncate_op case The fault_mutex hashing overhead can be avoided in truncate_op case because page faults can not race with truncation in this routine. So calculate hash for fault_mutex only in !truncate_op case to save some cpu cycles. Link: https://lkml.kernel.org/r/20210308112809.26107-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c262566f7c5d..d81f52b87bd7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -482,10 +482,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; - u32 hash; + u32 hash = 0; index = page->index; - hash = hugetlb_fault_mutex_hash(mapping, index); if (!truncate_op) { /* * Only need to hold the fault mutex in the @@ -493,6 +492,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * page faults. Races are not possible in the * case of truncation. */ + hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); } From 0edf61e5ee5c334f33bb7bf95d1b470f01ae9fec Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:37 -0700 Subject: [PATCH 1084/1379] khugepaged: remove unneeded return value of khugepaged_collapse_pte_mapped_thps() Patch series "Cleanup and fixup for khugepaged", v2. This series contains cleanups to remove unneeded return value, use helper function and so on. And there is one fix to correct the wrong result value for trace_mm_collapse_huge_page_isolate(). This patch (of 4): The return value of khugepaged_collapse_pte_mapped_thps() is never checked since it's introduced. We should remove such unneeded return value. Link: https://lkml.kernel.org/r/20210306032947.35921-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210306032947.35921-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Kirill A. Shutemov Cc: Rik van Riel Cc: Ebru Akagunduz Cc: Dan Carpenter Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a7d6cb912b05..d43812c5ce16 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1533,16 +1533,16 @@ abort: goto drop_hpage; } -static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) { struct mm_struct *mm = mm_slot->mm; int i; if (likely(mm_slot->nr_pte_mapped_thp == 0)) - return 0; + return; if (!mmap_write_trylock(mm)) - return -EBUSY; + return; if (unlikely(khugepaged_test_exit(mm))) goto out; @@ -1553,7 +1553,6 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) out: mm_slot->nr_pte_mapped_thp = 0; mmap_write_unlock(mm); - return 0; } static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) @@ -2057,9 +2056,8 @@ static void khugepaged_scan_file(struct mm_struct *mm, BUILD_BUG(); } -static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) { - return 0; } #endif From 588d01f918d42d2d453d8cd5af6bf2c2e1072a47 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:40 -0700 Subject: [PATCH 1085/1379] khugepaged: reuse the smp_wmb() inside __SetPageUptodate() smp_wmb() is needed to avoid the copy_huge_page writes to become visible after the set_pmd_at() write here. But we can reuse the smp_wmb() inside __SetPageUptodate() to remove this redundant one. Link: https://lkml.kernel.org/r/20210306032947.35921-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: Ebru Akagunduz Cc: Mike Kravetz Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d43812c5ce16..287e7ecf978c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1183,19 +1183,18 @@ static void collapse_huge_page(struct mm_struct *mm, __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, &compound_pagelist); pte_unmap(pte); + /* + * spin_lock() below is not the equivalent of smp_wmb(), but + * the smp_wmb() inside __SetPageUptodate() can be reused to + * avoid the copy_huge_page writes to become visible after + * the set_pmd_at() write. + */ __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - /* - * spin_lock() below is not the equivalent of smp_wmb(), so - * this is needed to avoid the copy_huge_page writes to become - * visible after the set_pmd_at() write. - */ - smp_wmb(); - spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address, true); From 28ff0a3c421ca19f4c8b41f736ff388fd588e1a1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:43 -0700 Subject: [PATCH 1086/1379] khugepaged: use helper khugepaged_test_exit() in __khugepaged_enter() Commit 4d45e75a9955 ("mm: remove the now-unnecessary mmget_still_valid() hack") have made khugepaged_test_exit() suitable for check mm->mm_users against 0. Use this helper here. Link: https://lkml.kernel.org/r/20210306032947.35921-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: Ebru Akagunduz Cc: Mike Kravetz Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 287e7ecf978c..e886a8618c33 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -481,7 +481,7 @@ int __khugepaged_enter(struct mm_struct *mm) return -ENOMEM; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm); + VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); return 0; From 74e579bf231a337ab3786d59e64bc94f45ca7b3f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:46 -0700 Subject: [PATCH 1087/1379] khugepaged: fix wrong result value for trace_mm_collapse_huge_page_isolate() In writable and !referenced case, the result value should be SCAN_LACK_REFERENCED_PAGE for trace_mm_collapse_huge_page_isolate() instead of default 0 (SCAN_FAIL) here. Link: https://lkml.kernel.org/r/20210306032947.35921-5-linmiaohe@huawei.com Fixes: 7d2eba0557c1 ("mm: add tracepoint for scanning pages") Signed-off-by: Miaohe Lin Acked-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: Ebru Akagunduz Cc: Mike Kravetz Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e886a8618c33..adf677246d86 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -716,17 +716,17 @@ next: if (pte_write(pteval)) writable = true; } - if (likely(writable)) { - if (likely(referenced)) { - result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(page, none_or_zero, - referenced, writable, result); - return 1; - } - } else { - result = SCAN_PAGE_RO; - } + if (unlikely(!writable)) { + result = SCAN_PAGE_RO; + } else if (unlikely(!referenced)) { + result = SCAN_LACK_REFERENCED_PAGE; + } else { + result = SCAN_SUCCEED; + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); + return 1; + } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, From 8fd5eda4c7268b62f46b2ed76b96f9e41e128a47 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:49 -0700 Subject: [PATCH 1088/1379] mm/huge_memory.c: remove unnecessary local variable ret2 There is no need to use a new local variable ret2 to get the return value of handle_userfault(). Use ret directly to make code more succinct. Link: https://lkml.kernel.org/r/20210210072409.60587-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Andrew Morton Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ae907a9c2050..bff92dea5ab3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -624,14 +624,12 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { - vm_fault_t ret2; - spin_unlock(vmf->ptl); put_page(page); pte_free(vma->vm_mm, pgtable); - ret2 = handle_userfault(vmf, VM_UFFD_MISSING); - VM_BUG_ON(ret2 & VM_FAULT_FALLBACK); - return ret2; + ret = handle_userfault(vmf, VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + return ret; } entry = mk_huge_pmd(page, vma->vm_page_prot); From 71f9e58eb408db423e0e27b55e0de66fb3590296 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:52 -0700 Subject: [PATCH 1089/1379] mm/huge_memory.c: rework the function vma_adjust_trans_huge() Patch series "Some cleanups for huge_memory", v3. This series contains cleanups to rework some function logics to make it more readable, use helper function and so on. More details can be found in the respective changelogs. This patch (of 6): The current implementation of vma_adjust_trans_huge() contains some duplicated codes. Add helper function to get rid of these codes to make it more succinct. Link: https://lkml.kernel.org/r/20210318122722.13135-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210318122722.13135-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Peter Xu Cc: Zi Yan Cc: Matthew Wilcox Cc: William Kucharski Cc: Vlastimil Babka Cc: Peter Xu Cc: yuleixzhang Cc: Michel Lespinasse Cc: Aneesh Kumar K.V Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Yang Shi Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bff92dea5ab3..ae16a82da823 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2301,44 +2301,38 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, __split_huge_pmd(vma, pmd, address, freeze, page); } +static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address) +{ + /* + * If the new address isn't hpage aligned and it could previously + * contain an hugepage: check if we need to split an huge pmd. + */ + if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) && + range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE), + ALIGN(address, HPAGE_PMD_SIZE))) + split_huge_pmd_address(vma, address, false, NULL); +} + void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) { - /* - * If the new start address isn't hpage aligned and it could - * previously contain an hugepage: check if we need to split - * an huge pmd. - */ - if (start & ~HPAGE_PMD_MASK && - (start & HPAGE_PMD_MASK) >= vma->vm_start && - (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, start, false, NULL); + /* Check if we need to split start first. */ + split_huge_pmd_if_needed(vma, start); + + /* Check if we need to split end next. */ + split_huge_pmd_if_needed(vma, end); /* - * If the new end address isn't hpage aligned and it could - * previously contain an hugepage: check if we need to split - * an huge pmd. - */ - if (end & ~HPAGE_PMD_MASK && - (end & HPAGE_PMD_MASK) >= vma->vm_start && - (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, end, false, NULL); - - /* - * If we're also updating the vma->vm_next->vm_start, if the new - * vm_next->vm_start isn't hpage aligned and it could previously - * contain an hugepage: check if we need to split an huge pmd. + * If we're also updating the vma->vm_next->vm_start, + * check if we need to split it. */ if (adjust_next > 0) { struct vm_area_struct *next = vma->vm_next; unsigned long nstart = next->vm_start; nstart += adjust_next; - if (nstart & ~HPAGE_PMD_MASK && - (nstart & HPAGE_PMD_MASK) >= next->vm_start && - (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) - split_huge_pmd_address(next, nstart, false, NULL); + split_huge_pmd_if_needed(next, nstart); } } From aaa9705b4af3608fd759c9ba8d0003f7a83fb335 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:55 -0700 Subject: [PATCH 1090/1379] mm/huge_memory.c: make get_huge_zero_page() return bool It's guaranteed that huge_zero_page will not be NULL if huge_zero_refcount is increased successfully. When READ_ONCE(huge_zero_page) is returned, there must be a huge_zero_page and it can be replaced with returning 'true' when we do not care about the value of huge_zero_page. We can thus make it return bool to save READ_ONCE cpu cycles as the return value is just used to check if huge_zero_page exists. Link: https://lkml.kernel.org/r/20210318122722.13135-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Reviewed-by: Peter Xu Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Vlastimil Babka Cc: Wei Yang Cc: William Kucharski Cc: Yang Shi Cc: yuleixzhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ae16a82da823..01b96c638e73 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -77,18 +77,18 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma) return false; } -static struct page *get_huge_zero_page(void) +static bool get_huge_zero_page(void) { struct page *zero_page; retry: if (likely(atomic_inc_not_zero(&huge_zero_refcount))) - return READ_ONCE(huge_zero_page); + return true; zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, HPAGE_PMD_ORDER); if (!zero_page) { count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); - return NULL; + return false; } count_vm_event(THP_ZERO_PAGE_ALLOC); preempt_disable(); @@ -101,7 +101,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); - return READ_ONCE(huge_zero_page); + return true; } static void put_huge_zero_page(void) From 6beb5e8bba972e15276a27555f2f4b834b248742 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:33:59 -0700 Subject: [PATCH 1091/1379] mm/huge_memory.c: rework the function do_huge_pmd_numa_page() slightly The current code that checks if migrating misplaced transhuge page is needed is pretty hard to follow. Rework it and add a comment to make its logic more clear and improve readability. Link: https://lkml.kernel.org/r/20210318122722.13135-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Reviewed-by: Peter Xu Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Vlastimil Babka Cc: Wei Yang Cc: William Kucharski Cc: Yang Shi Cc: yuleixzhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 01b96c638e73..23964adf5db2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1462,12 +1462,6 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) */ page_locked = trylock_page(page); target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == NUMA_NO_NODE) { - /* If the page was locked, there are no parallel migrations */ - if (page_locked) - goto clear_pmdnuma; - } - /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { page_nid = NUMA_NO_NODE; @@ -1476,6 +1470,11 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) spin_unlock(vmf->ptl); put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); goto out; + } else if (target_nid == NUMA_NO_NODE) { + /* There are no parallel migrations and page is in the right + * node. Clear the numa hinting info in this pmd. + */ + goto clear_pmdnuma; } /* From f6004e73ae955d0a44d66a5709ec5f98c07c733f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:02 -0700 Subject: [PATCH 1092/1379] mm/huge_memory.c: remove redundant PageCompound() check The !PageCompound() check limits the page must be head or tail while !PageHead() further limits it to page head only. So !PageHead() check is equivalent here. Link: https://lkml.kernel.org/r/20210318122722.13135-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Peter Xu Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Vlastimil Babka Cc: Wei Yang Cc: William Kucharski Cc: Yang Shi Cc: yuleixzhang Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 23964adf5db2..52acc3954afd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1291,7 +1291,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) } page = pmd_page(orig_pmd); - VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); + VM_BUG_ON_PAGE(!PageHead(page), page); /* Lock page for reuse_swap_page() */ if (!trylock_page(page)) { From d4afd60c24f87b6275b12ec3d67d8c2ad78cb075 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:05 -0700 Subject: [PATCH 1093/1379] mm/huge_memory.c: remove unused macro TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG Commit 4958e4d86ecb ("mm: thp: remove debug_cow switch") forgot to remove TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG macro. Remove it here. Link: https://lkml.kernel.org/r/20210318122722.13135-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Reviewed-by: Peter Xu Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Vlastimil Babka Cc: Wei Yang Cc: William Kucharski Cc: Yang Shi Cc: yuleixzhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ba973efcd369..9626fda5efce 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -87,9 +87,6 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, -#ifdef CONFIG_DEBUG_VM - TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, -#endif }; struct kobject; From a44f89dc6c5f8ba70240b81a570260d29d04bcb0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:08 -0700 Subject: [PATCH 1094/1379] mm/huge_memory.c: use helper function migration_entry_to_page() It's more recommended to use helper function migration_entry_to_page() to get the page via migration entry. We can also enjoy the PageLocked() check there. Link: https://lkml.kernel.org/r/20210318122722.13135-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Peter Xu Cc: Aneesh Kumar K.V Cc: Matthew Wilcox Cc: Michel Lespinasse Cc: Ralph Campbell Cc: Thomas Hellstrm (Intel) Cc: Vlastimil Babka Cc: Wei Yang Cc: William Kucharski Cc: Yang Shi Cc: yuleixzhang Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 52acc3954afd..9b31ef84bf7e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1693,7 +1693,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(!is_pmd_migration_entry(orig_pmd)); entry = pmd_to_swp_entry(orig_pmd); - page = pfn_to_page(swp_offset(entry)); + page = migration_entry_to_page(entry); flush_needed = 0; } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); @@ -2101,7 +2101,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, swp_entry_t entry; entry = pmd_to_swp_entry(old_pmd); - page = pfn_to_page(swp_offset(entry)); + page = migration_entry_to_page(entry); write = is_write_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); From 89dc6a9682919dbd64213c630a71eedaa021d7e5 Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Tue, 4 May 2021 18:34:12 -0700 Subject: [PATCH 1095/1379] mm/khugepaged.c: replace barrier() with READ_ONCE() for a selective variable READ_ONCE() is more selective and lightweight. It is more appropriate that using a READ_ONCE() for the certain variable to prevent the compiler from reordering. Link: https://lkml.kernel.org/r/20210323092730.247583-1-yanfei.xu@windriver.com Signed-off-by: Yanfei Xu Acked-by: Kirill A. Shutemov Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index adf677246d86..a40a29414762 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2202,11 +2202,9 @@ static void khugepaged_do_scan(void) { struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; - unsigned int pages = khugepaged_pages_to_scan; + unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; - barrier(); /* write khugepaged_pages_to_scan to local stack */ - lru_add_drain_all(); while (progress < pages) { From fef792a4fdb9b2d9d3d5c36aaa85f768f456a4d7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:15 -0700 Subject: [PATCH 1096/1379] khugepaged: use helper function range_in_vma() in collapse_pte_mapped_thp() Patch series "Cleanup for khugepaged". This series contains cleanups to remove unnecessary out label and meaningless !pte_present() check. Also use helper function to simplify the code. More details can be found in the respective changelogs. This patch (of 3): We could use helper function range_in_vma() to check whether the desired range is inside the vma to simplify the code. Link: https://lkml.kernel.org/r/20210325135647.64106-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210325135647.64106-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a40a29414762..beaf06123b4b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1446,7 +1446,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) int i; if (!vma || !vma->vm_file || - vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) + !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) return; /* From 18d24a7cd9d3f35cfa8bed32a921a94159c78df0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:17 -0700 Subject: [PATCH 1097/1379] khugepaged: remove unnecessary out label in collapse_huge_page() The out label here is unneeded because it just goes to out_up_write label. Remove it to make code more concise. Link: https://lkml.kernel.org/r/20210325135647.64106-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index beaf06123b4b..276ac0099e19 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1128,10 +1128,10 @@ static void collapse_huge_page(struct mm_struct *mm, mmap_write_lock(mm); result = hugepage_vma_revalidate(mm, address, &vma); if (result) - goto out; + goto out_up_write; /* check if the pmd is still valid */ if (mm_find_pmd(mm, address) != pmd) - goto out; + goto out_up_write; anon_vma_lock_write(vma->anon_vma); @@ -1171,7 +1171,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); result = SCAN_FAIL; - goto out; + goto out_up_write; } /* @@ -1215,8 +1215,6 @@ out_nolock: mem_cgroup_uncharge(*hpage); trace_mm_collapse_huge_page(mm, isolated, result); return; -out: - goto out_up_write; } static int khugepaged_scan_pmd(struct mm_struct *mm, From 75f83783bfdf2ddb3ffbf79ba44d506fb5b5548f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:20 -0700 Subject: [PATCH 1098/1379] khugepaged: remove meaningless !pte_present() check in khugepaged_scan_pmd() We know it must meet the !is_swap_pte() and !pte_none() condition if we reach here. Since !is_swap_pte() indicates pte_none() or pte_present() is met, it's guaranteed that pte must be present here. Link: https://lkml.kernel.org/r/20210325135647.64106-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zi Yan Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 276ac0099e19..a03569eda183 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1271,10 +1271,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } } - if (!pte_present(pteval)) { - result = SCAN_PTE_NON_PRESENT; - goto out_unmap; - } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small From fa6c02315f745f00b62c634b220c3fb5c3310258 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 4 May 2021 18:34:23 -0700 Subject: [PATCH 1099/1379] mm: huge_memory: a new debugfs interface for splitting THP tests We did not have a direct user interface of splitting the compound page backing a THP and there is no need unless we want to expose the THP implementation details to users. Make /split_huge_pages accept a new command to do that. By writing ",," to /split_huge_pages, THPs within the given virtual address range from the process with the given pid are split. It is used to test split_huge_page function. In addition, a selftest program is added to tools/testing/selftests/vm to utilize the interface by splitting PMD THPs and PTE-mapped THPs. This does not change the old behavior, i.e., writing 1 to the interface to split all THPs in the system. Link: https://lkml.kernel.org/r/20210331235309.332292-1-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: Yang Shi Cc: David Hildenbrand Cc: David Rientjes Cc: John Hubbard Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mika Penttila Cc: Sandipan Das Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 157 ++++++++- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 1 + .../selftests/vm/split_huge_page_test.c | 318 ++++++++++++++++++ 4 files changed, 468 insertions(+), 9 deletions(-) create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9b31ef84bf7e..58be9fc8253d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -2915,16 +2916,14 @@ static struct shrinker deferred_split_shrinker = { }; #ifdef CONFIG_DEBUG_FS -static int split_huge_pages_set(void *data, u64 val) +static void split_huge_pages_all(void) { struct zone *zone; struct page *page; unsigned long pfn, max_zone_pfn; unsigned long total = 0, split = 0; - if (val != 1) - return -EINVAL; - + pr_debug("Split all THPs\n"); for_each_populated_zone(zone) { max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { @@ -2948,15 +2947,155 @@ static int split_huge_pages_set(void *data, u64 val) unlock_page(page); next: put_page(page); + cond_resched(); } } - pr_info("%lu of %lu THP split\n", split, total); - - return 0; + pr_debug("%lu of %lu THP split\n", split, total); } -DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, - "%llu\n"); + +static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma) +{ + return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) || + is_vm_hugetlb_page(vma); +} + +static int split_huge_pages_pid(int pid, unsigned long vaddr_start, + unsigned long vaddr_end) +{ + int ret = 0; + struct task_struct *task; + struct mm_struct *mm; + unsigned long total = 0, split = 0; + unsigned long addr; + + vaddr_start &= PAGE_MASK; + vaddr_end &= PAGE_MASK; + + /* Find the task_struct from pid */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + get_task_struct(task); + rcu_read_unlock(); + + /* Find the mm_struct */ + mm = get_task_mm(task); + put_task_struct(task); + + if (!mm) { + ret = -EINVAL; + goto out; + } + + pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", + pid, vaddr_start, vaddr_end); + + mmap_read_lock(mm); + /* + * always increase addr by PAGE_SIZE, since we could have a PTE page + * table filled with PTE-mapped THPs, each of which is distinct. + */ + for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { + struct vm_area_struct *vma = find_vma(mm, addr); + unsigned int follflags; + struct page *page; + + if (!vma || addr < vma->vm_start) + break; + + /* skip special VMA and hugetlb VMA */ + if (vma_not_suitable_for_thp_split(vma)) { + addr = vma->vm_end; + continue; + } + + /* FOLL_DUMP to ignore special (like zero) pages */ + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, addr, follflags); + + if (IS_ERR(page)) + continue; + if (!page) + continue; + + if (!is_transparent_hugepage(page)) + goto next; + + total++; + if (!can_split_huge_page(compound_head(page), NULL)) + goto next; + + if (!trylock_page(page)) + goto next; + + if (!split_huge_page(page)) + split++; + + unlock_page(page); +next: + put_page(page); + cond_resched(); + } + mmap_read_unlock(mm); + mmput(mm); + + pr_debug("%lu of %lu THP split\n", split, total); + +out: + return ret; +} + +#define MAX_INPUT_BUF_SZ 255 + +static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppops) +{ + static DEFINE_MUTEX(split_debug_mutex); + ssize_t ret; + char input_buf[MAX_INPUT_BUF_SZ]; /* hold pid, start_vaddr, end_vaddr */ + int pid; + unsigned long vaddr_start, vaddr_end; + + ret = mutex_lock_interruptible(&split_debug_mutex); + if (ret) + return ret; + + ret = -EFAULT; + + memset(input_buf, 0, MAX_INPUT_BUF_SZ); + if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ))) + goto out; + + input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; + ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); + if (ret == 1 && pid == 1) { + split_huge_pages_all(); + ret = strlen(input_buf); + goto out; + } else if (ret != 3) { + ret = -EINVAL; + goto out; + } + + ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end); + if (!ret) + ret = strlen(input_buf); +out: + mutex_unlock(&split_debug_mutex); + return ret; + +} + +static const struct file_operations split_huge_pages_fops = { + .owner = THIS_MODULE, + .write = split_huge_pages_write, + .llseek = no_llseek, +}; static int __init split_huge_pages_debugfs(void) { diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 9a35c3f6a557..1f651e85ed60 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -22,3 +22,4 @@ map_fixed_noreplace write_to_hugetlbfs hmm-tests local_config.* +split_huge_page_test diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 8b0cd421ebd3..73e1cc96d7c2 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -42,6 +42,7 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd +TEST_GEN_FILES += split_huge_page_test ifeq ($(MACHINE),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) ../x86/trivial_32bit_program.c -m32) diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c new file mode 100644 index 000000000000..2c0c18e60c57 --- /dev/null +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual + * address range in a process via /split_huge_pages interface. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +uint64_t pagesize; +unsigned int pageshift; +uint64_t pmd_pagesize; + +#define PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" +#define SMAP_PATH "/proc/self/smaps" +#define INPUT_MAX 80 + +#define PFN_MASK ((1UL<<55)-1) +#define KPF_THP (1UL<<22) + +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) +{ + uint64_t paddr; + uint64_t page_flags; + + if (pagemap_file) { + pread(pagemap_file, &paddr, sizeof(paddr), + ((long)vaddr >> pageshift) * sizeof(paddr)); + + if (kpageflags_file) { + pread(kpageflags_file, &page_flags, sizeof(page_flags), + (paddr & PFN_MASK) * sizeof(page_flags)); + + return !!(page_flags & KPF_THP); + } + } + return 0; +} + + +static uint64_t read_pmd_pagesize(void) +{ + int fd; + char buf[20]; + ssize_t num_read; + + fd = open(PMD_SIZE_PATH, O_RDONLY); + if (fd == -1) { + perror("Open hpage_pmd_size failed"); + exit(EXIT_FAILURE); + } + num_read = read(fd, buf, 19); + if (num_read < 1) { + close(fd); + perror("Read hpage_pmd_size failed"); + exit(EXIT_FAILURE); + } + buf[num_read] = '\0'; + close(fd); + + return strtoul(buf, NULL, 10); +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) + return 0; + + return (unsigned int) numwritten; +} + +static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end) +{ + char input[INPUT_MAX]; + int ret; + + ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start, + vaddr_end); + if (ret >= INPUT_MAX) { + printf("%s: Debugfs input is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) { + perror(SPLIT_DEBUGFS); + exit(EXIT_FAILURE); + } +} + +#define MAX_LINE_LENGTH 500 + +static bool check_for_pattern(FILE *fp, const char *pattern, char *buf) +{ + while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { + if (!strncmp(buf, pattern, strlen(pattern))) + return true; + } + return false; +} + +static uint64_t check_huge(void *addr) +{ + uint64_t thp = 0; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(SMAP_PATH, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, SMAP_PATH); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer)) + goto err_out; + + /* + * Fetch the AnonHugePages: in the same block and check the number of + * hugepages. + */ + if (!check_for_pattern(fp, "AnonHugePages:", buffer)) + goto err_out; + + if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) { + printf("Reading smap error\n"); + exit(EXIT_FAILURE); + } + +err_out: + fclose(fp); + return thp; +} + +void split_pmd_thp(void) +{ + char *one_page; + size_t len = 4 * pmd_pagesize; + uint64_t thp_size; + size_t i; + + one_page = memalign(pmd_pagesize, len); + + if (!one_page) { + printf("Fail to allocate memory\n"); + exit(EXIT_FAILURE); + } + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + thp_size = check_huge(one_page); + if (!thp_size) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* split all THPs */ + write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + + + thp_size = check_huge(one_page); + if (thp_size) { + printf("Still %ld kB AnonHugePages not split\n", thp_size); + exit(EXIT_FAILURE); + } + + printf("Split huge pages successful\n"); + free(one_page); +} + +void split_pte_mapped_thp(void) +{ + char *one_page, *pte_mapped, *pte_mapped2; + size_t len = 4 * pmd_pagesize; + uint64_t thp_size; + size_t i; + const char *pagemap_template = "/proc/%d/pagemap"; + const char *kpageflags_proc = "/proc/kpageflags"; + char pagemap_proc[255]; + int pagemap_fd; + int kpageflags_fd; + + if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) { + perror("get pagemap proc error"); + exit(EXIT_FAILURE); + } + pagemap_fd = open(pagemap_proc, O_RDONLY); + + if (pagemap_fd == -1) { + perror("read pagemap:"); + exit(EXIT_FAILURE); + } + + kpageflags_fd = open(kpageflags_proc, O_RDONLY); + + if (kpageflags_fd == -1) { + perror("read kpageflags:"); + exit(EXIT_FAILURE); + } + + one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + thp_size = check_huge(one_page); + if (!thp_size) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* remap the first pagesize of first THP */ + pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); + + /* remap the Nth pagesize of Nth THP */ + for (i = 1; i < 4; i++) { + pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i, + pagesize, pagesize, + MREMAP_MAYMOVE|MREMAP_FIXED, + pte_mapped + pagesize * i); + if (pte_mapped2 == (char *)-1) { + perror("mremap failed"); + exit(EXIT_FAILURE); + } + } + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + + if (thp_size != 4) { + printf("Some THPs are missing during mremap\n"); + exit(EXIT_FAILURE); + } + + /* split all remapped THPs */ + write_debugfs(getpid(), (uint64_t)pte_mapped, + (uint64_t)pte_mapped + pagesize * 4); + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) { + if (pte_mapped[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + } + + if (thp_size) { + printf("Still %ld THPs not split\n", thp_size); + exit(EXIT_FAILURE); + } + + printf("Split PTE-mapped huge pages successful\n"); + munmap(one_page, len); + close(pagemap_fd); + close(kpageflags_fd); +} + +int main(int argc, char **argv) +{ + if (geteuid() != 0) { + printf("Please run the benchmark as root\n"); + exit(EXIT_FAILURE); + } + + pagesize = getpagesize(); + pageshift = ffs(pagesize) - 1; + pmd_pagesize = read_pmd_pagesize(); + + split_pmd_thp(); + split_pte_mapped_thp(); + + return 0; +} From fbe37501b2526a71d82b898671260524279c6765 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 4 May 2021 18:34:26 -0700 Subject: [PATCH 1100/1379] mm: huge_memory: debugfs for file-backed THP split Further extend /split_huge_pages to accept ",," for file-backed THP split tests since tmpfs may have file backed by THP that mapped nowhere. Update selftest program to test file-backed THP split too. Link: https://lkml.kernel.org/r/20210331235309.332292-2-zi.yan@sent.com Signed-off-by: Zi Yan Suggested-by: Kirill A. Shutemov Reviewed-by: Yang Shi Cc: "Kirill A . Shutemov" Cc: Shuah Khan Cc: John Hubbard Cc: Sandipan Das Cc: David Hildenbrand Cc: Mika Penttila Cc: David Rientjes Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 90 ++++++++++++++++++- .../selftests/vm/split_huge_page_test.c | 82 +++++++++++++++-- 2 files changed, 166 insertions(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 58be9fc8253d..b3788683f7d3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3050,6 +3050,65 @@ out: return ret; } +static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, + pgoff_t off_end) +{ + struct filename *file; + struct file *candidate; + struct address_space *mapping; + int ret = -EINVAL; + pgoff_t index; + int nr_pages = 1; + unsigned long total = 0, split = 0; + + file = getname_kernel(file_path); + if (IS_ERR(file)) + return ret; + + candidate = file_open_name(file, O_RDONLY, 0); + if (IS_ERR(candidate)) + goto out; + + pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", + file_path, off_start, off_end); + + mapping = candidate->f_mapping; + + for (index = off_start; index < off_end; index += nr_pages) { + struct page *fpage = pagecache_get_page(mapping, index, + FGP_ENTRY | FGP_HEAD, 0); + + nr_pages = 1; + if (xa_is_value(fpage) || !fpage) + continue; + + if (!is_transparent_hugepage(fpage)) + goto next; + + total++; + nr_pages = thp_nr_pages(fpage); + + if (!trylock_page(fpage)) + goto next; + + if (!split_huge_page(fpage)) + split++; + + unlock_page(fpage); +next: + put_page(fpage); + cond_resched(); + } + + filp_close(candidate, NULL); + ret = 0; + + pr_debug("%lu of %lu file-backed THP split\n", split, total); +out: + putname(file); + return ret; +} + #define MAX_INPUT_BUF_SZ 255 static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, @@ -3057,7 +3116,8 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, { static DEFINE_MUTEX(split_debug_mutex); ssize_t ret; - char input_buf[MAX_INPUT_BUF_SZ]; /* hold pid, start_vaddr, end_vaddr */ + /* hold pid, start_vaddr, end_vaddr or file_path, off_start, off_end */ + char input_buf[MAX_INPUT_BUF_SZ]; int pid; unsigned long vaddr_start, vaddr_end; @@ -3072,6 +3132,34 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, goto out; input_buf[MAX_INPUT_BUF_SZ - 1] = '\0'; + + if (input_buf[0] == '/') { + char *tok; + char *buf = input_buf; + char file_path[MAX_INPUT_BUF_SZ]; + pgoff_t off_start = 0, off_end = 0; + size_t input_len = strlen(input_buf); + + tok = strsep(&buf, ","); + if (tok) { + strncpy(file_path, tok, MAX_INPUT_BUF_SZ); + } else { + ret = -EINVAL; + goto out; + } + + ret = sscanf(buf, "0x%lx,0x%lx", &off_start, &off_end); + if (ret != 2) { + ret = -EINVAL; + goto out; + } + ret = split_huge_pages_in_file(file_path, off_start, off_end); + if (!ret) + ret = input_len; + + goto out; + } + ret = sscanf(input_buf, "%d,0x%lx,0x%lx", &pid, &vaddr_start, &vaddr_end); if (ret == 1 && pid == 1) { split_huge_pages_all(); diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 2c0c18e60c57..1af16d2c2a0a 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -7,11 +7,13 @@ #define _GNU_SOURCE #include #include +#include #include #include #include #include #include +#include #include #include @@ -24,6 +26,9 @@ uint64_t pmd_pagesize; #define SMAP_PATH "/proc/self/smaps" #define INPUT_MAX 80 +#define PID_FMT "%d,0x%lx,0x%lx" +#define PATH_FMT "%s,0x%lx,0x%lx" + #define PFN_MASK ((1UL<<55)-1) #define KPF_THP (1UL<<22) @@ -87,13 +92,16 @@ static int write_file(const char *path, const char *buf, size_t buflen) return (unsigned int) numwritten; } -static void write_debugfs(int pid, uint64_t vaddr_start, uint64_t vaddr_end) +static void write_debugfs(const char *fmt, ...) { char input[INPUT_MAX]; int ret; + va_list argp; + + va_start(argp, fmt); + ret = vsnprintf(input, INPUT_MAX, fmt, argp); + va_end(argp); - ret = snprintf(input, INPUT_MAX, "%d,0x%lx,0x%lx", pid, vaddr_start, - vaddr_end); if (ret >= INPUT_MAX) { printf("%s: Debugfs input is too long\n", __func__); exit(EXIT_FAILURE); @@ -183,7 +191,8 @@ void split_pmd_thp(void) } /* split all THPs */ - write_debugfs(getpid(), (uint64_t)one_page, (uint64_t)one_page + len); + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, + (uint64_t)one_page + len); for (i = 0; i < len; i++) if (one_page[i] != (char)i) { @@ -274,7 +283,7 @@ void split_pte_mapped_thp(void) } /* split all remapped THPs */ - write_debugfs(getpid(), (uint64_t)pte_mapped, + write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, (uint64_t)pte_mapped + pagesize * 4); /* smap does not show THPs after mremap, use kpageflags instead */ @@ -300,6 +309,68 @@ void split_pte_mapped_thp(void) close(kpageflags_fd); } +void split_file_backed_thp(void) +{ + int status; + int fd; + ssize_t num_written; + char tmpfs_template[] = "/tmp/thp_split_XXXXXX"; + const char *tmpfs_loc = mkdtemp(tmpfs_template); + char testfile[INPUT_MAX]; + uint64_t pgoff_start = 0, pgoff_end = 1024; + + printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n"); + + status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); + + if (status) { + printf("Unable to create a tmpfs for testing\n"); + exit(EXIT_FAILURE); + } + + status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); + if (status >= INPUT_MAX) { + printf("Fail to create file-backed THP split testing file\n"); + goto cleanup; + } + + fd = open(testfile, O_CREAT|O_WRONLY); + if (fd == -1) { + perror("Cannot open testing file\n"); + goto cleanup; + } + + /* write something to the file, so a file-backed THP can be allocated */ + num_written = write(fd, tmpfs_loc, sizeof(tmpfs_loc)); + close(fd); + + if (num_written < 1) { + printf("Fail to write data to testing file\n"); + goto cleanup; + } + + /* split the file-backed THP */ + write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end); + + status = unlink(testfile); + if (status) + perror("Cannot remove testing file\n"); + +cleanup: + status = umount(tmpfs_loc); + if (status) { + printf("Unable to umount %s\n", tmpfs_loc); + exit(EXIT_FAILURE); + } + status = rmdir(tmpfs_loc); + if (status) { + perror("cannot remove tmp dir"); + exit(EXIT_FAILURE); + } + + printf("file-backed THP split test done, please check dmesg for more information\n"); +} + int main(int argc, char **argv) { if (geteuid() != 0) { @@ -313,6 +384,7 @@ int main(int argc, char **argv) split_pmd_thp(); split_pte_mapped_thp(); + split_file_backed_thp(); return 0; } From f84df0b7f1b603f6c99670bdf2f908f0b6a5ed59 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:30 -0700 Subject: [PATCH 1101/1379] mm/hugeltb: remove redundant VM_BUG_ON() in region_add() Patch series "Cleanup and fixup for hugetlb", v2. This series contains cleanups to remove redundant VM_BUG_ON() and simplify the return code. Also this handles the error case in hugetlb_fix_reserve_counts() correctly. More details can be found in the respective changelogs. This patch (of 5): The same VM_BUG_ON() check is already done in the callee. Remove this extra one to simplify the code slightly. Link: https://lkml.kernel.org/r/20210410072348.20437-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210410072348.20437-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: Feilong Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c8f0a38588ba..ce9e18d45f52 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -553,7 +553,6 @@ retry: resv->adds_in_progress -= in_regions_needed; spin_unlock(&resv->lock); - VM_BUG_ON(add < 0); return add; } From bf3d12b9f7f9e7c4ae4aa94c6c81400d3bf688e6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:32 -0700 Subject: [PATCH 1102/1379] mm/hugeltb: simplify the return code of __vma_reservation_common() It's guaranteed that the vma is associated with a resv_map, i.e. either VM_MAYSHARE or HPAGE_RESV_OWNER, when the code reaches here or we would have returned via !resv check above. So it's unneeded to check whether HPAGE_RESV_OWNER is set here. Simplify the return code to make it more clear. Link: https://lkml.kernel.org/r/20210410072348.20437-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Feilong Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ce9e18d45f52..d91bd780ba98 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2174,27 +2174,26 @@ static long __vma_reservation_common(struct hstate *h, if (vma->vm_flags & VM_MAYSHARE) return ret; - else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) { - /* - * In most cases, reserves always exist for private mappings. - * However, a file associated with mapping could have been - * hole punched or truncated after reserves were consumed. - * As subsequent fault on such a range will not use reserves. - * Subtle - The reserve map for private mappings has the - * opposite meaning than that of shared mappings. If NO - * entry is in the reserve map, it means a reservation exists. - * If an entry exists in the reserve map, it means the - * reservation has already been consumed. As a result, the - * return value of this routine is the opposite of the - * value returned from reserve map manipulation routines above. - */ - if (ret) - return 0; - else - return 1; - } - else - return ret < 0 ? ret : 0; + /* + * We know private mapping must have HPAGE_RESV_OWNER set. + * + * In most cases, reserves always exist for private mappings. + * However, a file associated with mapping could have been + * hole punched or truncated after reserves were consumed. + * As subsequent fault on such a range will not use reserves. + * Subtle - The reserve map for private mappings has the + * opposite meaning than that of shared mappings. If NO + * entry is in the reserve map, it means a reservation exists. + * If an entry exists in the reserve map, it means the + * reservation has already been consumed. As a result, the + * return value of this routine is the opposite of the + * value returned from reserve map manipulation routines above. + */ + if (ret > 0) + return 0; + if (ret == 0) + return 1; + return ret; } static long vma_needs_reservation(struct hstate *h, From dddf31a49a0eb858bba58876c3c67dd8ea81b800 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:35 -0700 Subject: [PATCH 1103/1379] mm/hugeltb: clarify (chg - freed) won't go negative in hugetlb_unreserve_pages() The resv_map could be NULL since this routine can be called in the evict inode path for all hugetlbfs inodes and we will have chg = 0 in this case. But (chg - freed) won't go negative as Mike pointed out: "If resv_map is NULL, then no hugetlb pages can be allocated/associated with the file. As a result, remove_inode_hugepages will never find any huge pages associated with the inode and the passed value 'freed' will always be zero." Add a comment clarifying this to make it clear and also avoid confusion. Link: https://lkml.kernel.org/r/20210410072348.20437-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Feilong Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d91bd780ba98..b1525ae1bf1f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5267,6 +5267,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, /* * If the subpool has a minimum size, the number of global * reservations to be released may be adjusted. + * + * Note that !resv_map implies freed == 0. So (chg - freed) + * won't go negative. */ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -gbl_reserve); From da56388c4397878a65b74f7fe97760f5aa7d316b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:38 -0700 Subject: [PATCH 1104/1379] mm/hugeltb: handle the error case in hugetlb_fix_reserve_counts() A rare out of memory error would prevent removal of the reserve map region for a page. hugetlb_fix_reserve_counts() handles this rare case to avoid dangling with incorrect counts. Unfortunately, hugepage_subpool_get_pages and hugetlb_acct_memory could possibly fail too. We should correctly handle these cases. Link: https://lkml.kernel.org/r/20210410072348.20437-5-linmiaohe@huawei.com Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages") Signed-off-by: Miaohe Lin Cc: Feilong Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b1525ae1bf1f..8ea8e3d2f814 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -742,13 +742,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; + bool reserved = false; rsv_adjust = hugepage_subpool_get_pages(spool, 1); - if (rsv_adjust) { + if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode); - hugetlb_acct_memory(h, 1); + if (!hugetlb_acct_memory(h, 1)) + reserved = true; + } else if (!rsv_adjust) { + reserved = true; } + + if (!reserved) + pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); } /* From 15b8365363215da82cb019d3de0eb781c9e82564 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:34:41 -0700 Subject: [PATCH 1105/1379] mm/hugetlb: remove unused variable pseudo_vma in remove_inode_hugepages() The local variable pseudo_vma is not used anymore. Link: https://lkml.kernel.org/r/20210410072348.20437-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Feilong Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d81f52b87bd7..a2a42335e8fd 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -463,14 +463,11 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, struct address_space *mapping = &inode->i_data; const pgoff_t start = lstart >> huge_page_shift(h); const pgoff_t end = lend >> huge_page_shift(h); - struct vm_area_struct pseudo_vma; struct pagevec pvec; pgoff_t next, index; int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); - vma_init(&pseudo_vma, current->mm); - pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pagevec_init(&pvec); next = start; while (next < end) { From 0ef7dcac998fefc4767b7f10eb3b6df150c38a4e Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:34:44 -0700 Subject: [PATCH 1106/1379] mm/cma: change cma mutex to irq safe spinlock Patch series "make hugetlb put_page safe for all calling contexts", v5. This effort is the result a recent bug report [1]. Syzbot found a potential deadlock in the hugetlb put_page/free_huge_page_path. WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected Since the free_huge_page_path already has code to 'hand off' page free requests to a workqueue, a suggestion was proposed to make the in_irq() detection accurate by always enabling PREEMPT_COUNT [2]. The outcome of that discussion was that the hugetlb put_page path (free_huge_page) path should be properly fixed and safe for all calling contexts. [1] https://lore.kernel.org/linux-mm/000000000000f1c03b05bc43aadc@google.com/ [2] http://lkml.kernel.org/r/20210311021321.127500-1-mike.kravetz@oracle.com This patch (of 8): cma_release is currently a sleepable operatation because the bitmap manipulation is protected by cma->lock mutex. Hugetlb code which relies on cma_release for CMA backed (giga) hugetlb pages, however, needs to be irq safe. The lock doesn't protect any sleepable operation so it can be changed to a (irq aware) spin lock. The bitmap processing should be quite fast in typical case but if cma sizes grow to TB then we will likely need to replace the lock by a more optimized bitmap implementation. Link: https://lkml.kernel.org/r/20210409205254.242291-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20210409205254.242291-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Acked-by: Roman Gushchin Cc: Shakeel Butt Cc: Oscar Salvador Cc: Muchun Song Cc: David Rientjes Cc: Miaohe Lin Cc: Peter Zijlstra Cc: Matthew Wilcox Cc: HORIGUCHI NAOYA Cc: "Aneesh Kumar K . V" Cc: Waiman Long Cc: Peter Xu Cc: Mina Almasry Cc: Hillf Danton Cc: Joonsoo Kim Cc: Barry Song Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 18 +++++++++--------- mm/cma.h | 2 +- mm/cma_debug.c | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 54eee2119822..acd6991f77a0 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -83,13 +82,14 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, unsigned int count) { unsigned long bitmap_no, bitmap_count; + unsigned long flags; bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; bitmap_count = cma_bitmap_pages_to_bits(cma, count); - mutex_lock(&cma->lock); + spin_lock_irqsave(&cma->lock, flags); bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); - mutex_unlock(&cma->lock); + spin_unlock_irqrestore(&cma->lock, flags); } static void __init cma_activate_area(struct cma *cma) @@ -118,7 +118,7 @@ static void __init cma_activate_area(struct cma *cma) pfn += pageblock_nr_pages) init_cma_reserved_pageblock(pfn_to_page(pfn)); - mutex_init(&cma->lock); + spin_lock_init(&cma->lock); #ifdef CONFIG_CMA_DEBUGFS INIT_HLIST_HEAD(&cma->mem_head); @@ -392,7 +392,7 @@ static void cma_debug_show_areas(struct cma *cma) unsigned long nr_part, nr_total = 0; unsigned long nbits = cma_bitmap_maxno(cma); - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); pr_info("number of available pages: "); for (;;) { next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start); @@ -407,7 +407,7 @@ static void cma_debug_show_areas(struct cma *cma) start = next_zero_bit + nr_zero; } pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); } #else static inline void cma_debug_show_areas(struct cma *cma) { } @@ -452,12 +452,12 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, return NULL; for (;;) { - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap, bitmap_maxno, start, bitmap_count, mask, offset); if (bitmap_no >= bitmap_maxno) { - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); break; } bitmap_set(cma->bitmap, bitmap_no, bitmap_count); @@ -466,7 +466,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, * our exclusive use. If the migration fails we will take the * lock again and unmark it. */ - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, diff --git a/mm/cma.h b/mm/cma.h index 42ae082cb067..c46c5050aaa9 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -9,7 +9,7 @@ struct cma { unsigned long count; unsigned long *bitmap; unsigned int order_per_bit; /* Order of pages represented by one bit */ - struct mutex lock; + spinlock_t lock; #ifdef CONFIG_CMA_DEBUGFS struct hlist_head mem_head; spinlock_t mem_head_lock; diff --git a/mm/cma_debug.c b/mm/cma_debug.c index d5bf8aa34fdc..2e7704955f4f 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -36,10 +36,10 @@ static int cma_used_get(void *data, u64 *val) struct cma *cma = data; unsigned long used; - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); /* pages counter is smaller than sizeof(int) */ used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma)); - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); *val = (u64)used << cma->order_per_bit; return 0; @@ -53,7 +53,7 @@ static int cma_maxchunk_get(void *data, u64 *val) unsigned long start, end = 0; unsigned long bitmap_maxno = cma_bitmap_maxno(cma); - mutex_lock(&cma->lock); + spin_lock_irq(&cma->lock); for (;;) { start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); if (start >= bitmap_maxno) @@ -61,7 +61,7 @@ static int cma_maxchunk_get(void *data, u64 *val) end = find_next_bit(cma->bitmap, bitmap_maxno, start); maxchunk = max(end - start, maxchunk); } - mutex_unlock(&cma->lock); + spin_unlock_irq(&cma->lock); *val = (u64)maxchunk << cma->order_per_bit; return 0; From 262443c0421e832e5312d2b14e0a2640a9f064d7 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:34:48 -0700 Subject: [PATCH 1107/1379] hugetlb: no need to drop hugetlb_lock to call cma_release Now that cma_release is non-blocking and irq safe, there is no need to drop hugetlb_lock before calling. Link: https://lkml.kernel.org/r/20210409205254.242291-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Roman Gushchin Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Reviewed-by: David Hildenbrand Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Cc: Peter Xu Cc: Peter Zijlstra Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8ea8e3d2f814..a5636e1593c5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1355,14 +1355,8 @@ static void update_and_free_page(struct hstate *h, struct page *page) set_compound_page_dtor(page, NULL_COMPOUND_DTOR); set_page_refcounted(page); if (hstate_is_gigantic(h)) { - /* - * Temporarily drop the hugetlb_lock, because - * we might block in free_gigantic_page(). - */ - spin_unlock(&hugetlb_lock); destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); - spin_lock(&hugetlb_lock); } else { __free_pages(page, huge_page_order(h)); } From 2938396771c8fd0870b5284319f9e78b4b552a79 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:34:52 -0700 Subject: [PATCH 1108/1379] hugetlb: add per-hstate mutex to synchronize user adjustments The helper routine hstate_next_node_to_alloc accesses and modifies the hstate variable next_nid_to_alloc. The helper is used by the routines alloc_pool_huge_page and adjust_pool_surplus. adjust_pool_surplus is called with hugetlb_lock held. However, alloc_pool_huge_page can not be called with the hugetlb lock held as it will call the page allocator. Two instances of alloc_pool_huge_page could be run in parallel or alloc_pool_huge_page could run in parallel with adjust_pool_surplus which may result in the variable next_nid_to_alloc becoming invalid for the caller and pages being allocated on the wrong node. Both alloc_pool_huge_page and adjust_pool_surplus are only called from the routine set_max_huge_pages after boot. set_max_huge_pages is only called as the reusult of a user writing to the proc/sysfs nr_hugepages, or nr_hugepages_mempolicy file to adjust the number of hugetlb pages. It makes little sense to allow multiple adjustment to the number of hugetlb pages in parallel. Add a mutex to the hstate and use it to only allow one hugetlb page adjustment at a time. This will synchronize modifications to the next_nid_to_alloc variable. Link: https://lkml.kernel.org/r/20210409205254.242291-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Reviewed-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + mm/hugetlb.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0f5813522224..628639422c5d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -559,6 +559,7 @@ HPAGEFLAG(Freed, freed) #define HSTATE_NAME_LEN 32 /* Defines one hugetlb page size */ struct hstate { + struct mutex resize_lock; int next_nid_to_alloc; int next_nid_to_free; unsigned int order; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a5636e1593c5..067fd29a9d51 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2621,6 +2621,11 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, else return -ENOMEM; + /* + * resize_lock mutex prevents concurrent adjustments to number of + * pages in hstate via the proc/sysfs interfaces. + */ + mutex_lock(&h->resize_lock); spin_lock(&hugetlb_lock); /* @@ -2653,6 +2658,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { spin_unlock(&hugetlb_lock); + mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return -EINVAL; } @@ -2727,6 +2733,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, out: h->max_huge_pages = persistent_huge_pages(h); spin_unlock(&hugetlb_lock); + mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); @@ -3214,6 +3221,7 @@ void __init hugetlb_add_hstate(unsigned int order) BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order == 0); h = &hstates[hugetlb_max_hstate++]; + mutex_init(&h->resize_lock); h->order = order; h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) From 6eb4e88a6d27022ea8aff424d47a0a5dfc9fcb34 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:34:55 -0700 Subject: [PATCH 1109/1379] hugetlb: create remove_hugetlb_page() to separate functionality The new remove_hugetlb_page() routine is designed to remove a hugetlb page from hugetlbfs processing. It will remove the page from the active or free list, update global counters and set the compound page destructor to NULL so that PageHuge() will return false for the 'page'. After this call, the 'page' can be treated as a normal compound page or a collection of base size pages. update_and_free_page no longer decrements h->nr_huge_pages{_node} as this is performed in remove_hugetlb_page. The only functionality performed by update_and_free_page is to free the base pages to the lower level allocators. update_and_free_page is typically called after remove_hugetlb_page. remove_hugetlb_page is to be called with the hugetlb_lock held. Creating this routine and separating functionality is in preparation for restructuring code to reduce lock hold times. This commit should not introduce any changes to functionality. Link: https://lkml.kernel.org/r/20210409205254.242291-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 65 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 067fd29a9d51..9a263a1c200e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1333,6 +1333,41 @@ static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } #endif +/* + * Remove hugetlb page from lists, and update dtor so that page appears + * as just a compound page. A reference is held on the page. + * + * Must be called with hugetlb lock held. + */ +static void remove_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + int nid = page_to_nid(page); + + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + return; + + list_del(&page->lru); + + if (HPageFreed(page)) { + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + } + if (adjust_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; + } + + set_page_refcounted(page); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + + h->nr_huge_pages--; + h->nr_huge_pages_node[nid]--; +} + static void update_and_free_page(struct hstate *h, struct page *page) { int i; @@ -1341,8 +1376,6 @@ static void update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - h->nr_huge_pages--; - h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++, subpage = mem_map_next(subpage, page, i)) { subpage->flags &= ~(1 << PG_locked | 1 << PG_error | @@ -1350,10 +1383,6 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_active | 1 << PG_private | 1 << PG_writeback); } - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); - set_page_refcounted(page); if (hstate_is_gigantic(h)) { destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); @@ -1421,15 +1450,12 @@ static void __free_huge_page(struct page *page) h->resv_huge_pages++; if (HPageTemporary(page)) { - list_del(&page->lru); - ClearHPageTemporary(page); + remove_hugetlb_page(h, page, false); update_and_free_page(h, page); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ - list_del(&page->lru); + remove_hugetlb_page(h, page, true); update_and_free_page(h, page); - h->surplus_huge_pages--; - h->surplus_huge_pages_node[nid]--; } else { arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); @@ -1714,13 +1740,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, struct page *page = list_entry(h->hugepage_freelists[node].next, struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[node]--; - if (acct_surplus) { - h->surplus_huge_pages--; - h->surplus_huge_pages_node[node]--; - } + remove_hugetlb_page(h, page, acct_surplus); update_and_free_page(h, page); ret = 1; break; @@ -1758,7 +1778,6 @@ retry: if (!page_count(page)) { struct page *head = compound_head(page); struct hstate *h = page_hstate(head); - int nid = page_to_nid(head); if (h->free_huge_pages - h->resv_huge_pages == 0) goto out; @@ -1789,9 +1808,7 @@ retry: SetPageHWPoison(page); ClearPageHWPoison(head); } - list_del(&head->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; + remove_hugetlb_page(h, page, false); h->max_huge_pages--; update_and_free_page(h, head); rc = 0; @@ -2558,10 +2575,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count, return; if (PageHighMem(page)) continue; - list_del(&page->lru); + remove_hugetlb_page(h, page, false); update_and_free_page(h, page); - h->free_huge_pages--; - h->free_huge_pages_node[page_to_nid(page)]--; } } } From 1121828a0c213caa55ddd5ee23ee78e99cbdd33e Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:34:59 -0700 Subject: [PATCH 1110/1379] hugetlb: call update_and_free_page without hugetlb_lock With the introduction of remove_hugetlb_page(), there is no need for update_and_free_page to hold the hugetlb lock. Change all callers to drop the lock before calling. With additional code modifications, this will allow loops which decrease the huge page pool to drop the hugetlb_lock with each page to reduce long hold times. The ugly unlock/lock cycle in free_pool_huge_page will be removed in a subsequent patch which restructures free_pool_huge_page. Link: https://lkml.kernel.org/r/20210409205254.242291-6-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: Muchun Song Reviewed-by: Miaohe Lin Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a263a1c200e..5425936a4590 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1451,16 +1451,18 @@ static void __free_huge_page(struct page *page) if (HPageTemporary(page)) { remove_hugetlb_page(h, page, false); + spin_unlock(&hugetlb_lock); update_and_free_page(h, page); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ remove_hugetlb_page(h, page, true); + spin_unlock(&hugetlb_lock); update_and_free_page(h, page); } else { arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); + spin_unlock(&hugetlb_lock); } - spin_unlock(&hugetlb_lock); } /* @@ -1741,7 +1743,13 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, list_entry(h->hugepage_freelists[node].next, struct page, lru); remove_hugetlb_page(h, page, acct_surplus); + /* + * unlock/lock around update_and_free_page is temporary + * and will be removed with subsequent patch. + */ + spin_unlock(&hugetlb_lock); update_and_free_page(h, page); + spin_lock(&hugetlb_lock); ret = 1; break; } @@ -1810,8 +1818,9 @@ retry: } remove_hugetlb_page(h, page, false); h->max_huge_pages--; + spin_unlock(&hugetlb_lock); update_and_free_page(h, head); - rc = 0; + return 0; } out: spin_unlock(&hugetlb_lock); @@ -2563,22 +2572,34 @@ static void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { int i; + struct page *page, *next; + LIST_HEAD(page_list); if (hstate_is_gigantic(h)) return; + /* + * Collect pages to be freed on a list, and free after dropping lock + */ for_each_node_mask(i, *nodes_allowed) { - struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(page, next, freel, lru) { if (count >= h->nr_huge_pages) - return; + goto out; if (PageHighMem(page)) continue; remove_hugetlb_page(h, page, false); - update_and_free_page(h, page); + list_add(&page->lru, &page_list); } } + +out: + spin_unlock(&hugetlb_lock); + list_for_each_entry_safe(page, next, &page_list, lru) { + update_and_free_page(h, page); + cond_resched(); + } + spin_lock(&hugetlb_lock); } #else static inline void try_to_free_low(struct hstate *h, unsigned long count, From 10c6ec49802b1779c01fc029cfd92ea20ae33c06 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:35:03 -0700 Subject: [PATCH 1111/1379] hugetlb: change free_pool_huge_page to remove_pool_huge_page free_pool_huge_page was called with hugetlb_lock held. It would remove a hugetlb page, and then free the corresponding pages to the lower level allocators such as buddy. free_pool_huge_page was called in a loop to remove hugetlb pages and these loops could hold the hugetlb_lock for a considerable time. Create new routine remove_pool_huge_page to replace free_pool_huge_page. remove_pool_huge_page will remove the hugetlb page, and it must be called with the hugetlb_lock held. It will return the removed page and it is the responsibility of the caller to free the page to the lower level allocators. The hugetlb_lock is dropped before freeing to these allocators which results in shorter lock hold times. Add new helper routine to call update_and_free_page for a list of pages. Note: Some changes to the routine return_unused_surplus_pages are in need of explanation. Commit e5bbc8a6c992 ("mm/hugetlb.c: fix reservation race when freeing surplus pages") modified this routine to address a race which could occur when dropping the hugetlb_lock in the loop that removes pool pages. Accounting changes introduced in that commit were subtle and took some thought to understand. This commit removes the cond_resched_lock() and the potential race. Therefore, remove the subtle code and restore the more straight forward accounting effectively reverting the commit. Link: https://lkml.kernel.org/r/20210409205254.242291-7-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 93 ++++++++++++++++++++++++++++------------------------ 1 file changed, 51 insertions(+), 42 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5425936a4590..d7db93639787 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1211,7 +1211,7 @@ static int hstate_next_node_to_alloc(struct hstate *h, } /* - * helper for free_pool_huge_page() - return the previously saved + * helper for remove_pool_huge_page() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node. @@ -1391,6 +1391,16 @@ static void update_and_free_page(struct hstate *h, struct page *page) } } +static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) +{ + struct page *page, *t_page; + + list_for_each_entry_safe(page, t_page, list, lru) { + update_and_free_page(h, page); + cond_resched(); + } +} + struct hstate *size_to_hstate(unsigned long size) { struct hstate *h; @@ -1721,16 +1731,18 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, } /* - * Free huge page from pool from next node to free. - * Attempt to keep persistent huge pages more or less - * balanced over allowed nodes. + * Remove huge page from pool from next node to free. Attempt to keep + * persistent huge pages more or less balanced over allowed nodes. + * This routine only 'removes' the hugetlb page. The caller must make + * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ -static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, - bool acct_surplus) +static struct page *remove_pool_huge_page(struct hstate *h, + nodemask_t *nodes_allowed, + bool acct_surplus) { int nr_nodes, node; - int ret = 0; + struct page *page = NULL; for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* @@ -1739,23 +1751,14 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { - struct page *page = - list_entry(h->hugepage_freelists[node].next, + page = list_entry(h->hugepage_freelists[node].next, struct page, lru); remove_hugetlb_page(h, page, acct_surplus); - /* - * unlock/lock around update_and_free_page is temporary - * and will be removed with subsequent patch. - */ - spin_unlock(&hugetlb_lock); - update_and_free_page(h, page); - spin_lock(&hugetlb_lock); - ret = 1; break; } } - return ret; + return page; } /* @@ -2075,17 +2078,16 @@ free: * to the associated reservation map. * 2) Free any unused surplus pages that may have been allocated to satisfy * the reservation. As many as unused_resv_pages may be freed. - * - * Called with hugetlb_lock held. However, the lock could be dropped (and - * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, - * we must make sure nobody else can claim pages we are in the process of - * freeing. Do this by ensuring resv_huge_page always is greater than the - * number of huge pages we plan to free when dropping the lock. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; + struct page *page; + LIST_HEAD(page_list); + + /* Uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; /* Cannot return gigantic pages currently */ if (hstate_is_gigantic(h)) @@ -2102,24 +2104,21 @@ static void return_unused_surplus_pages(struct hstate *h, * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. - * free_pool_huge_page() will balance the freed pages across the + * remove_pool_huge_page() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. - * - * Note that we decrement resv_huge_pages as we free the pages. If - * we drop the lock, resv_huge_pages will still be sufficiently large - * to cover subsequent pages we may free. */ while (nr_pages--) { - h->resv_huge_pages--; - unused_resv_pages--; - if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) + page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); + if (!page) goto out; - cond_resched_lock(&hugetlb_lock); + + list_add(&page->lru, &page_list); } out: - /* Fully uncommit the reservation */ - h->resv_huge_pages -= unused_resv_pages; + spin_unlock(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock(&hugetlb_lock); } @@ -2572,7 +2571,6 @@ static void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { int i; - struct page *page, *next; LIST_HEAD(page_list); if (hstate_is_gigantic(h)) @@ -2582,6 +2580,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, * Collect pages to be freed on a list, and free after dropping lock */ for_each_node_mask(i, *nodes_allowed) { + struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(page, next, freel, lru) { if (count >= h->nr_huge_pages) @@ -2595,10 +2594,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, out: spin_unlock(&hugetlb_lock); - list_for_each_entry_safe(page, next, &page_list, lru) { - update_and_free_page(h, page); - cond_resched(); - } + update_and_free_pages_bulk(h, &page_list); spin_lock(&hugetlb_lock); } #else @@ -2645,6 +2641,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; + struct page *page; + LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); /* @@ -2757,11 +2755,22 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); + + /* + * Collect pages to be removed on list without dropping lock + */ while (min_count < persistent_huge_pages(h)) { - if (!free_pool_huge_page(h, nodes_allowed, 0)) + page = remove_pool_huge_page(h, nodes_allowed, 0); + if (!page) break; - cond_resched_lock(&hugetlb_lock); + + list_add(&page->lru, &page_list); } + /* free the pages after dropping lock */ + spin_unlock(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock(&hugetlb_lock); + while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; From db71ef79b59bb2e78dc4df83d0e4bf6beaa5c82d Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:35:07 -0700 Subject: [PATCH 1112/1379] hugetlb: make free_huge_page irq safe Commit c77c0a8ac4c5 ("mm/hugetlb: defer freeing of huge pages if in non-task context") was added to address the issue of free_huge_page being called from irq context. That commit hands off free_huge_page processing to a workqueue if !in_task. However, this doesn't cover all the cases as pointed out by 0day bot lockdep report [1]. : Possible interrupt unsafe locking scenario: : : CPU0 CPU1 : ---- ---- : lock(hugetlb_lock); : local_irq_disable(); : lock(slock-AF_INET); : lock(hugetlb_lock); : : lock(slock-AF_INET); Shakeel has later explained that this is very likely TCP TX zerocopy from hugetlb pages scenario when the networking code drops a last reference to hugetlb page while having IRQ disabled. Hugetlb freeing path doesn't disable IRQ while holding hugetlb_lock so a lock dependency chain can lead to a deadlock. This commit addresses the issue by doing the following: - Make hugetlb_lock irq safe. This is mostly a simple process of changing spin_*lock calls to spin_*lock_irq* calls. - Make subpool lock irq safe in a similar manner. - Revert the !in_task check and workqueue handoff. [1] https://lore.kernel.org/linux-mm/000000000000f1c03b05bc43aadc@google.com/ Link: https://lkml.kernel.org/r/20210409205254.242291-8-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: Muchun Song Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 169 +++++++++++++++++--------------------------- mm/hugetlb_cgroup.c | 8 +-- 2 files changed, 67 insertions(+), 110 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d7db93639787..47f56e2cc804 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -94,9 +94,10 @@ static inline bool subpool_is_free(struct hugepage_subpool *spool) return true; } -static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) +static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, + unsigned long irq_flags) { - spin_unlock(&spool->lock); + spin_unlock_irqrestore(&spool->lock, irq_flags); /* If no pages are used, and no other handles to the subpool * remain, give up any reservations based on minimum size and @@ -135,10 +136,12 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool) { - spin_lock(&spool->lock); + unsigned long flags; + + spin_lock_irqsave(&spool->lock, flags); BUG_ON(!spool->count); spool->count--; - unlock_or_release_subpool(spool); + unlock_or_release_subpool(spool, flags); } /* @@ -157,7 +160,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, if (!spool) return ret; - spin_lock(&spool->lock); + spin_lock_irq(&spool->lock); if (spool->max_hpages != -1) { /* maximum size accounting */ if ((spool->used_hpages + delta) <= spool->max_hpages) @@ -184,7 +187,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, } unlock_ret: - spin_unlock(&spool->lock); + spin_unlock_irq(&spool->lock); return ret; } @@ -198,11 +201,12 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; + unsigned long flags; if (!spool) return delta; - spin_lock(&spool->lock); + spin_lock_irqsave(&spool->lock, flags); if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; @@ -223,7 +227,7 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, * If hugetlbfs_put_super couldn't free spool due to an outstanding * quota reference, free it now. */ - unlock_or_release_subpool(spool); + unlock_or_release_subpool(spool, flags); return ret; } @@ -1412,7 +1416,7 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -static void __free_huge_page(struct page *page) +void free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the @@ -1422,6 +1426,7 @@ static void __free_huge_page(struct page *page) int nid = page_to_nid(page); struct hugepage_subpool *spool = hugetlb_page_subpool(page); bool restore_reserve; + unsigned long flags; VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_mapcount(page), page); @@ -1450,7 +1455,7 @@ static void __free_huge_page(struct page *page) restore_reserve = true; } - spin_lock(&hugetlb_lock); + spin_lock_irqsave(&hugetlb_lock, flags); ClearHPageMigratable(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); @@ -1461,68 +1466,20 @@ static void __free_huge_page(struct page *page) if (HPageTemporary(page)) { remove_hugetlb_page(h, page, false); - spin_unlock(&hugetlb_lock); + spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ remove_hugetlb_page(h, page, true); - spin_unlock(&hugetlb_lock); + spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page); } else { arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); - spin_unlock(&hugetlb_lock); + spin_unlock_irqrestore(&hugetlb_lock, flags); } } -/* - * As free_huge_page() can be called from a non-task context, we have - * to defer the actual freeing in a workqueue to prevent potential - * hugetlb_lock deadlock. - * - * free_hpage_workfn() locklessly retrieves the linked list of pages to - * be freed and frees them one-by-one. As the page->mapping pointer is - * going to be cleared in __free_huge_page() anyway, it is reused as the - * llist_node structure of a lockless linked list of huge pages to be freed. - */ -static LLIST_HEAD(hpage_freelist); - -static void free_hpage_workfn(struct work_struct *work) -{ - struct llist_node *node; - struct page *page; - - node = llist_del_all(&hpage_freelist); - - while (node) { - page = container_of((struct address_space **)node, - struct page, mapping); - node = node->next; - __free_huge_page(page); - } -} -static DECLARE_WORK(free_hpage_work, free_hpage_workfn); - -void free_huge_page(struct page *page) -{ - /* - * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. - */ - if (!in_task()) { - /* - * Only call schedule_work() if hpage_freelist is previously - * empty. Otherwise, schedule_work() had been called but the - * workfn hasn't retrieved the list yet. - */ - if (llist_add((struct llist_node *)&page->mapping, - &hpage_freelist)) - schedule_work(&free_hpage_work); - return; - } - - __free_huge_page(page); -} - static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); @@ -1530,11 +1487,11 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) hugetlb_set_page_subpool(page, NULL); set_hugetlb_cgroup(page, NULL); set_hugetlb_cgroup_rsvd(page, NULL); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; ClearHPageFreed(page); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); } static void prep_compound_gigantic_page(struct page *page, unsigned int order) @@ -1780,7 +1737,7 @@ retry: if (!PageHuge(page)) return 0; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!PageHuge(page)) { rc = 0; goto out; @@ -1797,7 +1754,7 @@ retry: * when it is dissolved. */ if (unlikely(!HPageFreed(head))) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); cond_resched(); /* @@ -1821,12 +1778,12 @@ retry: } remove_hugetlb_page(h, page, false); h->max_huge_pages--; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); update_and_free_page(h, head); return 0; } out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return rc; } @@ -1868,16 +1825,16 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, if (hstate_is_gigantic(h)) return NULL; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) goto out_unlock; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * We could have raced with the pool size change. * Double check that and simply deallocate the new page @@ -1887,7 +1844,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { SetHPageTemporary(page); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); put_page(page); return NULL; } else { @@ -1896,7 +1853,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, } out_unlock: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return page; } @@ -1946,17 +1903,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->free_huge_pages - h->resv_huge_pages > 0) { struct page *page; page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); if (page) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return page; } } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); } @@ -2004,7 +1961,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) ret = -ENOMEM; retry: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); @@ -2021,7 +1978,7 @@ retry: * After retaking hugetlb_lock, we need to recalculate 'needed' * because either resv_huge_pages or free_huge_pages may have changed. */ - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - (h->free_huge_pages + allocated); if (needed > 0) { @@ -2061,12 +2018,12 @@ retry: enqueue_huge_page(h, page); } free: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) put_page(page); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); return ret; } @@ -2116,9 +2073,9 @@ static void return_unused_surplus_pages(struct hstate *h, } out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); } @@ -2352,7 +2309,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (ret) goto out_uncharge_cgroup_reservation; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken * from the global free pool (global change). gbl_chg == 0 indicates @@ -2360,7 +2317,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, */ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; @@ -2368,7 +2325,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, SetHPageRestoreReserve(page); h->resv_huge_pages--; } - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); /* Fall through */ } @@ -2381,7 +2338,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, h_cg, page); } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); hugetlb_set_page_subpool(page, spool); @@ -2593,9 +2550,9 @@ static void try_to_free_low(struct hstate *h, unsigned long count, } out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); } #else static inline void try_to_free_low(struct hstate *h, unsigned long count, @@ -2660,7 +2617,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * pages in hstate via the proc/sysfs interfaces. */ mutex_lock(&h->resize_lock); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * Check for a node specific request. @@ -2691,7 +2648,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, */ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return -EINVAL; @@ -2721,14 +2678,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * page, free_huge_page will handle it by freeing the page * and reducing the surplus. */ - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); /* yield cpu to avoid soft lockup */ cond_resched(); ret = alloc_pool_huge_page(h, nodes_allowed, node_alloc_noretry); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!ret) goto out; @@ -2767,9 +2724,9 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, list_add(&page->lru, &page_list); } /* free the pages after dropping lock */ - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); update_and_free_pages_bulk(h, &page_list); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) @@ -2777,7 +2734,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, } out: h->max_huge_pages = persistent_huge_pages(h); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); @@ -2933,9 +2890,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, if (err) return err; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = input; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return count; } @@ -3522,9 +3479,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, goto out; if (write) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); } out: return ret; @@ -3620,7 +3577,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta) if (!delta) return 0; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page * reservation as the accounting is done on a global variable. Such @@ -3659,7 +3616,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta) return_unused_surplus_pages(h, (unsigned long) -delta); out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return ret; } @@ -5687,7 +5644,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list) { bool ret = true; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!PageHeadHuge(page) || !HPageMigratable(page) || !get_page_unless_zero(page)) { @@ -5697,16 +5654,16 @@ bool isolate_huge_page(struct page *page, struct list_head *list) ClearHPageMigratable(page); list_move_tail(&page->lru, list); unlock: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return ret; } void putback_active_hugepage(struct page *page) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); SetHPageMigratable(page); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); put_page(page); } @@ -5740,12 +5697,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) */ if (new_nid == old_nid) return; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { h->surplus_huge_pages_node[old_nid]--; h->surplus_huge_pages_node[new_nid]++; } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); } } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 726b85f4f303..5383023d0cca 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -204,11 +204,11 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) do { idx = 0; for_each_hstate(h) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_activelist, lru) hugetlb_cgroup_move_parent(idx, h_cg, page); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); idx++; } cond_resched(); @@ -784,7 +784,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) if (hugetlb_cgroup_disabled()) return; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(oldhpage); h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); set_hugetlb_cgroup(oldhpage, NULL); @@ -794,7 +794,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) set_hugetlb_cgroup(newhpage, h_cg); set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd); list_move(&newhpage->lru, &h->hugepage_activelist); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return; } From 9487ca60fd7fa2c259f0daba8e2e01e51a64da05 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 May 2021 18:35:10 -0700 Subject: [PATCH 1113/1379] hugetlb: add lockdep_assert_held() calls for hugetlb_lock After making hugetlb lock irq safe and separating some functionality done under the lock, add some lockdep_assert_held to help verify locking. Link: https://lkml.kernel.org/r/20210409205254.242291-9-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Reviewed-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: Oscar Salvador Cc: "Aneesh Kumar K . V" Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Hillf Danton Cc: HORIGUCHI NAOYA Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mina Almasry Cc: Peter Xu Cc: Peter Zijlstra Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 47f56e2cc804..017cb260354c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1069,6 +1069,8 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); + + lockdep_assert_held(&hugetlb_lock); list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; @@ -1080,6 +1082,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) struct page *page; bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA); + lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { if (nocma && is_migrate_cma_page(page)) continue; @@ -1351,6 +1354,7 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page, VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; @@ -1701,6 +1705,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, int nr_nodes, node; struct page *page = NULL; + lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine @@ -1950,6 +1955,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) long needed, allocated; bool alloc_ok = true; + lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { h->resv_huge_pages += delta; @@ -2043,6 +2049,7 @@ static void return_unused_surplus_pages(struct hstate *h, struct page *page; LIST_HEAD(page_list); + lockdep_assert_held(&hugetlb_lock); /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; @@ -2530,6 +2537,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, int i; LIST_HEAD(page_list); + lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h)) return; @@ -2571,6 +2579,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, { int nr_nodes, node; + lockdep_assert_held(&hugetlb_lock); VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { From c8e28b47af45c6acfc7a9256848562d4d5ef63a2 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:14 -0700 Subject: [PATCH 1114/1379] mm,page_alloc: bail out earlier on -ENOMEM in alloc_contig_migrate_range Patch series "Make alloc_contig_range handle Hugetlb pages", v10. alloc_contig_range lacks the ability to handle HugeTLB pages. This can be problematic for some users, e.g: CMA and virtio-mem, where those users will fail the call if alloc_contig_range ever sees a HugeTLB page, even when those pages lay in ZONE_MOVABLE and are free. That problem can be easily solved by replacing the page in the free hugepage pool. In-use HugeTLB are no exception though, as those can be isolated and migrated as any other LRU or Movable page. This aims to improve alloc_contig_range->isolate_migratepages_block, so that HugeTLB pages can be recognized and handled. Since we also need to start reporting errors down the chain (e.g: -ENOMEM due to not be able to allocate a new hugetlb page), isolate_migratepages_{range,block} interfaces need to change to start reporting error codes instead of the pfn == 0 vs pfn != 0 scheme it is using right now. From now on, isolate_migratepages_block will not return the next pfn to be scanned anymore, but -EINTR, -ENOMEM or 0, so we the next pfn to be scanned will be recorded in cc->migrate_pfn field (as it is already done in isolate_migratepages_range()). Below is an insight from David (thanks), where the problem can clearly be seen: "Start a VM with 4G. Hotplug 1G via virtio-mem and online it to ZONE_MOVABLE. Allocate 512 huge pages. [root@localhost ~]# cat /proc/meminfo MemTotal: 5061512 kB MemFree: 3319396 kB MemAvailable: 3457144 kB ... HugePages_Total: 512 HugePages_Free: 512 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 2048 kB The huge pages get partially allocate from ZONE_MOVABLE. Try unplugging 1G via virtio-mem (remember, all ZONE_MOVABLE). Inside the guest: [ 180.058992] alloc_contig_range: [1b8000, 1c0000) PFNs busy [ 180.060531] alloc_contig_range: [1b8000, 1c0000) PFNs busy [ 180.061972] alloc_contig_range: [1b8000, 1c0000) PFNs busy [ 180.063413] alloc_contig_range: [1b8000, 1c0000) PFNs busy [ 180.064838] alloc_contig_range: [1b8000, 1c0000) PFNs busy [ 180.065848] alloc_contig_range: [1bfc00, 1c0000) PFNs busy [ 180.066794] alloc_contig_range: [1bfc00, 1c0000) PFNs busy [ 180.067738] alloc_contig_range: [1bfc00, 1c0000) PFNs busy [ 180.068669] alloc_contig_range: [1bfc00, 1c0000) PFNs busy [ 180.069598] alloc_contig_range: [1bfc00, 1c0000) PFNs busy" And then with this patchset running: "Same experiment with ZONE_MOVABLE: a) Free huge pages: all memory can get unplugged again. b) Allocated/populated but idle huge pages: all memory can get unplugged again. c) Allocated/populated but all 512 huge pages are read/written in a loop: all memory can get unplugged again, but I get a single [ 121.192345] alloc_contig_range: [180000, 188000) PFNs busy Most probably because it happened to try migrating a huge page while it was busy. As virtio-mem retries on ZONE_MOVABLE a couple of times, it can deal with this temporary failure. Last but not least, I did something extreme: # cat /proc/meminfo MemTotal: 5061568 kB MemFree: 186560 kB MemAvailable: 354524 kB ... HugePages_Total: 2048 HugePages_Free: 2048 HugePages_Rsvd: 0 HugePages_Surp: 0 Triggering unplug would require to dissolve+alloc - which now fails when trying to allocate an additional ~512 huge pages (1G). As expected, I can properly see memory unplug not fully succeeding. + I get a fairly continuous stream of [ 226.611584] alloc_contig_range: [19f400, 19f800) PFNs busy ... But more importantly, the hugepage count remains stable, as configured by the admin (me): HugePages_Total: 2048 HugePages_Free: 2048 HugePages_Rsvd: 0 HugePages_Surp: 0" This patch (of 7): Currently, __alloc_contig_migrate_range can generate -EINTR, -ENOMEM or -EBUSY, and report them down the chain. The problem is that when migrate_pages() reports -ENOMEM, we keep going till we exhaust all the try-attempts (5 at the moment) instead of bailing out. migrate_pages() bails out right away on -ENOMEM because it is considered a fatal error. Do the same here instead of keep going and retrying. Note that this is not fixing a real issue, just a cosmetic change. Although we can save some cycles by backing off ealier Link: https://lkml.kernel.org/r/20210419075413.1064-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20210419075413.1064-2-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Vlastimil Babka Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6b208b1843bf..d9acd86fa761 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8696,7 +8696,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, } tries = 0; } else if (++tries == 5) { - ret = ret < 0 ? ret : -EBUSY; + ret = -EBUSY; break; } @@ -8706,6 +8706,13 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, ret = migrate_pages(&cc->migratepages, alloc_migration_target, NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE); + + /* + * On -ENOMEM, migrate_pages() bails out right away. It is pointless + * to retry again over this error, so do the same here. + */ + if (ret == -ENOMEM) + break; } if (ret < 0) { alloc_contig_dump_pages(&cc->migratepages); From c2ad7a1ffeafa32eb3b3b99835f210ad402a86ff Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:17 -0700 Subject: [PATCH 1115/1379] mm,compaction: let isolate_migratepages_{range,block} return error codes Currently, isolate_migratepages_{range,block} and their callers use a pfn == 0 vs pfn != 0 scheme to let the caller know whether there was any error during isolation. This does not work as soon as we need to start reporting different error codes and make sure we pass them down the chain, so they are properly interpreted by functions like e.g: alloc_contig_range. Let us rework isolate_migratepages_{range,block} so we can report error codes. Since isolate_migratepages_block will stop returning the next pfn to be scanned, we reuse the cc->migrate_pfn field to keep track of that. Link: https://lkml.kernel.org/r/20210419075413.1064-3-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Vlastimil Babka Acked-by: Mike Kravetz Reviewed-by: David Hildenbrand Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 52 ++++++++++++++++++++++++------------------------- mm/internal.h | 10 ++++++++-- mm/page_alloc.c | 7 +++---- 3 files changed, 36 insertions(+), 33 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index e04f4476e68e..c4d8007221b7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -787,15 +787,14 @@ static bool too_many_isolated(pg_data_t *pgdat) * * Isolate all pages that can be migrated from the range specified by * [low_pfn, end_pfn). The range is expected to be within same pageblock. - * Returns zero if there is a fatal signal pending, otherwise PFN of the - * first page that was not scanned (which may be both less, equal to or more - * than end_pfn). + * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion, + * or 0. + * cc->migrate_pfn will contain the next pfn to scan. * * The pages are isolated on cc->migratepages list (not required to be empty), - * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field - * is neither read nor updated. + * and cc->nr_migratepages is updated accordingly. */ -static unsigned long +static int isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn, isolate_mode_t isolate_mode) { @@ -809,6 +808,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, bool skip_on_failure = false; unsigned long next_skip_pfn = 0; bool skip_updated = false; + int ret = 0; + + cc->migrate_pfn = low_pfn; /* * Ensure that there are not too many pages isolated from the LRU @@ -818,16 +820,16 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, while (unlikely(too_many_isolated(pgdat))) { /* stop isolation if there are still pages not migrated */ if (cc->nr_migratepages) - return 0; + return -EAGAIN; /* async migration should just abort */ if (cc->mode == MIGRATE_ASYNC) - return 0; + return -EAGAIN; congestion_wait(BLK_RW_ASYNC, HZ/10); if (fatal_signal_pending(current)) - return 0; + return -EINTR; } cond_resched(); @@ -875,8 +877,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (fatal_signal_pending(current)) { cc->contended = true; + ret = -EINTR; - low_pfn = 0; goto fatal_pending; } @@ -1130,7 +1132,9 @@ fatal_pending: if (nr_isolated) count_compact_events(COMPACTISOLATED, nr_isolated); - return low_pfn; + cc->migrate_pfn = low_pfn; + + return ret; } /** @@ -1139,15 +1143,14 @@ fatal_pending: * @start_pfn: The first PFN to start isolating. * @end_pfn: The one-past-last PFN. * - * Returns zero if isolation fails fatally due to e.g. pending signal. - * Otherwise, function returns one-past-the-last PFN of isolated page - * (which may be greater than end_pfn if end fell in a middle of a THP page). + * Returns -EAGAIN when contented, -EINTR in case of a signal pending or 0. */ -unsigned long +int isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn, block_start_pfn, block_end_pfn; + int ret = 0; /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; @@ -1166,17 +1169,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, block_end_pfn, cc->zone)) continue; - pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, - ISOLATE_UNEVICTABLE); + ret = isolate_migratepages_block(cc, pfn, block_end_pfn, + ISOLATE_UNEVICTABLE); - if (!pfn) + if (ret) break; if (cc->nr_migratepages >= COMPACT_CLUSTER_MAX) break; } - return pfn; + return ret; } #endif /* CONFIG_COMPACTION || CONFIG_CMA */ @@ -1847,7 +1850,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) */ for (; block_end_pfn <= cc->free_pfn; fast_find_block = false, - low_pfn = block_end_pfn, + cc->migrate_pfn = low_pfn = block_end_pfn, block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { @@ -1889,10 +1892,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) } /* Perform the isolation */ - low_pfn = isolate_migratepages_block(cc, low_pfn, - block_end_pfn, isolate_mode); - - if (!low_pfn) + if (isolate_migratepages_block(cc, low_pfn, block_end_pfn, + isolate_mode)) return ISOLATE_ABORT; /* @@ -1903,9 +1904,6 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) break; } - /* Record where migration scanner will be restarted. */ - cc->migrate_pfn = low_pfn; - return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } diff --git a/mm/internal.h b/mm/internal.h index ef5f336f59bd..feeaaf06705d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -244,7 +244,13 @@ struct compact_control { unsigned int nr_freepages; /* Number of isolated free pages */ unsigned int nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ - unsigned long migrate_pfn; /* isolate_migratepages search base */ + /* + * Acts as an in/out parameter to page isolation for migration. + * isolate_migratepages uses it as a search base. + * isolate_migratepages_block will update the value to the next pfn + * after the last isolated one. + */ + unsigned long migrate_pfn; unsigned long fast_start_pfn; /* a pfn to start linear scan from */ struct zone *zone; unsigned long total_migrate_scanned; @@ -280,7 +286,7 @@ struct capture_control { unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn); -unsigned long +int isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); int find_suitable_fallback(struct free_area *area, unsigned int order, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d9acd86fa761..656f86a98554 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8689,11 +8689,10 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, if (list_empty(&cc->migratepages)) { cc->nr_migratepages = 0; - pfn = isolate_migratepages_range(cc, pfn, end); - if (!pfn) { - ret = -EINTR; + ret = isolate_migratepages_range(cc, pfn, end); + if (ret && ret != -EAGAIN) break; - } + pfn = cc->migrate_pfn; tries = 0; } else if (++tries == 5) { ret = -EBUSY; From 9f27b34f234da7a185b4f1a2aa2cea2c47c458bf Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:20 -0700 Subject: [PATCH 1116/1379] mm,hugetlb: drop clearing of flag from prep_new_huge_page Pages allocated via the page allocator or CMA get its private field cleared by means of post_alloc_hook(). Pages allocated during boot, that is directly from the memblock allocator, get cleared by paging_init()-> .. ->memmap_init_zone-> .. ->__init_single_page() before any memblock allocation. Based on this ground, let us remove the clearing of the flag from prep_new_huge_page() as it is not needed. This was a leftover from commit 6c0371490140 ("hugetlb: convert PageHugeFreed to HPageFreed flag"). Previously the explicit clearing was necessary because compound allocations do not get this initialization (see prep_compound_page). Link: https://lkml.kernel.org/r/20210419075413.1064-4-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Reviewed-by: Mike Kravetz Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 017cb260354c..c1539fb95f51 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1494,7 +1494,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) spin_lock_irq(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; - ClearHPageFreed(page); spin_unlock_irq(&hugetlb_lock); } From d3d99fcc4e28f1a613744608c289d4f18b60b12f Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:23 -0700 Subject: [PATCH 1117/1379] mm,hugetlb: split prep_new_huge_page functionality Currently, prep_new_huge_page() performs two functions. It sets the right state for a new hugetlb, and increases the hstate's counters to account for the new page. Let us split its functionality into two separate functions, decoupling the handling of the counters from initializing a hugepage. The outcome is having __prep_new_huge_page(), which only initializes the page , and __prep_account_new_huge_page(), which adds the new page to the hstate's counters. This allows us to be able to set a hugetlb without having to worry about the counter/locking. It will prove useful in the next patch. prep_new_huge_page() still calls both functions. Link: https://lkml.kernel.org/r/20210419075413.1064-5-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c1539fb95f51..63760be2688e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1484,16 +1484,30 @@ void free_huge_page(struct page *page) } } -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +/* + * Must be called with the hugetlb lock held + */ +static void __prep_account_new_huge_page(struct hstate *h, int nid) +{ + lockdep_assert_held(&hugetlb_lock); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; +} + +static void __prep_new_huge_page(struct page *page) { INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); set_hugetlb_cgroup(page, NULL); set_hugetlb_cgroup_rsvd(page, NULL); +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +{ + __prep_new_huge_page(page); spin_lock_irq(&hugetlb_lock); - h->nr_huge_pages++; - h->nr_huge_pages_node[nid]++; + __prep_account_new_huge_page(h, nid); spin_unlock_irq(&hugetlb_lock); } From 369fa227c21949b22fd7374506c4992a0d7bb580 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:26 -0700 Subject: [PATCH 1118/1379] mm: make alloc_contig_range handle free hugetlb pages alloc_contig_range will fail if it ever sees a HugeTLB page within the range we are trying to allocate, even when that page is free and can be easily reallocated. This has proved to be problematic for some users of alloc_contic_range, e.g: CMA and virtio-mem, where those would fail the call even when those pages lay in ZONE_MOVABLE and are free. We can do better by trying to replace such page. Free hugepages are tricky to handle so as to no userspace application notices disruption, we need to replace the current free hugepage with a new one. In order to do that, a new function called alloc_and_dissolve_huge_page is introduced. This function will first try to get a new fresh hugepage, and if it succeeds, it will replace the old one in the free hugepage pool. The free page replacement is done under hugetlb_lock, so no external users of hugetlb will notice the change. To allocate the new huge page, we use alloc_buddy_huge_page(), so we do not have to deal with any counters, and prep_new_huge_page() is not called. This is valulable because in case we need to free the new page, we only need to call __free_pages(). Once we know that the page to be replaced is a genuine 0-refcounted huge page, we remove the old page from the freelist by remove_hugetlb_page(). Then, we can call __prep_new_huge_page() and __prep_account_new_huge_page() for the new huge page to properly initialize it and increment the hstate->nr_huge_pages counter (previously decremented by remove_hugetlb_page()). Once done, the page is enqueued by enqueue_huge_page() and it is ready to be used. There is one tricky case when page's refcount is 0 because it is in the process of being released. A missing PageHugeFreed bit will tell us that freeing is in flight so we retry after dropping the hugetlb_lock. The race window should be small and the next retry should make a forward progress. E.g: CPU0 CPU1 free_huge_page() isolate_or_dissolve_huge_page PageHuge() == T alloc_and_dissolve_huge_page alloc_buddy_huge_page() spin_lock_irq(hugetlb_lock) // PageHuge() && !PageHugeFreed && // !PageCount() spin_unlock_irq(hugetlb_lock) spin_lock_irq(hugetlb_lock) 1) update_and_free_page PageHuge() == F __free_pages() 2) enqueue_huge_page SetPageHugeFreed() spin_unlock_irq(&hugetlb_lock) spin_lock_irq(hugetlb_lock) 1) PageHuge() == F (freed by case#1 from CPU0) 2) PageHuge() == T PageHugeFreed() == T - proceed with replacing the page In the case above we retry as the window race is quite small and we have high chances to succeed next time. With regard to the allocation, we restrict it to the node the page belongs to with __GFP_THISNODE, meaning we do not fallback on other node's zones. Note that gigantic hugetlb pages are fenced off since there is a cyclic dependency between them and alloc_contig_range. Link: https://lkml.kernel.org/r/20210419075413.1064-6-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: David Hildenbrand Reviewed-by: Mike Kravetz Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 +++ mm/compaction.c | 33 ++++++++++-- mm/hugetlb.c | 116 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+), 3 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 628639422c5d..ec6a10b8860a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -588,6 +588,7 @@ struct huge_bootmem_page { struct hstate *hstate; }; +int isolate_or_dissolve_huge_page(struct page *page); struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, @@ -870,6 +871,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; +static inline int isolate_or_dissolve_huge_page(struct page *page) +{ + return -ENOMEM; +} + static inline struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) diff --git a/mm/compaction.c b/mm/compaction.c index c4d8007221b7..b77e1382307f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -788,7 +788,7 @@ static bool too_many_isolated(pg_data_t *pgdat) * Isolate all pages that can be migrated from the range specified by * [low_pfn, end_pfn). The range is expected to be within same pageblock. * Returns errno, like -EAGAIN or -EINTR in case e.g signal pending or congestion, - * or 0. + * -ENOMEM in case we could not allocate a page, or 0. * cc->migrate_pfn will contain the next pfn to scan. * * The pages are isolated on cc->migratepages list (not required to be empty), @@ -906,6 +906,29 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, valid_page = page; } + if (PageHuge(page) && cc->alloc_contig) { + ret = isolate_or_dissolve_huge_page(page); + + /* + * Fail isolation in case isolate_or_dissolve_huge_page() + * reports an error. In case of -ENOMEM, abort right away. + */ + if (ret < 0) { + /* Do not report -EBUSY down the chain */ + if (ret == -EBUSY) + ret = 0; + low_pfn += (1UL << compound_order(page)) - 1; + goto isolate_fail; + } + + /* + * Ok, the hugepage was dissolved. Now these pages are + * Buddy and cannot be re-allocated because they are + * isolated. Fall-through as the check below handles + * Buddy pages. + */ + } + /* * Skip if free. We read page order here without zone lock * which is generally unsafe, but the race window is small and @@ -1065,7 +1088,7 @@ isolate_fail_put: put_page(page); isolate_fail: - if (!skip_on_failure) + if (!skip_on_failure && ret != -ENOMEM) continue; /* @@ -1091,6 +1114,9 @@ isolate_fail: */ next_skip_pfn += 1UL << cc->order; } + + if (ret == -ENOMEM) + break; } /* @@ -1143,7 +1169,8 @@ fatal_pending: * @start_pfn: The first PFN to start isolating. * @end_pfn: The one-past-last PFN. * - * Returns -EAGAIN when contented, -EINTR in case of a signal pending or 0. + * Returns -EAGAIN when contented, -EINTR in case of a signal pending, -ENOMEM + * in case we could not allocate a page, or 0. */ int isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 63760be2688e..92f3cd08946f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2267,6 +2267,122 @@ static void restore_reserve_on_error(struct hstate *h, } } +/* + * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one + * @h: struct hstate old page belongs to + * @old_page: Old page to dissolve + * Returns 0 on success, otherwise negated error. + */ +static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page) +{ + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + int nid = page_to_nid(old_page); + struct page *new_page; + int ret = 0; + + /* + * Before dissolving the page, we need to allocate a new one for the + * pool to remain stable. Using alloc_buddy_huge_page() allows us to + * not having to deal with prep_new_huge_page() and avoids dealing of any + * counters. This simplifies and let us do the whole thing under the + * lock. + */ + new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); + if (!new_page) + return -ENOMEM; + +retry: + spin_lock_irq(&hugetlb_lock); + if (!PageHuge(old_page)) { + /* + * Freed from under us. Drop new_page too. + */ + goto free_new; + } else if (page_count(old_page)) { + /* + * Someone has grabbed the page, fail for now. + */ + ret = -EBUSY; + goto free_new; + } else if (!HPageFreed(old_page)) { + /* + * Page's refcount is 0 but it has not been enqueued in the + * freelist yet. Race window is small, so we can succeed here if + * we retry. + */ + spin_unlock_irq(&hugetlb_lock); + cond_resched(); + goto retry; + } else { + /* + * Ok, old_page is still a genuine free hugepage. Remove it from + * the freelist and decrease the counters. These will be + * incremented again when calling __prep_account_new_huge_page() + * and enqueue_huge_page() for new_page. The counters will remain + * stable since this happens under the lock. + */ + remove_hugetlb_page(h, old_page, false); + + /* + * new_page needs to be initialized with the standard hugetlb + * state. This is normally done by prep_new_huge_page() but + * that takes hugetlb_lock which is already held so we need to + * open code it here. + * Reference count trick is needed because allocator gives us + * referenced page but the pool requires pages with 0 refcount. + */ + __prep_new_huge_page(new_page); + __prep_account_new_huge_page(h, nid); + page_ref_dec(new_page); + enqueue_huge_page(h, new_page); + + /* + * Pages have been replaced, we can safely free the old one. + */ + spin_unlock_irq(&hugetlb_lock); + update_and_free_page(h, old_page); + } + + return ret; + +free_new: + spin_unlock_irq(&hugetlb_lock); + __free_pages(new_page, huge_page_order(h)); + + return ret; +} + +int isolate_or_dissolve_huge_page(struct page *page) +{ + struct hstate *h; + struct page *head; + + /* + * The page might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + * Return success when racing as if we dissolved the page ourselves. + */ + spin_lock_irq(&hugetlb_lock); + if (PageHuge(page)) { + head = compound_head(page); + h = page_hstate(head); + } else { + spin_unlock_irq(&hugetlb_lock); + return 0; + } + spin_unlock_irq(&hugetlb_lock); + + /* + * Fence off gigantic pages as there is a cyclic dependency between + * alloc_contig_range and them. Return -ENOMEM as this has the effect + * of bailing out right away without further retrying. + */ + if (hstate_is_gigantic(h)) + return -ENOMEM; + + return alloc_and_dissolve_huge_page(h, head); +} + struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { From ae37c7ff79f1f030e28ec76c46ee032f8fd07607 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:29 -0700 Subject: [PATCH 1119/1379] mm: make alloc_contig_range handle in-use hugetlb pages alloc_contig_range() will fail if it finds a HugeTLB page within the range, without a chance to handle them. Since HugeTLB pages can be migrated as any LRU or Movable page, it does not make sense to bail out without trying. Enable the interface to recognize in-use HugeTLB pages so we can migrate them, and have much better chances to succeed the call. Link: https://lkml.kernel.org/r/20210419075413.1064-7-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Mike Kravetz Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +++-- mm/compaction.c | 12 +++++++++++- mm/hugetlb.c | 22 +++++++++++++++++----- mm/vmscan.c | 5 +++-- 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ec6a10b8860a..d0f310ae3f82 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -588,7 +588,7 @@ struct huge_bootmem_page { struct hstate *hstate; }; -int isolate_or_dissolve_huge_page(struct page *page); +int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, @@ -871,7 +871,8 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; -static inline int isolate_or_dissolve_huge_page(struct page *page) +static inline int isolate_or_dissolve_huge_page(struct page *page, + struct list_head *list) { return -ENOMEM; } diff --git a/mm/compaction.c b/mm/compaction.c index b77e1382307f..335862f1661c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -907,7 +907,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } if (PageHuge(page) && cc->alloc_contig) { - ret = isolate_or_dissolve_huge_page(page); + ret = isolate_or_dissolve_huge_page(page, &cc->migratepages); /* * Fail isolation in case isolate_or_dissolve_huge_page() @@ -921,6 +921,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, goto isolate_fail; } + if (PageHuge(page)) { + /* + * Hugepage was successfully isolated and placed + * on the cc->migratepages list. + */ + low_pfn += compound_nr(page) - 1; + goto isolate_success_no_list; + } + /* * Ok, the hugepage was dissolved. Now these pages are * Buddy and cannot be re-allocated because they are @@ -1062,6 +1071,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, isolate_success: list_add(&page->lru, &cc->migratepages); +isolate_success_no_list: cc->nr_migratepages += compound_nr(page); nr_isolated += compound_nr(page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 92f3cd08946f..b5977d9709ad 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2271,9 +2271,11 @@ static void restore_reserve_on_error(struct hstate *h, * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one * @h: struct hstate old page belongs to * @old_page: Old page to dissolve + * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ -static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page) +static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, + struct list_head *list) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = page_to_nid(old_page); @@ -2300,9 +2302,13 @@ retry: goto free_new; } else if (page_count(old_page)) { /* - * Someone has grabbed the page, fail for now. + * Someone has grabbed the page, try to isolate it here. + * Fail with -EBUSY if not possible. */ - ret = -EBUSY; + spin_unlock_irq(&hugetlb_lock); + if (!isolate_huge_page(old_page, list)) + ret = -EBUSY; + spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!HPageFreed(old_page)) { /* @@ -2352,10 +2358,11 @@ free_new: return ret; } -int isolate_or_dissolve_huge_page(struct page *page) +int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { struct hstate *h; struct page *head; + int ret = -EBUSY; /* * The page might have been dissolved from under our feet, so make sure @@ -2380,7 +2387,12 @@ int isolate_or_dissolve_huge_page(struct page *page) if (hstate_is_gigantic(h)) return -ENOMEM; - return alloc_and_dissolve_huge_page(h, head); + if (page_count(head) && isolate_huge_page(head, list)) + ret = 0; + else if (!page_count(head)) + ret = alloc_and_dissolve_huge_page(h, head, list); + + return ret; } struct page *alloc_huge_page(struct vm_area_struct *vma, diff --git a/mm/vmscan.c b/mm/vmscan.c index 562e87cbd7a1..42aaef30633e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1507,8 +1507,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, LIST_HEAD(clean_pages); list_for_each_entry_safe(page, next, page_list, lru) { - if (page_is_file_lru(page) && !PageDirty(page) && - !__PageMovable(page) && !PageUnevictable(page)) { + if (!PageHuge(page) && page_is_file_lru(page) && + !PageDirty(page) && !__PageMovable(page) && + !PageUnevictable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } From eb14d4eefdc4f0051a63973124f431798e16a8b2 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:35:33 -0700 Subject: [PATCH 1120/1379] mm,page_alloc: drop unnecessary checks from pfn_range_valid_contig pfn_range_valid_contig() bails out when it finds an in-use page or a hugetlb page, among other things. We can drop the in-use page check since __alloc_contig_pages can migrate away those pages, and the hugetlb page check can go too since isolate_migratepages_range is now capable of dealing with hugetlb pages. Either way, those checks are racy so let the end function handle it when the time comes. Link: https://lkml.kernel.org/r/20210419075413.1064-8-osalvador@suse.de Signed-off-by: Oscar Salvador Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Acked-by: Mike Kravetz Acked-by: Michal Hocko Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 656f86a98554..80fa6e0f9ed9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8898,12 +8898,6 @@ static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, if (PageReserved(page)) return false; - - if (page_count(page) > 0) - return false; - - if (PageHuge(page)) - return false; } return true; } From 7677f7fd8be76659cd2d0db8ff4093bbb51c20e5 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:36 -0700 Subject: [PATCH 1121/1379] userfaultfd: add minor fault registration mode Patch series "userfaultfd: add minor fault handling", v9. Overview ======== This series adds a new userfaultfd feature, UFFD_FEATURE_MINOR_HUGETLBFS. When enabled (via the UFFDIO_API ioctl), this feature means that any hugetlbfs VMAs registered with UFFDIO_REGISTER_MODE_MISSING will *also* get events for "minor" faults. By "minor" fault, I mean the following situation: Let there exist two mappings (i.e., VMAs) to the same page(s) (shared memory). One of the mappings is registered with userfaultfd (in minor mode), and the other is not. Via the non-UFFD mapping, the underlying pages have already been allocated & filled with some contents. The UFFD mapping has not yet been faulted in; when it is touched for the first time, this results in what I'm calling a "minor" fault. As a concrete example, when working with hugetlbfs, we have huge_pte_none(), but find_lock_page() finds an existing page. We also add a new ioctl to resolve such faults: UFFDIO_CONTINUE. The idea is, userspace resolves the fault by either a) doing nothing if the contents are already correct, or b) updating the underlying contents using the second, non-UFFD mapping (via memcpy/memset or similar, or something fancier like RDMA, or etc...). In either case, userspace issues UFFDIO_CONTINUE to tell the kernel "I have ensured the page contents are correct, carry on setting up the mapping". Use Case ======== Consider the use case of VM live migration (e.g. under QEMU/KVM): 1. While a VM is still running, we copy the contents of its memory to a target machine. The pages are populated on the target by writing to the non-UFFD mapping, using the setup described above. The VM is still running (and therefore its memory is likely changing), so this may be repeated several times, until we decide the target is "up to date enough". 2. We pause the VM on the source, and start executing on the target machine. During this gap, the VM's user(s) will *see* a pause, so it is desirable to minimize this window. 3. Between the last time any page was copied from the source to the target, and when the VM was paused, the contents of that page may have changed - and therefore the copy we have on the target machine is out of date. Although we can keep track of which pages are out of date, for VMs with large amounts of memory, it is "slow" to transfer this information to the target machine. We want to resume execution before such a transfer would complete. 4. So, the guest begins executing on the target machine. The first time it touches its memory (via the UFFD-registered mapping), userspace wants to intercept this fault. Userspace checks whether or not the page is up to date, and if not, copies the updated page from the source machine, via the non-UFFD mapping. Finally, whether a copy was performed or not, userspace issues a UFFDIO_CONTINUE ioctl to tell the kernel "I have ensured the page contents are correct, carry on setting up the mapping". We don't have to do all of the final updates on-demand. The userfaultfd manager can, in the background, also copy over updated pages once it receives the map of which pages are up-to-date or not. Interaction with Existing APIs ============================== Because this is a feature, a registered VMA could potentially receive both missing and minor faults. I spent some time thinking through how the existing API interacts with the new feature: UFFDIO_CONTINUE cannot be used to resolve non-minor faults, as it does not allocate a new page. If UFFDIO_CONTINUE is used on a non-minor fault: - For non-shared memory or shmem, -EINVAL is returned. - For hugetlb, -EFAULT is returned. UFFDIO_COPY and UFFDIO_ZEROPAGE cannot be used to resolve minor faults. Without modifications, the existing codepath assumes a new page needs to be allocated. This is okay, since userspace must have a second non-UFFD-registered mapping anyway, thus there isn't much reason to want to use these in any case (just memcpy or memset or similar). - If UFFDIO_COPY is used on a minor fault, -EEXIST is returned. - If UFFDIO_ZEROPAGE is used on a minor fault, -EEXIST is returned (or -EINVAL in the case of hugetlb, as UFFDIO_ZEROPAGE is unsupported in any case). - UFFDIO_WRITEPROTECT simply doesn't work with shared memory, and returns -ENOENT in that case (regardless of the kind of fault). Future Work =========== This series only supports hugetlbfs. I have a second series in flight to support shmem as well, extending the functionality. This series is more mature than the shmem support at this point, and the functionality works fully on hugetlbfs, so this series can be merged first and then shmem support will follow. This patch (of 6): This feature allows userspace to intercept "minor" faults. By "minor" faults, I mean the following situation: Let there exist two mappings (i.e., VMAs) to the same page(s). One of the mappings is registered with userfaultfd (in minor mode), and the other is not. Via the non-UFFD mapping, the underlying pages have already been allocated & filled with some contents. The UFFD mapping has not yet been faulted in; when it is touched for the first time, this results in what I'm calling a "minor" fault. As a concrete example, when working with hugetlbfs, we have huge_pte_none(), but find_lock_page() finds an existing page. This commit adds the new registration mode, and sets the relevant flag on the VMAs being registered. In the hugetlb fault path, if we find that we have huge_pte_none(), but find_lock_page() does indeed find an existing page, then we have a "minor" fault, and if the VMA has the userfaultfd registration flag, we call into userfaultfd to handle it. This is implemented as a new registration mode, instead of an API feature. This is because the alternative implementation has significant drawbacks [1]. However, doing it this was requires we allocate a VM_* flag for the new registration mode. On 32-bit systems, there are no unused bits, so this feature is only supported on architectures with CONFIG_ARCH_USES_HIGH_VMA_FLAGS. When attempting to register a VMA in MINOR mode on 32-bit architectures, we return -EINVAL. [1] https://lore.kernel.org/patchwork/patch/1380226/ [peterx@redhat.com: fix minor fault page leak] Link: https://lkml.kernel.org/r/20210322175132.36659-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210301222728.176417-1-axelrasmussen@google.com Link: https://lkml.kernel.org/r/20210301222728.176417-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Chinwen Chang Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Peter Xu Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Rostedt Cc: Steven Price Cc: Vlastimil Babka Cc: Adam Ruprecht Cc: Axel Rasmussen Cc: Cannon Matthews Cc: "Dr . David Alan Gilbert" Cc: David Rientjes Cc: Mina Almasry Cc: Oliver Upton Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 1 + arch/x86/Kconfig | 1 + fs/proc/task_mmu.c | 3 ++ fs/userfaultfd.c | 78 ++++++++++++++++++------------- include/linux/mm.h | 7 +++ include/linux/userfaultfd_k.h | 15 +++++- include/trace/events/mmflags.h | 7 +++ include/uapi/linux/userfaultfd.h | 15 +++++- init/Kconfig | 5 ++ mm/hugetlb.c | 80 +++++++++++++++++++++----------- 10 files changed, 150 insertions(+), 62 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7f2a80091337..04c69f606537 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -213,6 +213,7 @@ config ARM64 select SWIOTLB select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK + select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD help ARM 64-bit (AArch64) Linux support. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dac15f646f79..1c350e8782ed 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -165,6 +165,7 @@ config X86 select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD + select HAVE_ARCH_USERFAULTFD_MINOR if X86_64 && USERFAULTFD select HAVE_ARCH_VMAP_STACK if X86_64 select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_WITHIN_STACK_FRAMES diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e862cab69583..fc9784544b24 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -661,6 +661,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_PKEY_BIT4)] = "", #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + [ilog2(VM_UFFD_MINOR)] = "ui", +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ }; size_t i; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e5ce3b4e6c3d..ba35cafa8b0d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -197,24 +197,21 @@ static inline struct uffd_msg userfault_msg(unsigned long address, msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; msg.arg.pagefault.address = address; + /* + * These flags indicate why the userfault occurred: + * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. + * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault. + * - Neither of these flags being set indicates a MISSING fault. + * + * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write + * fault. Otherwise, it was a read fault. + */ if (flags & FAULT_FLAG_WRITE) - /* - * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the - * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE - * was not set in a UFFD_EVENT_PAGEFAULT, it means it - * was a read fault, otherwise if set it means it's - * a write fault. - */ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; if (reason & VM_UFFD_WP) - /* - * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the - * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was - * not set in a UFFD_EVENT_PAGEFAULT, it means it was - * a missing fault, otherwise if set it means it's a - * write protect fault. - */ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; + if (reason & VM_UFFD_MINOR) + msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR; if (features & UFFD_FEATURE_THREAD_ID) msg.arg.pagefault.feat.ptid = task_pid_vnr(current); return msg; @@ -401,8 +398,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) BUG_ON(ctx->mm != mm); - VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); - VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); + /* Any unrecognized flag is a bug. */ + VM_BUG_ON(reason & ~__VM_UFFD_FLAGS); + /* 0 or > 1 flags set is a bug; we expect exactly 1. */ + VM_BUG_ON(!reason || (reason & (reason - 1))); if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; @@ -612,7 +611,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + vma->vm_flags &= ~__VM_UFFD_FLAGS; } mmap_write_unlock(mm); @@ -644,7 +643,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + vma->vm_flags &= ~__VM_UFFD_FLAGS; return 0; } @@ -726,7 +725,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, } else { /* Drop uffd context if remap feature not enabled */ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + vma->vm_flags &= ~__VM_UFFD_FLAGS; } } @@ -867,12 +866,12 @@ static int userfaultfd_release(struct inode *inode, struct file *file) for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ - !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + !!(vma->vm_flags & __VM_UFFD_FLAGS)); if (vma->vm_userfaultfd_ctx.ctx != ctx) { prev = vma; continue; } - new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); + new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, @@ -1262,9 +1261,19 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, unsigned long vm_flags) { /* FIXME: add WP support to hugetlbfs and shmem */ - return vma_is_anonymous(vma) || - ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) && - !(vm_flags & VM_UFFD_WP)); + if (vm_flags & VM_UFFD_WP) { + if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) + return false; + } + + if (vm_flags & VM_UFFD_MINOR) { + /* FIXME: Add minor fault interception for shmem. */ + if (!is_vm_hugetlb_page(vma)) + return false; + } + + return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || + vma_is_shmem(vma); } static int userfaultfd_register(struct userfaultfd_ctx *ctx, @@ -1290,14 +1299,19 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (!uffdio_register.mode) goto out; - if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| - UFFDIO_REGISTER_MODE_WP)) + if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES) goto out; vm_flags = 0; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) vm_flags |= VM_UFFD_WP; + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + goto out; +#endif + vm_flags |= VM_UFFD_MINOR; + } ret = validate_range(mm, &uffdio_register.range.start, uffdio_register.range.len); @@ -1341,7 +1355,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + !!(cur->vm_flags & __VM_UFFD_FLAGS)); /* check not compatible vmas */ ret = -EINVAL; @@ -1421,8 +1435,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, start = vma->vm_start; vma_end = min(end, vma->vm_end); - new_flags = (vma->vm_flags & - ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags; + new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), @@ -1544,7 +1557,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ - !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); + !!(cur->vm_flags & __VM_UFFD_FLAGS)); /* * Check not compatible vmas, not strictly required @@ -1595,7 +1608,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); } - new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); + new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), @@ -1863,6 +1876,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, goto err_out; /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR + uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS; +#endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) diff --git a/include/linux/mm.h b/include/linux/mm.h index 011f43605807..1dbb53c44243 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -372,6 +372,13 @@ extern unsigned int kobjsize(const void *objp); # define VM_GROWSUP VM_NONE #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +# define VM_UFFD_MINOR_BIT 37 +# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ +#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ +# define VM_UFFD_MINOR VM_NONE +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c63ccdae3eab..0390e5ac63b3 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -17,6 +17,9 @@ #include #include +/* The set of all possible UFFD-related VM flags. */ +#define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) + /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want @@ -71,6 +74,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma) return vma->vm_flags & VM_UFFD_WP; } +static inline bool userfaultfd_minor(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_UFFD_MINOR; +} + static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { @@ -85,7 +93,7 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, static inline bool userfaultfd_armed(struct vm_area_struct *vma) { - return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP); + return vma->vm_flags & __VM_UFFD_FLAGS; } extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); @@ -132,6 +140,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma) return false; } +static inline bool userfaultfd_minor(struct vm_area_struct *vma) +{ + return false; +} + static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, pte_t pte) { diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 67018d367b9f..629c7a0eaff2 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -137,6 +137,12 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) #define IF_HAVE_VM_SOFTDIRTY(flag,name) #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +# define IF_HAVE_UFFD_MINOR(flag, name) {flag, name}, +#else +# define IF_HAVE_UFFD_MINOR(flag, name) +#endif + #define __def_vmaflag_names \ {VM_READ, "read" }, \ {VM_WRITE, "write" }, \ @@ -148,6 +154,7 @@ IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) {VM_MAYSHARE, "mayshare" }, \ {VM_GROWSDOWN, "growsdown" }, \ {VM_UFFD_MISSING, "uffd_missing" }, \ +IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR, "uffd_minor" ) \ {VM_PFNMAP, "pfnmap" }, \ {VM_DENYWRITE, "denywrite" }, \ {VM_UFFD_WP, "uffd_wp" }, \ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 5f2d88212f7c..f24dd4fcbad9 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -19,15 +19,19 @@ * means the userland is reading). */ #define UFFD_API ((__u64)0xAA) +#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \ + UFFDIO_REGISTER_MODE_WP | \ + UFFDIO_REGISTER_MODE_MINOR) #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ UFFD_FEATURE_EVENT_FORK | \ UFFD_FEATURE_EVENT_REMAP | \ - UFFD_FEATURE_EVENT_REMOVE | \ + UFFD_FEATURE_EVENT_REMOVE | \ UFFD_FEATURE_EVENT_UNMAP | \ UFFD_FEATURE_MISSING_HUGETLBFS | \ UFFD_FEATURE_MISSING_SHMEM | \ UFFD_FEATURE_SIGBUS | \ - UFFD_FEATURE_THREAD_ID) + UFFD_FEATURE_THREAD_ID | \ + UFFD_FEATURE_MINOR_HUGETLBFS) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -127,6 +131,7 @@ struct uffd_msg { /* flags for UFFD_EVENT_PAGEFAULT */ #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ +#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ struct uffdio_api { /* userland asks for an API number and the features to enable */ @@ -171,6 +176,10 @@ struct uffdio_api { * * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will * be returned, if feature is not requested 0 will be returned. + * + * UFFD_FEATURE_MINOR_HUGETLBFS indicates that minor faults + * can be intercepted (via REGISTER_MODE_MINOR) for + * hugetlbfs-backed pages. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -181,6 +190,7 @@ struct uffdio_api { #define UFFD_FEATURE_EVENT_UNMAP (1<<6) #define UFFD_FEATURE_SIGBUS (1<<7) #define UFFD_FEATURE_THREAD_ID (1<<8) +#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9) __u64 features; __u64 ioctls; @@ -195,6 +205,7 @@ struct uffdio_register { struct uffdio_range range; #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) #define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) +#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) __u64 mode; /* diff --git a/init/Kconfig b/init/Kconfig index 9acb7762e971..1413413fcb9f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1644,6 +1644,11 @@ config HAVE_ARCH_USERFAULTFD_WP help Arch has userfaultfd write protection support +config HAVE_ARCH_USERFAULTFD_MINOR + bool + help + Arch has userfaultfd minor fault support + config MEMBARRIER bool "Enable membarrier() system call" if EXPERT default y diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b5977d9709ad..84530876b2ae 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4469,6 +4469,44 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return 0; } +static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, + unsigned int flags, + unsigned long haddr, + unsigned long reason) +{ + vm_fault_t ret; + u32 hash; + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .flags = flags, + + /* + * Hard to debug if it ends up being + * used by a callee that assumes + * something about the other + * uninitialized fields... same as in + * memory.c + */ + }; + + /* + * hugetlb_fault_mutex and i_mmap_rwsem must be + * dropped before handling userfault. Reacquire + * after handling fault to make calling code simpler. + */ + hash = hugetlb_fault_mutex_hash(mapping, idx); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); + ret = handle_userfault(&vmf, reason); + i_mmap_lock_read(mapping); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + return ret; +} + static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, @@ -4507,35 +4545,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, retry: page = find_lock_page(mapping, idx); if (!page) { - /* - * Check for page in userfault range - */ + /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { - u32 hash; - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .flags = flags, - /* - * Hard to debug if it ends up being - * used by a callee that assumes - * something about the other - * uninitialized fields... same as in - * memory.c - */ - }; - - /* - * hugetlb_fault_mutex and i_mmap_rwsem must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. - */ - hash = hugetlb_fault_mutex_hash(mapping, idx); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); - ret = handle_userfault(&vmf, VM_UFFD_MISSING); - i_mmap_lock_read(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); + ret = hugetlb_handle_userfault(vma, mapping, idx, + flags, haddr, + VM_UFFD_MISSING); goto out; } @@ -4591,6 +4605,16 @@ retry: VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } + + /* Check for page in userfault range. */ + if (userfaultfd_minor(vma)) { + unlock_page(page); + put_page(page); + ret = hugetlb_handle_userfault(vma, mapping, idx, + flags, haddr, + VM_UFFD_MINOR); + goto out; + } } /* From 0d9cadabd193c6008d256533f544de8206fd3a80 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:40 -0700 Subject: [PATCH 1122/1379] userfaultfd: disable huge PMD sharing for MINOR registered VMAs As the comment says: for the MINOR fault use case, although the page might be present and populated in the other (non-UFFD-registered) half of the mapping, it may be out of date, and we explicitly want userspace to get a minor fault so it can check and potentially update the page's contents. Huge PMD sharing would prevent these faults from occurring for suitably aligned areas, so disable it upon UFFD registration. Link: https://lkml.kernel.org/r/20210301222728.176417-3-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/userfaultfd_k.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 0390e5ac63b3..e060d5f77cc5 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -56,12 +56,19 @@ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, } /* - * Never enable huge pmd sharing on uffd-wp registered vmas, because uffd-wp - * protect information is per pgtable entry. + * Never enable huge pmd sharing on some uffd registered vmas: + * + * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry. + * + * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for + * VMAs which share huge pmds. (If you have two mappings to the same + * underlying pages, and fault in the non-UFFD-registered one with a write, + * with huge pmd sharing this would *also* setup the second UFFD-registered + * mapping, and we'd not get minor faults.) */ static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) { - return vma->vm_flags & VM_UFFD_WP; + return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); } static inline bool userfaultfd_missing(struct vm_area_struct *vma) From 714c189108244f1df579689061db1d785d92e7e2 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:45 -0700 Subject: [PATCH 1123/1379] userfaultfd: hugetlbfs: only compile UFFD helpers if config enabled For background, mm/userfaultfd.c provides a general mcopy_atomic implementation. But some types of memory (i.e., hugetlb and shmem) need a slightly different implementation, so they provide their own helpers for this. In other words, userfaultfd is the only caller of these functions. This patch achieves two things: 1. Don't spend time compiling code which will end up never being referenced anyway (a small build time optimization). 2. In patches later in this series, we extend the signature of these helpers with UFFD-specific state (a mode enumeration). Once this happens, we *have to* either not compile the helpers, or unconditionally define the UFFD-only state (which seems messier to me). This includes the declarations in the headers, as otherwise they'd yield warnings about implicitly defining the type of those arguments. Link: https://lkml.kernel.org/r/20210301222728.176417-4-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Mike Kravetz Reviewed-by: Peter Xu Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++++ mm/hugetlb.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d0f310ae3f82..a1dbe4568707 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -134,11 +134,13 @@ void hugetlb_show_meminfo(void); unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); +#ifdef CONFIG_USERFAULTFD int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, struct page **pagep); +#endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); @@ -310,6 +312,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, BUG(); } +#ifdef CONFIG_USERFAULTFD static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, @@ -320,6 +323,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, BUG(); return 0; } +#endif /* CONFIG_USERFAULTFD */ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 84530876b2ae..b105a455124d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4855,6 +4855,7 @@ out_mutex: return ret; } +#ifdef CONFIG_USERFAULTFD /* * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with * modifications for huge pages. @@ -4985,6 +4986,7 @@ out_release_nounlock: put_page(page); goto out; } +#endif /* CONFIG_USERFAULTFD */ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, int refs, struct page **pages, From f619147104c8ea71e120e4936d2b68ec11a1e527 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:49 -0700 Subject: [PATCH 1124/1379] userfaultfd: add UFFDIO_CONTINUE ioctl This ioctl is how userspace ought to resolve "minor" userfaults. The idea is, userspace is notified that a minor fault has occurred. It might change the contents of the page using its second non-UFFD mapping, or not. Then, it calls UFFDIO_CONTINUE to tell the kernel "I have ensured the page contents are correct, carry on setting up the mapping". Note that it doesn't make much sense to use UFFDIO_{COPY,ZEROPAGE} for MINOR registered VMAs. ZEROPAGE maps the VMA to the zero page; but in the minor fault case, we already have some pre-existing underlying page. Likewise, UFFDIO_COPY isn't useful if we have a second non-UFFD mapping. We'd just use memcpy() or similar instead. It turns out hugetlb_mcopy_atomic_pte() already does very close to what we want, if an existing page is provided via `struct page **pagep`. We already special-case the behavior a bit for the UFFDIO_ZEROPAGE case, so just extend that design: add an enum for the three modes of operation, and make the small adjustments needed for the MCOPY_ATOMIC_CONTINUE case. (Basically, look up the existing page, and avoid adding the existing page to the page cache or calling set_page_huge_active() on it.) Link: https://lkml.kernel.org/r/20210301222728.176417-5-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Kravetz Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 67 ++++++++++++++++++++++++++++++++ include/linux/hugetlb.h | 3 ++ include/linux/userfaultfd_k.h | 18 +++++++++ include/uapi/linux/userfaultfd.h | 21 +++++++++- mm/hugetlb.c | 40 ++++++++++++------- mm/userfaultfd.c | 37 +++++++++++------- 6 files changed, 156 insertions(+), 30 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ba35cafa8b0d..14f92285d04f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1487,6 +1487,10 @@ out_unlock: if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); + /* CONTINUE ioctl is only supported for MINOR ranges. */ + if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR)) + ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE); + /* * Now that we scanned all vmas we can already tell * userland which ioctls methods are guaranteed to @@ -1840,6 +1844,66 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, return ret; } +static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) +{ + __s64 ret; + struct uffdio_continue uffdio_continue; + struct uffdio_continue __user *user_uffdio_continue; + struct userfaultfd_wake_range range; + + user_uffdio_continue = (struct uffdio_continue __user *)arg; + + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + + ret = -EFAULT; + if (copy_from_user(&uffdio_continue, user_uffdio_continue, + /* don't copy the output fields */ + sizeof(uffdio_continue) - (sizeof(__s64)))) + goto out; + + ret = validate_range(ctx->mm, &uffdio_continue.range.start, + uffdio_continue.range.len); + if (ret) + goto out; + + ret = -EINVAL; + /* double check for wraparound just in case. */ + if (uffdio_continue.range.start + uffdio_continue.range.len <= + uffdio_continue.range.start) { + goto out; + } + if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) + goto out; + + if (mmget_not_zero(ctx->mm)) { + ret = mcopy_continue(ctx->mm, uffdio_continue.range.start, + uffdio_continue.range.len, + &ctx->mmap_changing); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + BUG_ON(!ret); + range.len = ret; + if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { + range.start = uffdio_continue.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN; + +out: + return ret; +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -1927,6 +1991,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_WRITEPROTECT: ret = userfaultfd_writeprotect(ctx, arg); break; + case UFFDIO_CONTINUE: + ret = userfaultfd_continue(ctx, arg); + break; } return ret; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a1dbe4568707..b92f25ccef58 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -11,6 +11,7 @@ #include #include #include +#include struct ctl_table; struct user_struct; @@ -139,6 +140,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep); #endif /* CONFIG_USERFAULTFD */ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, @@ -318,6 +320,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { BUG(); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e060d5f77cc5..794d1538b8ba 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -37,6 +37,22 @@ extern int sysctl_unprivileged_userfaultfd; extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); +/* + * The mode of operation for __mcopy_atomic and its helpers. + * + * This is almost an implementation detail (mcopy_atomic below doesn't take this + * as a parameter), but it's exposed here because memory-kind-specific + * implementations (e.g. hugetlbfs) need to know the mode of operation. + */ +enum mcopy_atomic_mode { + /* A normal copy_from_user into the destination range. */ + MCOPY_ATOMIC_NORMAL, + /* Don't copy; map the destination range to the zero page. */ + MCOPY_ATOMIC_ZEROPAGE, + /* Just install pte(s) with the existing page(s) in the page cache. */ + MCOPY_ATOMIC_CONTINUE, +}; + extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, bool *mmap_changing, __u64 mode); @@ -44,6 +60,8 @@ extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, bool *mmap_changing); +extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long len, bool *mmap_changing); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, bool *mmap_changing); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index f24dd4fcbad9..bafbeb1a2624 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -40,10 +40,12 @@ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE | \ - (__u64)1 << _UFFDIO_WRITEPROTECT) + (__u64)1 << _UFFDIO_WRITEPROTECT | \ + (__u64)1 << _UFFDIO_CONTINUE) #define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ - (__u64)1 << _UFFDIO_COPY) + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_CONTINUE) /* * Valid ioctl command number range with this API is from 0x00 to @@ -59,6 +61,7 @@ #define _UFFDIO_COPY (0x03) #define _UFFDIO_ZEROPAGE (0x04) #define _UFFDIO_WRITEPROTECT (0x06) +#define _UFFDIO_CONTINUE (0x07) #define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ @@ -77,6 +80,8 @@ struct uffdio_zeropage) #define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ struct uffdio_writeprotect) +#define UFFDIO_CONTINUE _IOR(UFFDIO, _UFFDIO_CONTINUE, \ + struct uffdio_continue) /* read() structure */ struct uffd_msg { @@ -268,6 +273,18 @@ struct uffdio_writeprotect { __u64 mode; }; +struct uffdio_continue { + struct uffdio_range range; +#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 mapped; +}; + /* * Flags for the userfaultfd(2) system call itself. */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b105a455124d..533e5a26e437 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include "internal.h" @@ -4865,8 +4864,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct address_space *mapping; pgoff_t idx; unsigned long size; @@ -4876,8 +4877,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, spinlock_t *ptl; int ret; struct page *page; + int writable; - if (!*pagep) { + mapping = dst_vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, dst_vma, dst_addr); + + if (is_continue) { + ret = -EFAULT; + page = find_lock_page(mapping, idx); + if (!page) + goto out; + } else if (!*pagep) { ret = -ENOMEM; page = alloc_huge_page(dst_vma, dst_addr, 0); if (IS_ERR(page)) @@ -4906,13 +4916,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, */ __SetPageUptodate(page); - mapping = dst_vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, dst_vma, dst_addr); - - /* - * If shared, add to page cache - */ - if (vm_shared) { + /* Add shared, newly allocated pages to the page cache. */ + if (vm_shared && !is_continue) { size = i_size_read(mapping->host) >> huge_page_shift(h); ret = -EFAULT; if (idx >= size) @@ -4957,8 +4962,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); } - _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); - if (dst_vma->vm_flags & VM_WRITE) + /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */ + if (is_continue && !vm_shared) + writable = 0; + else + writable = dst_vma->vm_flags & VM_WRITE; + + _dst_pte = make_huge_pte(dst_vma, page, writable); + if (writable) _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte); @@ -4972,15 +4983,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); - SetHPageMigratable(page); - if (vm_shared) + if (!is_continue) + SetHPageMigratable(page); + if (vm_shared || is_continue) unlock_page(page); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); - if (vm_shared) + if (vm_shared || is_continue) unlock_page(page); out_release_nounlock: put_page(page); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 063cbb17e8d8..e14b3820c6a8 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage) + enum mcopy_atomic_mode mode) { int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; int vm_shared = dst_vma->vm_flags & VM_SHARED; @@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, * by THP. Since we can not reliably insert a zero page, this * feature is not supported. */ - if (zeropage) { + if (mode == MCOPY_ATOMIC_ZEROPAGE) { mmap_read_unlock(dst_mm); return -EINVAL; } @@ -273,8 +273,6 @@ retry: } while (src_addr < src_start + len) { - pte_t dst_pteval; - BUG_ON(dst_addr >= dst_start + len); /* @@ -297,16 +295,16 @@ retry: goto out_unlock; } - err = -EEXIST; - dst_pteval = huge_ptep_get(dst_pte); - if (!huge_pte_none(dst_pteval)) { + if (mode != MCOPY_ATOMIC_CONTINUE && + !huge_pte_none(huge_ptep_get(dst_pte))) { + err = -EEXIST; mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); goto out_unlock; } err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, - dst_addr, src_addr, &page); + dst_addr, src_addr, mode, &page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_unlock_read(mapping); @@ -408,7 +406,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage); + enum mcopy_atomic_mode mode); #endif /* CONFIG_HUGETLB_PAGE */ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, @@ -458,7 +456,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage, + enum mcopy_atomic_mode mcopy_mode, bool *mmap_changing, __u64 mode) { @@ -469,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, long copied; struct page *page; bool wp_copy; + bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE); /* * Sanitize the command parameters: @@ -527,10 +526,12 @@ retry: */ if (is_vm_hugetlb_page(dst_vma)) return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, - src_start, len, zeropage); + src_start, len, mcopy_mode); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; + if (mcopy_mode == MCOPY_ATOMIC_CONTINUE) + goto out_unlock; /* * Ensure the dst_vma has a anon_vma or this page @@ -626,14 +627,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, bool *mmap_changing, __u64 mode) { - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, - mmap_changing, mode); + return __mcopy_atomic(dst_mm, dst_start, src_start, len, + MCOPY_ATOMIC_NORMAL, mmap_changing, mode); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0); + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, + mmap_changing, 0); +} + +ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, bool *mmap_changing) +{ + return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, + mmap_changing, 0); } int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, From b8da5cd4e5f1ce1274140e200a9116b7fe61dd87 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:53 -0700 Subject: [PATCH 1125/1379] userfaultfd: update documentation to describe minor fault handling Reword / reorganize things a little bit into "lists", so new features / modes / ioctls can sort of just be appended. Describe how UFFDIO_REGISTER_MODE_MINOR and UFFDIO_CONTINUE can be used to intercept and resolve minor faults. Make it clear that COPY and ZEROPAGE are used for MISSING faults, whereas CONTINUE is used for MINOR faults. Link: https://lkml.kernel.org/r/20210301222728.176417-6-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Kravetz Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/userfaultfd.rst | 99 ++++++++++++-------- 1 file changed, 62 insertions(+), 37 deletions(-) diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 65eefa66c0ba..3aa38e8b8361 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -63,36 +63,36 @@ the generic ioctl available. The ``uffdio_api.features`` bitmask returned by the ``UFFDIO_API`` ioctl defines what memory types are supported by the ``userfaultfd`` and what -events, except page fault notifications, may be generated. +events, except page fault notifications, may be generated: -If the kernel supports registering ``userfaultfd`` ranges on hugetlbfs -virtual memory areas, ``UFFD_FEATURE_MISSING_HUGETLBFS`` will be set in -``uffdio_api.features``. Similarly, ``UFFD_FEATURE_MISSING_SHMEM`` will be -set if the kernel supports registering ``userfaultfd`` ranges on shared -memory (covering all shmem APIs, i.e. tmpfs, ``IPCSHM``, ``/dev/zero``, -``MAP_SHARED``, ``memfd_create``, etc). +- The ``UFFD_FEATURE_EVENT_*`` flags indicate that various other events + other than page faults are supported. These events are described in more + detail below in the `Non-cooperative userfaultfd`_ section. -The userland application that wants to use ``userfaultfd`` with hugetlbfs -or shared memory need to set the corresponding flag in -``uffdio_api.features`` to enable those features. +- ``UFFD_FEATURE_MISSING_HUGETLBFS`` and ``UFFD_FEATURE_MISSING_SHMEM`` + indicate that the kernel supports ``UFFDIO_REGISTER_MODE_MISSING`` + registrations for hugetlbfs and shared memory (covering all shmem APIs, + i.e. tmpfs, ``IPCSHM``, ``/dev/zero``, ``MAP_SHARED``, ``memfd_create``, + etc) virtual memory areas, respectively. -If the userland desires to receive notifications for events other than -page faults, it has to verify that ``uffdio_api.features`` has appropriate -``UFFD_FEATURE_EVENT_*`` bits set. These events are described in more -detail below in `Non-cooperative userfaultfd`_ section. +- ``UFFD_FEATURE_MINOR_HUGETLBFS`` indicates that the kernel supports + ``UFFDIO_REGISTER_MODE_MINOR`` registration for hugetlbfs virtual memory + areas. -Once the ``userfaultfd`` has been enabled the ``UFFDIO_REGISTER`` ioctl should -be invoked (if present in the returned ``uffdio_api.ioctls`` bitmask) to -register a memory range in the ``userfaultfd`` by setting the +The userland application should set the feature flags it intends to use +when invoking the ``UFFDIO_API`` ioctl, to request that those features be +enabled if supported. + +Once the ``userfaultfd`` API has been enabled the ``UFFDIO_REGISTER`` +ioctl should be invoked (if present in the returned ``uffdio_api.ioctls`` +bitmask) to register a memory range in the ``userfaultfd`` by setting the uffdio_register structure accordingly. The ``uffdio_register.mode`` bitmask will specify to the kernel which kind of faults to track for -the range (``UFFDIO_REGISTER_MODE_MISSING`` would track missing -pages). The ``UFFDIO_REGISTER`` ioctl will return the +the range. The ``UFFDIO_REGISTER`` ioctl will return the ``uffdio_register.ioctls`` bitmask of ioctls that are suitable to resolve userfaults on the range registered. Not all ioctls will necessarily be -supported for all memory types depending on the underlying virtual -memory backend (anonymous memory vs tmpfs vs real filebacked -mappings). +supported for all memory types (e.g. anonymous memory vs. shmem vs. +hugetlbfs), or all types of intercepted faults. Userland can use the ``uffdio_register.ioctls`` to manage the virtual address space in the background (to add or potentially also remove @@ -100,21 +100,46 @@ memory from the ``userfaultfd`` registered range). This means a userfault could be triggering just before userland maps in the background the user-faulted page. -The primary ioctl to resolve userfaults is ``UFFDIO_COPY``. That -atomically copies a page into the userfault registered range and wakes -up the blocked userfaults -(unless ``uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE`` is set). -Other ioctl works similarly to ``UFFDIO_COPY``. They're atomic as in -guaranteeing that nothing can see an half copied page since it'll -keep userfaulting until the copy has finished. +Resolving Userfaults +-------------------- + +There are three basic ways to resolve userfaults: + +- ``UFFDIO_COPY`` atomically copies some existing page contents from + userspace. + +- ``UFFDIO_ZEROPAGE`` atomically zeros the new page. + +- ``UFFDIO_CONTINUE`` maps an existing, previously-populated page. + +These operations are atomic in the sense that they guarantee nothing can +see a half-populated page, since readers will keep userfaulting until the +operation has finished. + +By default, these wake up userfaults blocked on the range in question. +They support a ``UFFDIO_*_MODE_DONTWAKE`` ``mode`` flag, which indicates +that waking will be done separately at some later time. + +Which ioctl to choose depends on the kind of page fault, and what we'd +like to do to resolve it: + +- For ``UFFDIO_REGISTER_MODE_MISSING`` faults, the fault needs to be + resolved by either providing a new page (``UFFDIO_COPY``), or mapping + the zero page (``UFFDIO_ZEROPAGE``). By default, the kernel would map + the zero page for a missing fault. With userfaultfd, userspace can + decide what content to provide before the faulting thread continues. + +- For ``UFFDIO_REGISTER_MODE_MINOR`` faults, there is an existing page (in + the page cache). Userspace has the option of modifying the page's + contents before resolving the fault. Once the contents are correct + (modified or not), userspace asks the kernel to map the page and let the + faulting thread continue with ``UFFDIO_CONTINUE``. Notes: -- If you requested ``UFFDIO_REGISTER_MODE_MISSING`` when registering then - you must provide some kind of page in your thread after reading from - the uffd. You must provide either ``UFFDIO_COPY`` or ``UFFDIO_ZEROPAGE``. - The normal behavior of the OS automatically providing a zero page on - an anonymous mmaping is not in place. +- You can tell which kind of fault occurred by examining + ``pagefault.flags`` within the ``uffd_msg``, checking for the + ``UFFD_PAGEFAULT_FLAG_*`` flags. - None of the page-delivering ioctls default to the range that you registered with. You must fill in all fields for the appropriate @@ -122,9 +147,9 @@ Notes: - You get the address of the access that triggered the missing page event out of a struct uffd_msg that you read in the thread from the - uffd. You can supply as many pages as you want with ``UFFDIO_COPY`` or - ``UFFDIO_ZEROPAGE``. Keep in mind that unless you used DONTWAKE then - the first of any of those IOCTLs wakes up the faulting thread. + uffd. You can supply as many pages as you want with these IOCTLs. + Keep in mind that unless you used DONTWAKE then the first of any of + those IOCTLs wakes up the faulting thread. - Be sure to test for all errors including (``pollfd[0].revents & POLLERR``). This can happen, e.g. when ranges From f0fa94330919be8ec5620382b50f1c72844c9224 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Tue, 4 May 2021 18:35:57 -0700 Subject: [PATCH 1126/1379] userfaultfd/selftests: add test exercising minor fault handling Fix a dormant bug in userfaultfd_events_test(), where we did `return faulting_process(0)` instead of `exit(faulting_process(0))`. This caused the forked process to keep running, trying to execute any further test cases after the events test in parallel with the "real" process. Add a simple test case which exercises minor faults. In short, it does the following: 1. "Sets up" an area (area_dst) and a second shared mapping to the same underlying pages (area_dst_alias). 2. Register one of these areas with userfaultfd, in minor fault mode. 3. Start a second thread to handle any minor faults. 4. Populate the underlying pages with the non-UFFD-registered side of the mapping. Basically, memset() each page with some arbitrary contents. 5. Then, using the UFFD-registered mapping, read all of the page contents, asserting that the contents match expectations (we expect the minor fault handling thread can modify the page contents before resolving the fault). The minor fault handling thread, upon receiving an event, flips all the bits (~) in that page, just to prove that it can modify it in some arbitrary way. Then it issues a UFFDIO_CONTINUE ioctl, to setup the mapping and resolve the fault. The reading thread should wake up and see this modification. Currently the minor fault test is only enabled in hugetlb_shared mode, as this is the only configuration the kernel feature supports. Link: https://lkml.kernel.org/r/20210301222728.176417-7-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Adam Ruprecht Cc: Alexander Viro Cc: Alexey Dobriyan Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Cannon Matthews Cc: Catalin Marinas Cc: Chinwen Chang Cc: David Rientjes Cc: "Dr . David Alan Gilbert" Cc: Huang Ying Cc: Ingo Molnar Cc: Jann Horn Cc: Jerome Glisse Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: "Matthew Wilcox (Oracle)" Cc: Michael Ellerman Cc: "Michal Koutn" Cc: Michel Lespinasse Cc: Mike Kravetz Cc: Mike Rapoport Cc: Mina Almasry Cc: Nicholas Piggin Cc: Oliver Upton Cc: Shaohua Li Cc: Shawn Anastasio Cc: Steven Price Cc: Steven Rostedt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/userfaultfd.c | 164 ++++++++++++++++++++++- 1 file changed, 158 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 92b8ec423201..f5ab5e0312e7 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -81,6 +81,8 @@ static volatile bool test_uffdio_copy_eexist = true; static volatile bool test_uffdio_zeropage_eexist = true; /* Whether to test uffd write-protection */ static bool test_uffdio_wp = false; +/* Whether to test uffd minor faults */ +static bool test_uffdio_minor = false; static bool map_shared; static int huge_fd; @@ -96,6 +98,7 @@ struct uffd_stats { int cpu; unsigned long missing_faults; unsigned long wp_faults; + unsigned long minor_faults; }; /* pthread_mutex_t starts at page offset 0 */ @@ -153,17 +156,19 @@ static void uffd_stats_reset(struct uffd_stats *uffd_stats, uffd_stats[i].cpu = i; uffd_stats[i].missing_faults = 0; uffd_stats[i].wp_faults = 0; + uffd_stats[i].minor_faults = 0; } } static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) { int i; - unsigned long long miss_total = 0, wp_total = 0; + unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; for (i = 0; i < n_cpus; i++) { miss_total += stats[i].missing_faults; wp_total += stats[i].wp_faults; + minor_total += stats[i].minor_faults; } printf("userfaults: %llu missing (", miss_total); @@ -172,6 +177,9 @@ static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) printf("\b), %llu wp (", wp_total); for (i = 0; i < n_cpus; i++) printf("%lu+", stats[i].wp_faults); + printf("\b), %llu minor (", minor_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].minor_faults); printf("\b)\n"); } @@ -328,7 +336,7 @@ static struct uffd_test_ops shmem_uffd_test_ops = { }; static struct uffd_test_ops hugetlb_uffd_test_ops = { - .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC, + .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE), .allocate_area = hugetlb_allocate_area, .release_pages = hugetlb_release_pages, .alias_mapping = hugetlb_alias_mapping, @@ -362,6 +370,22 @@ static void wp_range(int ufd, __u64 start, __u64 len, bool wp) } } +static void continue_range(int ufd, __u64 start, __u64 len) +{ + struct uffdio_continue req; + + req.range.start = start; + req.range.len = len; + req.mode = 0; + + if (ioctl(ufd, UFFDIO_CONTINUE, &req)) { + fprintf(stderr, + "UFFDIO_CONTINUE failed for address 0x%" PRIx64 "\n", + (uint64_t)start); + exit(1); + } +} + static void *locking_thread(void *arg) { unsigned long cpu = (unsigned long) arg; @@ -569,8 +593,32 @@ static void uffd_handle_page_fault(struct uffd_msg *msg, } if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { + /* Write protect page faults */ wp_range(uffd, msg->arg.pagefault.address, page_size, false); stats->wp_faults++; + } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { + uint8_t *area; + int b; + + /* + * Minor page faults + * + * To prove we can modify the original range for testing + * purposes, we're going to bit flip this range before + * continuing. + * + * Note that this requires all minor page fault tests operate on + * area_dst (non-UFFD-registered) and area_dst_alias + * (UFFD-registered). + */ + + area = (uint8_t *)(area_dst + + ((char *)msg->arg.pagefault.address - + area_dst_alias)); + for (b = 0; b < page_size; ++b) + area[b] = ~area[b]; + continue_range(uffd, msg->arg.pagefault.address, page_size); + stats->minor_faults++; } else { /* Missing page faults */ if (bounces & BOUNCE_VERIFY && @@ -779,7 +827,7 @@ static int stress(struct uffd_stats *uffd_stats) return 0; } -static int userfaultfd_open(int features) +static int userfaultfd_open_ext(uint64_t *features) { struct uffdio_api uffdio_api; @@ -792,7 +840,7 @@ static int userfaultfd_open(int features) uffd_flags = fcntl(uffd, F_GETFD, NULL); uffdio_api.api = UFFD_API; - uffdio_api.features = features; + uffdio_api.features = *features; if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to " "run with either root or ptrace capability.\n"); @@ -804,9 +852,15 @@ static int userfaultfd_open(int features) return 1; } + *features = uffdio_api.features; return 0; } +static int userfaultfd_open(uint64_t features) +{ + return userfaultfd_open_ext(&features); +} + sigjmp_buf jbuf, *sigbuf; static void sighndl(int sig, siginfo_t *siginfo, void *ptr) @@ -1112,7 +1166,7 @@ static int userfaultfd_events_test(void) } if (!pid) - return faulting_process(0); + exit(faulting_process(0)); waitpid(pid, &err, 0); if (err) { @@ -1215,6 +1269,102 @@ static int userfaultfd_sig_test(void) return userfaults != 0; } +static int userfaultfd_minor_test(void) +{ + struct uffdio_register uffdio_register; + unsigned long expected_ioctls; + unsigned long p; + pthread_t uffd_mon; + uint8_t expected_byte; + void *expected_page; + char c; + struct uffd_stats stats = { 0 }; + uint64_t features = UFFD_FEATURE_MINOR_HUGETLBFS; + + if (!test_uffdio_minor) + return 0; + + printf("testing minor faults: "); + fflush(stdout); + + if (uffd_test_ops->release_pages(area_dst)) + return 1; + + if (userfaultfd_open_ext(&features)) + return 1; + /* If kernel reports the feature isn't supported, skip the test. */ + if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) { + printf("skipping test due to lack of feature support\n"); + fflush(stdout); + return 0; + } + + uffdio_register.range.start = (unsigned long)area_dst_alias; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) { + fprintf(stderr, "register failure\n"); + exit(1); + } + + expected_ioctls = uffd_test_ops->expected_ioctls; + expected_ioctls |= 1 << _UFFDIO_CONTINUE; + if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) { + fprintf(stderr, "unexpected missing ioctl(s)\n"); + exit(1); + } + + /* + * After registering with UFFD, populate the non-UFFD-registered side of + * the shared mapping. This should *not* trigger any UFFD minor faults. + */ + for (p = 0; p < nr_pages; ++p) { + memset(area_dst + (p * page_size), p % ((uint8_t)-1), + page_size); + } + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) { + perror("uffd_poll_thread create"); + exit(1); + } + + /* + * Read each of the pages back using the UFFD-registered mapping. We + * expect that the first time we touch a page, it will result in a minor + * fault. uffd_poll_thread will resolve the fault by bit-flipping the + * page's contents, and then issuing a CONTINUE ioctl. + */ + + if (posix_memalign(&expected_page, page_size, page_size)) { + fprintf(stderr, "out of memory\n"); + return 1; + } + + for (p = 0; p < nr_pages; ++p) { + expected_byte = ~((uint8_t)(p % ((uint8_t)-1))); + memset(expected_page, expected_byte, page_size); + if (my_bcmp(expected_page, area_dst_alias + (p * page_size), + page_size)) { + fprintf(stderr, + "unexpected page contents after minor fault\n"); + exit(1); + } + } + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) { + perror("pipe write"); + exit(1); + } + if (pthread_join(uffd_mon, NULL)) + return 1; + + close(uffd); + + uffd_stats_report(&stats, 1); + + return stats.missing_faults != 0 || stats.minor_faults != nr_pages; +} + static int userfaultfd_stress(void) { void *area; @@ -1413,7 +1563,7 @@ static int userfaultfd_stress(void) close(uffd); return userfaultfd_zeropage_test() || userfaultfd_sig_test() - || userfaultfd_events_test(); + || userfaultfd_events_test() || userfaultfd_minor_test(); } /* @@ -1454,6 +1604,8 @@ static void set_test_type(const char *type) map_shared = true; test_type = TEST_HUGETLB; uffd_test_ops = &hugetlb_uffd_test_ops; + /* Minor faults require shared hugetlb; only enable here. */ + test_uffdio_minor = true; } else if (!strcmp(type, "shmem")) { map_shared = true; test_type = TEST_SHMEM; From b6676de8d7b48724d4cd3a3742c62fa525baa904 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 4 May 2021 18:36:01 -0700 Subject: [PATCH 1127/1379] mm/vmscan: move RECLAIM* bits to uapi header It is currently not obvious that the RECLAIM_* bits are part of the uapi since they are defined in vmscan.c. Move them to a uapi header to make it obvious. This should have no functional impact. Link: https://lkml.kernel.org/r/20210219172557.08074910@viggo.jf.intel.com Signed-off-by: Dave Hansen Reviewed-by: Ben Widawsky Reviewed-by: Oscar Salvador Acked-by: David Rientjes Acked-by: Christoph Lameter Cc: Alex Shi Cc: Daniel Wagner Cc: "Tobin C. Harding" Cc: Christoph Lameter Cc: Huang Ying Cc: Dan Williams Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/mempolicy.h | 7 +++++++ mm/vmscan.c | 8 -------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 8948467b3992..4832fd0b5642 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -64,5 +64,12 @@ enum { #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ +/* + * These bit locations are exposed in the vm.zone_reclaim_mode sysctl + * ABI. New bits are OK, but existing bits can never change. + */ +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ #endif /* _UAPI_LINUX_MEMPOLICY_H */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 42aaef30633e..671143dbf809 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4086,14 +4086,6 @@ module_init(kswapd_init) */ int node_reclaim_mode __read_mostly; -/* - * These bit locations are exposed in the vm.zone_reclaim_mode sysctl - * ABI. New bits are OK, but existing bits can never change. - */ -#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ -#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ -#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ - /* * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of From 202e35db5e719ee8af6028183403f475e243f82d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 4 May 2021 18:36:04 -0700 Subject: [PATCH 1128/1379] mm/vmscan: replace implicit RECLAIM_ZONE checks with explicit checks RECLAIM_ZONE was assumed to be unused because it was never explicitly used in the kernel. However, there were a number of places where it was checked implicitly by checking 'node_reclaim_mode' for a zero value. These zero checks are not great because it is not obvious what a zero mode *means* in the code. Replace them with a helper which makes it more obvious: node_reclaim_enabled(). This helper also provides a handy place to explicitly check the RECLAIM_ZONE bit itself. Check it explicitly there to make it more obvious where the bit can affect behavior. This should have no functional impact. Link: https://lkml.kernel.org/r/20210219172559.BF589C44@viggo.jf.intel.com Signed-off-by: Dave Hansen Reviewed-by: Ben Widawsky Reviewed-by: Oscar Salvador Acked-by: Christoph Lameter Acked-by: David Rientjes Cc: Alex Shi Cc: "Tobin C. Harding" Cc: Huang Ying Cc: Dan Williams Cc: Qian Cai Cc: Daniel Wagner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 7 +++++++ mm/khugepaged.c | 2 +- mm/page_alloc.c | 2 +- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 4cc6ec3bf0ab..42191da1bdc9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -12,6 +12,7 @@ #include #include #include +#include #include struct notifier_block; @@ -378,6 +379,12 @@ extern int sysctl_min_slab_ratio; #define node_reclaim_mode 0 #endif +static inline bool node_reclaim_enabled(void) +{ + /* Is any node_reclaim_mode bit set? */ + return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP); +} + extern void check_move_unevictable_pages(struct pagevec *pvec); extern int kswapd_run(int nid); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a03569eda183..ea74da3232ab 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -809,7 +809,7 @@ static bool khugepaged_scan_abort(int nid) * If node_reclaim_mode is disabled, then no extra effort is made to * allocate memory locally. */ - if (!node_reclaim_mode) + if (!node_reclaim_enabled()) return false; /* If there is a count for this node already, it must be acceptable */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 80fa6e0f9ed9..19cdd8a829dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3968,7 +3968,7 @@ retry: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (node_reclaim_mode == 0 || + if (!node_reclaim_enabled() || !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; From 8efb4b596df05f004e847d6bfadad3492b766ab3 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:08 -0700 Subject: [PATCH 1129/1379] mm: vmscan: use nid from shrink_control for tracepoint Patch series "Make shrinker's nr_deferred memcg aware", v10. Recently huge amount one-off slab drop was seen on some vfs metadata heavy workloads, it turned out there were huge amount accumulated nr_deferred objects seen by the shrinker. On our production machine, I saw absurd number of nr_deferred shown as the below tracing result: <...>-48776 [032] .... 27970562.458916: mm_shrink_slab_start: super_cache_scan+0x0/0x1a0 ffff9a83046f3458: nid: 0 objects to shrink 2531805877005 gfp_flags GFP_HIGHUSER_MOVABLE pgs_scanned 32 lru_pgs 9300 cache items 1667 delta 11 total_scan 833 There are 2.5 trillion deferred objects on one node, assuming all of them are dentry (192 bytes per object), so the total size of deferred on one node is ~480TB. It is definitely ridiculous. I managed to reproduce this problem with kernel build workload plus negative dentry generator. First step, run the below kernel build test script: NR_CPUS=`cat /proc/cpuinfo | grep -e processor | wc -l` cd /root/Buildarea/linux-stable for i in `seq 1500`; do cgcreate -g memory:kern_build echo 4G > /sys/fs/cgroup/memory/kern_build/memory.limit_in_bytes echo 3 > /proc/sys/vm/drop_caches cgexec -g memory:kern_build make clean > /dev/null 2>&1 cgexec -g memory:kern_build make -j$NR_CPUS > /dev/null 2>&1 cgdelete -g memory:kern_build done Then run the below negative dentry generator script: NR_CPUS=`cat /proc/cpuinfo | grep -e processor | wc -l` mkdir /sys/fs/cgroup/memory/test echo $$ > /sys/fs/cgroup/memory/test/tasks for i in `seq $NR_CPUS`; do while true; do FILE=`head /dev/urandom | tr -dc A-Za-z0-9 | head -c 64` cat $FILE 2>/dev/null done & done Then kswapd will shrink half of dentry cache in just one loop as the below tracing result showed: kswapd0-475 [028] .... 305968.252561: mm_shrink_slab_start: super_cache_scan+0x0/0x190 0000000024acf00c: nid: 0 objects to shrink 4994376020 gfp_flags GFP_KERNEL cache items 93689873 delta 45746 total_scan 46844936 priority 12 kswapd0-475 [021] .... 306013.099399: mm_shrink_slab_end: super_cache_scan+0x0/0x190 0000000024acf00c: nid: 0 unused scan count 4994376020 new scan count 4947576838 total_scan 8 last shrinker return val 46844928 There were huge number of deferred objects before the shrinker was called, the behavior does match the code but it might be not desirable from the user's stand of point. The excessive amount of nr_deferred might be accumulated due to various reasons, for example: * GFP_NOFS allocation * Significant times of small amount scan (< scan_batch, 1024 for vfs metadata) However the LRUs of slabs are per memcg (memcg-aware shrinkers) but the deferred objects is per shrinker, this may have some bad effects: * Poor isolation among memcgs. Some memcgs which happen to have frequent limit reclaim may get nr_deferred accumulated to a huge number, then other innocent memcgs may take the fall. In our case the main workload was hit. * Unbounded deferred objects. There is no cap for deferred objects, it can outgrow ridiculously as the tracing result showed. * Easy to get out of control. Although shrinkers take into account deferred objects, but it can go out of control easily. One misconfigured memcg could incur absurd amount of deferred objects in a period of time. * Sort of reclaim problems, i.e. over reclaim, long reclaim latency, etc. There may be hundred GB slab caches for vfe metadata heavy workload, shrink half of them may take minutes. We observed latency spike due to the prolonged reclaim. These issues also have been discussed in https://lore.kernel.org/linux-mm/20200916185823.5347-1-shy828301@gmail.com/. The patchset is the outcome of that discussion. So this patchset makes nr_deferred per-memcg to tackle the problem. It does: * Have memcg_shrinker_deferred per memcg per node, just like what shrinker_map does. Instead it is an atomic_long_t array, each element represent one shrinker even though the shrinker is not memcg aware, this simplifies the implementation. For memcg aware shrinkers, the deferred objects are just accumulated to its own memcg. The shrinkers just see nr_deferred from its own memcg. Non memcg aware shrinkers still use global nr_deferred from struct shrinker. * Once the memcg is offlined, its nr_deferred will be reparented to its parent along with LRUs. * The root memcg has memcg_shrinker_deferred array too. It simplifies the handling of reparenting to root memcg. * Cap nr_deferred to 2x of the length of lru. The idea is borrowed from Dave Chinner's series (https://lore.kernel.org/linux-xfs/20191031234618.15403-1-david@fromorbit.com/) The downside is each memcg has to allocate extra memory to store the nr_deferred array. On our production environment, there are typically around 40 shrinkers, so each memcg needs ~320 bytes. 10K memcgs would need ~3.2MB memory. It seems fine. We have been running the patched kernel on some hosts of our fleet (test and production) for months, it works very well. The monitor data shows the working set is sustained as expected. This patch (of 13): The tracepoint's nid should show what node the shrink happens on, the start tracepoint uses nid from shrinkctl, but the nid might be set to 0 before end tracepoint if the shrinker is not NUMA aware, so the tracing log may show the shrink happens on one node but end up on the other node. It seems confusing. And the following patch will remove using nid directly in do_shrink_slab(), this patch also helps cleanup the code. Link: https://lkml.kernel.org/r/20210311190845.9708-1-shy828301@gmail.com Link: https://lkml.kernel.org/r/20210311190845.9708-2-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 671143dbf809..d231af7cae06 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -536,7 +536,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); - trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); return freed; } From 2bfd36374edd9ed7f2ebf66cacebedf7273901cb Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:11 -0700 Subject: [PATCH 1130/1379] mm: vmscan: consolidate shrinker_maps handling code The shrinker map management is not purely memcg specific, it is at the intersection between memory cgroup and shrinkers. It's allocation and assignment of a structure, and the only memcg bit is the map is being stored in a memcg structure. So move the shrinker_maps handling code into vmscan.c for tighter integration with shrinker code, and remove the "memcg_" prefix. There is no functional change. Link: https://lkml.kernel.org/r/20210311190845.9708-3-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 11 ++-- mm/huge_memory.c | 4 +- mm/list_lru.c | 6 +- mm/memcontrol.c | 130 +----------------------------------- mm/vmscan.c | 132 ++++++++++++++++++++++++++++++++++++- 5 files changed, 142 insertions(+), 141 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5904716f29ba..7dbd2c9bad32 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1610,10 +1610,9 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -extern int memcg_expand_shrinker_maps(int new_id); - -extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id); +int alloc_shrinker_maps(struct mem_cgroup *memcg); +void free_shrinker_maps(struct mem_cgroup *memcg); +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; @@ -1623,8 +1622,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, - int nid, int shrinker_id) +static inline void set_shrinker_bit(struct mem_cgroup *memcg, + int nid, int shrinker_id) { } #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b3788683f7d3..98456017744d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2830,8 +2830,8 @@ void deferred_split_huge_page(struct page *page) ds_queue->split_queue_len++; #ifdef CONFIG_MEMCG if (memcg) - memcg_set_shrinker_bit(memcg, page_to_nid(page), - deferred_split_shrinker.id); + set_shrinker_bit(memcg, page_to_nid(page), + deferred_split_shrinker.id); #endif } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); diff --git a/mm/list_lru.c b/mm/list_lru.c index 6f067b6b935f..cd58790d0fb3 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -125,8 +125,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) - memcg_set_shrinker_bit(memcg, nid, - lru_shrinker_id(lru)); + set_shrinker_bit(memcg, nid, + lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; @@ -540,7 +540,7 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid, if (src->nr_items) { dst->nr_items += src->nr_items; - memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); + set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c100265dc393..09fd17ba6de2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -400,130 +400,6 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif -static int memcg_shrinker_map_size; -static DEFINE_MUTEX(memcg_shrinker_map_mutex); - -static void memcg_free_shrinker_map_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct memcg_shrinker_map, rcu)); -} - -static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, - int size, int old_size) -{ - struct memcg_shrinker_map *new, *old; - struct mem_cgroup_per_node *pn; - int nid; - - lockdep_assert_held(&memcg_shrinker_map_mutex); - - for_each_node(nid) { - pn = memcg->nodeinfo[nid]; - old = rcu_dereference_protected(pn->shrinker_map, true); - /* Not yet online memcg */ - if (!old) - return 0; - - new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); - if (!new) - return -ENOMEM; - - /* Set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_size); - memset((void *)new->map + old_size, 0, size - old_size); - - rcu_assign_pointer(pn->shrinker_map, new); - call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); - } - - return 0; -} - -static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_node *pn; - struct memcg_shrinker_map *map; - int nid; - - if (mem_cgroup_is_root(memcg)) - return; - - for_each_node(nid) { - pn = memcg->nodeinfo[nid]; - map = rcu_dereference_protected(pn->shrinker_map, true); - kvfree(map); - rcu_assign_pointer(pn->shrinker_map, NULL); - } -} - -static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) -{ - struct memcg_shrinker_map *map; - int nid, size, ret = 0; - - if (mem_cgroup_is_root(memcg)) - return 0; - - mutex_lock(&memcg_shrinker_map_mutex); - size = memcg_shrinker_map_size; - for_each_node(nid) { - map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); - if (!map) { - memcg_free_shrinker_maps(memcg); - ret = -ENOMEM; - break; - } - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); - } - mutex_unlock(&memcg_shrinker_map_mutex); - - return ret; -} - -int memcg_expand_shrinker_maps(int new_id) -{ - int size, old_size, ret = 0; - struct mem_cgroup *memcg; - - size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); - old_size = memcg_shrinker_map_size; - if (size <= old_size) - return 0; - - mutex_lock(&memcg_shrinker_map_mutex); - if (!root_mem_cgroup) - goto unlock; - - for_each_mem_cgroup(memcg) { - if (mem_cgroup_is_root(memcg)) - continue; - ret = memcg_expand_one_shrinker_map(memcg, size, old_size); - if (ret) { - mem_cgroup_iter_break(NULL, memcg); - goto unlock; - } - } -unlock: - if (!ret) - memcg_shrinker_map_size = size; - mutex_unlock(&memcg_shrinker_map_mutex); - return ret; -} - -void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) -{ - if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { - struct memcg_shrinker_map *map; - - rcu_read_lock(); - map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); - /* Pairs with smp mb in shrink_slab() */ - smp_mb__before_atomic(); - set_bit(shrinker_id, map->map); - rcu_read_unlock(); - } -} - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest @@ -5242,11 +5118,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); /* - * A memcg must be visible for memcg_expand_shrinker_maps() + * A memcg must be visible for expand_shrinker_maps() * by the time the maps are allocated. So, we allocate maps * here, when for_each_mem_cgroup() can't skip it. */ - if (memcg_alloc_shrinker_maps(memcg)) { + if (alloc_shrinker_maps(memcg)) { mem_cgroup_id_remove(memcg); return -ENOMEM; } @@ -5310,7 +5186,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); - memcg_free_shrinker_maps(memcg); + free_shrinker_maps(memcg); memcg_free_kmem(memcg); mem_cgroup_free(memcg); } diff --git a/mm/vmscan.c b/mm/vmscan.c index d231af7cae06..bead5ae1e7e2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -185,6 +185,132 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG + +static int memcg_shrinker_map_size; +static DEFINE_MUTEX(memcg_shrinker_map_mutex); + +static void free_shrinker_map_rcu(struct rcu_head *head) +{ + kvfree(container_of(head, struct memcg_shrinker_map, rcu)); +} + +static int expand_one_shrinker_map(struct mem_cgroup *memcg, + int size, int old_size) +{ + struct memcg_shrinker_map *new, *old; + struct mem_cgroup_per_node *pn; + int nid; + + lockdep_assert_held(&memcg_shrinker_map_mutex); + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + old = rcu_dereference_protected(pn->shrinker_map, true); + /* Not yet online memcg */ + if (!old) + return 0; + + new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); + if (!new) + return -ENOMEM; + + /* Set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_size); + memset((void *)new->map + old_size, 0, size - old_size); + + rcu_assign_pointer(pn->shrinker_map, new); + call_rcu(&old->rcu, free_shrinker_map_rcu); + } + + return 0; +} + +void free_shrinker_maps(struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *pn; + struct memcg_shrinker_map *map; + int nid; + + if (mem_cgroup_is_root(memcg)) + return; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + map = rcu_dereference_protected(pn->shrinker_map, true); + kvfree(map); + rcu_assign_pointer(pn->shrinker_map, NULL); + } +} + +int alloc_shrinker_maps(struct mem_cgroup *memcg) +{ + struct memcg_shrinker_map *map; + int nid, size, ret = 0; + + if (mem_cgroup_is_root(memcg)) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + size = memcg_shrinker_map_size; + for_each_node(nid) { + map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); + if (!map) { + free_shrinker_maps(memcg); + ret = -ENOMEM; + break; + } + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); + } + mutex_unlock(&memcg_shrinker_map_mutex); + + return ret; +} + +static int expand_shrinker_maps(int new_id) +{ + int size, old_size, ret = 0; + struct mem_cgroup *memcg; + + size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); + old_size = memcg_shrinker_map_size; + if (size <= old_size) + return 0; + + mutex_lock(&memcg_shrinker_map_mutex); + if (!root_mem_cgroup) + goto unlock; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + if (mem_cgroup_is_root(memcg)) + continue; + ret = expand_one_shrinker_map(memcg, size, old_size); + if (ret) { + mem_cgroup_iter_break(NULL, memcg); + goto unlock; + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); +unlock: + if (!ret) + memcg_shrinker_map_size = size; + mutex_unlock(&memcg_shrinker_map_mutex); + return ret; +} + +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ + if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { + struct memcg_shrinker_map *map; + + rcu_read_lock(); + map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); + /* Pairs with smp mb in shrink_slab() */ + smp_mb__before_atomic(); + set_bit(shrinker_id, map->map); + rcu_read_unlock(); + } +} + /* * We allow subsystems to populate their shrinker-related * LRU lists before register_shrinker_prepared() is called @@ -212,7 +338,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) goto unlock; if (id >= shrinker_nr_max) { - if (memcg_expand_shrinker_maps(id)) { + if (expand_shrinker_maps(id)) { idr_remove(&shrinker_idr, id); goto unlock; } @@ -590,7 +716,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * case, we invoke the shrinker one more time and reset * the bit if it reports that it is not empty anymore. * The memory barrier here pairs with the barrier in - * memcg_set_shrinker_bit(): + * set_shrinker_bit(): * * list_lru_add() shrink_slab_memcg() * list_add_tail() clear_bit() @@ -602,7 +728,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; else - memcg_set_shrinker_bit(memcg, nid, i); + set_shrinker_bit(memcg, nid, i); } freed += ret; From d27cf2aa0d26a221982d04757cc32db97833ec29 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:14 -0700 Subject: [PATCH 1131/1379] mm: vmscan: use shrinker_rwsem to protect shrinker_maps allocation Since memcg_shrinker_map_size just can be changed under holding shrinker_rwsem exclusively, the read side can be protected by holding read lock, so it sounds superfluous to have a dedicated mutex. Kirill Tkhai suggested use write lock since: * We want the assignment to shrinker_maps is visible for shrink_slab_memcg(). * The rcu_dereference_protected() dereferrencing in shrink_slab_memcg(), but in case of we use READ lock in alloc_shrinker_maps(), the dereferrencing is not actually protected. * READ lock makes alloc_shrinker_info() racy against memory allocation fail. alloc_shrinker_info()->free_shrinker_info() may free memory right after shrink_slab_memcg() dereferenced it. You may say shrink_slab_memcg()->mem_cgroup_online() protects us from it? Yes, sure, but this is not the thing we want to remember in the future, since this spreads modularity. And a test with heavy paging workload didn't show write lock makes things worse. Link: https://lkml.kernel.org/r/20210311190845.9708-4-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index bead5ae1e7e2..d3fba55a0028 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -187,7 +187,6 @@ static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int memcg_shrinker_map_size; -static DEFINE_MUTEX(memcg_shrinker_map_mutex); static void free_shrinker_map_rcu(struct rcu_head *head) { @@ -201,8 +200,6 @@ static int expand_one_shrinker_map(struct mem_cgroup *memcg, struct mem_cgroup_per_node *pn; int nid; - lockdep_assert_held(&memcg_shrinker_map_mutex); - for_each_node(nid) { pn = memcg->nodeinfo[nid]; old = rcu_dereference_protected(pn->shrinker_map, true); @@ -250,7 +247,7 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) if (mem_cgroup_is_root(memcg)) return 0; - mutex_lock(&memcg_shrinker_map_mutex); + down_write(&shrinker_rwsem); size = memcg_shrinker_map_size; for_each_node(nid) { map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); @@ -261,7 +258,7 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) } rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); } - mutex_unlock(&memcg_shrinker_map_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -276,9 +273,10 @@ static int expand_shrinker_maps(int new_id) if (size <= old_size) return 0; - mutex_lock(&memcg_shrinker_map_mutex); if (!root_mem_cgroup) - goto unlock; + goto out; + + lockdep_assert_held(&shrinker_rwsem); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { @@ -287,13 +285,13 @@ static int expand_shrinker_maps(int new_id) ret = expand_one_shrinker_map(memcg, size, old_size); if (ret) { mem_cgroup_iter_break(NULL, memcg); - goto unlock; + goto out; } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); -unlock: +out: if (!ret) memcg_shrinker_map_size = size; - mutex_unlock(&memcg_shrinker_map_mutex); + return ret; } From a2fb12619f202dcec83f22accc09d48347fd9138 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:17 -0700 Subject: [PATCH 1132/1379] mm: vmscan: remove memcg_shrinker_map_size Both memcg_shrinker_map_size and shrinker_nr_max is maintained, but actually the map size can be calculated via shrinker_nr_max, so it seems unnecessary to keep both. Remove memcg_shrinker_map_size since shrinker_nr_max is also used by iterating the bit map. Link: https://lkml.kernel.org/r/20210311190845.9708-5-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d3fba55a0028..dff5112dfff7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -185,8 +185,12 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG +static int shrinker_nr_max; -static int memcg_shrinker_map_size; +static inline int shrinker_map_size(int nr_items) +{ + return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); +} static void free_shrinker_map_rcu(struct rcu_head *head) { @@ -248,7 +252,7 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) return 0; down_write(&shrinker_rwsem); - size = memcg_shrinker_map_size; + size = shrinker_map_size(shrinker_nr_max); for_each_node(nid) { map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); if (!map) { @@ -266,12 +270,13 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) static int expand_shrinker_maps(int new_id) { int size, old_size, ret = 0; + int new_nr_max = new_id + 1; struct mem_cgroup *memcg; - size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); - old_size = memcg_shrinker_map_size; + size = shrinker_map_size(new_nr_max); + old_size = shrinker_map_size(shrinker_nr_max); if (size <= old_size) - return 0; + goto out; if (!root_mem_cgroup) goto out; @@ -290,7 +295,7 @@ static int expand_shrinker_maps(int new_id) } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); out: if (!ret) - memcg_shrinker_map_size = size; + shrinker_nr_max = new_nr_max; return ret; } @@ -323,7 +328,6 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) #define SHRINKER_REGISTERING ((struct shrinker *)~0UL) static DEFINE_IDR(shrinker_idr); -static int shrinker_nr_max; static int prealloc_memcg_shrinker(struct shrinker *shrinker) { @@ -340,8 +344,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) idr_remove(&shrinker_idr, id); goto unlock; } - - shrinker_nr_max = id + 1; } shrinker->id = id; ret = 0; From 72673e861dd032ccaff533c0d9bb705d508017f7 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:20 -0700 Subject: [PATCH 1133/1379] mm: vmscan: use kvfree_rcu instead of call_rcu Using kvfree_rcu() to free the old shrinker_maps instead of call_rcu(). We don't have to define a dedicated callback for call_rcu() anymore. Link: https://lkml.kernel.org/r/20210311190845.9708-6-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Roman Gushchin Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index dff5112dfff7..aa99a835cf89 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -192,11 +192,6 @@ static inline int shrinker_map_size(int nr_items) return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); } -static void free_shrinker_map_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct memcg_shrinker_map, rcu)); -} - static int expand_one_shrinker_map(struct mem_cgroup *memcg, int size, int old_size) { @@ -220,7 +215,7 @@ static int expand_one_shrinker_map(struct mem_cgroup *memcg, memset((void *)new->map + old_size, 0, size - old_size); rcu_assign_pointer(pn->shrinker_map, new); - call_rcu(&old->rcu, free_shrinker_map_rcu); + kvfree_rcu(old, rcu); } return 0; From e4262c4f51d6373447c9d89093f49ff6b1e607be Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:23 -0700 Subject: [PATCH 1134/1379] mm: memcontrol: rename shrinker_map to shrinker_info The following patch is going to add nr_deferred into shrinker_map, the change will make shrinker_map not only include map anymore, so rename it to "memcg_shrinker_info". And this should make the patch adding nr_deferred cleaner and readable and make review easier. Also remove the "memcg_" prefix. Link: https://lkml.kernel.org/r/20210311190845.9708-7-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 +++--- mm/memcontrol.c | 6 ++-- mm/vmscan.c | 58 +++++++++++++++++++------------------- 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7dbd2c9bad32..6cd800fe9a67 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -117,7 +117,7 @@ struct batched_lruvec_stat { * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, * which have elements charged to this memcg. */ -struct memcg_shrinker_map { +struct shrinker_info { struct rcu_head rcu; unsigned long map[]; }; @@ -145,7 +145,7 @@ struct mem_cgroup_per_node { struct mem_cgroup_reclaim_iter iter; - struct memcg_shrinker_map __rcu *shrinker_map; + struct shrinker_info __rcu *shrinker_info; struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ @@ -1610,8 +1610,8 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) return false; } -int alloc_shrinker_maps(struct mem_cgroup *memcg); -void free_shrinker_maps(struct mem_cgroup *memcg); +int alloc_shrinker_info(struct mem_cgroup *memcg); +void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 09fd17ba6de2..36f31d611dea 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5118,11 +5118,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); /* - * A memcg must be visible for expand_shrinker_maps() + * A memcg must be visible for expand_shrinker_info() * by the time the maps are allocated. So, we allocate maps * here, when for_each_mem_cgroup() can't skip it. */ - if (alloc_shrinker_maps(memcg)) { + if (alloc_shrinker_info(memcg)) { mem_cgroup_id_remove(memcg); return -ENOMEM; } @@ -5186,7 +5186,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); - free_shrinker_maps(memcg); + free_shrinker_info(memcg); memcg_free_kmem(memcg); mem_cgroup_free(memcg); } diff --git a/mm/vmscan.c b/mm/vmscan.c index aa99a835cf89..518084ce8757 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -192,16 +192,16 @@ static inline int shrinker_map_size(int nr_items) return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); } -static int expand_one_shrinker_map(struct mem_cgroup *memcg, - int size, int old_size) +static int expand_one_shrinker_info(struct mem_cgroup *memcg, + int size, int old_size) { - struct memcg_shrinker_map *new, *old; + struct shrinker_info *new, *old; struct mem_cgroup_per_node *pn; int nid; for_each_node(nid) { pn = memcg->nodeinfo[nid]; - old = rcu_dereference_protected(pn->shrinker_map, true); + old = rcu_dereference_protected(pn->shrinker_info, true); /* Not yet online memcg */ if (!old) return 0; @@ -214,17 +214,17 @@ static int expand_one_shrinker_map(struct mem_cgroup *memcg, memset(new->map, (int)0xff, old_size); memset((void *)new->map + old_size, 0, size - old_size); - rcu_assign_pointer(pn->shrinker_map, new); + rcu_assign_pointer(pn->shrinker_info, new); kvfree_rcu(old, rcu); } return 0; } -void free_shrinker_maps(struct mem_cgroup *memcg) +void free_shrinker_info(struct mem_cgroup *memcg) { struct mem_cgroup_per_node *pn; - struct memcg_shrinker_map *map; + struct shrinker_info *info; int nid; if (mem_cgroup_is_root(memcg)) @@ -232,15 +232,15 @@ void free_shrinker_maps(struct mem_cgroup *memcg) for_each_node(nid) { pn = memcg->nodeinfo[nid]; - map = rcu_dereference_protected(pn->shrinker_map, true); - kvfree(map); - rcu_assign_pointer(pn->shrinker_map, NULL); + info = rcu_dereference_protected(pn->shrinker_info, true); + kvfree(info); + rcu_assign_pointer(pn->shrinker_info, NULL); } } -int alloc_shrinker_maps(struct mem_cgroup *memcg) +int alloc_shrinker_info(struct mem_cgroup *memcg) { - struct memcg_shrinker_map *map; + struct shrinker_info *info; int nid, size, ret = 0; if (mem_cgroup_is_root(memcg)) @@ -249,20 +249,20 @@ int alloc_shrinker_maps(struct mem_cgroup *memcg) down_write(&shrinker_rwsem); size = shrinker_map_size(shrinker_nr_max); for_each_node(nid) { - map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid); - if (!map) { - free_shrinker_maps(memcg); + info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); + if (!info) { + free_shrinker_info(memcg); ret = -ENOMEM; break; } - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } up_write(&shrinker_rwsem); return ret; } -static int expand_shrinker_maps(int new_id) +static int expand_shrinker_info(int new_id) { int size, old_size, ret = 0; int new_nr_max = new_id + 1; @@ -282,7 +282,7 @@ static int expand_shrinker_maps(int new_id) do { if (mem_cgroup_is_root(memcg)) continue; - ret = expand_one_shrinker_map(memcg, size, old_size); + ret = expand_one_shrinker_info(memcg, size, old_size); if (ret) { mem_cgroup_iter_break(NULL, memcg); goto out; @@ -298,13 +298,13 @@ out: void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { - struct memcg_shrinker_map *map; + struct shrinker_info *info; rcu_read_lock(); - map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); - set_bit(shrinker_id, map->map); + set_bit(shrinker_id, info->map); rcu_read_unlock(); } } @@ -335,7 +335,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) goto unlock; if (id >= shrinker_nr_max) { - if (expand_shrinker_maps(id)) { + if (expand_shrinker_info(id)) { idr_remove(&shrinker_idr, id); goto unlock; } @@ -665,7 +665,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { - struct memcg_shrinker_map *map; + struct shrinker_info *info; unsigned long ret, freed = 0; int i; @@ -675,12 +675,12 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (!down_read_trylock(&shrinker_rwsem)) return 0; - map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map, - true); - if (unlikely(!map)) + info = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + true); + if (unlikely(!info)) goto unlock; - for_each_set_bit(i, map->map, shrinker_nr_max) { + for_each_set_bit(i, info->map, shrinker_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -691,7 +691,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, shrinker = idr_find(&shrinker_idr, i); if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) { if (!shrinker) - clear_bit(i, map->map); + clear_bit(i, info->map); continue; } @@ -702,7 +702,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { - clear_bit(i, map->map); + clear_bit(i, info->map); /* * After the shrinker reported that it had no objects to * free, but before we cleared the corresponding bit in From 468ab8437a97a953895856c3709e48b3067da13c Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:26 -0700 Subject: [PATCH 1135/1379] mm: vmscan: add shrinker_info_protected() helper The shrinker_info is dereferenced in a couple of places via rcu_dereference_protected with different calling conventions, for example, using mem_cgroup_nodeinfo helper or dereferencing memcg->nodeinfo[nid]->shrinker_info. And the later patch will add more dereference places. So extract the dereference into a helper to make the code more readable. No functional change. [akpm@linux-foundation.org: retain rcu_dereference_protected() in free_shrinker_info(), per Hugh] Link: https://lkml.kernel.org/r/20210311190845.9708-8-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Roman Gushchin Acked-by: Kirill Tkhai Acked-by: Vlastimil Babka Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 518084ce8757..400f4a657b27 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -192,6 +192,13 @@ static inline int shrinker_map_size(int nr_items) return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); } +static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, + int nid) +{ + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); +} + static int expand_one_shrinker_info(struct mem_cgroup *memcg, int size, int old_size) { @@ -201,7 +208,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, for_each_node(nid) { pn = memcg->nodeinfo[nid]; - old = rcu_dereference_protected(pn->shrinker_info, true); + old = shrinker_info_protected(memcg, nid); /* Not yet online memcg */ if (!old) return 0; @@ -675,8 +682,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (!down_read_trylock(&shrinker_rwsem)) return 0; - info = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, - true); + info = shrinker_info_protected(memcg, nid); if (unlikely(!info)) goto unlock; From 41ca668a71e7b03743369a2c6d8b8edc1e943dc8 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:29 -0700 Subject: [PATCH 1136/1379] mm: vmscan: use a new flag to indicate shrinker is registered Currently registered shrinker is indicated by non-NULL shrinker->nr_deferred. This approach is fine with nr_deferred at the shrinker level, but the following patches will move MEMCG_AWARE shrinkers' nr_deferred to memcg level, so their shrinker->nr_deferred would always be NULL. This would prevent the shrinkers from unregistering correctly. Remove SHRINKER_REGISTERING since we could check if shrinker is registered successfully by the new flag. Link: https://lkml.kernel.org/r/20210311190845.9708-9-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Kirill Tkhai Acked-by: Vlastimil Babka Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shrinker.h | 7 ++++--- mm/vmscan.c | 40 +++++++++++++++------------------------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 0f80123650e2..1eac79ce57d4 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -79,13 +79,14 @@ struct shrinker { #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ /* Flags */ -#define SHRINKER_NUMA_AWARE (1 << 0) -#define SHRINKER_MEMCG_AWARE (1 << 1) +#define SHRINKER_REGISTERED (1 << 0) +#define SHRINKER_NUMA_AWARE (1 << 1) +#define SHRINKER_MEMCG_AWARE (1 << 2) /* * It just makes sense when the shrinker is also MEMCG_AWARE for now, * non-MEMCG_AWARE shrinker should not have this flag set. */ -#define SHRINKER_NONSLAB (1 << 2) +#define SHRINKER_NONSLAB (1 << 3) extern int prealloc_shrinker(struct shrinker *shrinker); extern void register_shrinker_prepared(struct shrinker *shrinker); diff --git a/mm/vmscan.c b/mm/vmscan.c index 400f4a657b27..d1601163d895 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -316,19 +316,6 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) } } -/* - * We allow subsystems to populate their shrinker-related - * LRU lists before register_shrinker_prepared() is called - * for the shrinker, since we don't want to impose - * restrictions on their internal registration order. - * In this case shrink_slab_memcg() may find corresponding - * bit is set in the shrinkers map. - * - * This value is used by the function to detect registering - * shrinkers and to skip do_shrink_slab() calls for them. - */ -#define SHRINKER_REGISTERING ((struct shrinker *)~0UL) - static DEFINE_IDR(shrinker_idr); static int prealloc_memcg_shrinker(struct shrinker *shrinker) @@ -337,7 +324,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) down_write(&shrinker_rwsem); /* This may call shrinker, so it must use down_read_trylock() */ - id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL); + id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -360,9 +347,9 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) BUG_ON(id < 0); - down_write(&shrinker_rwsem); + lockdep_assert_held(&shrinker_rwsem); + idr_remove(&shrinker_idr, id); - up_write(&shrinker_rwsem); } static bool cgroup_reclaim(struct scan_control *sc) @@ -490,8 +477,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker) if (!shrinker->nr_deferred) return; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); + up_write(&shrinker_rwsem); + } kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; @@ -501,10 +491,7 @@ void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); -#ifdef CONFIG_MEMCG - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - idr_replace(&shrinker_idr, shrinker, shrinker->id); -#endif + shrinker->flags |= SHRINKER_REGISTERED; up_write(&shrinker_rwsem); } @@ -524,13 +511,16 @@ EXPORT_SYMBOL(register_shrinker); */ void unregister_shrinker(struct shrinker *shrinker) { - if (!shrinker->nr_deferred) + if (!(shrinker->flags & SHRINKER_REGISTERED)) return; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); + down_write(&shrinker_rwsem); list_del(&shrinker->list); + shrinker->flags &= ~SHRINKER_REGISTERED; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; } @@ -695,7 +685,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct shrinker *shrinker; shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) { + if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { if (!shrinker) clear_bit(i, info->map); continue; From 3c6f17e6c5d048c8029578c475dd037dd5db58af Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:33 -0700 Subject: [PATCH 1137/1379] mm: vmscan: add per memcg shrinker nr_deferred Currently the number of deferred objects are per shrinker, but some slabs, for example, vfs inode/dentry cache are per memcg, this would result in poor isolation among memcgs. The deferred objects typically are generated by __GFP_NOFS allocations, one memcg with excessive __GFP_NOFS allocations may blow up deferred objects, then other innocent memcgs may suffer from over shrink, excessive reclaim latency, etc. For example, two workloads run in memcgA and memcgB respectively, workload in B is vfs heavy workload. Workload in A generates excessive deferred objects, then B's vfs cache might be hit heavily (drop half of caches) by B's limit reclaim or global reclaim. We observed this hit in our production environment which was running vfs heavy workload shown as the below tracing log: <...>-409454 [016] .... 28286961.747146: mm_shrink_slab_start: super_cache_scan+0x0/0x1a0 ffff9a83046f3458: nid: 1 objects to shrink 3641681686040 gfp_flags GFP_HIGHUSER_MOVABLE|__GFP_ZERO pgs_scanned 1 lru_pgs 15721 cache items 246404277 delta 31345 total_scan 123202138 <...>-409454 [022] .... 28287105.928018: mm_shrink_slab_end: super_cache_scan+0x0/0x1a0 ffff9a83046f3458: nid: 1 unused scan count 3641681686040 new scan count 3641798379189 total_scan 602 last shrinker return val 123186855 The vfs cache and page cache ratio was 10:1 on this machine, and half of caches were dropped. This also resulted in significant amount of page caches were dropped due to inodes eviction. Make nr_deferred per memcg for memcg aware shrinkers would solve the unfairness and bring better isolation. The following patch will add nr_deferred to parent memcg when memcg offline. To preserve nr_deferred when reparenting memcgs to root, root memcg needs shrinker_info allocated too. When memcg is not enabled (!CONFIG_MEMCG or memcg disabled), the shrinker's nr_deferred would be used. And non memcg aware shrinkers use shrinker's nr_deferred all the time. Link: https://lkml.kernel.org/r/20210311190845.9708-10-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Roman Gushchin Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 +++-- mm/vmscan.c | 60 ++++++++++++++++++++++++++------------ 2 files changed, 46 insertions(+), 21 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6cd800fe9a67..32bd62047238 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -114,12 +114,13 @@ struct batched_lruvec_stat { }; /* - * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, - * which have elements charged to this memcg. + * Bitmap and deferred work of shrinker::id corresponding to memcg-aware + * shrinkers, which have elements charged to this memcg. */ struct shrinker_info { struct rcu_head rcu; - unsigned long map[]; + atomic_long_t *nr_deferred; + unsigned long *map; }; /* diff --git a/mm/vmscan.c b/mm/vmscan.c index d1601163d895..db668c4c78f4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -187,11 +187,17 @@ static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max; +/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ static inline int shrinker_map_size(int nr_items) { return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); } +static inline int shrinker_defer_size(int nr_items) +{ + return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); +} + static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { @@ -200,11 +206,13 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, } static int expand_one_shrinker_info(struct mem_cgroup *memcg, - int size, int old_size) + int map_size, int defer_size, + int old_map_size, int old_defer_size) { struct shrinker_info *new, *old; struct mem_cgroup_per_node *pn; int nid; + int size = map_size + defer_size; for_each_node(nid) { pn = memcg->nodeinfo[nid]; @@ -217,9 +225,16 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, if (!new) return -ENOMEM; - /* Set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_size); - memset((void *)new->map + old_size, 0, size - old_size); + new->nr_deferred = (atomic_long_t *)(new + 1); + new->map = (void *)new->nr_deferred + defer_size; + + /* map: set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_map_size); + memset((void *)new->map + old_map_size, 0, map_size - old_map_size); + /* nr_deferred: copy old values, clear all new values */ + memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); + memset((void *)new->nr_deferred + old_defer_size, 0, + defer_size - old_defer_size); rcu_assign_pointer(pn->shrinker_info, new); kvfree_rcu(old, rcu); @@ -234,9 +249,6 @@ void free_shrinker_info(struct mem_cgroup *memcg) struct shrinker_info *info; int nid; - if (mem_cgroup_is_root(memcg)) - return; - for_each_node(nid) { pn = memcg->nodeinfo[nid]; info = rcu_dereference_protected(pn->shrinker_info, true); @@ -249,12 +261,12 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) { struct shrinker_info *info; int nid, size, ret = 0; - - if (mem_cgroup_is_root(memcg)) - return 0; + int map_size, defer_size = 0; down_write(&shrinker_rwsem); - size = shrinker_map_size(shrinker_nr_max); + map_size = shrinker_map_size(shrinker_nr_max); + defer_size = shrinker_defer_size(shrinker_nr_max); + size = map_size + defer_size; for_each_node(nid) { info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); if (!info) { @@ -262,6 +274,8 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) ret = -ENOMEM; break; } + info->nr_deferred = (atomic_long_t *)(info + 1); + info->map = (void *)info->nr_deferred + defer_size; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } up_write(&shrinker_rwsem); @@ -269,15 +283,21 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) return ret; } +static inline bool need_expand(int nr_max) +{ + return round_up(nr_max, BITS_PER_LONG) > + round_up(shrinker_nr_max, BITS_PER_LONG); +} + static int expand_shrinker_info(int new_id) { - int size, old_size, ret = 0; + int ret = 0; int new_nr_max = new_id + 1; + int map_size, defer_size = 0; + int old_map_size, old_defer_size = 0; struct mem_cgroup *memcg; - size = shrinker_map_size(new_nr_max); - old_size = shrinker_map_size(shrinker_nr_max); - if (size <= old_size) + if (!need_expand(new_nr_max)) goto out; if (!root_mem_cgroup) @@ -285,11 +305,15 @@ static int expand_shrinker_info(int new_id) lockdep_assert_held(&shrinker_rwsem); + map_size = shrinker_map_size(new_nr_max); + defer_size = shrinker_defer_size(new_nr_max); + old_map_size = shrinker_map_size(shrinker_nr_max); + old_defer_size = shrinker_defer_size(shrinker_nr_max); + memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - if (mem_cgroup_is_root(memcg)) - continue; - ret = expand_one_shrinker_info(memcg, size, old_size); + ret = expand_one_shrinker_info(memcg, map_size, defer_size, + old_map_size, old_defer_size); if (ret) { mem_cgroup_iter_break(NULL, memcg); goto out; From 86750830468506dc27fa99c644534a7189be7975 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:36 -0700 Subject: [PATCH 1138/1379] mm: vmscan: use per memcg nr_deferred of shrinker Use per memcg's nr_deferred for memcg aware shrinkers. The shrinker's nr_deferred will be used in the following cases: 1. Non memcg aware shrinkers 2. !CONFIG_MEMCG 3. memcg is disabled by boot parameter Link: https://lkml.kernel.org/r/20210311190845.9708-11-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Roman Gushchin Acked-by: Kirill Tkhai Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 78 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 66 insertions(+), 12 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index db668c4c78f4..19b134d985c5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -376,6 +376,24 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) idr_remove(&shrinker_idr, id); } +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); +} + static bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; @@ -414,6 +432,18 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) { } +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + static bool cgroup_reclaim(struct scan_control *sc) { return false; @@ -425,6 +455,39 @@ static bool writeback_throttling_sane(struct scan_control *sc) } #endif +static long xchg_nr_deferred(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return xchg_nr_deferred_memcg(nid, shrinker, + sc->memcg); + + return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); +} + + +static long add_nr_deferred(long nr, struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return add_nr_deferred_memcg(nr, nid, shrinker, + sc->memcg); + + return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); +} + /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is @@ -561,14 +624,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, long freeable; long nr; long new_nr; - int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; long scanned = 0, next_deferred; - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; - freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0 || freeable == SHRINK_EMPTY) return freeable; @@ -578,7 +637,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ - nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + nr = xchg_nr_deferred(shrinker, shrinkctl); total_scan = nr; if (shrinker->seeks) { @@ -669,14 +728,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, next_deferred = 0; /* * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. If we exhausted the - * scan, there is no need to do an update. + * manner that handles concurrent updates. */ - if (next_deferred > 0) - new_nr = atomic_long_add_return(next_deferred, - &shrinker->nr_deferred[nid]); - else - new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); + new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); return freed; From 476b30a0949aec865dcc64d4c14f621b1a8afd12 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:39 -0700 Subject: [PATCH 1139/1379] mm: vmscan: don't need allocate shrinker->nr_deferred for memcg aware shrinkers Now nr_deferred is available on per memcg level for memcg aware shrinkers, so don't need allocate shrinker->nr_deferred for such shrinkers anymore. The prealloc_memcg_shrinker() would return -ENOSYS if !CONFIG_MEMCG or memcg is disabled by kernel command line, then shrinker's SHRINKER_MEMCG_AWARE flag would be cleared. This makes the implementation of this patch simpler. Link: https://lkml.kernel.org/r/20210311190845.9708-12-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Reviewed-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 19b134d985c5..9617c5ff3e98 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -346,6 +346,9 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) { int id, ret = -ENOMEM; + if (mem_cgroup_disabled()) + return -ENOSYS; + down_write(&shrinker_rwsem); /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); @@ -425,7 +428,7 @@ static bool writeback_throttling_sane(struct scan_control *sc) #else static int prealloc_memcg_shrinker(struct shrinker *shrinker) { - return 0; + return -ENOSYS; } static void unregister_memcg_shrinker(struct shrinker *shrinker) @@ -537,8 +540,18 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, */ int prealloc_shrinker(struct shrinker *shrinker) { - unsigned int size = sizeof(*shrinker->nr_deferred); + unsigned int size; + int err; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + err = prealloc_memcg_shrinker(shrinker); + if (err != -ENOSYS) + return err; + + shrinker->flags &= ~SHRINKER_MEMCG_AWARE; + } + + size = sizeof(*shrinker->nr_deferred); if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; @@ -546,28 +559,16 @@ int prealloc_shrinker(struct shrinker *shrinker) if (!shrinker->nr_deferred) return -ENOMEM; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - if (prealloc_memcg_shrinker(shrinker)) - goto free_deferred; - } - return 0; - -free_deferred: - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; - return -ENOMEM; } void free_prealloced_shrinker(struct shrinker *shrinker) { - if (!shrinker->nr_deferred) - return; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); up_write(&shrinker_rwsem); + return; } kfree(shrinker->nr_deferred); From a178015cde69981cdcd8f109c5abc98703fead62 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:42 -0700 Subject: [PATCH 1140/1379] mm: memcontrol: reparent nr_deferred when memcg offline Now shrinker's nr_deferred is per memcg for memcg aware shrinkers, add to parent's corresponding nr_deferred when memcg offline. Link: https://lkml.kernel.org/r/20210311190845.9708-13-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Acked-by: Kirill Tkhai Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Dave Chinner Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 + mm/memcontrol.c | 1 + mm/vmscan.c | 24 ++++++++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 32bd62047238..c193be760709 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1614,6 +1614,7 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); +void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36f31d611dea..3004afb6d090 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5154,6 +5154,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); + reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); drain_all_stock(memcg); diff --git a/mm/vmscan.c b/mm/vmscan.c index 9617c5ff3e98..8c2d2003acbe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -397,6 +397,30 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); } +void reparent_shrinker_deferred(struct mem_cgroup *memcg) +{ + int i, nid; + long nr; + struct mem_cgroup *parent; + struct shrinker_info *child_info, *parent_info; + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* Prevent from concurrent shrinker_info expand */ + down_read(&shrinker_rwsem); + for_each_node(nid) { + child_info = shrinker_info_protected(memcg, nid); + parent_info = shrinker_info_protected(parent, nid); + for (i = 0; i < shrinker_nr_max; i++) { + nr = atomic_long_read(&child_info->nr_deferred[i]); + atomic_long_add(nr, &parent_info->nr_deferred[i]); + } + } + up_read(&shrinker_rwsem); +} + static bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; From 18bb473e5031213ebfa9a622c0b0f8cdcb8a5371 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 4 May 2021 18:36:45 -0700 Subject: [PATCH 1141/1379] mm: vmscan: shrink deferred objects proportional to priority The number of deferred objects might get windup to an absurd number, and it results in clamp of slab objects. It is undesirable for sustaining workingset. So shrink deferred objects proportional to priority and cap nr_deferred to twice of cache items. The idea is borrowed from Dave Chinner's patch: https://lore.kernel.org/linux-xfs/20191031234618.15403-13-david@fromorbit.com/ Tested with kernel build and vfs metadata heavy workload in our production environment, no regression is spotted so far. Link: https://lkml.kernel.org/r/20210311190845.9708-14-shy828301@gmail.com Signed-off-by: Yang Shi Cc: Johannes Weiner Cc: Kirill Tkhai Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 46 +++++++++++----------------------------------- 1 file changed, 11 insertions(+), 35 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8c2d2003acbe..44c49acf10c4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -664,7 +664,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, */ nr = xchg_nr_deferred(shrinker, shrinkctl); - total_scan = nr; if (shrinker->seeks) { delta = freeable >> priority; delta *= 4; @@ -678,37 +677,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, delta = freeable / 2; } + total_scan = nr >> priority; total_scan += delta; - if (total_scan < 0) { - pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n", - shrinker->scan_objects, total_scan); - total_scan = freeable; - next_deferred = nr; - } else - next_deferred = total_scan; - - /* - * We need to avoid excessive windup on filesystem shrinkers - * due to large numbers of GFP_NOFS allocations causing the - * shrinkers to return -1 all the time. This results in a large - * nr being built up so when a shrink that can do some work - * comes along it empties the entire cache due to nr >>> - * freeable. This is bad for sustaining a working set in - * memory. - * - * Hence only allow the shrinker to scan the entire cache when - * a large delta change is calculated directly. - */ - if (delta < freeable / 4) - total_scan = min(total_scan, freeable / 2); - - /* - * Avoid risking looping forever due to too large nr value: - * never try to free more than twice the estimate number of - * freeable entries. - */ - if (total_scan > freeable * 2) - total_scan = freeable * 2; + total_scan = min(total_scan, (2 * freeable)); trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, freeable, delta, total_scan, priority); @@ -747,10 +718,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, cond_resched(); } - if (next_deferred >= scanned) - next_deferred -= scanned; - else - next_deferred = 0; + /* + * The deferred work is increased by any new work (delta) that wasn't + * done, decreased by old deferred work that was done now. + * + * And it is capped to two times of the freeable items. + */ + next_deferred = max_t(long, (nr + delta - scanned), 0); + next_deferred = min(next_deferred, (2 * freeable)); + /* * move the unused scan count back into the shrinker in a * manner that handles concurrent updates. From ef4984384172e93cc95e0e8cd102536d67e8a787 Mon Sep 17 00:00:00 2001 From: Pintu Kumar Date: Tue, 4 May 2021 18:36:48 -0700 Subject: [PATCH 1142/1379] mm/compaction: remove unused variable sysctl_compact_memory The sysctl_compact_memory is mostly unused in mm/compaction.c It just acts as a place holder for sysctl to store .data. But the .data itself is not needed here. So we can get ride of this variable completely and make .data as NULL. This will also eliminate the extern declaration from header file. No functionality is broken or changed this way. Link: https://lkml.kernel.org/r/1614852224-14671-1-git-send-email-pintu@codeaurora.org Signed-off-by: Pintu Kumar Signed-off-by: Pintu Agarwal Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 1 - kernel/sysctl.c | 2 +- mm/compaction.c | 3 --- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index ed4070ed41ef..4221888bdcd6 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -81,7 +81,6 @@ static inline unsigned long compact_gap(unsigned int order) } #ifdef CONFIG_COMPACTION -extern int sysctl_compact_memory; extern unsigned int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f91d327273c1..14edf84cc571 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2830,7 +2830,7 @@ static struct ctl_table vm_table[] = { #ifdef CONFIG_COMPACTION { .procname = "compact_memory", - .data = &sysctl_compact_memory, + .data = NULL, .maxlen = sizeof(int), .mode = 0200, .proc_handler = sysctl_compaction_handler, diff --git a/mm/compaction.c b/mm/compaction.c index 335862f1661c..027eb794e747 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2692,9 +2692,6 @@ static void compact_nodes(void) compact_node(nid); } -/* The written value is actually unused, all memory is compacted */ -int sysctl_compact_memory; - /* * Tunable for proactive compaction. It determines how * aggressively the kernel should compact memory in the From 06dac2f467fe9269a433aa5056dd2ee1d20475e9 Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Tue, 4 May 2021 18:36:51 -0700 Subject: [PATCH 1143/1379] mm: compaction: update the COMPACT[STALL|FAIL] events properly By definition, COMPACT[STALL|FAIL] events needs to be counted when there is 'At least in one zone compaction wasn't deferred or skipped from the direct compaction'. And when compaction is skipped or deferred, COMPACT_SKIPPED will be returned but it will still go and update these compaction events which is wrong in the sense that COMPACT[STALL|FAIL] is counted without even trying the compaction. Correct this by skipping the counting of these events when COMPACT_SKIPPED is returned for compaction. This indirectly also avoid the unnecessary try into the get_page_from_freelist() when compaction is not even tried. There is a corner case where compaction is skipped but still count COMPACTSTALL event, which is that IRQ came and freed the page and the same is captured in capture_control. Link: https://lkml.kernel.org/r/1613151184-21213-1-git-send-email-charante@codeaurora.org Signed-off-by: Charan Teja Reddy Acked-by: Vlastimil Babka Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 8 ++++++++ mm/page_alloc.c | 2 ++ 2 files changed, 10 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index 027eb794e747..1be7928af62b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2529,6 +2529,14 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, */ WRITE_ONCE(current->capture_control, NULL); *capture = READ_ONCE(capc.page); + /* + * Technically, it is also possible that compaction is skipped but + * the page is still captured out of luck(IRQ came and freed the page). + * Returning COMPACT_SUCCESS in such cases helps in properly accounting + * the COMPACT[STALL|FAIL] when compaction is skipped. + */ + if (*capture) + ret = COMPACT_SUCCESS; return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 19cdd8a829dc..64d4aae2a78a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4204,6 +4204,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); + if (*compact_result == COMPACT_SKIPPED) + return NULL; /* * At least in one zone compaction wasn't deferred or skipped, so let's * count a compaction stall From d479960e44f27e0e52ba31b21740b703c538027c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:36:54 -0700 Subject: [PATCH 1144/1379] mm: disable LRU pagevec during the migration temporarily LRU pagevec holds refcount of pages until the pagevec are drained. It could prevent migration since the refcount of the page is greater than the expection in migration logic. To mitigate the issue, callers of migrate_pages drains LRU pagevec via migrate_prep or lru_add_drain_all before migrate_pages call. However, it's not enough because pages coming into pagevec after the draining call still could stay at the pagevec so it could keep preventing page migration. Since some callers of migrate_pages have retrial logic with LRU draining, the page would migrate at next trail but it is still fragile in that it doesn't close the fundamental race between upcoming LRU pages into pagvec and migration so the migration failure could cause contiguous memory allocation failure in the end. To close the race, this patch disables lru caches(i.e, pagevec) during ongoing migration until migrate is done. Since it's really hard to reproduce, I measured how many times migrate_pages retried with force mode(it is about a fallback to a sync migration) with below debug code. int migrate_pages(struct list_head *from, new_page_t get_new_page, .. .. if (rc && reason == MR_CONTIG_RANGE && pass > 2) { printk(KERN_ERR, "pfn 0x%lx reason %d", page_to_pfn(page), rc); dump_page(page, "fail to migrate"); } The test was repeating android apps launching with cma allocation in background every five seconds. Total cma allocation count was about 500 during the testing. With this patch, the dump_page count was reduced from 400 to 30. The new interface is also useful for memory hotplug which currently drains lru pcp caches after each migration failure. This is rather suboptimal as it has to disrupt others running during the operation. With the new interface the operation happens only once. This is also in line with pcp allocator cache which are disabled for the offlining as well. Link: https://lkml.kernel.org/r/20210319175127.886124-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Chris Goldsworthy Acked-by: Michal Hocko Cc: John Dias Cc: Suren Baghdasaryan Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Vlastimil Babka Cc: Oliver Sang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 ++ include/linux/swap.h | 14 +++++++++ mm/memory_hotplug.c | 3 +- mm/mempolicy.c | 4 +++ mm/migrate.c | 11 ++++--- mm/page_alloc.c | 2 ++ mm/swap.c | 64 +++++++++++++++++++++++++++++++++++------ 7 files changed, 86 insertions(+), 14 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3a389633b68f..9e4a2dc8622c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -46,6 +46,7 @@ extern int isolate_movable_page(struct page *page, isolate_mode_t mode); extern void putback_movable_page(struct page *page); extern void migrate_prep(void); +extern void migrate_finish(void); extern void migrate_prep_local(void); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); @@ -67,6 +68,7 @@ static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) { return -EBUSY; } static inline int migrate_prep(void) { return -ENOSYS; } +static inline int migrate_finish(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } static inline void migrate_page_states(struct page *newpage, struct page *page) diff --git a/include/linux/swap.h b/include/linux/swap.h index 42191da1bdc9..f69e0f67651d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -340,6 +340,20 @@ extern void lru_note_cost(struct lruvec *lruvec, bool file, extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); extern void mark_page_accessed(struct page *); + +extern atomic_t lru_disable_count; + +static inline bool lru_cache_disabled(void) +{ + return atomic_read(&lru_disable_count); +} + +static inline void lru_cache_enable(void) +{ + atomic_dec(&lru_disable_count); +} + +extern void lru_cache_disable(void); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0cdbbfbc5757..729fba144c71 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1611,6 +1611,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) * in a way that pages from isolated pageblock are left on pcplists. */ zone_pcp_disable(zone); + lru_cache_disable(); /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, @@ -1642,7 +1643,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) } cond_resched(); - lru_add_drain_all(); ret = scan_movable_pages(pfn, end_pfn, &pfn); if (!ret) { @@ -1687,6 +1687,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; spin_unlock_irqrestore(&zone->lock, flags); + lru_cache_enable(); zone_pcp_enable(zone); /* removal success */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index cd0295567a04..3b95e169e97d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1208,6 +1208,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, break; } mmap_read_unlock(mm); + + migrate_finish(); if (err < 0) return err; return busy; @@ -1371,6 +1373,8 @@ up_out: mmap_write_unlock(mm); mpol_out: mpol_put(new); + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_finish(); return err; } diff --git a/mm/migrate.c b/mm/migrate.c index 47df0df8f21a..5b09567dc293 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -66,11 +66,13 @@ void migrate_prep(void) { /* * Clear the LRU lists so pages can be isolated. - * Note that pages may be moved off the LRU after we have - * drained them. Those pages will fail to migrate like other - * pages that may be busy. */ - lru_add_drain_all(); + lru_cache_disable(); +} + +void migrate_finish(void) +{ + lru_cache_enable(); } /* Do the necessary work of migrate_prep but not if it involves other CPUs */ @@ -1838,6 +1840,7 @@ out_flush: if (err >= 0) err = err1; out: + migrate_finish(); return err; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 64d4aae2a78a..2cefb634e0d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8715,6 +8715,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, if (ret == -ENOMEM) break; } + + migrate_finish(); if (ret < 0) { alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); diff --git a/mm/swap.c b/mm/swap.c index 31b844d4ed94..c94f55e7b649 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -235,6 +235,18 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) } } +/* return true if pagevec needs to drain */ +static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) +{ + bool ret = false; + + if (!pagevec_add(pvec, page) || PageCompound(page) || + lru_cache_disabled()) + ret = true; + + return ret; +} + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the @@ -252,7 +264,7 @@ void rotate_reclaimable_page(struct page *page) get_page(page); local_lock_irqsave(&lru_rotate.lock, flags); pvec = this_cpu_ptr(&lru_rotate.pvec); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } @@ -343,7 +355,7 @@ static void activate_page(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.activate_page); get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, __activate_page); local_unlock(&lru_pvecs.lock); } @@ -458,7 +470,7 @@ void lru_cache_add(struct page *page) get_page(page); local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_add); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) __pagevec_lru_add(pvec); local_unlock(&lru_pvecs.lock); } @@ -654,7 +666,7 @@ void deactivate_file_page(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); local_unlock(&lru_pvecs.lock); } @@ -676,7 +688,7 @@ void deactivate_page(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate); get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_deactivate_fn); local_unlock(&lru_pvecs.lock); } @@ -698,7 +710,7 @@ void mark_page_lazyfree(struct page *page) local_lock(&lru_pvecs.lock); pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree); get_page(page); - if (!pagevec_add(pvec, page) || PageCompound(page)) + if (pagevec_add_and_need_flush(pvec, page)) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); local_unlock(&lru_pvecs.lock); } @@ -735,7 +747,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ -void lru_add_drain_all(void) +inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number @@ -780,7 +792,7 @@ void lru_add_drain_all(void) * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ - if (unlikely(this_gen != lru_drain_gen)) + if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* @@ -810,7 +822,8 @@ void lru_add_drain_all(void) for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || + if (force_all_cpus || + pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || @@ -828,6 +841,11 @@ void lru_add_drain_all(void) done: mutex_unlock(&lock); } + +void lru_add_drain_all(void) +{ + __lru_add_drain_all(false); +} #else void lru_add_drain_all(void) { @@ -835,6 +853,34 @@ void lru_add_drain_all(void) } #endif /* CONFIG_SMP */ +atomic_t lru_disable_count = ATOMIC_INIT(0); + +/* + * lru_cache_disable() needs to be called before we start compiling + * a list of pages to be migrated using isolate_lru_page(). + * It drains pages on LRU cache and then disable on all cpus until + * lru_cache_enable is called. + * + * Must be paired with a call to lru_cache_enable(). + */ +void lru_cache_disable(void) +{ + atomic_inc(&lru_disable_count); +#ifdef CONFIG_SMP + /* + * lru_add_drain_all in the force mode will schedule draining on + * all online CPUs so any calls of lru_cache_disabled wrapped by + * local_lock or preemption disabled would be ordered by that. + * The atomic operation doesn't need to have stronger ordering + * requirements because that is enforeced by the scheduling + * guarantees. + */ + __lru_add_drain_all(true); +#else + lru_add_drain(); +#endif +} + /** * release_pages - batched put_page() * @pages: array of pages to release From 361a2a229fa31ab7f2b236b5946e434964d00762 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:36:57 -0700 Subject: [PATCH 1145/1379] mm: replace migrate_[prep|finish] with lru_cache_[disable|enable] Currently, migrate_[prep|finish] is merely a wrapper of lru_cache_[disable|enable]. There is not much to gain from having additional abstraction. Use lru_cache_[disable|enable] instead of migrate_[prep|finish], which would be more descriptive. note: migrate_prep_local in compaction.c changed into lru_add_drain to avoid CPU schedule cost with involving many other CPUs to keep old behavior. Link: https://lkml.kernel.org/r/20210319175127.886124-2-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Cc: Chris Goldsworthy Cc: John Dias Cc: Matthew Wilcox Cc: Oliver Sang Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 7 ------- mm/compaction.c | 3 ++- mm/mempolicy.c | 8 ++++---- mm/migrate.c | 28 ++-------------------------- mm/page_alloc.c | 4 ++-- 5 files changed, 10 insertions(+), 40 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 9e4a2dc8622c..6155d97ec76c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -45,9 +45,6 @@ extern struct page *alloc_migration_target(struct page *page, unsigned long priv extern int isolate_movable_page(struct page *page, isolate_mode_t mode); extern void putback_movable_page(struct page *page); -extern void migrate_prep(void); -extern void migrate_finish(void); -extern void migrate_prep_local(void); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); extern int migrate_huge_page_move_mapping(struct address_space *mapping, @@ -67,10 +64,6 @@ static inline struct page *alloc_migration_target(struct page *page, static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) { return -EBUSY; } -static inline int migrate_prep(void) { return -ENOSYS; } -static inline int migrate_finish(void) { return -ENOSYS; } -static inline int migrate_prep_local(void) { return -ENOSYS; } - static inline void migrate_page_states(struct page *newpage, struct page *page) { } diff --git a/mm/compaction.c b/mm/compaction.c index 1be7928af62b..598dffbd5c8e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2354,7 +2354,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync); - migrate_prep_local(); + /* lru_add_drain_all could be expensive with involving other CPUs */ + lru_add_drain(); while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) { int err; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3b95e169e97d..c0343c742bed 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1124,7 +1124,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, int err = 0; nodemask_t tmp; - migrate_prep(); + lru_cache_disable(); mmap_read_lock(mm); @@ -1209,7 +1209,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, } mmap_read_unlock(mm); - migrate_finish(); + lru_cache_enable(); if (err < 0) return err; return busy; @@ -1325,7 +1325,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - migrate_prep(); + lru_cache_disable(); } { NODEMASK_SCRATCH(scratch); @@ -1374,7 +1374,7 @@ up_out: mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - migrate_finish(); + lru_cache_enable(); return err; } diff --git a/mm/migrate.c b/mm/migrate.c index 5b09567dc293..b5fbeb4bf49a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -57,30 +57,6 @@ #include "internal.h" -/* - * migrate_prep() needs to be called before we start compiling a list of pages - * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is - * undesirable, use migrate_prep_local() - */ -void migrate_prep(void) -{ - /* - * Clear the LRU lists so pages can be isolated. - */ - lru_cache_disable(); -} - -void migrate_finish(void) -{ - lru_cache_enable(); -} - -/* Do the necessary work of migrate_prep but not if it involves other CPUs */ -void migrate_prep_local(void) -{ - lru_add_drain(); -} - int isolate_movable_page(struct page *page, isolate_mode_t mode) { struct address_space *mapping; @@ -1771,7 +1747,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, int start, i; int err = 0, err1; - migrate_prep(); + lru_cache_disable(); for (i = start = 0; i < nr_pages; i++) { const void __user *p; @@ -1840,7 +1816,7 @@ out_flush: if (err >= 0) err = err1; out: - migrate_finish(); + lru_cache_enable(); return err; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2cefb634e0d6..80754fc1f1ff 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8681,7 +8681,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; - migrate_prep(); + lru_cache_disable(); while (pfn < end || !list_empty(&cc->migratepages)) { if (fatal_signal_pending(current)) { @@ -8716,7 +8716,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, break; } - migrate_finish(); + lru_cache_enable(); if (ret < 0) { alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); From 8cc621d2f45ddd3dc664024a647ee7adf48d79a5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:00 -0700 Subject: [PATCH 1146/1379] mm: fs: invalidate BH LRU during page migration Pages containing buffer_heads that are in one of the per-CPU buffer_head LRU caches will be pinned and thus cannot be migrated. This can prevent CMA allocations from succeeding, which are often used on platforms with co-processors (such as a DSP) that can only use physically contiguous memory. It can also prevent memory hot-unplugging from succeeding, which involves migrating at least MIN_MEMORY_BLOCK_SIZE bytes of memory, which ranges from 8 MiB to 1 GiB based on the architecture in use. Correspondingly, invalidate the BH LRU caches before a migration starts and stop any buffer_head from being cached in the LRU caches, until migration has finished. Link: https://lkml.kernel.org/r/20210319175127.886124-3-minchan@kernel.org Signed-off-by: Minchan Kim Reported-by: Chris Goldsworthy Reported-by: Laura Abbott Tested-by: Oliver Sang Cc: David Hildenbrand Cc: John Dias Cc: Matthew Wilcox Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 36 ++++++++++++++++++++++++++++++------ include/linux/buffer_head.h | 4 ++++ mm/swap.c | 5 ++++- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 0cb7ffd4977c..e9872d0dcbf1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1264,6 +1264,15 @@ static void bh_lru_install(struct buffer_head *bh) int i; check_irqs_on(); + /* + * the refcount of buffer_head in bh_lru prevents dropping the + * attached page(i.e., try_to_free_buffers) so it could cause + * failing page migration. + * Skip putting upcoming bh into bh_lru until migration is done. + */ + if (lru_cache_disabled()) + return; + bh_lru_lock(); b = this_cpu_ptr(&bh_lrus); @@ -1404,6 +1413,15 @@ __bread_gfp(struct block_device *bdev, sector_t block, } EXPORT_SYMBOL(__bread_gfp); +static void __invalidate_bh_lrus(struct bh_lru *b) +{ + int i; + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } +} /* * invalidate_bh_lrus() is called rarely - but not only at unmount. * This doesn't race because it runs in each cpu either in irq @@ -1412,16 +1430,12 @@ EXPORT_SYMBOL(__bread_gfp); static void invalidate_bh_lru(void *arg) { struct bh_lru *b = &get_cpu_var(bh_lrus); - int i; - for (i = 0; i < BH_LRU_SIZE; i++) { - brelse(b->bhs[i]); - b->bhs[i] = NULL; - } + __invalidate_bh_lrus(b); put_cpu_var(bh_lrus); } -static bool has_bh_in_lru(int cpu, void *dummy) +bool has_bh_in_lru(int cpu, void *dummy) { struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu); int i; @@ -1440,6 +1454,16 @@ void invalidate_bh_lrus(void) } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); +void invalidate_bh_lrus_cpu(int cpu) +{ + struct bh_lru *b; + + bh_lru_lock(); + b = per_cpu_ptr(&bh_lrus, cpu); + __invalidate_bh_lrus(b); + bh_lru_unlock(); +} + void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset) { diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6b47f94378c5..e7e99da31349 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -194,6 +194,8 @@ void __breadahead_gfp(struct block_device *, sector_t block, unsigned int size, struct buffer_head *__bread_gfp(struct block_device *, sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); +void invalidate_bh_lrus_cpu(int cpu); +bool has_bh_in_lru(int cpu, void *dummy); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); void unlock_buffer(struct buffer_head *bh); @@ -406,6 +408,8 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } +static inline void invalidate_bh_lrus_cpu(int cpu) {} +static inline bool has_bh_in_lru(int cpu, void *dummy) { return 0; } #define buffer_heads_over_limit 0 #endif /* CONFIG_BLOCK */ diff --git a/mm/swap.c b/mm/swap.c index c94f55e7b649..a75a8265302b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "internal.h" @@ -641,6 +642,7 @@ void lru_add_drain_cpu(int cpu) pagevec_lru_move_fn(pvec, lru_lazyfree_fn); activate_page_drain(cpu); + invalidate_bh_lrus_cpu(cpu); } /** @@ -828,7 +830,8 @@ inline void __lru_add_drain_all(bool force_all_cpus) pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || - need_activate_page_drain(cpu)) { + need_activate_page_drain(cpu) || + has_bh_in_lru(cpu, NULL)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); From 606a6f71a25accfc960a5063c23717ff07aa43a3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:04 -0700 Subject: [PATCH 1147/1379] mm/migrate.c: make putback_movable_page() static Patch series "Cleanup and fixup for mm/migrate.c", v3. This series contains cleanups to remove unnecessary VM_BUG_ON_PAGE and rc != MIGRATEPAGE_SUCCESS check. Also use helper function to remove some duplicated codes. What's more, this fixes potential deadlock in NUMA balancing shared exec THP case and so on. More details can be found in the respective changelogs. This patch (of 5): The putback_movable_page() is just called by putback_movable_pages() and we know the page is locked and both PageMovable() and PageIsolated() is checked right before calling putback_movable_page(). So we make it static and remove all the 3 VM_BUG_ON_PAGE(). Link: https://lkml.kernel.org/r/20210325131524.48181-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210325131524.48181-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Jerome Glisse Cc: Rafael Aquini Cc: Alistair Popple Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 1 - mm/migrate.c | 7 +------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 6155d97ec76c..175ef15ae9e8 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -43,7 +43,6 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason); extern struct page *alloc_migration_target(struct page *page, unsigned long private); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); -extern void putback_movable_page(struct page *page); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); diff --git a/mm/migrate.c b/mm/migrate.c index b5fbeb4bf49a..1e1f8324cefe 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -118,15 +118,10 @@ out: return -EBUSY; } -/* It should be called on page which is PG_movable */ -void putback_movable_page(struct page *page) +static void putback_movable_page(struct page *page) { struct address_space *mapping; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageMovable(page), page); - VM_BUG_ON_PAGE(!PageIsolated(page), page); - mapping = page_mapping(page); mapping->a_ops->putback_page(page); __ClearPageIsolated(page); From a04840c6841bb266c38f51adc87325308ab8d575 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:07 -0700 Subject: [PATCH 1148/1379] mm/migrate.c: remove unnecessary rc != MIGRATEPAGE_SUCCESS check in 'else' case It's guaranteed that in the 'else' case of the rc == MIGRATEPAGE_SUCCESS check, rc does not equal to MIGRATEPAGE_SUCCESS. Remove this unnecessary check. Link: https://lkml.kernel.org/r/20210325131524.48181-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Alistair Popple Cc: Jerome Glisse Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 1e1f8324cefe..896e9cd49ecc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1348,7 +1348,7 @@ out_unlock: out: if (rc == MIGRATEPAGE_SUCCESS) putback_active_hugepage(hpage); - else if (rc != -EAGAIN && rc != MIGRATEPAGE_SUCCESS) + else if (rc != -EAGAIN) list_move_tail(&hpage->lru, ret); /* From 34f5e9b9d1990d286199084efa752530ee3d8297 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:10 -0700 Subject: [PATCH 1149/1379] mm/migrate.c: fix potential indeterminate pte entry in migrate_vma_insert_page() If the zone device page does not belong to un-addressable device memory, the variable entry will be uninitialized and lead to indeterminate pte entry ultimately. Fix this unexpected case and warn about it. Link: https://lkml.kernel.org/r/20210325131524.48181-4-linmiaohe@huawei.com Fixes: df6ad69838fc ("mm/device-public-memory: device memory cache coherent with CPU") Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Jerome Glisse Cc: Rafael Aquini Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/migrate.c b/mm/migrate.c index 896e9cd49ecc..c40075d48ef9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2947,6 +2947,13 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); entry = swp_entry_to_pte(swp_entry); + } else { + /* + * For now we only support migrating to un-addressable + * device memory. + */ + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); + goto abort; } } else { entry = mk_pte(page, vma->vm_page_prot); From 843e1be108b9130e5ec5a78a14f77dc237c83e1e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:13 -0700 Subject: [PATCH 1150/1379] mm/migrate.c: use helper migrate_vma_collect_skip() in migrate_vma_collect_hole() It's more recommended to use helper function migrate_vma_collect_skip() to skip the unexpected case and it also helps remove some duplicated codes. Move migrate_vma_collect_skip() above migrate_vma_collect_hole() to avoid compiler warning. Link: https://lkml.kernel.org/r/20210325131524.48181-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Jerome Glisse Cc: Rafael Aquini Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 50 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index c40075d48ef9..6876696ab302 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2290,34 +2290,6 @@ out: #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEVICE_PRIVATE -static int migrate_vma_collect_hole(unsigned long start, - unsigned long end, - __always_unused int depth, - struct mm_walk *walk) -{ - struct migrate_vma *migrate = walk->private; - unsigned long addr; - - /* Only allow populating anonymous memory. */ - if (!vma_is_anonymous(walk->vma)) { - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->src[migrate->npages] = 0; - migrate->dst[migrate->npages] = 0; - migrate->npages++; - } - return 0; - } - - for (addr = start; addr < end; addr += PAGE_SIZE) { - migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; - migrate->dst[migrate->npages] = 0; - migrate->npages++; - migrate->cpages++; - } - - return 0; -} - static int migrate_vma_collect_skip(unsigned long start, unsigned long end, struct mm_walk *walk) @@ -2333,6 +2305,28 @@ static int migrate_vma_collect_skip(unsigned long start, return 0; } +static int migrate_vma_collect_hole(unsigned long start, + unsigned long end, + __always_unused int depth, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + /* Only allow populating anonymous memory. */ + if (!vma_is_anonymous(walk->vma)) + return migrate_vma_collect_skip(start, end, walk); + + for (addr = start; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE; + migrate->dst[migrate->npages] = 0; + migrate->npages++; + migrate->cpages++; + } + + return 0; +} + static int migrate_vma_collect_pmd(pmd_t *pmdp, unsigned long start, unsigned long end, From 7ee820ee72388279a37077f418e32643a298243a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:16 -0700 Subject: [PATCH 1151/1379] Revert "mm: migrate: skip shared exec THP for NUMA balancing" This reverts commit c77c5cbafe549eb330e8909861a3e16cbda2c848. Since commit c77c5cbafe54 ("mm: migrate: skip shared exec THP for NUMA balancing"), the NUMA balancing would skip shared exec transhuge page. But this enhancement is not suitable for transhuge page. Because it's required that page_mapcount() must be 1 due to no migration pte dance is done here. On the other hand, the shared exec transhuge page will leave the migrate_misplaced_page() with pte entry untouched and page locked. Thus pagefault for NUMA will be triggered again and deadlock occurs when we start waiting for the page lock held by ourselves. Yang Shi said: "Thanks for catching this. By relooking the code I think the other important reason for removing this is migrate_misplaced_transhuge_page() actually can't see shared exec file THP at all since page_lock_anon_vma_read() is called before and if page is not anonymous page it will just restore the PMD without migrating anything. The pages for private mapped file vma may be anonymous pages due to COW but they can't be THP so it won't trigger THP numa fault at all. I think this is why no bug was reported. I overlooked this in the first place." Link: https://lkml.kernel.org/r/20210325131524.48181-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Yang Shi Cc: Alistair Popple Cc: David Hildenbrand Cc: Jerome Glisse Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 6876696ab302..30c65c2be30b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2084,17 +2084,6 @@ bool pmd_trans_migrating(pmd_t pmd) return PageLocked(page); } -static inline bool is_shared_exec_page(struct vm_area_struct *vma, - struct page *page) -{ - if (page_mapcount(page) != 1 && - (page_is_file_lru(page) || vma_is_shmem(vma)) && - (vma->vm_flags & VM_EXEC)) - return true; - - return false; -} - /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on @@ -2112,7 +2101,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, * Don't migrate file pages that are mapped in multiple processes * with execute permissions as they are probably shared libraries. */ - if (is_shared_exec_page(vma, page)) + if (page_mapcount(page) != 1 && page_is_file_lru(page) && + (vma->vm_flags & VM_EXEC)) goto out; /* @@ -2167,9 +2157,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int page_lru = page_is_file_lru(page); unsigned long start = address & HPAGE_PMD_MASK; - if (is_shared_exec_page(vma, page)) - goto out; - new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), HPAGE_PMD_ORDER); @@ -2281,7 +2268,6 @@ out_fail: out_unlock: unlock_page(page); -out: put_page(page); return 0; } From bbb269206f3c914d4f23e023de4ec020abea6d1b Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:19 -0700 Subject: [PATCH 1152/1379] mm: vmstat: add cma statistics Since CMA is used more widely, it's worth to have CMA allocation statistics into vmstat. With it, we could know how agressively system uses cma allocation and how often it fails. Link: https://lkml.kernel.org/r/20210302183346.3707237-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: John Hubbard Cc: John Dias Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 4 ++++ mm/cma.c | 12 +++++++++--- mm/vmstat.c | 4 ++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 18e75974d4e3..21d7c7f72f1c 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -70,6 +70,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #endif #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, +#endif +#ifdef CONFIG_CMA + CMA_ALLOC_SUCCESS, + CMA_ALLOC_FAIL, #endif UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ diff --git a/mm/cma.c b/mm/cma.c index acd6991f77a0..b44a71eb3174 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -435,13 +435,13 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, int ret = -ENOMEM; if (!cma || !cma->count || !cma->bitmap) - return NULL; + goto out; pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma, count, align); if (!count) - return NULL; + goto out; mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); @@ -449,7 +449,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, bitmap_count = cma_bitmap_pages_to_bits(cma, count); if (bitmap_count > bitmap_maxno) - return NULL; + goto out; for (;;) { spin_lock_irq(&cma->lock); @@ -506,6 +506,12 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } pr_debug("%s(): returned %p\n", __func__, page); +out: + if (page) + count_vm_event(CMA_ALLOC_SUCCESS); + else + count_vm_event(CMA_ALLOC_FAIL); + return page; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 74b2c374b86c..49a8456ec079 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1312,6 +1312,10 @@ const char * const vmstat_text[] = { #ifdef CONFIG_HUGETLB_PAGE "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", +#endif +#ifdef CONFIG_CMA + "cma_alloc_success", + "cma_alloc_fail", #endif "unevictable_pgs_culled", "unevictable_pgs_scanned", From 63f83b31f4f36d933e13bd8b9a25d6d9a0cf89dd Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 4 May 2021 18:37:22 -0700 Subject: [PATCH 1153/1379] mm: cma: use pr_err_ratelimited for CMA warning If we did not reserve extra CMA memory, the log buffer can be easily filled up by CMA failure warning when the devices calling dmam_alloc_coherent() to alloc DMA memory. Thus we can use pr_err_ratelimited() instead to reduce the duplicate CMA warning. Link: https://lkml.kernel.org/r/ce2251ef49e1727a9a40531d1996660b05462bd2.1615279825.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: David Hildenbrand Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index b44a71eb3174..a862ff8c23ce 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -500,8 +500,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } if (ret && !no_warn) { - pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", - __func__, cma->name, count, ret); + pr_err_ratelimited("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", + __func__, cma->name, count, ret); cma_debug_show_areas(cma); } From 7bc1aec5e28765ad18742824b3b972471807a632 Mon Sep 17 00:00:00 2001 From: Liam Mark Date: Tue, 4 May 2021 18:37:25 -0700 Subject: [PATCH 1154/1379] mm: cma: add trace events for CMA alloc perf testing Add cma and migrate trace events to enable CMA allocation performance to be measured via ftrace. [georgi.djakov@linaro.org: add the CMA instance name to the cma_alloc_start trace event] Link: https://lkml.kernel.org/r/20210326155414.25006-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20210324160740.15901-1-georgi.djakov@linaro.org Signed-off-by: Liam Mark Signed-off-by: Georgi Djakov Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/cma.h | 42 +++++++++++++++++++++++++++++++++- include/trace/events/migrate.h | 22 ++++++++++++++++++ mm/cma.c | 4 ++++ mm/migrate.c | 2 ++ 4 files changed, 69 insertions(+), 1 deletion(-) diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 5017a8829270..be1525a10457 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -8,7 +8,7 @@ #include #include -TRACE_EVENT(cma_alloc, +DECLARE_EVENT_CLASS(cma_alloc_class, TP_PROTO(unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), @@ -61,6 +61,46 @@ TRACE_EVENT(cma_release, __entry->count) ); +TRACE_EVENT(cma_alloc_start, + + TP_PROTO(const char *name, unsigned int count, unsigned int align), + + TP_ARGS(name, count, align), + + TP_STRUCT__entry( + __string(name, name) + __field(unsigned int, count) + __field(unsigned int, align) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->count = count; + __entry->align = align; + ), + + TP_printk("name=%s count=%u align=%u", + __get_str(name), + __entry->count, + __entry->align) +); + +DEFINE_EVENT(cma_alloc_class, cma_alloc, + + TP_PROTO(unsigned long pfn, const struct page *page, + unsigned int count, unsigned int align), + + TP_ARGS(pfn, page, count, align) +); + +DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, + + TP_PROTO(unsigned long pfn, const struct page *page, + unsigned int count, unsigned int align), + + TP_ARGS(pfn, page, count, align) +); + #endif /* _TRACE_CMA_H */ /* This part must be outside protection */ diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 4d434398d64d..f2c990603888 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -81,6 +81,28 @@ TRACE_EVENT(mm_migrate_pages, __print_symbolic(__entry->mode, MIGRATE_MODE), __print_symbolic(__entry->reason, MIGRATE_REASON)) ); + +TRACE_EVENT(mm_migrate_pages_start, + + TP_PROTO(enum migrate_mode mode, int reason), + + TP_ARGS(mode, reason), + + TP_STRUCT__entry( + __field(enum migrate_mode, mode) + __field(int, reason) + ), + + TP_fast_assign( + __entry->mode = mode; + __entry->reason = reason; + ), + + TP_printk("mode=%s reason=%s", + __print_symbolic(__entry->mode, MIGRATE_MODE), + __print_symbolic(__entry->reason, MIGRATE_REASON)) +); + #endif /* _TRACE_MIGRATE_H */ /* This part must be outside protection */ diff --git a/mm/cma.c b/mm/cma.c index a862ff8c23ce..d1a4c06b5039 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -443,6 +443,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, if (!count) goto out; + trace_cma_alloc_start(cma->name, count, align); + mask = cma_bitmap_aligned_mask(cma, align); offset = cma_bitmap_aligned_offset(cma, align); bitmap_maxno = cma_bitmap_maxno(cma); @@ -483,6 +485,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pr_debug("%s(): memory range at %p is busy, retrying\n", __func__, pfn_to_page(pfn)); + + trace_cma_alloc_busy_retry(pfn, pfn_to_page(pfn), count, align); /* try again with a bit different memory target */ start = bitmap_no + mask + 1; } diff --git a/mm/migrate.c b/mm/migrate.c index 30c65c2be30b..6b37d00890ca 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1418,6 +1418,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int rc, nr_subpages; LIST_HEAD(ret_pages); + trace_mm_migrate_pages_start(mode, reason); + if (!swapwrite) current->flags |= PF_SWAPWRITE; From 43ca106fa8ec7d684776fbe561214d3b2b7cb9cb Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:28 -0700 Subject: [PATCH 1155/1379] mm: cma: support sysfs Since CMA is getting used more widely, it's more important to keep monitoring CMA statistics for system health since it's directly related to user experience. This patch introduces sysfs statistics for CMA, in order to provide some basic monitoring of the CMA allocator. * the number of CMA page successful allocations * the number of CMA page allocation failures These two values allow the user to calcuate the allocation failure rate for each CMA area. e.g.) /sys/kernel/mm/cma/WIFI/alloc_pages_[success|fail] /sys/kernel/mm/cma/SENSOR/alloc_pages_[success|fail] /sys/kernel/mm/cma/BLUETOOTH/alloc_pages_[success|fail] The cma_stat was intentionally allocated by dynamic allocation to harmonize with kobject lifetime management. https://lore.kernel.org/linux-mm/YCOAmXqt6dZkCQYs@kroah.com/ Link: https://lkml.kernel.org/r/20210324230759.2213957-1-minchan@kernel.org Link: https://lore.kernel.org/linux-mm/20210316100433.17665-1-colin.king@canonical.com/ Signed-off-by: Minchan Kim Signed-off-by: Colin Ian King Tested-by: Dmitry Osipenko Reviewed-by: Dmitry Osipenko Reviewed-by: Greg Kroah-Hartman Reviewed-by: John Hubbard Tested-by: Anders Roxell Cc: Suren Baghdasaryan Cc: John Dias Cc: Matthew Wilcox (Oracle) Cc: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-kernel-mm-cma | 25 ++++ mm/Kconfig | 7 ++ mm/Makefile | 1 + mm/cma.c | 8 +- mm/cma.h | 23 ++++ mm/cma_sysfs.c | 112 ++++++++++++++++++ 6 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-cma create mode 100644 mm/cma_sysfs.c diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma new file mode 100644 index 000000000000..02b2bb60c296 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-cma @@ -0,0 +1,25 @@ +What: /sys/kernel/mm/cma/ +Date: Feb 2021 +Contact: Minchan Kim +Description: + /sys/kernel/mm/cma/ contains a subdirectory for each CMA + heap name (also sometimes called CMA areas). + + Each CMA heap subdirectory (that is, each + /sys/kernel/mm/cma/ directory) contains the + following items: + + alloc_pages_success + alloc_pages_fail + +What: /sys/kernel/mm/cma//alloc_pages_success +Date: Feb 2021 +Contact: Minchan Kim +Description: + the number of pages CMA API succeeded to allocate + +What: /sys/kernel/mm/cma//alloc_pages_fail +Date: Feb 2021 +Contact: Minchan Kim +Description: + the number of pages CMA API failed to allocate diff --git a/mm/Kconfig b/mm/Kconfig index f4ceeaedb966..5f7d7b3d8cc2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -518,6 +518,13 @@ config CMA_DEBUGFS help Turns on the DebugFS interface for CMA. +config CMA_SYSFS + bool "CMA information through sysfs interface" + depends on CMA && SYSFS + help + This option exposes some sysfs attributes to get information + from CMA. + config CMA_AREAS int "Maximum count of the CMA areas" depends on CMA diff --git a/mm/Makefile b/mm/Makefile index c0135e385984..809033d77710 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -109,6 +109,7 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o +obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o diff --git a/mm/cma.c b/mm/cma.c index d1a4c06b5039..2380f2571eb5 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -511,10 +511,14 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pr_debug("%s(): returned %p\n", __func__, page); out: - if (page) + if (page) { count_vm_event(CMA_ALLOC_SUCCESS); - else + cma_sysfs_account_success_pages(cma, count); + } else { count_vm_event(CMA_ALLOC_FAIL); + if (cma) + cma_sysfs_account_fail_pages(cma, count); + } return page; } diff --git a/mm/cma.h b/mm/cma.h index c46c5050aaa9..2c775877eae2 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -3,6 +3,12 @@ #define __MM_CMA_H__ #include +#include + +struct cma_kobject { + struct kobject kobj; + struct cma *cma; +}; struct cma { unsigned long base_pfn; @@ -16,6 +22,14 @@ struct cma { struct debugfs_u32_array dfs_bitmap; #endif char name[CMA_MAX_NAME]; +#ifdef CONFIG_CMA_SYSFS + /* the number of CMA page successful allocations */ + atomic64_t nr_pages_succeeded; + /* the number of CMA page allocation failures */ + atomic64_t nr_pages_failed; + /* kobject requires dynamic object */ + struct cma_kobject *cma_kobj; +#endif }; extern struct cma cma_areas[MAX_CMA_AREAS]; @@ -26,4 +40,13 @@ static inline unsigned long cma_bitmap_maxno(struct cma *cma) return cma->count >> cma->order_per_bit; } +#ifdef CONFIG_CMA_SYSFS +void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages); +void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages); +#else +static inline void cma_sysfs_account_success_pages(struct cma *cma, + unsigned long nr_pages) {}; +static inline void cma_sysfs_account_fail_pages(struct cma *cma, + unsigned long nr_pages) {}; +#endif #endif diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c new file mode 100644 index 000000000000..eb2f39caff59 --- /dev/null +++ b/mm/cma_sysfs.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CMA SysFS Interface + * + * Copyright (c) 2021 Minchan Kim + */ + +#include +#include +#include + +#include "cma.h" + +#define CMA_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +void cma_sysfs_account_success_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_succeeded); +} + +void cma_sysfs_account_fail_pages(struct cma *cma, unsigned long nr_pages) +{ + atomic64_add(nr_pages, &cma->nr_pages_failed); +} + +static inline struct cma *cma_from_kobj(struct kobject *kobj) +{ + return container_of(kobj, struct cma_kobject, kobj)->cma; +} + +static ssize_t alloc_pages_success_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", + atomic64_read(&cma->nr_pages_succeeded)); +} +CMA_ATTR_RO(alloc_pages_success); + +static ssize_t alloc_pages_fail_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct cma *cma = cma_from_kobj(kobj); + + return sysfs_emit(buf, "%llu\n", atomic64_read(&cma->nr_pages_failed)); +} +CMA_ATTR_RO(alloc_pages_fail); + +static void cma_kobj_release(struct kobject *kobj) +{ + struct cma *cma = cma_from_kobj(kobj); + struct cma_kobject *cma_kobj = cma->cma_kobj; + + kfree(cma_kobj); + cma->cma_kobj = NULL; +} + +static struct attribute *cma_attrs[] = { + &alloc_pages_success_attr.attr, + &alloc_pages_fail_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(cma); + +static struct kobj_type cma_ktype = { + .release = cma_kobj_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = cma_groups, +}; + +static int __init cma_sysfs_init(void) +{ + struct kobject *cma_kobj_root; + struct cma_kobject *cma_kobj; + struct cma *cma; + int i, err; + + cma_kobj_root = kobject_create_and_add("cma", mm_kobj); + if (!cma_kobj_root) + return -ENOMEM; + + for (i = 0; i < cma_area_count; i++) { + cma_kobj = kzalloc(sizeof(*cma_kobj), GFP_KERNEL); + if (!cma_kobj) { + err = -ENOMEM; + goto out; + } + + cma = &cma_areas[i]; + cma->cma_kobj = cma_kobj; + cma_kobj->cma = cma; + err = kobject_init_and_add(&cma_kobj->kobj, &cma_ktype, + cma_kobj_root, "%s", cma->name); + if (err) { + kobject_put(&cma_kobj->kobj); + goto out; + } + } + + return 0; +out: + while (--i >= 0) { + cma = &cma_areas[i]; + kobject_put(&cma->cma_kobj->kobj); + } + kobject_put(cma_kobj_root); + + return err; +} +subsys_initcall(cma_sysfs_init); From 3aab8ae7aace3388da319a233edf48f0f5d26a44 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:31 -0700 Subject: [PATCH 1156/1379] mm: cma: add the CMA instance name to cma trace events There were missing places to add cma instance name. To identify each CMA instance, let's add the name for every cma trace. This patch also changes the existing cma_trace_alloc to cma_trace_finish since we have cma_alloc_start[1]. [1] https://lore.kernel.org/linux-mm/20210324160740.15901-1-georgi.djakov@linaro.org Link: https://lkml.kernel.org/r/20210330220237.748899-1-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Liam Mark Cc: Georgi Djakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/cma.h | 28 +++++++++++++++++----------- mm/cma.c | 7 ++++--- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index be1525a10457..5cf385ae7c08 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -10,12 +10,13 @@ DECLARE_EVENT_CLASS(cma_alloc_class, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align), + TP_ARGS(name, pfn, page, count, align), TP_STRUCT__entry( + __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) __field(unsigned int, count) @@ -23,13 +24,15 @@ DECLARE_EVENT_CLASS(cma_alloc_class, ), TP_fast_assign( + __assign_str(name, name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; __entry->align = align; ), - TP_printk("pfn=%lx page=%p count=%u align=%u", + TP_printk("name=%s pfn=%lx page=%p count=%u align=%u", + __get_str(name), __entry->pfn, __entry->page, __entry->count, @@ -38,24 +41,27 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TRACE_EVENT(cma_release, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count), - TP_ARGS(pfn, page, count), + TP_ARGS(name, pfn, page, count), TP_STRUCT__entry( + __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) __field(unsigned int, count) ), TP_fast_assign( + __assign_str(name, name); __entry->pfn = pfn; __entry->page = page; __entry->count = count; ), - TP_printk("pfn=%lx page=%p count=%u", + TP_printk("name=%s pfn=%lx page=%p count=%u", + __get_str(name), __entry->pfn, __entry->page, __entry->count) @@ -85,20 +91,20 @@ TRACE_EVENT(cma_alloc_start, __entry->align) ); -DEFINE_EVENT(cma_alloc_class, cma_alloc, +DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align) + TP_ARGS(name, pfn, page, count, align) ); DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, - TP_PROTO(unsigned long pfn, const struct page *page, + TP_PROTO(const char *name, unsigned long pfn, const struct page *page, unsigned int count, unsigned int align), - TP_ARGS(pfn, page, count, align) + TP_ARGS(name, pfn, page, count, align) ); #endif /* _TRACE_CMA_H */ diff --git a/mm/cma.c b/mm/cma.c index 2380f2571eb5..cdad8c4de921 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -486,12 +486,13 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, pr_debug("%s(): memory range at %p is busy, retrying\n", __func__, pfn_to_page(pfn)); - trace_cma_alloc_busy_retry(pfn, pfn_to_page(pfn), count, align); + trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), + count, align); /* try again with a bit different memory target */ start = bitmap_no + mask + 1; } - trace_cma_alloc(pfn, page, count, align); + trace_cma_alloc_finish(cma->name, pfn, page, count, align); /* * CMA can allocate multiple page blocks, which results in different @@ -551,7 +552,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) free_contig_range(pfn, count); cma_clear_bitmap(cma, pfn, count); - trace_cma_release(pfn, pages, count); + trace_cma_release(cma->name, pfn, pages, count); return true; } From 78fa51503fdbe463c96eef4c3cf69ca54032647a Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 4 May 2021 18:37:34 -0700 Subject: [PATCH 1157/1379] mm: use proper type for cma_[alloc|release] size_t in cma_alloc is confusing since it makes people think it's byte count, not pages. Change it to unsigned long[1]. The unsigned int in cma_release is also not right so change it. Since we have unsigned long in cma_release, free_contig_range should also respect it. [1] 67a2e213e7e9, mm: cma: fix incorrect type conversion for size during dma allocation Link: https://lore.kernel.org/linux-mm/20210324043434.GP1719932@casper.infradead.org/ Link: https://lkml.kernel.org/r/20210331164018.710560-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: David Hildenbrand Cc: Matthew Wilcox Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cma.h | 4 ++-- include/linux/gfp.h | 2 +- include/trace/events/cma.h | 22 +++++++++++----------- mm/cma.c | 17 +++++++++-------- mm/page_alloc.c | 6 +++--- 5 files changed, 26 insertions(+), 25 deletions(-) diff --git a/include/linux/cma.h b/include/linux/cma.h index 217999c8a762..53fd8c3cdbd0 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -44,9 +44,9 @@ extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, const char *name, struct cma **res_cma); -extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, +extern struct page *cma_alloc(struct cma *cma, unsigned long count, unsigned int align, bool no_warn); -extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); +extern bool cma_release(struct cma *cma, const struct page *pages, unsigned long count); extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); #endif diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 26f4d907254a..8a5f6c3d7dba 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -657,7 +657,7 @@ extern int alloc_contig_range(unsigned long start, unsigned long end, extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #endif -void free_contig_range(unsigned long pfn, unsigned int nr_pages); +void free_contig_range(unsigned long pfn, unsigned long nr_pages); #ifdef CONFIG_CMA /* CMA stuff */ diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 5cf385ae7c08..c3d354702cb0 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -11,7 +11,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align), @@ -19,7 +19,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) - __field(unsigned int, count) + __field(unsigned long, count) __field(unsigned int, align) ), @@ -31,7 +31,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, __entry->align = align; ), - TP_printk("name=%s pfn=%lx page=%p count=%u align=%u", + TP_printk("name=%s pfn=%lx page=%p count=%lu align=%u", __get_str(name), __entry->pfn, __entry->page, @@ -42,7 +42,7 @@ DECLARE_EVENT_CLASS(cma_alloc_class, TRACE_EVENT(cma_release, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count), + unsigned long count), TP_ARGS(name, pfn, page, count), @@ -50,7 +50,7 @@ TRACE_EVENT(cma_release, __string(name, name) __field(unsigned long, pfn) __field(const struct page *, page) - __field(unsigned int, count) + __field(unsigned long, count) ), TP_fast_assign( @@ -60,7 +60,7 @@ TRACE_EVENT(cma_release, __entry->count = count; ), - TP_printk("name=%s pfn=%lx page=%p count=%u", + TP_printk("name=%s pfn=%lx page=%p count=%lu", __get_str(name), __entry->pfn, __entry->page, @@ -69,13 +69,13 @@ TRACE_EVENT(cma_release, TRACE_EVENT(cma_alloc_start, - TP_PROTO(const char *name, unsigned int count, unsigned int align), + TP_PROTO(const char *name, unsigned long count, unsigned int align), TP_ARGS(name, count, align), TP_STRUCT__entry( __string(name, name) - __field(unsigned int, count) + __field(unsigned long, count) __field(unsigned int, align) ), @@ -85,7 +85,7 @@ TRACE_EVENT(cma_alloc_start, __entry->align = align; ), - TP_printk("name=%s count=%u align=%u", + TP_printk("name=%s count=%lu align=%u", __get_str(name), __entry->count, __entry->align) @@ -94,7 +94,7 @@ TRACE_EVENT(cma_alloc_start, DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align) ); @@ -102,7 +102,7 @@ DEFINE_EVENT(cma_alloc_class, cma_alloc_finish, DEFINE_EVENT(cma_alloc_class, cma_alloc_busy_retry, TP_PROTO(const char *name, unsigned long pfn, const struct page *page, - unsigned int count, unsigned int align), + unsigned long count, unsigned int align), TP_ARGS(name, pfn, page, count, align) ); diff --git a/mm/cma.c b/mm/cma.c index cdad8c4de921..995e15480937 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -79,7 +79,7 @@ static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, } static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, - unsigned int count) + unsigned long count) { unsigned long bitmap_no, bitmap_count; unsigned long flags; @@ -423,21 +423,21 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, - bool no_warn) +struct page *cma_alloc(struct cma *cma, unsigned long count, + unsigned int align, bool no_warn) { unsigned long mask, offset; unsigned long pfn = -1; unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; - size_t i; + unsigned long i; struct page *page = NULL; int ret = -ENOMEM; if (!cma || !cma->count || !cma->bitmap) goto out; - pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma, + pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma, count, align); if (!count) @@ -505,7 +505,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, } if (ret && !no_warn) { - pr_err_ratelimited("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n", + pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n", __func__, cma->name, count, ret); cma_debug_show_areas(cma); } @@ -534,14 +534,15 @@ out: * It returns false when provided pages do not belong to contiguous area and * true otherwise. */ -bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) +bool cma_release(struct cma *cma, const struct page *pages, + unsigned long count) { unsigned long pfn; if (!cma || !pages) return false; - pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count); + pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count); pfn = page_to_pfn(pages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 80754fc1f1ff..d12299c08b95 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8973,9 +8973,9 @@ struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, } #endif /* CONFIG_CONTIG_ALLOC */ -void free_contig_range(unsigned long pfn, unsigned int nr_pages) +void free_contig_range(unsigned long pfn, unsigned long nr_pages) { - unsigned int count = 0; + unsigned long count = 0; for (; nr_pages--; pfn++) { struct page *page = pfn_to_page(pfn); @@ -8983,7 +8983,7 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages) count += page_count(page) != 1; __free_page(page); } - WARN(count != 0, "%d pages are still in use!\n", count); + WARN(count != 0, "%lu pages are still in use!\n", count); } EXPORT_SYMBOL(free_contig_range); From a08e1e11c90f3e6020963b3ad097680768bc8567 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:37 -0700 Subject: [PATCH 1158/1379] ksm: remove redundant VM_BUG_ON_PAGE() on stable_tree_search() Patch series "Cleanup and fixup for ksm". This series contains cleanups to remove unnecessary VM_BUG_ON_PAGE and dedicated macro KSM_FLAG_MASK. Also this fixes potential missing rmap_item for stable_node which would result in failed rmap_walk_ksm(). More details can be found in the respective changelogs. This patch (of 4): The same VM_BUG_ON_PAGE() check is already done in the callee. Remove these extra caller one to simplify code slightly. Link: https://lkml.kernel.org/r/20210330140228.45635-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210330140228.45635-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 9694ee2c71de..7b6bc1036596 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1771,7 +1771,6 @@ chain_append: * stable_node_dup is the dup to replace. */ if (stable_node_dup == stable_node) { - VM_BUG_ON(is_stable_node_chain(stable_node_dup)); VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* chain is missing so create it */ stable_node = alloc_stable_node_chain(stable_node_dup, @@ -1785,7 +1784,6 @@ chain_append: * of the current nid for this page * content. */ - VM_BUG_ON(!is_stable_node_chain(stable_node)); VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); From 3e96b6a2e9ad929a3230a22f4d64a74671a0720b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:39 -0700 Subject: [PATCH 1159/1379] ksm: use GET_KSM_PAGE_NOLOCK to get ksm page in remove_rmap_item_from_tree() It's unnecessary to lock the page when get ksm page if we're going to remove the rmap item as page migration is irrelevant in this case. Use GET_KSM_PAGE_NOLOCK instead to save some page lock cycles. Link: https://lkml.kernel.org/r/20210330140228.45635-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 7b6bc1036596..31dcbd4d3ac9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -778,12 +778,11 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) struct page *page; stable_node = rmap_item->head; - page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); + page = get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); if (!page) goto out; hlist_del(&rmap_item->hlist); - unlock_page(page); put_page(page); if (!hlist_empty(&stable_node->hlist)) From cd7fae26024690c772ec66719735c58a12034088 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:42 -0700 Subject: [PATCH 1160/1379] ksm: remove dedicated macro KSM_FLAG_MASK The macro KSM_FLAG_MASK is used in rmap_walk_ksm() only. So we can replace ~KSM_FLAG_MASK with PAGE_MASK to remove this dedicated macro and make code more consistent because PAGE_MASK is used elsewhere in this file. Link: https://lkml.kernel.org/r/20210330140228.45635-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 31dcbd4d3ac9..4b25f642f4ec 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -215,8 +215,6 @@ struct rmap_item { #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ -#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG) - /* to mask all the flags */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; @@ -2631,7 +2629,7 @@ again: vma = vmac->vma; /* Ignore the stable/unstable/sqnr flags */ - addr = rmap_item->address & ~KSM_FLAG_MASK; + addr = rmap_item->address & PAGE_MASK; if (addr < vma->vm_start || addr >= vma->vm_end) continue; From c89a384e2551c692a9fe60d093fd7080f50afc51 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 May 2021 18:37:45 -0700 Subject: [PATCH 1161/1379] ksm: fix potential missing rmap_item for stable_node When removing rmap_item from stable tree, STABLE_FLAG of rmap_item is cleared with head reserved. So the following scenario might happen: For ksm page with rmap_item1: cmp_and_merge_page stable_node->head = &migrate_nodes; remove_rmap_item_from_tree, but head still equal to stable_node; try_to_merge_with_ksm_page failed; return; For the same ksm page with rmap_item2, stable node migration succeed this time. The stable_node->head does not equal to migrate_nodes now. For ksm page with rmap_item1 again: cmp_and_merge_page stable_node->head != &migrate_nodes && rmap_item->head == stable_node return; We would miss the rmap_item for stable_node and might result in failed rmap_walk_ksm(). Fix this by set rmap_item->head to NULL when rmap_item is removed from stable tree. Link: https://lkml.kernel.org/r/20210330140228.45635-5-linmiaohe@huawei.com Fixes: 4146d2d673e8 ("ksm: make !merge_across_nodes migration safe") Signed-off-by: Miaohe Lin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/ksm.c b/mm/ksm.c index 4b25f642f4ec..e2ba4ebc2500 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -791,6 +791,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); + rmap_item->head = NULL; rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { From 420be4edefe503f8dbd6ab914b11a57a0d339660 Mon Sep 17 00:00:00 2001 From: Chengyang Fan Date: Tue, 4 May 2021 18:37:48 -0700 Subject: [PATCH 1162/1379] mm/ksm: remove unused parameter from remove_trailing_rmap_items() Since commit 6514d511dbe5 ("ksm: singly-linked rmap_list") was merged, remove_trailing_rmap_items() doesn't use the 'mm_slot' parameter. So remove it, and update caller accordingly. Link: https://lkml.kernel.org/r/20210330121320.1693474-1-cy.fan@huawei.com Signed-off-by: Chengyang Fan Reviewed-by: David Hildenbrand Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index e2ba4ebc2500..b321a67ebaa9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -815,8 +815,7 @@ out: cond_resched(); /* we're called from many long loops */ } -static void remove_trailing_rmap_items(struct mm_slot *mm_slot, - struct rmap_item **rmap_list) +static void remove_trailing_rmap_items(struct rmap_item **rmap_list) { while (*rmap_list) { struct rmap_item *rmap_item = *rmap_list; @@ -987,7 +986,7 @@ static int unmerge_and_remove_all_rmap_items(void) goto error; } - remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); + remove_trailing_rmap_items(&mm_slot->rmap_list); mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); @@ -2333,7 +2332,7 @@ next_mm: * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ - remove_trailing_rmap_items(slot, ksm_scan.rmap_list); + remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(slot->mm_list.next, From 76d8cc3c8f45cc597726616f11db4180f7e21ce0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 May 2021 18:37:51 -0700 Subject: [PATCH 1163/1379] mm: restore node stat checking in /proc/sys/vm/stat_refresh In v4.7 commit 52b6f46bc163 ("mm: /proc/sys/vm/stat_refresh to force vmstat update") introduced vmstat_refresh(), with its vmstat underflow checking; then in v4.8 commit 75ef71840539 ("mm, vmstat: add infrastructure for per-node vmstats") split NR_VM_NODE_STAT_ITEMS out of NR_VM_ZONE_STAT_ITEMS without updating vmstat_refresh(): so it has been missing out much of the vmstat underflow checking ever since. Reinstate it. Thanks to Roman Gushchin for tangentially pointing this out. Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2102251502240.13363@eggly.anvils Signed-off-by: Hugh Dickins Cc: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index 49a8456ec079..14d0ba4fd46b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1875,6 +1875,14 @@ int vmstat_refresh(struct ctl_table *table, int write, } } #endif + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + val = atomic_long_read(&vm_node_stat[i]); + if (val < 0) { + pr_warn("%s: %s %ld\n", + __func__, node_stat_name(i), val); + err = -EINVAL; + } + } if (err) return err; if (write) From 6d99a4c029c01cd7d075f7f9fa3b8b620e49a9f7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 May 2021 18:37:54 -0700 Subject: [PATCH 1164/1379] mm: no more EINVAL from /proc/sys/vm/stat_refresh EINVAL was good for drawing the refresher's attention to a warning in dmesg, but became very tiresome when running test suites scripted with "set -e": an underflow from a bug in one feature would cause unrelated tests much later to fail, just because their /proc/sys/vm/stat_refresh touch failed with that error. Stop doing that. Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2102251510410.13363@eggly.anvils Signed-off-by: Hugh Dickins Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 14d0ba4fd46b..1fb91d5a2b60 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1862,7 +1862,6 @@ int vmstat_refresh(struct ctl_table *table, int write, if (val < 0) { pr_warn("%s: %s %ld\n", __func__, zone_stat_name(i), val); - err = -EINVAL; } } #ifdef CONFIG_NUMA @@ -1871,7 +1870,6 @@ int vmstat_refresh(struct ctl_table *table, int write, if (val < 0) { pr_warn("%s: %s %ld\n", __func__, numa_stat_name(i), val); - err = -EINVAL; } } #endif @@ -1880,11 +1878,8 @@ int vmstat_refresh(struct ctl_table *table, int write, if (val < 0) { pr_warn("%s: %s %ld\n", __func__, node_stat_name(i), val); - err = -EINVAL; } } - if (err) - return err; if (write) *ppos += *lenp; else From 75083aae114c2738af28eef2fb0c2515e818885a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 May 2021 18:37:57 -0700 Subject: [PATCH 1165/1379] mm: /proc/sys/vm/stat_refresh skip checking known negative stats vmstat_refresh() can occasionally catch nr_zone_write_pending and nr_writeback when they are transiently negative. The reason is partly that the interrupt which decrements them in test_clear_page_writeback() can come in before __test_set_page_writeback() got to increment them; but transient negatives are still seen even when that is prevented, and I am not yet certain why (but see Roman's note below). Those stats are not buggy, they have never been seen to drift away from 0 permanently: so just avoid the annoyance of showing a warning on them. Similarly avoid showing a warning on nr_free_cma: CMA users have seen that one reported negative from /proc/sys/vm/stat_refresh too, but it does drift away permanently: I believe that's because its incrementation and decrementation are decided by page migratetype, but the migratetype of a pageblock is not guaranteed to be constant. Roman Gushchin points out: "For performance reasons, vmstat counters are incremented and decremented using per-cpu batches. vmstat_refresh() flushes the per-cpu batches on all CPUs, to get values as accurate as possible; but this method is not atomic, so the resulting value is not always precise. As a consequence, for those counters whose actual value is close to 0, a small negative value may occasionally be reported. If the value is small and the state is transient, it is not an indication of an error" Link: https://lore.kernel.org/linux-mm/20200714173747.3315771-1-guro@fb.com/ Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2103012158540.7549@eggly.anvils Signed-off-by: Hugh Dickins Reported-by: Roman Gushchin Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index 1fb91d5a2b60..5437a7861089 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1858,6 +1858,14 @@ int vmstat_refresh(struct ctl_table *table, int write, if (err) return err; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + /* + * Skip checking stats known to go negative occasionally. + */ + switch (i) { + case NR_ZONE_WRITE_PENDING: + case NR_FREE_CMA_PAGES: + continue; + } val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { pr_warn("%s: %s %ld\n", @@ -1874,6 +1882,13 @@ int vmstat_refresh(struct ctl_table *table, int write, } #endif for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + /* + * Skip checking stats known to go negative occasionally. + */ + switch (i) { + case NR_WRITEBACK: + continue; + } val = atomic_long_read(&vm_node_stat[i]); if (val < 0) { pr_warn("%s: %s %ld\n", From c675790972916d3722809fcc52c5c4f8421b2e5d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 4 May 2021 18:38:00 -0700 Subject: [PATCH 1166/1379] mm: /proc/sys/vm/stat_refresh stop checking monotonic numa stats All of the VM NUMA stats are event counts, incremented never decremented: it is not very useful for vmstat_refresh() to check them throughout their first aeon, then warn on them throughout their next. Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2102251514110.13363@eggly.anvils Signed-off-by: Hugh Dickins Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 5437a7861089..06cd78dc914d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1872,15 +1872,6 @@ int vmstat_refresh(struct ctl_table *table, int write, __func__, zone_stat_name(i), val); } } -#ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) { - val = atomic_long_read(&vm_numa_stat[i]); - if (val < 0) { - pr_warn("%s: %s %ld\n", - __func__, numa_stat_name(i), val); - } - } -#endif for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { /* * Skip checking stats known to go negative occasionally. From 575299ea18a8c0575d4c2ef6ad3fa4d41d529d1c Mon Sep 17 00:00:00 2001 From: Saravanan D Date: Tue, 4 May 2021 18:38:03 -0700 Subject: [PATCH 1167/1379] x86/mm: track linear mapping split events To help with debugging the sluggishness caused by TLB miss/reload, we introduce monotonic hugepage [direct mapped] split event counts since system state: SYSTEM_RUNNING to be displayed as part of /proc/vmstat in x86 servers The lifetime split event information will be displayed at the bottom of /proc/vmstat .... swap_ra 0 swap_ra_hit 0 direct_map_level2_splits 94 direct_map_level3_splits 4 nr_unstable 0 .... One of the many lasting sources of direct hugepage splits is kernel tracing (kprobes, tracepoints). Note that the kernel's code segment [512 MB] points to the same physical addresses that have been already mapped in the kernel's direct mapping range. Source : Documentation/x86/x86_64/mm.rst When we enable kernel tracing, the kernel has to modify attributes/permissions of the text segment hugepages that are direct mapped causing them to split. Kernel's direct mapped hugepages do not coalesce back after split and remain in place for the remainder of the lifetime. An instance of direct page splits when we turn on dynamic kernel tracing .... cat /proc/vmstat | grep -i direct_map_level direct_map_level2_splits 784 direct_map_level3_splits 12 bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @ [pid, comm] = count(); }' cat /proc/vmstat | grep -i direct_map_level direct_map_level2_splits 789 direct_map_level3_splits 12 .... Link: https://lkml.kernel.org/r/20210218235744.1040634-1-saravanand@fb.com Signed-off-by: Saravanan D Acked-by: Tejun Heo Acked-by: Johannes Weiner Acked-by: Dave Hansen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pat/set_memory.c | 8 ++++++++ include/linux/vm_event_item.h | 4 ++++ mm/vmstat.c | 4 ++++ 3 files changed, 16 insertions(+) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 427980617557..156cd235659f 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include @@ -91,6 +93,12 @@ static void split_page_count(int level) return; direct_pages_count[level]--; + if (system_state == SYSTEM_RUNNING) { + if (level == PG_LEVEL_2M) + count_vm_event(DIRECT_MAP_LEVEL2_SPLIT); + else if (level == PG_LEVEL_1G) + count_vm_event(DIRECT_MAP_LEVEL3_SPLIT); + } direct_pages_count[level - 1] += PTRS_PER_PTE; } diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 21d7c7f72f1c..ae0dd1948c2b 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -124,6 +124,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, +#endif +#ifdef CONFIG_X86 + DIRECT_MAP_LEVEL2_SPLIT, + DIRECT_MAP_LEVEL3_SPLIT, #endif NR_VM_EVENT_ITEMS }; diff --git a/mm/vmstat.c b/mm/vmstat.c index 06cd78dc914d..5ba118521ded 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1369,6 +1369,10 @@ const char * const vmstat_text[] = { "swap_ra", "swap_ra_hit", #endif +#ifdef CONFIG_X86 + "direct_map_level2_splits", + "direct_map_level3_splits", +#endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ From fce000b1bc08c64c0cff4bb705b3970bd6fc1e34 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Tue, 4 May 2021 18:38:06 -0700 Subject: [PATCH 1168/1379] mm/mmap.c: don't unlock VMAs in remap_file_pages() Since this call uses MAP_FIXED, do_mmap() will munlock the necessary range. There is also an error in the loop test expression which will evaluate as false and the loop body has never execute. Link: https://lkml.kernel.org/r/20210223235010.2296915-1-Liam.Howlett@Oracle.com Signed-off-by: Liam R. Howlett Acked-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 347ef9b83bb5..c1b848fa7da6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3029,25 +3029,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, flags &= MAP_NONBLOCK; flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; - if (vma->vm_flags & VM_LOCKED) { - struct vm_area_struct *tmp; + if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; - /* drop PG_Mlocked flag for over-mapped range */ - for (tmp = vma; tmp->vm_start >= start + size; - tmp = tmp->vm_next) { - /* - * Split pmd and munlock page on the border - * of the range. - */ - vma_adjust_trans_huge(tmp, start, start + size, 0); - - munlock_vma_pages_range(tmp, - max(tmp->vm_start, start), - min(tmp->vm_end, start + size)); - } - } - file = get_file(vma->vm_file); ret = do_mmap(vma->vm_file, start, size, prot, flags, pgoff, &populate, NULL); From c2280be81de404e99f66c7249496b0355406ed94 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:38:09 -0700 Subject: [PATCH 1169/1379] mm: generalize ARCH_HAS_CACHE_LINE_SIZE Patch series "mm: some config cleanups", v2. This series contains config cleanup patches which reduces code duplication across platforms and also improves maintainability. There is no functional change intended with this series. This patch (of 6): ARCH_HAS_CACHE_LINE_SIZE config has duplicate definitions on platforms that subscribe it. Instead, just make it a generic option which can be selected on applicable platforms. This change reduces code duplication and makes it cleaner. Link: https://lkml.kernel.org/r/1617259448-22529-1-git-send-email-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/1617259448-22529-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas [arm64] Acked-by: Vineet Gupta [arc] Cc: Will Deacon Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Albert Ou Cc: Alexander Viro Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/Kconfig | 4 +--- arch/arm64/Kconfig | 4 +--- arch/x86/Kconfig | 4 +--- mm/Kconfig | 3 +++ 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index bc8d6aecfbbd..fab05f7189c0 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -6,6 +6,7 @@ config ARC def_bool y select ARC_TIMERS + select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_PTE_SPECIAL @@ -48,9 +49,6 @@ config ARC select HAVE_ARCH_JUMP_LABEL if ISA_ARCV2 && !CPU_ENDIAN_BE32 select SET_FS -config ARCH_HAS_CACHE_LINE_SIZE - def_bool y - config TRACE_IRQFLAGS_SUPPORT def_bool y diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 04c69f606537..86dfaac79e49 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -11,6 +11,7 @@ config ARM64 select ACPI_PPTT if ACPI select ARCH_HAS_DEBUG_WX select ARCH_BINFMT_ELF_STATE + select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT @@ -1074,9 +1075,6 @@ config HW_PERF_EVENTS config SYS_SUPPORTS_HUGETLBFS def_bool y -config ARCH_HAS_CACHE_LINE_SIZE - def_bool y - config ARCH_HAS_FILTER_PGPROT def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1c350e8782ed..78c475d8c409 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -61,6 +61,7 @@ config X86 select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_INIT select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI + select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE select ARCH_HAS_DEVMEM_IS_ALLOWED @@ -316,9 +317,6 @@ config GENERIC_CALIBRATE_DELAY config ARCH_HAS_CPU_RELAX def_bool y -config ARCH_HAS_CACHE_LINE_SIZE - def_bool y - config ARCH_HAS_FILTER_PGPROT def_bool y diff --git a/mm/Kconfig b/mm/Kconfig index 5f7d7b3d8cc2..8d6fb99aeded 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -772,6 +772,9 @@ config IDLE_PAGE_TRACKING See Documentation/admin-guide/mm/idle_page_tracking.rst for more details. +config ARCH_HAS_CACHE_LINE_SIZE + bool + config ARCH_HAS_PTE_DEVMAP bool From 855f9a8e87fe3912a1c00eb63f36880d1ad32e40 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:38:13 -0700 Subject: [PATCH 1170/1379] mm: generalize SYS_SUPPORTS_HUGETLBFS (rename as ARCH_SUPPORTS_HUGETLBFS) SYS_SUPPORTS_HUGETLBFS config has duplicate definitions on platforms that subscribe it. Instead, just make it a generic option which can be selected on applicable platforms. Also rename it as ARCH_SUPPORTS_HUGETLBFS instead. This reduces code duplication and makes it cleaner. Link: https://lkml.kernel.org/r/1617259448-22529-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas [arm64] Acked-by: Palmer Dabbelt [riscv] Acked-by: Michael Ellerman [powerpc] Cc: Russell King Cc: Will Deacon Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Albert Ou Cc: Yoshinori Sato Cc: Rich Felker Cc: Alexander Viro Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 5 +---- arch/arm64/Kconfig | 4 +--- arch/mips/Kconfig | 6 +----- arch/parisc/Kconfig | 5 +---- arch/powerpc/Kconfig | 3 --- arch/powerpc/platforms/Kconfig.cputype | 6 +++--- arch/riscv/Kconfig | 5 +---- arch/sh/Kconfig | 5 +---- fs/Kconfig | 5 ++++- 9 files changed, 13 insertions(+), 31 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 085c830d344b..13fa7ecf195c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -31,6 +31,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST @@ -1511,10 +1512,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config SYS_SUPPORTS_HUGETLBFS - def_bool y - depends on ARM_LPAE - config HAVE_ARCH_TRANSPARENT_HUGEPAGE def_bool y depends on ARM_LPAE diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 86dfaac79e49..0fd5d373d7cf 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -73,6 +73,7 @@ config ARM64 select ARCH_USE_QUEUED_SPINLOCKS select ARCH_USE_SYM_ANNOTATIONS select ARCH_SUPPORTS_DEBUG_PAGEALLOC + select ARCH_SUPPORTS_HUGETLBFS select ARCH_SUPPORTS_MEMORY_FAILURE select ARCH_SUPPORTS_SHADOW_CALL_STACK if CC_HAVE_SHADOW_CALL_STACK select ARCH_SUPPORTS_LTO_CLANG if CPU_LITTLE_ENDIAN @@ -1072,9 +1073,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config SYS_SUPPORTS_HUGETLBFS - def_bool y - config ARCH_HAS_FILTER_PGPROT def_bool y diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 49a3c9cd1cb2..ed51970c08e7 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -19,6 +19,7 @@ config MIPS select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_SUPPORTS_HUGETLBFS if CPU_SUPPORTS_HUGEPAGES select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_LD_ORPHAN_WARN @@ -1287,11 +1288,6 @@ config SYS_SUPPORTS_BIG_ENDIAN config SYS_SUPPORTS_LITTLE_ENDIAN bool -config SYS_SUPPORTS_HUGETLBFS - bool - depends on CPU_SUPPORTS_HUGEPAGES - default y - config MIPS_HUGE_TLB_SUPPORT def_bool HUGETLB_PAGE || TRANSPARENT_HUGEPAGE diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index afc3b8d03572..bde9907bc5b2 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -12,6 +12,7 @@ config PARISC select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_NO_SG_CHAIN + select ARCH_SUPPORTS_HUGETLBFS if PA20 select ARCH_SUPPORTS_MEMORY_FAILURE select DMA_OPS select RTC_CLASS @@ -138,10 +139,6 @@ config PGTABLE_LEVELS default 3 if 64BIT && PARISC_PAGE_SIZE_4KB default 2 -config SYS_SUPPORTS_HUGETLBFS - def_bool y if PA20 - - menu "Processor type and features" choice diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 965278498dd7..3a2b90e21e69 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -697,9 +697,6 @@ config ARCH_SPARSEMEM_DEFAULT def_bool y depends on PPC_BOOK3S_64 -config SYS_SUPPORTS_HUGETLBFS - bool - config ILLEGAL_POINTER_VALUE hex # This is roughly half way between the top of user space and the bottom diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 3ce907523b1e..cec1017813f8 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -40,8 +40,8 @@ config PPC_85xx config PPC_8xx bool "Freescale 8xx" + select ARCH_SUPPORTS_HUGETLBFS select FSL_SOC - select SYS_SUPPORTS_HUGETLBFS select PPC_HAVE_KUEP select PPC_HAVE_KUAP select HAVE_ARCH_VMAP_STACK @@ -95,9 +95,9 @@ config PPC_BOOK3S_64 bool "Server processors" select PPC_FPU select PPC_HAVE_PMU_SUPPORT - select SYS_SUPPORTS_HUGETLBFS select HAVE_ARCH_TRANSPARENT_HUGEPAGE select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE + select ARCH_SUPPORTS_HUGETLBFS select ARCH_SUPPORTS_NUMA_BALANCING select IRQ_WORK select PPC_MM_SLICES @@ -278,9 +278,9 @@ config FSL_BOOKE # this is for common code between PPC32 & PPC64 FSL BOOKE config PPC_FSL_BOOK3E bool + select ARCH_SUPPORTS_HUGETLBFS if PHYS_64BIT || PPC64 select FSL_EMB_PERFMON select PPC_SMP_MUXED_IPI - select SYS_SUPPORTS_HUGETLBFS if PHYS_64BIT || PPC64 select PPC_DOORBELL default y if FSL_BOOKE diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4515a10c5d22..2a0fc12b8d45 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -30,6 +30,7 @@ config RISCV select ARCH_HAS_STRICT_KERNEL_RWX if MMU select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT + select ARCH_SUPPORTS_HUGETLBFS if MMU select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if 64BIT @@ -165,10 +166,6 @@ config ARCH_WANT_GENERAL_HUGETLB config ARCH_SUPPORTS_UPROBES def_bool y -config SYS_SUPPORTS_HUGETLBFS - depends on MMU - def_bool y - config STACKTRACE_SUPPORT def_bool y diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index e798e55915c2..a54b0c5de37b 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -101,9 +101,6 @@ config SYS_SUPPORTS_APM_EMULATION bool select ARCH_SUSPEND_POSSIBLE -config SYS_SUPPORTS_HUGETLBFS - bool - config SYS_SUPPORTS_SMP bool @@ -175,12 +172,12 @@ config CPU_SH3 config CPU_SH4 bool + select ARCH_SUPPORTS_HUGETLBFS if MMU select CPU_HAS_INTEVT select CPU_HAS_SR_RB select CPU_HAS_FPU if !CPU_SH4AL_DSP select SH_INTC select SYS_SUPPORTS_SH_TMU - select SYS_SUPPORTS_HUGETLBFS if MMU config CPU_SH4A bool diff --git a/fs/Kconfig b/fs/Kconfig index 97e7b77c9309..89a750d292ba 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -223,10 +223,13 @@ config TMPFS_INODE64 If unsure, say N. +config ARCH_SUPPORTS_HUGETLBFS + def_bool n + config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ - SYS_SUPPORTS_HUGETLBFS || BROKEN + ARCH_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read From 91024b3ce247213ee43103dffd629623537a569e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:38:17 -0700 Subject: [PATCH 1171/1379] mm: generalize ARCH_ENABLE_MEMORY_[HOTPLUG|HOTREMOVE] ARCH_ENABLE_MEMORY_[HOTPLUG|HOTREMOVE] configs have duplicate definitions on platforms that subscribe them. Instead, just make them generic options which can be selected on applicable platforms. Link: https://lkml.kernel.org/r/1617259448-22529-4-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas [arm64] Acked-by: Heiko Carstens [s390] Cc: Will Deacon Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Albert Ou Cc: Alexander Viro Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Bogendoerfer Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 8 ++------ arch/ia64/Kconfig | 8 ++------ arch/powerpc/Kconfig | 8 ++------ arch/s390/Kconfig | 8 ++------ arch/sh/Kconfig | 2 ++ arch/sh/mm/Kconfig | 8 -------- arch/x86/Kconfig | 10 ++-------- mm/Kconfig | 6 ++++++ 8 files changed, 18 insertions(+), 40 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 0fd5d373d7cf..c0746b2d3fff 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -11,6 +11,8 @@ config ARM64 select ACPI_PPTT if ACPI select ARCH_HAS_DEBUG_WX select ARCH_BINFMT_ELF_STATE + select ARCH_ENABLE_MEMORY_HOTPLUG + select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE @@ -311,12 +313,6 @@ config ZONE_DMA32 bool "Support DMA32 zone" if EXPERT default y -config ARCH_ENABLE_MEMORY_HOTPLUG - def_bool y - -config ARCH_ENABLE_MEMORY_HOTREMOVE - def_bool y - config SMP def_bool y diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index fd2cfc65fdc9..279252e3e0f7 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -13,6 +13,8 @@ config IA64 select ARCH_MIGHT_HAVE_PC_SERIO select ACPI select ACPI_NUMA if NUMA + select ARCH_ENABLE_MEMORY_HOTPLUG + select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_SUPPORTS_ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI @@ -246,12 +248,6 @@ config HOTPLUG_CPU can be controlled through /sys/devices/system/cpu/cpu#. Say N if you want to disable CPU hotplug. -config ARCH_ENABLE_MEMORY_HOTPLUG - def_bool y - -config ARCH_ENABLE_MEMORY_HOTREMOVE - def_bool y - config SCHED_SMT bool "SMT scheduler support" depends on SMP diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3a2b90e21e69..d4333049b813 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -118,6 +118,8 @@ config PPC # Please keep this list sorted alphabetically. # select ARCH_32BIT_OFF_T if PPC32 + select ARCH_ENABLE_MEMORY_HOTPLUG + select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE @@ -512,12 +514,6 @@ config ARCH_CPU_PROBE_RELEASE def_bool y depends on HOTPLUG_CPU -config ARCH_ENABLE_MEMORY_HOTPLUG - def_bool y - -config ARCH_ENABLE_MEMORY_HOTREMOVE - def_bool y - config PPC64_SUPPORTS_MEMORY_FAILURE bool "Add support for memory hwpoison" depends on PPC_BOOK3S_64 diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c1ff874e6c2e..f8b356550daa 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -60,6 +60,8 @@ config S390 imply IMA_SECURE_AND_OR_TRUSTED_BOOT select ARCH_32BIT_USTAT_F_TINODE select ARCH_BINFMT_ELF_STATE + select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM + select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX select ARCH_HAS_DEVMEM_IS_ALLOWED @@ -626,12 +628,6 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SPARSEMEM_DEFAULT def_bool y -config ARCH_ENABLE_MEMORY_HOTPLUG - def_bool y if SPARSEMEM - -config ARCH_ENABLE_MEMORY_HOTREMOVE - def_bool y - config ARCH_ENABLE_SPLIT_PMD_PTLOCK def_bool y diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index a54b0c5de37b..68129537e350 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -2,6 +2,8 @@ config SUPERH def_bool y select ARCH_32BIT_OFF_T + select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM && MMU + select ARCH_ENABLE_MEMORY_HOTREMOVE if SPARSEMEM && MMU select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) select ARCH_HAS_BINFMT_FLAT if !MMU diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index 77aa2f802d8d..d551a9cac41e 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -136,14 +136,6 @@ config ARCH_SPARSEMEM_DEFAULT config ARCH_SELECT_MEMORY_MODEL def_bool y -config ARCH_ENABLE_MEMORY_HOTPLUG - def_bool y - depends on SPARSEMEM && MMU - -config ARCH_ENABLE_MEMORY_HOTREMOVE - def_bool y - depends on SPARSEMEM && MMU - config ARCH_MEMORY_PROBE def_bool y depends on MEMORY_HOTPLUG diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 78c475d8c409..306f76fea837 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,6 +60,8 @@ config X86 select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_INIT + select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM) + select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL @@ -2427,14 +2429,6 @@ config ARCH_HAS_ADD_PAGES def_bool y depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG -config ARCH_ENABLE_MEMORY_HOTPLUG - def_bool y - depends on X86_64 || (X86_32 && HIGHMEM) - -config ARCH_ENABLE_MEMORY_HOTREMOVE - def_bool y - depends on MEMORY_HOTPLUG - config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA diff --git a/mm/Kconfig b/mm/Kconfig index 8d6fb99aeded..fe4897c3c81b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -148,6 +148,9 @@ config MEMORY_ISOLATION config HAVE_BOOTMEM_INFO_NODE def_bool n +config ARCH_ENABLE_MEMORY_HOTPLUG + bool + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" @@ -176,6 +179,9 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE Say N here if you want the default policy to keep all hot-plugged memory blocks in 'offline' state. +config ARCH_ENABLE_MEMORY_HOTREMOVE + bool + config MEMORY_HOTREMOVE bool "Allow for memory hot remove" select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) From 1e866974a15be8921fb01f8c4efa93a5157ef690 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:38:21 -0700 Subject: [PATCH 1172/1379] mm: drop redundant ARCH_ENABLE_[HUGEPAGE|THP]_MIGRATION ARCH_ENABLE_[HUGEPAGE|THP]_MIGRATION configs have duplicate definitions on platforms that subscribe them. Drop these reduntant definitions and instead just select them appropriately. [akpm@linux-foundation.org: s/x86_64/X86_64/, per Oscar] Link: https://lkml.kernel.org/r/1617259448-22529-5-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas [arm64] Cc: Will Deacon Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Albert Ou Cc: Alexander Viro Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Rich Felker Cc: Russell King Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 10 ++-------- arch/powerpc/platforms/Kconfig.cputype | 5 +---- arch/x86/Kconfig | 10 ++-------- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c0746b2d3fff..4297fbca0faa 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -11,8 +11,10 @@ config ARM64 select ACPI_PPTT if ACPI select ARCH_HAS_DEBUG_WX select ARCH_BINFMT_ELF_STATE + select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE + select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE @@ -1916,14 +1918,6 @@ config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC -config ARCH_ENABLE_HUGEPAGE_MIGRATION - def_bool y - depends on HUGETLB_PAGE && MIGRATION - -config ARCH_ENABLE_THP_MIGRATION - def_bool y - depends on TRANSPARENT_HUGEPAGE - menu "Power management options" source "kernel/power/Kconfig" diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index cec1017813f8..4465b71b2bff 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -96,6 +96,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_HUGETLBFS select ARCH_SUPPORTS_NUMA_BALANCING @@ -420,10 +421,6 @@ config PPC_PKEY depends on PPC_BOOK3S_64 depends on PPC_MEM_KEYS || PPC_KUAP || PPC_KUEP -config ARCH_ENABLE_HUGEPAGE_MIGRATION - def_bool y - depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION - config PPC_MMU_NOHASH def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 306f76fea837..ab581af756cd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,8 +60,10 @@ config X86 select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_INIT + select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM) select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG + select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL @@ -2437,14 +2439,6 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK def_bool y depends on X86_64 || X86_PAE -config ARCH_ENABLE_HUGEPAGE_MIGRATION - def_bool y - depends on X86_64 && HUGETLB_PAGE && MIGRATION - -config ARCH_ENABLE_THP_MIGRATION - def_bool y - depends on X86_64 && TRANSPARENT_HUGEPAGE - menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER From 66f24fa766e3a5a194a85af98ff454d8d94b59cf Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:38:25 -0700 Subject: [PATCH 1173/1379] mm: drop redundant ARCH_ENABLE_SPLIT_PMD_PTLOCK ARCH_ENABLE_SPLIT_PMD_PTLOCKS has duplicate definitions on platforms that subscribe it. Drop these redundant definitions and instead just select it on applicable platforms. Link: https://lkml.kernel.org/r/1617259448-22529-6-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas [arm64] Acked-by: Heiko Carstens [s390] Cc: Will Deacon Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Albert Ou Cc: Alexander Viro Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Russell King Cc: Thomas Bogendoerfer Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 4 +--- arch/powerpc/platforms/Kconfig.cputype | 5 +---- arch/s390/Kconfig | 4 +--- arch/x86/Kconfig | 5 +---- 4 files changed, 4 insertions(+), 14 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4297fbca0faa..e12fad2e2a15 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -14,6 +14,7 @@ config ARM64 select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE + select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VIRTUAL @@ -1074,9 +1075,6 @@ config HW_PERF_EVENTS config ARCH_HAS_FILTER_PGPROT def_bool y -config ARCH_ENABLE_SPLIT_PMD_PTLOCK - def_bool y if PGTABLE_LEVELS > 2 - # Supported by clang >= 7.0 config CC_HAVE_SHADOW_CALL_STACK def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 4465b71b2bff..be0e29f18dd4 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -97,6 +97,7 @@ config PPC_BOOK3S_64 select PPC_HAVE_PMU_SUPPORT select HAVE_ARCH_TRANSPARENT_HUGEPAGE select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION + select ARCH_ENABLE_PMD_SPLIT_PTLOCK select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_HUGETLBFS select ARCH_SUPPORTS_NUMA_BALANCING @@ -356,10 +357,6 @@ config SPE If in doubt, say Y here. -config ARCH_ENABLE_SPLIT_PMD_PTLOCK - def_bool y - depends on PPC_BOOK3S_64 - config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f8b356550daa..d72989591223 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -62,6 +62,7 @@ config S390 select ARCH_BINFMT_ELF_STATE select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM select ARCH_ENABLE_MEMORY_HOTREMOVE + select ARCH_ENABLE_SPLIT_PMD_PTLOCK select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX select ARCH_HAS_DEVMEM_IS_ALLOWED @@ -628,9 +629,6 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SPARSEMEM_DEFAULT def_bool y -config ARCH_ENABLE_SPLIT_PMD_PTLOCK - def_bool y - config MAX_PHYSMEM_BITS int "Maximum size of supported physical memory in bits (42-53)" range 42 53 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ab581af756cd..0bbc157e4bc7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -63,6 +63,7 @@ config X86 select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM) select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG + select ARCH_ENABLE_SPLIT_PMD_PTLOCK if X86_64 || X86_PAE select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CACHE_LINE_SIZE @@ -2435,10 +2436,6 @@ config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA -config ARCH_ENABLE_SPLIT_PMD_PTLOCK - def_bool y - depends on X86_64 || X86_PAE - menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER From e8003bf66a7a66d8ae3db2c40b2dca180bf942bb Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 4 May 2021 18:38:29 -0700 Subject: [PATCH 1174/1379] mm: drop redundant HAVE_ARCH_TRANSPARENT_HUGEPAGE HAVE_ARCH_TRANSPARENT_HUGEPAGE has duplicate definitions on platforms that subscribe it. Drop these reduntant definitions and instead just select it on applicable platforms. Link: https://lkml.kernel.org/r/1617259448-22529-7-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Arnd Bergmann Acked-by: Vineet Gupta [arc] Cc: Russell King Cc: Albert Ou Cc: Alexander Viro Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Rich Felker Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/Kconfig | 5 +---- arch/arm/Kconfig | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index fab05f7189c0..2d98501c0897 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -29,6 +29,7 @@ config ARC select GENERIC_SMP_IDLE_THREAD select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARC_MMU_V4 select HAVE_DEBUG_STACKOVERFLOW select HAVE_DEBUG_KMEMLEAK select HAVE_FUTEX_CMPXCHG if FUTEX @@ -84,10 +85,6 @@ config STACKTRACE_SUPPORT def_bool y select STACKTRACE -config HAVE_ARCH_TRANSPARENT_HUGEPAGE - def_bool y - depends on ARC_MMU_V4 - menu "ARC Architecture Configuration" menu "ARC Platform/SoC/Board" diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 13fa7ecf195c..24804f11302d 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -78,6 +78,7 @@ config ARM select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK + select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARM_LPAE select HAVE_ARM_SMCCC if CPU_V7 select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32 select HAVE_CONTEXT_TRACKING @@ -1512,10 +1513,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config HAVE_ARCH_TRANSPARENT_HUGEPAGE - def_bool y - depends on ARM_LPAE - config ARCH_WANT_GENERAL_HUGETLB def_bool y From 2521781c1ebc6d26b7fbe9b7e9614fd2f38affb5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 4 May 2021 18:38:32 -0700 Subject: [PATCH 1175/1379] mm/util.c: reduce mem_dump_obj() object size Simplify the code by using a temporary and reduce the object size by using a single call to pr_cont(). Reverse a test and unindent a block too. $ size mm/util.o* (defconfig x86-64) text data bss dec hex filename 7419 372 40 7831 1e97 mm/util.o.new 7477 372 40 7889 1ed1 mm/util.o.old Link: https://lkml.kernel.org/r/a6e105886338f68afd35f7a13d73bcf06b0cc732.camel@perches.com Signed-off-by: Joe Perches Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/mm/util.c b/mm/util.c index 083c5c417cfc..972e7a0cda5e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -987,22 +987,26 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) */ void mem_dump_obj(void *object) { + const char *type; + if (kmem_valid_obj(object)) { kmem_dump_obj(object); return; } + if (vmalloc_dump_obj(object)) return; - if (!virt_addr_valid(object)) { - if (object == NULL) - pr_cont(" NULL pointer.\n"); - else if (object == ZERO_SIZE_PTR) - pr_cont(" zero-size pointer.\n"); - else - pr_cont(" non-paged memory.\n"); - return; - } - pr_cont(" non-slab/vmalloc memory.\n"); + + if (virt_addr_valid(object)) + type = "non-slab/vmalloc memory"; + else if (object == NULL) + type = "NULL pointer"; + else if (object == ZERO_SIZE_PTR) + type = "zero-size pointer"; + else + type = "non-paged memory"; + + pr_cont(" %s\n", type); } EXPORT_SYMBOL_GPL(mem_dump_obj); #endif From 31454980b8b55b066ba0d6b8267313fcb94ea816 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Tue, 4 May 2021 18:38:35 -0700 Subject: [PATCH 1176/1379] mm/util.c: fix typo s/condtion/condition/ Link: https://lkml.kernel.org/r/20210317033439.3429411-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/util.c b/mm/util.c index 972e7a0cda5e..a8bf17f18a81 100644 --- a/mm/util.c +++ b/mm/util.c @@ -765,7 +765,7 @@ int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, * The deviation of sync_overcommit_as could be big with loose policy * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply - * with the strict "NEVER", and to avoid possible race condtion (even + * with the strict "NEVER", and to avoid possible race condition (even * though user usually won't too frequently do the switching to policy * OVERCOMMIT_NEVER), the switch is done in the following order: * 1. changing the batch From c991ffef7bce85a5d4ebc503c06dfd6dd8e5dc52 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:38 -0700 Subject: [PATCH 1177/1379] mm/gup: don't pin migrated cma pages in movable zone Patch series "prohibit pinning pages in ZONE_MOVABLE", v11. When page is pinned it cannot be moved and its physical address stays the same until pages is unpinned. This is useful functionality to allows userland to implementation DMA access. For example, it is used by vfio in vfio_pin_pages(). However, this functionality breaks memory hotplug/hotremove assumptions that pages in ZONE_MOVABLE can always be migrated. This patch series fixes this issue by forcing new allocations during page pinning to omit ZONE_MOVABLE, and also to migrate any existing pages from ZONE_MOVABLE during pinning. It uses the same scheme logic that is currently used by CMA, and extends the functionality for all allocations. For more information read the discussion [1] about this problem. [1] https://lore.kernel.org/lkml/CA+CK2bBffHBxjmb9jmSKacm0fJMinyt3Nhk8Nx6iudcQSj80_w@mail.gmail.com This patch (of 14): In order not to fragment CMA the pinned pages are migrated. However, they are migrated to ZONE_MOVABLE, which also should not have pinned pages. Remove __GFP_MOVABLE, so pages can be migrated to zones where pinning is allowed. Link: https://lkml.kernel.org/r/20210215161349.246722-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20210215161349.246722-2-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: David Hildenbrand Reviewed-by: John Hubbard Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Michal Hocko Cc: David Hildenbrand Cc: Oscar Salvador Cc: Dan Williams Cc: Sasha Levin Cc: Tyler Hicks Cc: Joonsoo Kim Cc: Mike Kravetz Cc: Steven Rostedt (VMware) Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Peter Zijlstra Cc: Mel Gorman Cc: Matthew Wilcox Cc: David Rientjes Cc: John Hubbard Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index 71e546e279fc..b63e2c0df3c7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1616,7 +1616,7 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm, long ret = nr_pages; struct migration_target_control mtc = { .nid = NUMA_NO_NODE, - .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN, + .gfp_mask = GFP_USER | __GFP_NOWARN, }; check_again: From 83c02c23d0747a7bdcd71f99a538aacec94b146c Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:42 -0700 Subject: [PATCH 1178/1379] mm/gup: check every subpage of a compound page during isolation When pages are isolated in check_and_migrate_movable_pages() we skip compound number of pages at a time. However, as Jason noted, it is not necessary correct that pages[i] corresponds to the pages that we skipped. This is because it is possible that the addresses in this range had split_huge_pmd()/split_huge_pud(), and these functions do not update the compound page metadata. The problem can be reproduced if something like this occurs: 1. User faulted huge pages. 2. split_huge_pmd() was called for some reason 3. User has unmapped some sub-pages in the range 4. User tries to longterm pin the addresses. The resulting pages[i] might end-up having pages which are not compound size page aligned. Link: https://lkml.kernel.org/r/20210215161349.246722-3-pasha.tatashin@soleen.com Fixes: aa712399c1e8 ("mm/gup: speed up check_and_migrate_cma_pages() on huge page") Signed-off-by: Pavel Tatashin Reported-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index b63e2c0df3c7..16015718c0df 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1609,26 +1609,23 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm, unsigned int gup_flags) { unsigned long i; - unsigned long step; bool drain_allow = true; bool migrate_allow = true; LIST_HEAD(cma_page_list); long ret = nr_pages; + struct page *prev_head, *head; struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, }; check_again: - for (i = 0; i < nr_pages;) { - - struct page *head = compound_head(pages[i]); - - /* - * gup may start from a tail page. Advance step by the left - * part. - */ - step = compound_nr(head) - (pages[i] - head); + prev_head = NULL; + for (i = 0; i < nr_pages; i++) { + head = compound_head(pages[i]); + if (head == prev_head) + continue; + prev_head = head; /* * If we get a page from the CMA zone, since we are going to * be pinning these entries, we might as well move them out @@ -1652,8 +1649,6 @@ check_again: } } } - - i += step; } if (!list_empty(&cma_page_list)) { From f0f4463837da17a89d965dcbe4e411629dbcf308 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:46 -0700 Subject: [PATCH 1179/1379] mm/gup: return an error on migration failure When migration failure occurs, we still pin pages, which means that we may pin CMA movable pages which should never be the case. Instead return an error without pinning pages when migration failure happens. No need to retry migrating, because migrate_pages() already retries 10 times. Link: https://lkml.kernel.org/r/20210215161349.246722-4-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: Jason Gunthorpe Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 16015718c0df..da08c582c216 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1610,7 +1610,6 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm, { unsigned long i; bool drain_allow = true; - bool migrate_allow = true; LIST_HEAD(cma_page_list); long ret = nr_pages; struct page *prev_head, *head; @@ -1661,17 +1660,15 @@ check_again: for (i = 0; i < nr_pages; i++) put_page(pages[i]); - if (migrate_pages(&cma_page_list, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) { - /* - * some of the pages failed migration. Do get_user_pages - * without migration. - */ - migrate_allow = false; - + ret = migrate_pages(&cma_page_list, alloc_migration_target, + NULL, (unsigned long)&mtc, MIGRATE_SYNC, + MR_CONTIG_RANGE); + if (ret) { if (!list_empty(&cma_page_list)) putback_movable_pages(&cma_page_list); + return ret > 0 ? -ENOMEM : ret; } + /* * We did migrate all the pages, Try to get the page references * again migrating any new CMA pages which we failed to isolate @@ -1681,7 +1678,7 @@ check_again: pages, vmas, NULL, gup_flags); - if ((ret > 0) && migrate_allow) { + if (ret > 0) { nr_pages = ret; drain_allow = true; goto check_again; From 6e7f34ebb8d25d71ce7f4580ba3cbfc10b895580 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:49 -0700 Subject: [PATCH 1180/1379] mm/gup: check for isolation errors It is still possible that we pin movable CMA pages if there are isolation errors and cma_page_list stays empty when we check again. Check for isolation errors, and return success only when there are no isolation errors, and cma_page_list is empty after checking. Because isolation errors are transient, we retry indefinitely. Link: https://lkml.kernel.org/r/20210215161349.246722-5-pasha.tatashin@soleen.com Fixes: 9a4e9f3b2d73 ("mm: update get_user_pages_longterm to migrate pages allocated from CMA region") Signed-off-by: Pavel Tatashin Reviewed-by: Jason Gunthorpe Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 60 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index da08c582c216..3a00f6a8ffd6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1608,8 +1608,8 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm, struct vm_area_struct **vmas, unsigned int gup_flags) { - unsigned long i; - bool drain_allow = true; + unsigned long i, isolation_error_count; + bool drain_allow; LIST_HEAD(cma_page_list); long ret = nr_pages; struct page *prev_head, *head; @@ -1620,6 +1620,8 @@ static long check_and_migrate_cma_pages(struct mm_struct *mm, check_again: prev_head = NULL; + isolation_error_count = 0; + drain_allow = true; for (i = 0; i < nr_pages; i++) { head = compound_head(pages[i]); if (head == prev_head) @@ -1631,25 +1633,35 @@ check_again: * of the CMA zone if possible. */ if (is_migrate_cma_page(head)) { - if (PageHuge(head)) - isolate_huge_page(head, &cma_page_list); - else { + if (PageHuge(head)) { + if (!isolate_huge_page(head, &cma_page_list)) + isolation_error_count++; + } else { if (!PageLRU(head) && drain_allow) { lru_add_drain_all(); drain_allow = false; } - if (!isolate_lru_page(head)) { - list_add_tail(&head->lru, &cma_page_list); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + - page_is_file_lru(head), - thp_nr_pages(head)); + if (isolate_lru_page(head)) { + isolation_error_count++; + continue; } + list_add_tail(&head->lru, &cma_page_list); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + + page_is_file_lru(head), + thp_nr_pages(head)); } } } + /* + * If list is empty, and no isolation errors, means that all pages are + * in the correct zone. + */ + if (list_empty(&cma_page_list) && !isolation_error_count) + return ret; + if (!list_empty(&cma_page_list)) { /* * drop the above get_user_pages reference. @@ -1669,23 +1681,19 @@ check_again: return ret > 0 ? -ENOMEM : ret; } - /* - * We did migrate all the pages, Try to get the page references - * again migrating any new CMA pages which we failed to isolate - * earlier. - */ - ret = __get_user_pages_locked(mm, start, nr_pages, - pages, vmas, NULL, - gup_flags); - - if (ret > 0) { - nr_pages = ret; - drain_allow = true; - goto check_again; - } + /* We unpinned pages before migration, pin them again */ + ret = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + NULL, gup_flags); + if (ret <= 0) + return ret; + nr_pages = ret; } - return ret; + /* + * check again because pages were unpinned, and we also might have + * had isolation errors and need more pages to migrate. + */ + goto check_again; } #else static long check_and_migrate_cma_pages(struct mm_struct *mm, From 1a08ae36cf8b5f26d0c64ebfe46f8eb07ea0b678 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:53 -0700 Subject: [PATCH 1181/1379] mm cma: rename PF_MEMALLOC_NOCMA to PF_MEMALLOC_PIN PF_MEMALLOC_NOCMA is used ot guarantee that the allocator will not return pages that might belong to CMA region. This is currently used for long term gup to make sure that such pins are not going to be done on any CMA pages. When PF_MEMALLOC_NOCMA has been introduced we haven't realized that it is focusing on CMA pages too much and that there is larger class of pages that need the same treatment. MOVABLE zone cannot contain any long term pins as well so it makes sense to reuse and redefine this flag for that usecase as well. Rename the flag to PF_MEMALLOC_PIN which defines an allocation context which can only get pages suitable for long-term pins. Also rename: memalloc_nocma_save()/memalloc_nocma_restore to memalloc_pin_save()/memalloc_pin_restore() and make the new functions common. [rppt@linux.ibm.com: fix renaming of PF_MEMALLOC_NOCMA to PF_MEMALLOC_PIN] Link: https://lkml.kernel.org/r/20210331163816.11517-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20210215161349.246722-6-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Acked-by: Michal Hocko Signed-off-by: Mike Rapoport Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- include/linux/sched/mm.h | 21 +++++---------------- mm/gup.c | 4 ++-- mm/hugetlb.c | 4 ++-- mm/page_alloc.c | 4 ++-- 5 files changed, 12 insertions(+), 23 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c25c8e67030..d2c881384517 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1583,7 +1583,7 @@ extern struct pid *cad_pid; #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ +#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 90b2a0bce11c..ae654819e8aa 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -271,29 +271,18 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC) | flags; } -#ifdef CONFIG_CMA -static inline unsigned int memalloc_nocma_save(void) +static inline unsigned int memalloc_pin_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_NOCMA; + unsigned int flags = current->flags & PF_MEMALLOC_PIN; - current->flags |= PF_MEMALLOC_NOCMA; + current->flags |= PF_MEMALLOC_PIN; return flags; } -static inline void memalloc_nocma_restore(unsigned int flags) +static inline void memalloc_pin_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags; + current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags; } -#else -static inline unsigned int memalloc_nocma_save(void) -{ - return 0; -} - -static inline void memalloc_nocma_restore(unsigned int flags) -{ -} -#endif #ifdef CONFIG_MEMCG DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); diff --git a/mm/gup.c b/mm/gup.c index 3a00f6a8ffd6..a1eff7ad31da 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1722,7 +1722,7 @@ static long __gup_longterm_locked(struct mm_struct *mm, long rc; if (gup_flags & FOLL_LONGTERM) - flags = memalloc_nocma_save(); + flags = memalloc_pin_save(); rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, gup_flags); @@ -1731,7 +1731,7 @@ static long __gup_longterm_locked(struct mm_struct *mm, if (rc > 0) rc = check_and_migrate_cma_pages(mm, start, rc, pages, vmas, gup_flags); - memalloc_nocma_restore(flags); + memalloc_pin_restore(flags); } return rc; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 533e5a26e437..60dc197a4417 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1079,11 +1079,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) { struct page *page; - bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA); + bool pin = !!(current->flags & PF_MEMALLOC_PIN); lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (nocma && is_migrate_cma_page(page)) + if (pin && is_migrate_cma_page(page)) continue; if (PageHWPoison(page)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d12299c08b95..c2fc6a64bef9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3865,8 +3865,8 @@ static inline unsigned int current_alloc_flags(gfp_t gfp_mask, #ifdef CONFIG_CMA unsigned int pflags = current->flags; - if (!(pflags & PF_MEMALLOC_NOCMA) && - gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (!(pflags & PF_MEMALLOC_PIN) && + gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif From da6df1b0fcfa97b2e3394df8622128bb810e1093 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:38:57 -0700 Subject: [PATCH 1182/1379] mm: apply per-task gfp constraints in fast path Function current_gfp_context() is called after fast path. However, soon we will add more constraints which will also limit zones based on context. Move this call into fast path, and apply the correct constraints for all allocations. Also update .reclaim_idx based on value returned by current_gfp_context() because it soon will modify the allowed zones. Note: With this patch we will do one extra current->flags load during fast path, but we already load current->flags in fast-path: __alloc_pages() prepare_alloc_pages() current_alloc_flags(gfp_mask, *alloc_flags); Later, when we add the zone constrain logic to current_gfp_context() we will be able to remove current->flags load from current_alloc_flags, and therefore return fast-path to the current performance level. Link: https://lkml.kernel.org/r/20210215161349.246722-7-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2fc6a64bef9..3c55eaafede1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5180,6 +5180,13 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, } gfp &= gfp_allowed_mask; + /* + * Apply scoped allocation constraints. This is mainly about GFP_NOFS + * resp. GFP_NOIO which has to be inherited for all allocation requests + * from a particular context which has been marked by + * memalloc_no{fs,io}_{save,restore}. + */ + gfp = current_gfp_context(gfp); alloc_gfp = gfp; if (!prepare_alloc_pages(gfp, order, preferred_nid, nodemask, &ac, &alloc_gfp, &alloc_flags)) @@ -5196,13 +5203,7 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, if (likely(page)) goto out; - /* - * Apply scoped allocation constraints. This is mainly about GFP_NOFS - * resp. GFP_NOIO which has to be inherited for all allocation requests - * from a particular context which has been marked by - * memalloc_no{fs,io}_{save,restore}. - */ - alloc_gfp = current_gfp_context(gfp); + alloc_gfp = gfp; ac.spread_dirty_pages = false; /* From 8e3560d963d22ba41857f48e4114ce80373144ea Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:00 -0700 Subject: [PATCH 1183/1379] mm: honor PF_MEMALLOC_PIN for all movable pages PF_MEMALLOC_PIN is only honored for CMA pages, extend this flag to work for any allocations from ZONE_MOVABLE by removing __GFP_MOVABLE from gfp_mask when this flag is passed in the current context. Add is_pinnable_page() to return true if page is in a pinnable page. A pinnable page is not in ZONE_MOVABLE and not of MIGRATE_CMA type. Link: https://lkml.kernel.org/r/20210215161349.246722-8-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 18 ++++++++++++++++++ include/linux/sched/mm.h | 6 +++++- mm/hugetlb.c | 2 +- mm/page_alloc.c | 20 +++++++++----------- 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1dbb53c44243..d0e628f511e4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1141,6 +1141,11 @@ static inline bool is_zone_device_page(const struct page *page) } #endif +static inline bool is_zone_movable_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_MOVABLE; +} + #ifdef CONFIG_DEV_PAGEMAP_OPS void free_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); @@ -1550,6 +1555,19 @@ static inline unsigned long page_to_section(const struct page *page) } #endif +/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ +#ifdef CONFIG_MIGRATION +static inline bool is_pinnable_page(struct page *page) +{ + return !is_zone_movable_page(page) && !is_migrate_cma_page(page); +} +#else +static inline bool is_pinnable_page(struct page *page) +{ + return true; +} +#endif + static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index ae654819e8aa..e24b1fe348e3 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -151,12 +151,13 @@ static inline bool in_vfork(struct task_struct *tsk) * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS + * PF_MEMALLOC_PIN implies !GFP_MOVABLE */ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); - if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) { + if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { /* * NOIO implies both NOIO and NOFS and it is a weaker context * so always make sure it makes precedence @@ -165,6 +166,9 @@ static inline gfp_t current_gfp_context(gfp_t flags) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; + + if (pflags & PF_MEMALLOC_PIN) + flags &= ~__GFP_MOVABLE; } return flags; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 60dc197a4417..629aa4c2259c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1083,7 +1083,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (pin && is_migrate_cma_page(page)) + if (pin && !is_pinnable_page(page)) continue; if (PageHWPoison(page)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3c55eaafede1..81db38926266 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3859,16 +3859,13 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) return alloc_flags; } -static inline unsigned int current_alloc_flags(gfp_t gfp_mask, - unsigned int alloc_flags) +/* Must be called after current_gfp_context() which can change gfp_mask */ +static inline unsigned int gfp_to_alloc_flags_cma(gfp_t gfp_mask, + unsigned int alloc_flags) { #ifdef CONFIG_CMA - unsigned int pflags = current->flags; - - if (!(pflags & PF_MEMALLOC_PIN) && - gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; - #endif return alloc_flags; } @@ -4526,7 +4523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; - alloc_flags = current_alloc_flags(gfp_mask, alloc_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); return alloc_flags; } @@ -4828,7 +4825,7 @@ retry: reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) - alloc_flags = current_alloc_flags(gfp_mask, reserve_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags); /* * Reset the nodemask and zonelist iterators if memory policies can be @@ -4997,7 +4994,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; - *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags); + *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -5184,7 +5181,8 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, * Apply scoped allocation constraints. This is mainly about GFP_NOFS * resp. GFP_NOIO which has to be inherited for all allocation requests * from a particular context which has been marked by - * memalloc_no{fs,io}_{save,restore}. + * memalloc_no{fs,io}_{save,restore}. And PF_MEMALLOC_PIN which ensures + * movable zones are not used during allocation. */ gfp = current_gfp_context(gfp); alloc_gfp = gfp; From 9afaf30f7a1aab2022961715a66f644275b8daec Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:04 -0700 Subject: [PATCH 1184/1379] mm/gup: do not migrate zero page On some platforms ZERO_PAGE(0) might end-up in a movable zone. Do not migrate zero page in gup during longterm pinning as migration of zero page is not allowed. For example, in x86 QEMU with 16G of memory and kernelcore=5G parameter, I see the following: Boot#1: zero_pfn 0x48a8d zero_pfn zone: ZONE_DMA32 Boot#2: zero_pfn 0x20168d zero_pfn zone: ZONE_MOVABLE On x86, empty_zero_page is declared in .bss and depending on the loader may end up in different physical locations during boots. Also, move is_zero_pfn() my_zero_pfn() functions under CONFIG_MMU, because zero_pfn that they are using is declared in memory.c which is compiled with CONFIG_MMU. Link: https://lkml.kernel.org/r/20210215161349.246722-9-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++- include/linux/mmzone.h | 4 ++++ include/linux/pgtable.h | 12 ++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d0e628f511e4..76e27ebb28a3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1559,7 +1559,8 @@ static inline unsigned long page_to_section(const struct page *page) #ifdef CONFIG_MIGRATION static inline bool is_pinnable_page(struct page *page) { - return !is_zone_movable_page(page) && !is_migrate_cma_page(page); + return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) || + is_zero_pfn(page_to_pfn(page)); } #else static inline bool is_pinnable_page(struct page *page) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3b2205741048..92b44149d5b9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -427,6 +427,10 @@ enum zone_type { * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). + * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create + * situations where ZERO_PAGE(0) which is allocated differently + * on different platforms may end up in a movable zone. ZERO_PAGE(0) + * cannot be migrated. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5e772392a379..2194a9cd885c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1111,6 +1111,7 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif +#ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { @@ -1134,6 +1135,17 @@ static inline unsigned long my_zero_pfn(unsigned long addr) return zero_pfn; } #endif +#else +static inline int is_zero_pfn(unsigned long pfn) +{ + return 0; +} + +static inline unsigned long my_zero_pfn(unsigned long addr) +{ + return 0; +} +#endif /* CONFIG_MMU */ #ifdef CONFIG_MMU From d1e153fea2a8940273174fc17733c44323d35cd5 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:08 -0700 Subject: [PATCH 1185/1379] mm/gup: migrate pinned pages out of movable zone We should not pin pages in ZONE_MOVABLE. Currently, we do not pin only movable CMA pages. Generalize the function that migrates CMA pages to migrate all movable pages. Use is_pinnable_page() to check which pages need to be migrated Link: https://lkml.kernel.org/r/20210215161349.246722-10-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 1 + include/linux/mmzone.h | 9 ++++- include/trace/events/migrate.h | 3 +- mm/gup.c | 67 +++++++++++++++++----------------- 4 files changed, 44 insertions(+), 36 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 175ef15ae9e8..4bb4e519e3f5 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -27,6 +27,7 @@ enum migrate_reason { MR_MEMPOLICY_MBIND, MR_NUMA_MISPLACED, MR_CONTIG_RANGE, + MR_LONGTERM_PIN, MR_TYPES }; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 92b44149d5b9..e8922a67d1a4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -407,8 +407,13 @@ enum zone_type { * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might - * essentially turn such pages unmovable. Memory offlining might - * retry a long time. + * essentially turn such pages unmovable. Therefore, we do not allow + * pinning long-term pages in ZONE_MOVABLE. When pages are pinned and + * faulted, they come from the right zone right away. However, it is + * still possible that address space already has pages in + * ZONE_MOVABLE at the time when pages are pinned (i.e. user has + * touches that memory before pinning). In such case we migrate them + * to a different zone. When migration fails - pinning fails. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index f2c990603888..9fb2a3bbcdfb 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -20,7 +20,8 @@ EM( MR_SYSCALL, "syscall_or_cpuset") \ EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \ EM( MR_NUMA_MISPLACED, "numa_misplaced") \ - EMe(MR_CONTIG_RANGE, "contig_range") + EM( MR_CONTIG_RANGE, "contig_range") \ + EMe(MR_LONGTERM_PIN, "longterm_pin") /* * First define the enums in the above macros to be exported to userspace diff --git a/mm/gup.c b/mm/gup.c index a1eff7ad31da..4bc57420f535 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -87,11 +87,12 @@ __maybe_unused struct page *try_grab_compound_head(struct page *page, int orig_refs = refs; /* - * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast - * path, so fail and let the caller fall back to the slow path. + * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a + * right zone, so fail and let the caller fall back to the slow + * path. */ - if (unlikely(flags & FOLL_LONGTERM) && - is_migrate_cma_page(page)) + if (unlikely((flags & FOLL_LONGTERM) && + !is_pinnable_page(page))) return NULL; /* @@ -1600,17 +1601,17 @@ struct page *get_dump_page(unsigned long addr) } #endif /* CONFIG_ELF_CORE */ -#ifdef CONFIG_CMA -static long check_and_migrate_cma_pages(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, - struct page **pages, - struct vm_area_struct **vmas, - unsigned int gup_flags) +#ifdef CONFIG_MIGRATION +static long check_and_migrate_movable_pages(struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { unsigned long i, isolation_error_count; bool drain_allow; - LIST_HEAD(cma_page_list); + LIST_HEAD(movable_page_list); long ret = nr_pages; struct page *prev_head, *head; struct migration_target_control mtc = { @@ -1628,13 +1629,12 @@ check_again: continue; prev_head = head; /* - * If we get a page from the CMA zone, since we are going to - * be pinning these entries, we might as well move them out - * of the CMA zone if possible. + * If we get a movable page, since we are going to be pinning + * these entries, try to move them out if possible. */ - if (is_migrate_cma_page(head)) { + if (!is_pinnable_page(head)) { if (PageHuge(head)) { - if (!isolate_huge_page(head, &cma_page_list)) + if (!isolate_huge_page(head, &movable_page_list)) isolation_error_count++; } else { if (!PageLRU(head) && drain_allow) { @@ -1646,7 +1646,7 @@ check_again: isolation_error_count++; continue; } - list_add_tail(&head->lru, &cma_page_list); + list_add_tail(&head->lru, &movable_page_list); mod_node_page_state(page_pgdat(head), NR_ISOLATED_ANON + page_is_file_lru(head), @@ -1659,10 +1659,10 @@ check_again: * If list is empty, and no isolation errors, means that all pages are * in the correct zone. */ - if (list_empty(&cma_page_list) && !isolation_error_count) + if (list_empty(&movable_page_list) && !isolation_error_count) return ret; - if (!list_empty(&cma_page_list)) { + if (!list_empty(&movable_page_list)) { /* * drop the above get_user_pages reference. */ @@ -1672,12 +1672,12 @@ check_again: for (i = 0; i < nr_pages; i++) put_page(pages[i]); - ret = migrate_pages(&cma_page_list, alloc_migration_target, + ret = migrate_pages(&movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, - MR_CONTIG_RANGE); + MR_LONGTERM_PIN); if (ret) { - if (!list_empty(&cma_page_list)) - putback_movable_pages(&cma_page_list); + if (!list_empty(&movable_page_list)) + putback_movable_pages(&movable_page_list); return ret > 0 ? -ENOMEM : ret; } @@ -1696,16 +1696,16 @@ check_again: goto check_again; } #else -static long check_and_migrate_cma_pages(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, - struct page **pages, - struct vm_area_struct **vmas, - unsigned int gup_flags) +static long check_and_migrate_movable_pages(struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { return nr_pages; } -#endif /* CONFIG_CMA */ +#endif /* CONFIG_MIGRATION */ /* * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which @@ -1729,8 +1729,9 @@ static long __gup_longterm_locked(struct mm_struct *mm, if (gup_flags & FOLL_LONGTERM) { if (rc > 0) - rc = check_and_migrate_cma_pages(mm, start, rc, pages, - vmas, gup_flags); + rc = check_and_migrate_movable_pages(mm, start, rc, + pages, vmas, + gup_flags); memalloc_pin_restore(flags); } return rc; From fa965fd54827a6b6967602051736da9c163b79b7 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:12 -0700 Subject: [PATCH 1186/1379] memory-hotplug.rst: add a note about ZONE_MOVABLE and page pinning Document the special handling of page pinning when ZONE_MOVABLE present. Link: https://lkml.kernel.org/r/20210215161349.246722-11-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Suggested-by: David Hildenbrand Acked-by: Michal Hocko Cc: Dan Williams Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/memory-hotplug.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 5307f90738aa..05d51d2d8beb 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -357,6 +357,15 @@ creates ZONE_MOVABLE as following. Unfortunately, there is no information to show which memory block belongs to ZONE_MOVABLE. This is TBD. +.. note:: + Techniques that rely on long-term pinnings of memory (especially, RDMA and + vfio) are fundamentally problematic with ZONE_MOVABLE and, therefore, memory + hot remove. Pinned pages cannot reside on ZONE_MOVABLE, to guarantee that + memory can still get hot removed - be aware that pinning can fail even if + there is plenty of free memory in ZONE_MOVABLE. In addition, using + ZONE_MOVABLE might make page pinning more expensive, because pages have to be + migrated off that zone first. + .. _memory_hotplug_how_to_offline_memory: How to offline memory From 24dc20c75f937b8f5c432e38275e70a1611766e9 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:15 -0700 Subject: [PATCH 1187/1379] mm/gup: change index type to long as it counts pages In __get_user_pages_locked() i counts number of pages which should be long, as long is used in all other places to contain number of pages, and 32-bit becomes increasingly small for handling page count proportional values. Link: https://lkml.kernel.org/r/20210215161349.246722-12-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index 4bc57420f535..91629b5b0636 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1528,7 +1528,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, { struct vm_area_struct *vma; unsigned long vm_flags; - int i; + long i; /* calculate required read or write permissions. * If FOLL_FORCE is set, we only require the "MAY" flags. From f68749ec342b5f2c18b3af3435714d9f653736c3 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:19 -0700 Subject: [PATCH 1188/1379] mm/gup: longterm pin migration cleanup When pages are longterm pinned, we must migrated them out of movable zone. The function that migrates them has a hidden loop with goto. The loop is to retry on isolation failures, and after successful migration. Make this code better by moving this loop to the caller. Link: https://lkml.kernel.org/r/20210215161349.246722-13-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: Jason Gunthorpe Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: John Hubbard Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 93 ++++++++++++++++++++++---------------------------------- 1 file changed, 37 insertions(+), 56 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 91629b5b0636..aa09535cf4d4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1602,27 +1602,28 @@ struct page *get_dump_page(unsigned long addr) #endif /* CONFIG_ELF_CORE */ #ifdef CONFIG_MIGRATION -static long check_and_migrate_movable_pages(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, +/* + * Check whether all pages are pinnable, if so return number of pages. If some + * pages are not pinnable, migrate them, and unpin all pages. Return zero if + * pages were migrated, or if some pages were not successfully isolated. + * Return negative error if migration fails. + */ +static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas, unsigned int gup_flags) { - unsigned long i, isolation_error_count; - bool drain_allow; + unsigned long i; + unsigned long isolation_error_count = 0; + bool drain_allow = true; LIST_HEAD(movable_page_list); - long ret = nr_pages; - struct page *prev_head, *head; + long ret = 0; + struct page *prev_head = NULL; + struct page *head; struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, }; -check_again: - prev_head = NULL; - isolation_error_count = 0; - drain_allow = true; for (i = 0; i < nr_pages; i++) { head = compound_head(pages[i]); if (head == prev_head) @@ -1660,47 +1661,27 @@ check_again: * in the correct zone. */ if (list_empty(&movable_page_list) && !isolation_error_count) - return ret; + return nr_pages; + if (gup_flags & FOLL_PIN) { + unpin_user_pages(pages, nr_pages); + } else { + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + } if (!list_empty(&movable_page_list)) { - /* - * drop the above get_user_pages reference. - */ - if (gup_flags & FOLL_PIN) - unpin_user_pages(pages, nr_pages); - else - for (i = 0; i < nr_pages; i++) - put_page(pages[i]); - ret = migrate_pages(&movable_page_list, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_LONGTERM_PIN); - if (ret) { - if (!list_empty(&movable_page_list)) - putback_movable_pages(&movable_page_list); - return ret > 0 ? -ENOMEM : ret; - } - - /* We unpinned pages before migration, pin them again */ - ret = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - NULL, gup_flags); - if (ret <= 0) - return ret; - nr_pages = ret; + if (ret && !list_empty(&movable_page_list)) + putback_movable_pages(&movable_page_list); } - /* - * check again because pages were unpinned, and we also might have - * had isolation errors and need more pages to migrate. - */ - goto check_again; + return ret > 0 ? -ENOMEM : ret; } #else -static long check_and_migrate_movable_pages(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, +static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas, unsigned int gup_flags) { return nr_pages; @@ -1718,22 +1699,22 @@ static long __gup_longterm_locked(struct mm_struct *mm, struct vm_area_struct **vmas, unsigned int gup_flags) { - unsigned long flags = 0; + unsigned int flags; long rc; - if (gup_flags & FOLL_LONGTERM) - flags = memalloc_pin_save(); + if (!(gup_flags & FOLL_LONGTERM)) + return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + NULL, gup_flags); + flags = memalloc_pin_save(); + do { + rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, + NULL, gup_flags); + if (rc <= 0) + break; + rc = check_and_migrate_movable_pages(rc, pages, gup_flags); + } while (!rc); + memalloc_pin_restore(flags); - rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, - gup_flags); - - if (gup_flags & FOLL_LONGTERM) { - if (rc > 0) - rc = check_and_migrate_movable_pages(mm, start, rc, - pages, vmas, - gup_flags); - memalloc_pin_restore(flags); - } return rc; } From 79dbf135e2481eaa77b172d88c343bf85e021545 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:23 -0700 Subject: [PATCH 1189/1379] selftests/vm: gup_test: fix test flag In gup_test both gup_flags and test_flags use the same flags field. This is broken. Farther, in the actual gup_test.c all the passed gup_flags are erased and unconditionally replaced with FOLL_WRITE. Which means that test_flags are ignored, and code like this always performs pin dump test: 155 if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) 156 nr = pin_user_pages(addr, nr, gup->flags, 157 pages + i, NULL); 158 else 159 nr = get_user_pages(addr, nr, gup->flags, 160 pages + i, NULL); 161 break; Add a new test_flags field, to allow raw gup_flags to work. Add a new subcommand for DUMP_USER_PAGES_TEST to specify that pin test should be performed. Remove unconditional overwriting of gup_flags via FOLL_WRITE. But, preserve the previous behaviour where FOLL_WRITE was the default flag, and add a new option "-W" to unset FOLL_WRITE. Rename flags with gup_flags. With the fix, dump works like this: root@virtme:/# gup_test -c ---- page #0, starting from user virt addr: 0x7f8acb9e4000 page:00000000d3d2ee27 refcount:2 mapcount:1 mapping:0000000000000000 index:0x0 pfn:0x100bcf anon flags: 0x300000000080016(referenced|uptodate|lru|swapbacked) raw: 0300000000080016 ffffd0e204021608 ffffd0e208df2e88 ffff8ea04243ec61 raw: 0000000000000000 0000000000000000 0000000200000000 0000000000000000 page dumped because: gup_test: dump_pages() test DUMP_USER_PAGES_TEST: done root@virtme:/# gup_test -c -p ---- page #0, starting from user virt addr: 0x7fd19701b000 page:00000000baed3c7d refcount:1025 mapcount:1 mapping:0000000000000000 index:0x0 pfn:0x108008 anon flags: 0x300000000080014(uptodate|lru|swapbacked) raw: 0300000000080014 ffffd0e204200188 ffffd0e205e09088 ffff8ea04243ee71 raw: 0000000000000000 0000000000000000 0000040100000000 0000000000000000 page dumped because: gup_test: dump_pages() test DUMP_USER_PAGES_TEST: done Refcount shows the difference between pin vs no-pin case. Also change type of nr from int to long, as it counts number of pages. Link: https://lkml.kernel.org/r/20210215161349.246722-14-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup_test.c | 23 ++++++++++------------- mm/gup_test.h | 3 ++- tools/testing/selftests/vm/gup_test.c | 15 +++++++++++---- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/mm/gup_test.c b/mm/gup_test.c index e3cf78e5873e..a6ed1c877679 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -94,7 +94,7 @@ static int __gup_test_ioctl(unsigned int cmd, { ktime_t start_time, end_time; unsigned long i, nr_pages, addr, next; - int nr; + long nr; struct page **pages; int ret = 0; bool needs_mmap_lock = @@ -126,37 +126,34 @@ static int __gup_test_ioctl(unsigned int cmd, nr = (next - addr) / PAGE_SIZE; } - /* Filter out most gup flags: only allow a tiny subset here: */ - gup->flags &= FOLL_WRITE; - switch (cmd) { case GUP_FAST_BENCHMARK: - nr = get_user_pages_fast(addr, nr, gup->flags, + nr = get_user_pages_fast(addr, nr, gup->gup_flags, pages + i); break; case GUP_BASIC_TEST: - nr = get_user_pages(addr, nr, gup->flags, pages + i, + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; case PIN_FAST_BENCHMARK: - nr = pin_user_pages_fast(addr, nr, gup->flags, + nr = pin_user_pages_fast(addr, nr, gup->gup_flags, pages + i); break; case PIN_BASIC_TEST: - nr = pin_user_pages(addr, nr, gup->flags, pages + i, + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; case PIN_LONGTERM_BENCHMARK: nr = pin_user_pages(addr, nr, - gup->flags | FOLL_LONGTERM, + gup->gup_flags | FOLL_LONGTERM, pages + i, NULL); break; case DUMP_USER_PAGES_TEST: - if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) - nr = pin_user_pages(addr, nr, gup->flags, + if (gup->test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) + nr = pin_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); else - nr = get_user_pages(addr, nr, gup->flags, + nr = get_user_pages(addr, nr, gup->gup_flags, pages + i, NULL); break; default: @@ -187,7 +184,7 @@ static int __gup_test_ioctl(unsigned int cmd, start_time = ktime_get(); - put_back_pages(cmd, pages, nr_pages, gup->flags); + put_back_pages(cmd, pages, nr_pages, gup->test_flags); end_time = ktime_get(); gup->put_delta_usec = ktime_us_delta(end_time, start_time); diff --git a/mm/gup_test.h b/mm/gup_test.h index 90a6713d50eb..887ac1d5f5bc 100644 --- a/mm/gup_test.h +++ b/mm/gup_test.h @@ -21,7 +21,8 @@ struct gup_test { __u64 addr; __u64 size; __u32 nr_pages_per_call; - __u32 flags; + __u32 gup_flags; + __u32 test_flags; /* * Each non-zero entry is the number of the page (1-based: first page is * page 1, so that zero entries mean "do nothing") from the .addr base. diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index 6c6336dd3b7f..943cc2608dc2 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -37,13 +37,13 @@ int main(int argc, char **argv) { struct gup_test gup = { 0 }; unsigned long size = 128 * MB; - int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0; + int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 1; unsigned long cmd = GUP_FAST_BENCHMARK; int flags = MAP_PRIVATE; char *file = "/dev/zero"; char *p; - while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwSH")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwWSHp")) != -1) { switch (opt) { case 'a': cmd = PIN_FAST_BENCHMARK; @@ -65,9 +65,13 @@ int main(int argc, char **argv) */ gup.which_pages[0] = 1; break; + case 'p': + /* works only with DUMP_USER_PAGES_TEST */ + gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN; + break; case 'F': /* strtol, so you can pass flags in hex form */ - gup.flags = strtol(optarg, 0, 0); + gup.gup_flags = strtol(optarg, 0, 0); break; case 'm': size = atoi(optarg) * MB; @@ -93,6 +97,9 @@ int main(int argc, char **argv) case 'w': write = 1; break; + case 'W': + write = 0; + break; case 'f': file = optarg; break; @@ -140,7 +147,7 @@ int main(int argc, char **argv) gup.nr_pages_per_call = nr_pages; if (write) - gup.flags |= FOLL_WRITE; + gup.gup_flags |= FOLL_WRITE; fd = open("/sys/kernel/debug/gup_test", O_RDWR); if (fd == -1) { From e44605a8b1aa13d892addc59ec3d416cb186c77b Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 4 May 2021 18:39:27 -0700 Subject: [PATCH 1190/1379] selftests/vm: gup_test: test faulting in kernel, and verify pinnable pages When pages are pinned they can be faulted in userland and migrated, and they can be faulted right in kernel without migration. In either case, the pinned pages must end-up being pinnable (not movable). Add a new test to gup_test, to help verify that the gup/pup (get_user_pages() / pin_user_pages()) behavior with respect to pinnable and movable pages is reasonable and correct. Specifically, provide a way to: 1) Verify that only "pinnable" pages are pinned. This is checked automatically for you. 2) Verify that gup/pup performance is reasonable. This requires comparing benchmarks between doing gup/pup on pages that have been pre-faulted in from user space, vs. doing gup/pup on pages that are not faulted in until gup/pup time (via FOLL_TOUCH). This decision is controlled with the new -z command line option. Link: https://lkml.kernel.org/r/20210215161349.246722-15-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: David Rientjes Cc: Ingo Molnar Cc: Ira Weiny Cc: James Morris Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Mike Kravetz Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt (VMware) Cc: Tyler Hicks Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup_test.c | 6 ++++++ tools/testing/selftests/vm/gup_test.c | 23 +++++++++++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/mm/gup_test.c b/mm/gup_test.c index a6ed1c877679..d974dec19e1c 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -52,6 +52,12 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, dump_page(page, "gup_test failure"); break; + } else if (cmd == PIN_LONGTERM_BENCHMARK && + WARN(!is_pinnable_page(page), + "pages[%lu] is NOT pinnable but pinned\n", + i)) { + dump_page(page, "gup_test failure"); + break; } } break; diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index 943cc2608dc2..1e662d59c502 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -13,6 +13,7 @@ /* Just the flags we need, copied from mm.h: */ #define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ static char *cmd_to_str(unsigned long cmd) { @@ -39,11 +40,11 @@ int main(int argc, char **argv) unsigned long size = 128 * MB; int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 1; unsigned long cmd = GUP_FAST_BENCHMARK; - int flags = MAP_PRIVATE; + int flags = MAP_PRIVATE, touch = 0; char *file = "/dev/zero"; char *p; - while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwWSHp")) != -1) { + while ((opt = getopt(argc, argv, "m:r:n:F:f:abctTLUuwWSHpz")) != -1) { switch (opt) { case 'a': cmd = PIN_FAST_BENCHMARK; @@ -110,6 +111,10 @@ int main(int argc, char **argv) case 'H': flags |= (MAP_HUGETLB | MAP_ANONYMOUS); break; + case 'z': + /* fault pages in gup, do not fault in userland */ + touch = 1; + break; default: return -1; } @@ -167,8 +172,18 @@ int main(int argc, char **argv) else if (thp == 0) madvise(p, size, MADV_NOHUGEPAGE); - for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) - p[0] = 0; + /* + * FOLL_TOUCH, in gup_test, is used as an either/or case: either + * fault pages in from the kernel via FOLL_TOUCH, or fault them + * in here, from user space. This allows comparison of performance + * between those two cases. + */ + if (touch) { + gup.gup_flags |= FOLL_TOUCH; + } else { + for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) + p[0] = 0; + } /* Only report timing information on the *_BENCHMARK commands: */ if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || From 8ca559132a2d9b56732d35e2b947af96acb9b80b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 4 May 2021 18:39:30 -0700 Subject: [PATCH 1191/1379] mm/memory_hotplug: remove broken locking of zone PCP structures during hot remove zone_pcp_reset allegedly protects against a race with drain_pages using local_irq_save but this is bogus. local_irq_save only operates on the local CPU. If memory hotplug is running on CPU A and drain_pages is running on CPU B, disabling IRQs on CPU A does not affect CPU B and offers no protection. This patch deletes IRQ disable/enable on the grounds that IRQs protect nothing and assumes the existing hotplug paths guarantees the PCP cannot be used after zone_pcp_enable(). That should be the case already because all the pages have been freed and there is no page to put on the PCP lists. Link: https://lkml.kernel.org/r/20210412090346.GQ3697@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: "Michael S. Tsirkin" Cc: Vlastimil Babka Cc: Alexander Duyck Cc: Minchan Kim Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 81db38926266..b012805a11ad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -9020,12 +9020,9 @@ void zone_pcp_enable(struct zone *zone) void zone_pcp_reset(struct zone *zone) { - unsigned long flags; int cpu; struct per_cpu_pageset *pset; - /* avoid races with drain_pages() */ - local_irq_save(flags); if (zone->pageset != &boot_pageset) { for_each_online_cpu(cpu) { pset = per_cpu_ptr(zone->pageset, cpu); @@ -9034,7 +9031,6 @@ void zone_pcp_reset(struct zone *zone) free_percpu(zone->pageset); zone->pageset = &boot_pageset; } - local_irq_restore(flags); } #ifdef CONFIG_MEMORY_HOTREMOVE From 8736cc2d002f14e90d2b33bc5bef1740f6275ba4 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:33 -0700 Subject: [PATCH 1192/1379] drivers/base/memory: introduce memory_block_{online,offline} Patch series "Allocate memmap from hotadded memory (per device)", v10. The primary goal of this patchset is to reduce memory overhead of the hot-added memory (at least for SPARSEMEM_VMEMMAP memory model). The current way we use to populate memmap (struct page array) has two main drawbacks: a) it consumes an additional memory until the hotadded memory itself is onlined and b) memmap might end up on a different numa node which is especially true for movable_node configuration. c) due to fragmentation we might end up populating memmap with base pages One way to mitigate all these issues is to simply allocate memmap array (which is the largest memory footprint of the physical memory hotplug) from the hot-added memory itself. SPARSEMEM_VMEMMAP memory model allows us to map any pfn range so the memory doesn't need to be online to be usable for the array. See patch 4 for more details. This feature is only usable when CONFIG_SPARSEMEM_VMEMMAP is set. [Overall design]: Implementation wise we reuse vmem_altmap infrastructure to override the default allocator used by vmemap_populate. memory_block structure gains a new field called nr_vmemmap_pages, which accounts for the number of vmemmap pages used by that memory_block. E.g: On x86_64, that is 512 vmemmap pages on small memory bloks and 4096 on large memory blocks (1GB) We also introduce new two functions: memory_block_{online,offline}. These functions take care of initializing/unitializing vmemmap pages prior to calling {online,offline}_pages, so the latter functions can remain totally untouched. More details can be found in the respective changelogs. This patch (of 8): This is a preparatory patch that introduces two new functions: memory_block_online() and memory_block_offline(). For now, these functions will only call online_pages() and offline_pages() respectively, but they will be later in charge of preparing the vmemmap pages, carrying out the initialization and proper accounting of such pages. Since memory_block struct contains all the information, pass this struct down the chain till the end functions. Link: https://lkml.kernel.org/r/20210421102701.25051-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20210421102701.25051-2-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Vlastimil Babka Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f35298425575..f209925a5d4e 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -169,30 +169,41 @@ int memory_notify(unsigned long val, void *v) return blocking_notifier_call_chain(&memory_chain, val, v); } +static int memory_block_online(struct memory_block *mem) +{ + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); + unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + + return online_pages(start_pfn, nr_pages, mem->online_type, mem->nid); +} + +static int memory_block_offline(struct memory_block *mem) +{ + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); + unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + + return offline_pages(start_pfn, nr_pages); +} + /* * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is * OK to have direct references to sparsemem variables in here. */ static int -memory_block_action(unsigned long start_section_nr, unsigned long action, - int online_type, int nid) +memory_block_action(struct memory_block *mem, unsigned long action) { - unsigned long start_pfn; - unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; int ret; - start_pfn = section_nr_to_pfn(start_section_nr); - switch (action) { case MEM_ONLINE: - ret = online_pages(start_pfn, nr_pages, online_type, nid); + ret = memory_block_online(mem); break; case MEM_OFFLINE: - ret = offline_pages(start_pfn, nr_pages); + ret = memory_block_offline(mem); break; default: WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " - "%ld\n", __func__, start_section_nr, action, action); + "%ld\n", __func__, mem->start_section_nr, action, action); ret = -EINVAL; } @@ -210,9 +221,7 @@ static int memory_block_change_state(struct memory_block *mem, if (to_state == MEM_OFFLINE) mem->state = MEM_GOING_OFFLINE; - ret = memory_block_action(mem->start_section_nr, to_state, - mem->online_type, mem->nid); - + ret = memory_block_action(mem, to_state); mem->state = ret ? from_state_req : to_state; return ret; From dd8e2f230d82ecd60504fba48bb10bf3760b674e Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:36 -0700 Subject: [PATCH 1193/1379] mm,memory_hotplug: relax fully spanned sections check We want {online,offline}_pages to operate on whole memblocks, but memmap_on_memory will poke pageblock_nr_pages aligned holes in the beginning, which is a special case we want to allow. Relax the check to account for that case. Link: https://lkml.kernel.org/r/20210421102701.25051-3-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 729fba144c71..b8467faf2f69 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -838,9 +838,16 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int ret; struct memory_notify arg; - /* We can only online full sections (e.g., SECTION_IS_ONLINE) */ + /* + * {on,off}lining is constrained to full memory sections (or more + * precisly to memory blocks from the user space POV). + * memmap_on_memory is an exception because it reserves initial part + * of the physical memory space for vmemmaps. That space is pageblock + * aligned. + */ if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION))) + !IS_ALIGNED(pfn, pageblock_nr_pages) || + !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; mem_hotplug_begin(); @@ -1573,9 +1580,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) int ret, node; char *reason; - /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */ + /* + * {on,off}lining is constrained to full memory sections (or more + * precisly to memory blocks from the user space POV). + * memmap_on_memory is an exception because it reserves initial part + * of the physical memory space for vmemmaps. That space is pageblock + * aligned. + */ if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION))) + !IS_ALIGNED(start_pfn, pageblock_nr_pages) || + !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; mem_hotplug_begin(); From f9901144e48f6a7ba186249add705d10e74738ec Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 4 May 2021 18:39:39 -0700 Subject: [PATCH 1194/1379] mm,memory_hotplug: factor out adjusting present pages into adjust_present_page_count() Let's have a single place (inspired by adjust_managed_page_count()) where we adjust present pages. In contrast to adjust_managed_page_count(), only memory onlining or offlining is allowed to modify the number of present pages. Link: https://lkml.kernel.org/r/20210421102701.25051-4-osalvador@suse.de Signed-off-by: David Hildenbrand Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b8467faf2f69..04f01fabc150 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -829,6 +829,16 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, return default_zone_for_pfn(nid, start_pfn, nr_pages); } +static void adjust_present_page_count(struct zone *zone, long nr_pages) +{ + unsigned long flags; + + zone->present_pages += nr_pages; + pgdat_resize_lock(zone->zone_pgdat, &flags); + zone->zone_pgdat->node_present_pages += nr_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); +} + int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type, int nid) { @@ -884,11 +894,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, } online_pages_range(pfn, nr_pages); - zone->present_pages += nr_pages; - - pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages += nr_pages; - pgdat_resize_unlock(zone->zone_pgdat, &flags); + adjust_present_page_count(zone, nr_pages); node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1706,11 +1712,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); - zone->present_pages -= nr_pages; - - pgdat_resize_lock(zone->zone_pgdat, &flags); - zone->zone_pgdat->node_present_pages -= nr_pages; - pgdat_resize_unlock(zone->zone_pgdat, &flags); + adjust_present_page_count(zone, -nr_pages); init_per_zone_wmark_min(); From a08a2ae3461383c2d50d0997dcc6cd1dd1fefb08 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:42 -0700 Subject: [PATCH 1195/1379] mm,memory_hotplug: allocate memmap from the added memory range Physical memory hotadd has to allocate a memmap (struct page array) for the newly added memory section. Currently, alloc_pages_node() is used for those allocations. This has some disadvantages: a) an existing memory is consumed for that purpose (eg: ~2MB per 128MB memory section on x86_64) This can even lead to extreme cases where system goes OOM because the physically hotplugged memory depletes the available memory before it is onlined. b) if the whole node is movable then we have off-node struct pages which has performance drawbacks. c) It might be there are no PMD_ALIGNED chunks so memmap array gets populated with base pages. This can be improved when CONFIG_SPARSEMEM_VMEMMAP is enabled. Vmemap page tables can map arbitrary memory. That means that we can reserve a part of the physically hotadded memory to back vmemmap page tables. This implementation uses the beginning of the hotplugged memory for that purpose. There are some non-obviously things to consider though. Vmemmap pages are allocated/freed during the memory hotplug events (add_memory_resource(), try_remove_memory()) when the memory is added/removed. This means that the reserved physical range is not online although it is used. The most obvious side effect is that pfn_to_online_page() returns NULL for those pfns. The current design expects that this should be OK as the hotplugged memory is considered a garbage until it is onlined. For example hibernation wouldn't save the content of those vmmemmaps into the image so it wouldn't be restored on resume but this should be OK as there no real content to recover anyway while metadata is reachable from other data structures (e.g. vmemmap page tables). The reserved space is therefore (de)initialized during the {on,off}line events (mhp_{de}init_memmap_on_memory). That is done by extracting page allocator independent initialization from the regular onlining path. The primary reason to handle the reserved space outside of {on,off}line_pages is to make each initialization specific to the purpose rather than special case them in a single function. As per above, the functions that are introduced are: - mhp_init_memmap_on_memory: Initializes vmemmap pages by calling move_pfn_range_to_zone(), calls kasan_add_zero_shadow(), and onlines as many sections as vmemmap pages fully span. - mhp_deinit_memmap_on_memory: Offlines as many sections as vmemmap pages fully span, removes the range from zhe zone by remove_pfn_range_from_zone(), and calls kasan_remove_zero_shadow() for the range. The new function memory_block_online() calls mhp_init_memmap_on_memory() before doing the actual online_pages(). Should online_pages() fail, we clean up by calling mhp_deinit_memmap_on_memory(). Adjusting of present_pages is done at the end once we know that online_pages() succedeed. On offline, memory_block_offline() needs to unaccount vmemmap pages from present_pages() before calling offline_pages(). This is necessary because offline_pages() tears down some structures based on the fact whether the node or the zone become empty. If offline_pages() fails, we account back vmemmap pages. If it succeeds, we call mhp_deinit_memmap_on_memory(). Hot-remove: We need to be careful when removing memory, as adding and removing memory needs to be done with the same granularity. To check that this assumption is not violated, we check the memory range we want to remove and if a) any memory block has vmemmap pages and b) the range spans more than a single memory block, we scream out loud and refuse to proceed. If all is good and the range was using memmap on memory (aka vmemmap pages), we construct an altmap structure so free_hugepage_table does the right thing and calls vmem_altmap_free instead of free_pagetable. Link: https://lkml.kernel.org/r/20210421102701.25051-5-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 72 +++++++++++++-- include/linux/memory.h | 8 +- include/linux/memory_hotplug.h | 15 ++- include/linux/memremap.h | 2 +- include/linux/mmzone.h | 7 +- mm/Kconfig | 5 + mm/memory_hotplug.c | 161 +++++++++++++++++++++++++++++++-- mm/sparse.c | 2 - 8 files changed, 250 insertions(+), 22 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f209925a5d4e..b31b3af5c490 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -173,16 +173,73 @@ static int memory_block_online(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; + struct zone *zone; + int ret; - return online_pages(start_pfn, nr_pages, mem->online_type, mem->nid); + zone = zone_for_pfn_range(mem->online_type, mem->nid, start_pfn, nr_pages); + + /* + * Although vmemmap pages have a different lifecycle than the pages + * they describe (they remain until the memory is unplugged), doing + * their initialization and accounting at memory onlining/offlining + * stage helps to keep accounting easier to follow - e.g vmemmaps + * belong to the same zone as the memory they backed. + */ + if (nr_vmemmap_pages) { + ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); + if (ret) + return ret; + } + + ret = online_pages(start_pfn + nr_vmemmap_pages, + nr_pages - nr_vmemmap_pages, zone); + if (ret) { + if (nr_vmemmap_pages) + mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); + return ret; + } + + /* + * Account once onlining succeeded. If the zone was unpopulated, it is + * now already properly populated. + */ + if (nr_vmemmap_pages) + adjust_present_page_count(zone, nr_vmemmap_pages); + + return ret; } static int memory_block_offline(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; + unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; + struct zone *zone; + int ret; - return offline_pages(start_pfn, nr_pages); + zone = page_zone(pfn_to_page(start_pfn)); + + /* + * Unaccount before offlining, such that unpopulated zone and kthreads + * can properly be torn down in offline_pages(). + */ + if (nr_vmemmap_pages) + adjust_present_page_count(zone, -nr_vmemmap_pages); + + ret = offline_pages(start_pfn + nr_vmemmap_pages, + nr_pages - nr_vmemmap_pages); + if (ret) { + /* offline_pages() failed. Account back. */ + if (nr_vmemmap_pages) + adjust_present_page_count(zone, nr_vmemmap_pages); + return ret; + } + + if (nr_vmemmap_pages) + mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); + + return ret; } /* @@ -576,7 +633,8 @@ int register_memory(struct memory_block *memory) return ret; } -static int init_memory_block(unsigned long block_id, unsigned long state) +static int init_memory_block(unsigned long block_id, unsigned long state, + unsigned long nr_vmemmap_pages) { struct memory_block *mem; int ret = 0; @@ -593,6 +651,7 @@ static int init_memory_block(unsigned long block_id, unsigned long state) mem->start_section_nr = block_id * sections_per_block; mem->state = state; mem->nid = NUMA_NO_NODE; + mem->nr_vmemmap_pages = nr_vmemmap_pages; ret = register_memory(mem); @@ -612,7 +671,7 @@ static int add_memory_block(unsigned long base_section_nr) if (section_count == 0) return 0; return init_memory_block(memory_block_id(base_section_nr), - MEM_ONLINE); + MEM_ONLINE, 0); } static void unregister_memory(struct memory_block *memory) @@ -634,7 +693,8 @@ static void unregister_memory(struct memory_block *memory) * * Called under device_hotplug_lock. */ -int create_memory_block_devices(unsigned long start, unsigned long size) +int create_memory_block_devices(unsigned long start, unsigned long size, + unsigned long vmemmap_pages) { const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); @@ -647,7 +707,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size) return -EINVAL; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = init_memory_block(block_id, MEM_OFFLINE); + ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages); if (ret) break; } diff --git a/include/linux/memory.h b/include/linux/memory.h index 4da95e684e20..97e92e8b556a 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -29,6 +29,11 @@ struct memory_block { int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ struct device dev; + /* + * Number of vmemmap pages. These pages + * lay at the beginning of the memory block. + */ + unsigned long nr_vmemmap_pages; }; int arch_get_memory_phys_device(unsigned long start_pfn); @@ -80,7 +85,8 @@ static inline int memory_notify(unsigned long val, void *v) #else extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); -int create_memory_block_devices(unsigned long start, unsigned long size); +int create_memory_block_devices(unsigned long start, unsigned long size, + unsigned long vmemmap_pages); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7288aa5ef73b..28f32fd00fe9 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -55,6 +55,14 @@ typedef int __bitwise mhp_t; */ #define MHP_MERGE_RESOURCE ((__force mhp_t)BIT(0)) +/* + * We want memmap (struct page array) to be self contained. + * To do so, we will use the beginning of the hot-added range to build + * the page tables for the memmap array that describes the entire range. + * Only selected architectures support it with SPARSE_VMEMMAP. + */ +#define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1)) + /* * Extended parameters for memory hotplug: * altmap: alternative allocator for memmap array (optional) @@ -99,9 +107,13 @@ static inline void zone_seqlock_init(struct zone *zone) extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); +extern void adjust_present_page_count(struct zone *zone, long nr_pages); /* VM interface that may be used by firmware interface */ +extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, + struct zone *zone); +extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, - int online_type, int nid); + struct zone *zone); extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn); extern void __offline_isolated_pages(unsigned long start_pfn, @@ -359,6 +371,7 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_ extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); +extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f5b464daeeca..45a79da89c5f 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -17,7 +17,7 @@ struct device; * @alloc: track pages consumed, private to vmemmap_populate() */ struct vmem_altmap { - const unsigned long base_pfn; + unsigned long base_pfn; const unsigned long end_pfn; const unsigned long reserve; unsigned long free; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e8922a67d1a4..917bd6c604d5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -436,6 +436,11 @@ enum zone_type { * situations where ZERO_PAGE(0) which is allocated differently * on different platforms may end up in a movable zone. ZERO_PAGE(0) * cannot be migrated. + * 7. Memory-hotplug: when using memmap_on_memory and onlining the + * memory to the MOVABLE zone, the vmemmap pages are also placed in + * such zone. Such pages cannot be really moved around as they are + * self-stored in the range, but they are treated as movable when + * the range they describe is about to be offlined. * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) @@ -1392,10 +1397,8 @@ static inline int online_section_nr(unsigned long nr) #ifdef CONFIG_MEMORY_HOTPLUG void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); -#ifdef CONFIG_MEMORY_HOTREMOVE void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #endif -#endif static inline struct mem_section *__pfn_to_section(unsigned long pfn) { diff --git a/mm/Kconfig b/mm/Kconfig index fe4897c3c81b..02d44e3420f5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -188,6 +188,11 @@ config MEMORY_HOTREMOVE depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION +config MHP_MEMMAP_ON_MEMORY + def_bool y + depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP + depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 04f01fabc150..0b3157836814 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -42,6 +42,8 @@ #include "internal.h" #include "shuffle.h" +static bool memmap_on_memory; + /* * online_page_callback contains pointer to current page onlining function. * Initially it is generic_online_page(). If it is required it could be @@ -648,9 +650,16 @@ static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages) * decide to not expose all pages to the buddy (e.g., expose them * later). We account all pages as being online and belonging to this * zone ("present"). + * When using memmap_on_memory, the range might not be aligned to + * MAX_ORDER_NR_PAGES - 1, but pageblock aligned. __ffs() will detect + * this and the first chunk to online will be pageblock_nr_pages. */ - for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES) - (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1); + for (pfn = start_pfn; pfn < end_pfn;) { + int order = min(MAX_ORDER - 1UL, __ffs(pfn)); + + (*online_page_callback)(pfn_to_page(pfn), order); + pfn += (1UL << order); + } /* mark all involved sections as online */ online_mem_sections(start_pfn, end_pfn); @@ -829,7 +838,11 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, return default_zone_for_pfn(nid, start_pfn, nr_pages); } -static void adjust_present_page_count(struct zone *zone, long nr_pages) +/* + * This function should only be called by memory_block_{online,offline}, + * and {online,offline}_pages. + */ +void adjust_present_page_count(struct zone *zone, long nr_pages) { unsigned long flags; @@ -839,12 +852,54 @@ static void adjust_present_page_count(struct zone *zone, long nr_pages) pgdat_resize_unlock(zone->zone_pgdat, &flags); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, - int online_type, int nid) +int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, + struct zone *zone) +{ + unsigned long end_pfn = pfn + nr_pages; + int ret; + + ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); + if (ret) + return ret; + + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + + /* + * It might be that the vmemmap_pages fully span sections. If that is + * the case, mark those sections online here as otherwise they will be + * left offline. + */ + if (nr_pages >= PAGES_PER_SECTION) + online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); + + return ret; +} + +void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long end_pfn = pfn + nr_pages; + + /* + * It might be that the vmemmap_pages fully span sections. If that is + * the case, mark those sections offline here as otherwise they will be + * left online. + */ + if (nr_pages >= PAGES_PER_SECTION) + offline_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION)); + + /* + * The pages associated with this vmemmap have been offlined, so + * we can reset its state here. + */ + remove_pfn_range_from_zone(page_zone(pfn_to_page(pfn)), pfn, nr_pages); + kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); +} + +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone) { unsigned long flags; - struct zone *zone; int need_zonelists_rebuild = 0; + const int nid = zone_to_nid(zone); int ret; struct memory_notify arg; @@ -863,7 +918,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, mem_hotplug_begin(); /* associate pfn range with the zone */ - zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); arg.start_pfn = pfn; @@ -1077,6 +1131,45 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } +bool mhp_supports_memmap_on_memory(unsigned long size) +{ + unsigned long nr_vmemmap_pages = size / PAGE_SIZE; + unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); + unsigned long remaining_size = size - vmemmap_size; + + /* + * Besides having arch support and the feature enabled at runtime, we + * need a few more assumptions to hold true: + * + * a) We span a single memory block: memory onlining/offlinin;g happens + * in memory block granularity. We don't want the vmemmap of online + * memory blocks to reside on offline memory blocks. In the future, + * we might want to support variable-sized memory blocks to make the + * feature more versatile. + * + * b) The vmemmap pages span complete PMDs: We don't want vmemmap code + * to populate memory from the altmap for unrelated parts (i.e., + * other memory blocks) + * + * c) The vmemmap pages (and thereby the pages that will be exposed to + * the buddy) have to cover full pageblocks: memory onlining/offlining + * code requires applicable ranges to be page-aligned, for example, to + * set the migratetypes properly. + * + * TODO: Although we have a check here to make sure that vmemmap pages + * fully populate a PMD, it is not the right place to check for + * this. A much better solution involves improving vmemmap code + * to fallback to base pages when trying to populate vmemmap using + * altmap as an alternative source of memory, and we do not exactly + * populate a single PMD. + */ + return memmap_on_memory && + IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) && + size == memory_block_size_bytes() && + IS_ALIGNED(vmemmap_size, PMD_SIZE) && + IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)); +} + /* * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations (triggered e.g. by sysfs). @@ -1086,6 +1179,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; + struct vmem_altmap mhp_altmap = {}; u64 start, size; bool new_node = false; int ret; @@ -1112,13 +1206,26 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; new_node = ret; + /* + * Self hosted memmap array + */ + if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { + if (!mhp_supports_memmap_on_memory(size)) { + ret = -EINVAL; + goto error; + } + mhp_altmap.free = PHYS_PFN(size); + mhp_altmap.base_pfn = PHYS_PFN(start); + params.altmap = &mhp_altmap; + } + /* call arch's memory hotadd */ ret = arch_add_memory(nid, start, size, ¶ms); if (ret < 0) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size); + ret = create_memory_block_devices(start, size, mhp_altmap.alloc); if (ret) { arch_remove_memory(nid, start, size, NULL); goto error; @@ -1767,6 +1874,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) return 0; } +static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg) +{ + /* + * If not set, continue with the next block. + */ + return mem->nr_vmemmap_pages; +} + static int check_cpu_on_node(pg_data_t *pgdat) { int cpu; @@ -1841,6 +1956,9 @@ EXPORT_SYMBOL(try_offline_node); static int __ref try_remove_memory(int nid, u64 start, u64 size) { int rc = 0; + struct vmem_altmap mhp_altmap = {}; + struct vmem_altmap *altmap = NULL; + unsigned long nr_vmemmap_pages; BUG_ON(check_hotplug_memory_range(start, size)); @@ -1853,6 +1971,31 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) if (rc) return rc; + /* + * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in + * the same granularity it was added - a single memory block. + */ + if (memmap_on_memory) { + nr_vmemmap_pages = walk_memory_blocks(start, size, NULL, + get_nr_vmemmap_pages_cb); + if (nr_vmemmap_pages) { + if (size != memory_block_size_bytes()) { + pr_warn("Refuse to remove %#llx - %#llx," + "wrong granularity\n", + start, start + size); + return -EINVAL; + } + + /* + * Let remove_pmd_table->free_hugepage_table do the + * right thing if we used vmem_altmap when hot-adding + * the range. + */ + mhp_altmap.alloc = nr_vmemmap_pages; + altmap = &mhp_altmap; + } + } + /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); @@ -1864,7 +2007,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); - arch_remove_memory(nid, start, size, NULL); + arch_remove_memory(nid, start, size, altmap); if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { memblock_free(start, size); diff --git a/mm/sparse.c b/mm/sparse.c index 33406ea2ecc4..d3fbed26e64e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -624,7 +624,6 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) } } -#ifdef CONFIG_MEMORY_HOTREMOVE /* Mark all memory sections within the pfn range as offline */ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) { @@ -645,7 +644,6 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) ms->section_mem_map &= ~SECTION_IS_ONLINE; } } -#endif #ifdef CONFIG_SPARSEMEM_VMEMMAP static struct page * __meminit populate_section_memmap(unsigned long pfn, From 4a3e5de9c4ec41bb0684b0d4e0c16abc39617d88 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:45 -0700 Subject: [PATCH 1196/1379] acpi,memhotplug: enable MHP_MEMMAP_ON_MEMORY when supported Let the caller check whether it can pass MHP_MEMMAP_ON_MEMORY by checking mhp_supports_memmap_on_memory(). MHP_MEMMAP_ON_MEMORY can only be set in case ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE is enabled, the architecture supports altmap, and the range to be added spans a single memory block. Link: https://lkml.kernel.org/r/20210421102701.25051-6-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/acpi/acpi_memhotplug.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index b02fd51e5589..8cc195c4c861 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -171,6 +171,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) acpi_handle handle = mem_device->device->handle; int result, num_enabled = 0; struct acpi_memory_info *info; + mhp_t mhp_flags = MHP_NONE; int node; node = acpi_get_node(handle); @@ -194,8 +195,10 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) if (node < 0) node = memory_add_physaddr_to_nid(info->start_addr); + if (mhp_supports_memmap_on_memory(info->length)) + mhp_flags |= MHP_MEMMAP_ON_MEMORY; result = __add_memory(node, info->start_addr, info->length, - MHP_NONE); + mhp_flags); /* * If the memory block has been used by the kernel, add_memory() From e3a9d9fcc3315993de2e9fcd7ea82fab84433815 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:48 -0700 Subject: [PATCH 1197/1379] mm,memory_hotplug: add kernel boot option to enable memmap_on_memory Self stored memmap leads to a sparse memory situation which is unsuitable for workloads that requires large contiguous memory chunks, so make this an opt-in which needs to be explicitly enabled. To control this, let memory_hotplug have its own memory space, as suggested by David, so we can add memory_hotplug.memmap_on_memory parameter. Link: https://lkml.kernel.org/r/20210421102701.25051-7-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 17 +++++++++++++++++ mm/Makefile | 5 ++++- mm/memory_hotplug.c | 10 +++++++++- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1c0a3cf6fcc9..d93fbc1c1917 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2801,6 +2801,23 @@ seconds. Use this parameter to check at some other rate. 0 disables periodic checking. + memory_hotplug.memmap_on_memory + [KNL,X86,ARM] Boolean flag to enable this feature. + Format: {on | off (default)} + When enabled, runtime hotplugged memory will + allocate its internal metadata (struct pages) + from the hotadded memory which will allow to + hotadd a lot of memory without requiring + additional memory to do so. + This feature is disabled by default because it + has some implication on large (e.g. GB) + allocations in some configurations (e.g. small + memory blocks). + The state of the flag can be read in + /sys/module/memory_hotplug/parameters/memmap_on_memory. + Note that even when enabled, there are a few cases where + the feature is not effective. + memtest= [KNL,X86,ARM,PPC] Enable memtest Format: default : 0 diff --git a/mm/Makefile b/mm/Makefile index 809033d77710..bf71e295e9f6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -58,9 +58,13 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ page-alloc-y := page_alloc.o page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o +# Give 'memory_hotplug' its own module-parameter namespace +memory-hotplug-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o + obj-y += page-alloc.o obj-y += init-mm.o obj-y += memblock.o +obj-y += $(memory-hotplug-y) ifdef CONFIG_MMU obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o @@ -83,7 +87,6 @@ obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_KFENCE) += kfence/ obj-$(CONFIG_FAILSLAB) += failslab.o -obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0b3157836814..f7e46f54a228 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -42,7 +42,15 @@ #include "internal.h" #include "shuffle.h" -static bool memmap_on_memory; + +/* + * memory_hotplug.memmap_on_memory parameter + */ +static bool memmap_on_memory __ro_after_init; +#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY +module_param(memmap_on_memory, bool, 0444); +MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); +#endif /* * online_page_callback contains pointer to current page onlining function. From f91ef2223dc425e2e8759a625cffd48dce3503de Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:51 -0700 Subject: [PATCH 1198/1379] x86/Kconfig: introduce ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE Enable x86_64 platform to use the MHP_MEMMAP_ON_MEMORY feature. Link: https://lkml.kernel.org/r/20210421102701.25051-8-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0bbc157e4bc7..0045e1b44190 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2432,6 +2432,9 @@ config ARCH_HAS_ADD_PAGES def_bool y depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG +config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + def_bool y + config USE_PERCPU_NUMA_NODE_ID def_bool y depends on NUMA From ca6e51d592d20180374366e71bb0972de002d509 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 4 May 2021 18:39:54 -0700 Subject: [PATCH 1199/1379] arm64/Kconfig: introduce ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE Enable arm64 platform to use the MHP_MEMMAP_ON_MEMORY feature. Link: https://lkml.kernel.org/r/20210421102701.25051-9-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: David Hildenbrand Cc: Anshuman Khandual Cc: Michal Hocko Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e12fad2e2a15..f0b17d758912 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -316,6 +316,9 @@ config ZONE_DMA32 bool "Support DMA32 zone" if EXPERT default y +config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + def_bool y + config SMP def_bool y From 79cd420248c776005d534416bfc9b04696e6c729 Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Tue, 4 May 2021 18:39:57 -0700 Subject: [PATCH 1200/1379] mm/zswap.c: switch from strlcpy to strscpy strlcpy is marked as deprecated in Documentation/process/deprecated.rst, and there is no functional difference when the caller expects truncation (when not checking the return value). strscpy is relatively better as it also avoids scanning the whole source string. Link: https://lkml.kernel.org/r/1614227981-20367-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Zhiyuan Dai Cc: Seth Jennings Cc: Dan Streetman Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index 578d9f256920..20763267a219 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -614,7 +614,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) } pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); - strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); + strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); if (!pool->acomp_ctx) { From ecfc2bda7aafc5c87b69a3d7a1fc1016dd21d5a7 Mon Sep 17 00:00:00 2001 From: zhouchuangao Date: Tue, 4 May 2021 18:40:00 -0700 Subject: [PATCH 1201/1379] mm/zsmalloc: use BUG_ON instead of if condition followed by BUG. It can be optimized at compile time. Link: https://lkml.kernel.org/r/1616727798-9110-1-git-send-email-zhouchuangao@vivo.com Signed-off-by: zhouchuangao Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 30c358b72025..58697f7a43f8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1987,8 +1987,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, head = obj_to_head(page, addr); if (head & OBJ_ALLOCATED_TAG) { handle = head & ~OBJ_ALLOCATED_TAG; - if (!testpin_tag(handle)) - BUG(); + BUG_ON(!testpin_tag(handle)); old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); @@ -2035,8 +2034,7 @@ unpin_objects: head = obj_to_head(page, addr); if (head & OBJ_ALLOCATED_TAG) { handle = head & ~OBJ_ALLOCATED_TAG; - if (!testpin_tag(handle)) - BUG(); + BUG_ON(!testpin_tag(handle)); unpin_tag(handle); } } From 28961998f858114e51d2ae862065b858afcfa2b2 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 4 May 2021 18:40:03 -0700 Subject: [PATCH 1202/1379] iov_iter: lift memzero_page() to highmem.h Patch series "btrfs: Convert kmap/memset/kunmap to memzero_user()". Lifting memzero_user(), convert it to kmap_local_page() and then use it in btrfs. This patch (of 3): memzero_page() can replace the kmap/memset/kunmap pattern in other places in the code. While zero_user() has the same interface it is not the same call and its use should be limited and some of those calls may be better converted from zero_user() to memzero_page().[1] But that is not addressed in this series. Lift memzero_page() to highmem. [1] https://lore.kernel.org/lkml/CAHk-=wijdojzo56FzYqE5TOYw2Vws7ik3LEMGj9SPQaJJ+Z73Q@mail.gmail.com/ Link: https://lkml.kernel.org/r/20210309212137.2610186-1-ira.weiny@intel.com Link: https://lkml.kernel.org/r/20210309212137.2610186-2-ira.weiny@intel.com Signed-off-by: Ira Weiny Cc: Alexander Viro Cc: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: Chaitanya Kulkarni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 7 +++++++ lib/iov_iter.c | 8 +------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 44170f312ae7..832b49b50c7b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -332,4 +332,11 @@ static inline void memcpy_to_page(struct page *page, size_t offset, kunmap_local(to); } +static inline void memzero_page(struct page *page, size_t offset, size_t len) +{ + char *addr = kmap_atomic(page); + memset(addr + offset, 0, len); + kunmap_atomic(addr); +} + #endif /* _LINUX_HIGHMEM_H */ diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 61228a6c69f8..c701b7a187f2 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -507,13 +508,6 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_init); -static void memzero_page(struct page *page, size_t offset, size_t len) -{ - char *addr = kmap_atomic(page); - memset(addr + offset, 0, len); - kunmap_atomic(addr); -} - static inline bool allocated(struct pipe_buffer *buf) { return buf->ops == &default_pipe_buf_ops; From d048b9c2a737eb791a5e9506930f72b02efb8b24 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 4 May 2021 18:40:07 -0700 Subject: [PATCH 1203/1379] btrfs: use memzero_page() instead of open coded kmap pattern There are many places where kmap/memset/kunmap patterns occur. Use the newly lifted memzero_page() to eliminate direct uses of kmap and leverage the new core functions use of kmap_local_page(). The development of this patch was aided by the following coccinelle script: // // SPDX-License-Identifier: GPL-2.0-only // Find kmap/memset/kunmap pattern and replace with memset*page calls // // NOTE: Offsets and other expressions may be more complex than what the script // will automatically generate. Therefore a catchall rule is provided to find // the pattern which then must be evaluated by hand. // // Confidence: Low // Copyright: (C) 2021 Intel Corporation // URL: http://coccinelle.lip6.fr/ // Comments: // Options: // // Then the memset pattern // @ memset_rule1 @ expression page, V, L, Off; identifier ptr; type VP; @@ ( -VP ptr = kmap(page); | -ptr = kmap(page); | -VP ptr = kmap_atomic(page); | -ptr = kmap_atomic(page); ) <+... ( -memset(ptr, 0, L); +memzero_page(page, 0, L); | -memset(ptr + Off, 0, L); +memzero_page(page, Off, L); | -memset(ptr, V, L); +memset_page(page, V, 0, L); | -memset(ptr + Off, V, L); +memset_page(page, V, Off, L); ) ...+> ( -kunmap(page); | -kunmap_atomic(ptr); ) // Remove any pointers left unused @ depends on memset_rule1 @ identifier memset_rule1.ptr; type VP, VP1; @@ -VP ptr; ... when != ptr; ? VP1 ptr; // // Catch all // @ memset_rule2 @ expression page; identifier ptr; expression GenTo, GenSize, GenValue; type VP; @@ ( -VP ptr = kmap(page); | -ptr = kmap(page); | -VP ptr = kmap_atomic(page); | -ptr = kmap_atomic(page); ) <+... ( // // Some call sites have complex expressions within the memset/memcpy // The follow are catch alls which need to be evaluated by hand. // -memset(GenTo, 0, GenSize); +memzero_pageExtra(page, GenTo, GenSize); | -memset(GenTo, GenValue, GenSize); +memset_pageExtra(page, GenValue, GenTo, GenSize); ) ...+> ( -kunmap(page); | -kunmap_atomic(ptr); ) // Remove any pointers left unused @ depends on memset_rule2 @ identifier memset_rule2.ptr; type VP, VP1; @@ -VP ptr; ... when != ptr; ? VP1 ptr; // Link: https://lkml.kernel.org/r/20210309212137.2610186-4-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: David Sterba Cc: Alexander Viro Cc: Chaitanya Kulkarni Cc: Chris Mason Cc: Josef Bacik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/compression.c | 5 +---- fs/btrfs/extent_io.c | 22 ++++------------------ fs/btrfs/inode.c | 33 ++++++++++----------------------- fs/btrfs/reflink.c | 6 +----- fs/btrfs/zlib.c | 5 +---- fs/btrfs/zstd.c | 5 +---- 6 files changed, 18 insertions(+), 58 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 17f93fd28f7e..2bea01d23a5b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -591,16 +591,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, free_extent_map(em); if (page->index == end_index) { - char *userpage; size_t zero_offset = offset_in_page(isize); if (zero_offset) { int zeros; zeros = PAGE_SIZE - zero_offset; - userpage = kmap_atomic(page); - memset(userpage + zero_offset, 0, zeros); + memzero_page(page, zero_offset, zeros); flush_dcache_page(page); - kunmap_atomic(userpage); } } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f2d1bb234377..074a78a202b8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3421,15 +3421,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } if (page->index == last_byte >> PAGE_SHIFT) { - char *userpage; size_t zero_offset = offset_in_page(last_byte); if (zero_offset) { iosize = PAGE_SIZE - zero_offset; - userpage = kmap_atomic(page); - memset(userpage + zero_offset, 0, iosize); + memzero_page(page, zero_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); } } begin_page_read(fs_info, page); @@ -3438,14 +3435,11 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 disk_bytenr; if (cur >= last_byte) { - char *userpage; struct extent_state *cached = NULL; iosize = PAGE_SIZE - pg_offset; - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, iosize); + memzero_page(page, pg_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, @@ -3528,13 +3522,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - char *userpage; struct extent_state *cached = NULL; - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, iosize); + memzero_page(page, pg_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); @@ -3845,12 +3836,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (page->index == end_index) { - char *userpage; - - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, - PAGE_SIZE - pg_offset); - kunmap_atomic(userpage); + memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); flush_dcache_page(page); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b21d491b3adc..4af336008b12 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -646,17 +646,12 @@ again: if (!ret) { unsigned long offset = offset_in_page(total_compressed); struct page *page = pages[nr_pages - 1]; - char *kaddr; /* zero the tail end of the last page, we might be * sending it down to disk */ - if (offset) { - kaddr = kmap_atomic(page); - memset(kaddr + offset, 0, - PAGE_SIZE - offset); - kunmap_atomic(kaddr); - } + if (offset) + memzero_page(page, offset, PAGE_SIZE - offset); will_compress = 1; } } @@ -4833,7 +4828,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - char *kaddr; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; @@ -4925,15 +4919,13 @@ again: if (offset != blocksize) { if (!len) len = blocksize - offset; - kaddr = kmap(page); if (front) - memset(kaddr + (block_start - page_offset(page)), - 0, offset); + memzero_page(page, (block_start - page_offset(page)), + offset); else - memset(kaddr + (block_start - page_offset(page)) + offset, - 0, len); + memzero_page(page, (block_start - page_offset(page)) + offset, + len); flush_dcache_page(page); - kunmap(page); } ClearPageChecked(page); set_page_dirty(page); @@ -6832,11 +6824,9 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size + pg_offset < PAGE_SIZE) { - char *map = kmap(page); - memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset); - kunmap(page); - } + if (max_size + pg_offset < PAGE_SIZE) + memzero_page(page, pg_offset + max_size, + PAGE_SIZE - max_size - pg_offset); kfree(tmp); return ret; } @@ -8506,7 +8496,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - char *kaddr; unsigned long zero_start; loff_t size; vm_fault_t ret; @@ -8620,10 +8609,8 @@ again: zero_start = PAGE_SIZE; if (zero_start != PAGE_SIZE) { - kaddr = kmap(page); - memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start); + memzero_page(page, zero_start, PAGE_SIZE - zero_start); flush_dcache_page(page); - kunmap(page); } ClearPageChecked(page); set_page_dirty(page); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f4ec06b53aa0..3928ecc40d7b 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -129,12 +129,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) { - char *map; - - map = kmap(page); - memset(map + datal, 0, block_size - datal); + memzero_page(page, datal, block_size - datal); flush_dcache_page(page); - kunmap(page); } SetPageUptodate(page); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index d524acf7b3e5..c3fa7d3fa770 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -375,7 +375,6 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, unsigned long bytes_left; unsigned long total_out = 0; unsigned long pg_offset = 0; - char *kaddr; destlen = min_t(unsigned long, destlen, PAGE_SIZE); bytes_left = destlen; @@ -455,9 +454,7 @@ next: * end of the inline extent (destlen) to the end of the page */ if (pg_offset < destlen) { - kaddr = kmap_atomic(dest_page); - memset(kaddr + pg_offset, 0, destlen - pg_offset); - kunmap_atomic(kaddr); + memzero_page(dest_page, pg_offset, destlen - pg_offset); } return ret; } diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 8e9626d63976..3e26b466476a 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -631,7 +631,6 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, size_t ret2; unsigned long total_out = 0; unsigned long pg_offset = 0; - char *kaddr; stream = ZSTD_initDStream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); @@ -696,9 +695,7 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, ret = 0; finish: if (pg_offset < destlen) { - kaddr = kmap_atomic(dest_page); - memset(kaddr + pg_offset, 0, destlen - pg_offset); - kunmap_atomic(kaddr); + memzero_page(dest_page, pg_offset, destlen - pg_offset); } return ret; } From 9727688dbf7ea9c3e1dc06885c6f3ba281feb1a8 Mon Sep 17 00:00:00 2001 From: songqiang Date: Tue, 4 May 2021 18:40:09 -0700 Subject: [PATCH 1204/1379] mm/highmem.c: fix coding style issue Delete/add some blank lines and some blank spaces Link: https://lkml.kernel.org/r/20210311095015.14277-1-songqiang@uniontech.com Signed-off-by: songqiang Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/highmem.c b/mm/highmem.c index 6ef8f5e05e7e..e389337e00b4 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -104,7 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) atomic_long_t _totalhigh_pages __read_mostly; EXPORT_SYMBOL(_totalhigh_pages); -unsigned int __nr_free_highpages (void) +unsigned int __nr_free_highpages(void) { struct zone *zone; unsigned int pages = 0; @@ -120,7 +120,7 @@ unsigned int __nr_free_highpages (void) static int pkmap_count[LAST_PKMAP]; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); -pte_t * pkmap_page_table; +pte_t *pkmap_page_table; /* * Most architectures have no use for kmap_high_get(), so let's abstract @@ -147,6 +147,7 @@ struct page *__kmap_to_page(void *vaddr) if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { int i = PKMAP_NR(addr); + return pte_page(pkmap_page_table[i]); } @@ -278,9 +279,8 @@ void *kmap_high(struct page *page) pkmap_count[PKMAP_NR(vaddr)]++; BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2); unlock_kmap(); - return (void*) vaddr; + return (void *) vaddr; } - EXPORT_SYMBOL(kmap_high); #ifdef ARCH_NEEDS_KMAP_HIGH_GET @@ -305,7 +305,7 @@ void *kmap_high_get(struct page *page) pkmap_count[PKMAP_NR(vaddr)]++; } unlock_kmap_any(flags); - return (void*) vaddr; + return (void *) vaddr; } #endif @@ -737,7 +737,6 @@ done: spin_unlock_irqrestore(&pas->lock, flags); return ret; } - EXPORT_SYMBOL(page_address); /** From 68d68ff6ebbf69d02511dd48f16b3795671c9b0b Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Tue, 4 May 2021 18:40:12 -0700 Subject: [PATCH 1205/1379] mm/mempool: minor coding style tweaks Various coding style tweaks to various files under mm/ [daizhiyuan@phytium.com.cn: mm/swapfile: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614223624-16055-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/sparse: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614227288-19363-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/vmscan: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614227649-19853-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/compaction: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228218-20770-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/oom_kill: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228360-21168-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/shmem: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228504-21491-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/page_alloc: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228613-21754-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/filemap: minor coding style tweaks] Link: https://lkml.kernel.org/r/1614228936-22337-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/mlock: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613956588-2453-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/frontswap: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613962668-15045-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/vmalloc: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613963379-15988-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/memory_hotplug: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613971784-24878-1-git-send-email-daizhiyuan@phytium.com.cn [daizhiyuan@phytium.com.cn: mm/mempolicy: minor coding style tweaks] Link: https://lkml.kernel.org/r/1613972228-25501-1-git-send-email-daizhiyuan@phytium.com.cn Link: https://lkml.kernel.org/r/1614222374-13805-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Zhiyuan Dai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- mm/filemap.c | 8 ++++---- mm/frontswap.c | 12 ++++++++---- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 4 ++-- mm/mempool.c | 2 +- mm/mlock.c | 4 ++-- mm/oom_kill.c | 2 +- mm/page_alloc.c | 2 +- mm/shmem.c | 2 +- mm/sparse.c | 2 +- mm/swapfile.c | 4 ++-- mm/vmalloc.c | 2 +- mm/vmscan.c | 2 +- 14 files changed, 27 insertions(+), 23 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 598dffbd5c8e..3a6c6b821f80 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2885,7 +2885,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) */ static int kcompactd(void *p) { - pg_data_t *pgdat = (pg_data_t*)p; + pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; unsigned int proactive_defer = 0; diff --git a/mm/filemap.c b/mm/filemap.c index ecc5f8a4c488..7fadf211643c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3267,7 +3267,7 @@ const struct vm_operations_struct generic_file_vm_ops = { /* This is used for a general mmap of a disk file */ -int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; @@ -3292,11 +3292,11 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { return VM_FAULT_SIGBUS; } -int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } -int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) +int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { return -ENOSYS; } @@ -3724,7 +3724,7 @@ EXPORT_SYMBOL(generic_perform_write); ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct address_space * mapping = file->f_mapping; + struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t written = 0; ssize_t err; diff --git a/mm/frontswap.c b/mm/frontswap.c index 2183a56c7874..130e301c5ac0 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -60,16 +60,20 @@ static u64 frontswap_succ_stores; static u64 frontswap_failed_stores; static u64 frontswap_invalidates; -static inline void inc_frontswap_loads(void) { +static inline void inc_frontswap_loads(void) +{ data_race(frontswap_loads++); } -static inline void inc_frontswap_succ_stores(void) { +static inline void inc_frontswap_succ_stores(void) +{ data_race(frontswap_succ_stores++); } -static inline void inc_frontswap_failed_stores(void) { +static inline void inc_frontswap_failed_stores(void) +{ data_race(frontswap_failed_stores++); } -static inline void inc_frontswap_invalidates(void) { +static inline void inc_frontswap_invalidates(void) +{ data_race(frontswap_invalidates++); } #else diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f7e46f54a228..70620d0dd923 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -834,7 +834,7 @@ static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn return movable_node_enabled ? movable_zone : kernel_zone; } -struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, +struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, unsigned long nr_pages) { if (online_type == MMOP_ONLINE_KERNEL) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c0343c742bed..3ebe2cfc64af 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -330,7 +330,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, + nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } @@ -1161,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, tmp = *from; while (!nodes_empty(tmp)) { - int s,d; + int s, d; int source = NUMA_NO_NODE; int dest = 0; diff --git a/mm/mempool.c b/mm/mempool.c index fe19d290a301..a258cf4de575 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -251,7 +251,7 @@ EXPORT_SYMBOL(mempool_init); mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data) { - return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data, + return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); } EXPORT_SYMBOL(mempool_create); diff --git a/mm/mlock.c b/mm/mlock.c index f8f8cc32d03d..df590fda5688 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -559,7 +559,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, vm_flags_t flags) { unsigned long nstart, end, tmp; - struct vm_area_struct * vma, * prev; + struct vm_area_struct *vma, *prev; int error; VM_BUG_ON(offset_in_page(start)); @@ -737,7 +737,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) */ static int apply_mlockall_flags(int flags) { - struct vm_area_struct * vma, * prev = NULL; + struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index fa1cf18bac97..3df2ac6b8686 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -993,7 +993,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) if (oom_group) { mem_cgroup_print_oom_group(oom_group); mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, - (void*)message); + (void *)message); mem_cgroup_put(oom_group); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b012805a11ad..bcdc0c6f21f1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8808,7 +8808,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = __alloc_contig_migrate_range(&cc, start, end); if (ret && ret != -EBUSY) goto done; - ret =0; + ret = 0; /* * Pages from [start, end) are within a MAX_ORDER_NR_PAGES diff --git a/mm/shmem.c b/mm/shmem.c index 162d8f8993bb..a08cedefbfaa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3508,7 +3508,7 @@ static int shmem_parse_options(struct fs_context *fc, void *data) } } if (*this_char) { - char *value = strchr(this_char,'='); + char *value = strchr(this_char, '='); size_t len = 0; int err; diff --git a/mm/sparse.c b/mm/sparse.c index d3fbed26e64e..b2ada9dc00cb 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -257,7 +257,7 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en if (unlikely(!mem_section)) { unsigned long size, align; - size = sizeof(struct mem_section*) * NR_SECTION_ROOTS; + size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); mem_section = memblock_alloc(size, align); if (!mem_section) diff --git a/mm/swapfile.c b/mm/swapfile.c index 084a5b9a18e5..149e77454e3c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2780,7 +2780,7 @@ static int swap_show(struct seq_file *swap, void *v) unsigned int bytes, inuse; if (si == SEQ_START_TOKEN) { - seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); + seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } @@ -3284,7 +3284,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sizeof(long), GFP_KERNEL); - if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d33894d7b27a..9c539f0730a5 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3083,7 +3083,7 @@ EXPORT_SYMBOL(vzalloc_node); * 64b systems should always have either DMA or DMA32 zones. For others * GFP_DMA32 should do the right thing and use the normal zone. */ -#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL +#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #endif /** diff --git a/mm/vmscan.c b/mm/vmscan.c index 44c49acf10c4..5199b9696bab 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4059,7 +4059,7 @@ static int kswapd(void *p) { unsigned int alloc_order, reclaim_order; unsigned int highest_zoneidx = MAX_NR_ZONES - 1; - pg_data_t *pgdat = (pg_data_t*)p; + pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); From 0c4ff27a0e541bcee167612fc9065623d75314a3 Mon Sep 17 00:00:00 2001 From: Zhang Yunkai Date: Tue, 4 May 2021 18:40:15 -0700 Subject: [PATCH 1206/1379] mm/process_vm_access.c: remove duplicate include 'linux/compat.h' included in 'process_vm_access.c' is duplicated. Link: https://lkml.kernel.org/r/20210306132122.220431-1-zhang.yunkai@zte.com.cn Signed-off-by: Zhang Yunkai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/process_vm_access.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index f5fee9cf90f8..4bcc11958089 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include From 94868a1e127bbe0e03a4467f27196cd668cbc344 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 4 May 2021 18:40:18 -0700 Subject: [PATCH 1207/1379] kfence: zero guard page after out-of-bounds access After an out-of-bounds accesses, zero the guard page before re-protecting in kfence_guarded_free(). On one hand this helps make the failure mode of subsequent out-of-bounds accesses more deterministic, but could also prevent certain information leaks. Link: https://lkml.kernel.org/r/20210312121653.348518-1-elver@google.com Signed-off-by: Marco Elver Acked-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Jann Horn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index d53c91f881a4..768dbd58170d 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -372,6 +372,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z /* Restore page protection if there was an OOB access. */ if (meta->unprotected_page) { + memzero_explicit((void *)ALIGN_DOWN(meta->unprotected_page, PAGE_SIZE), PAGE_SIZE); kfence_protect(meta->unprotected_page); meta->unprotected_page = 0; } From 407f1d8c1b5f3ec66a6a3eb835d3b81c76440f4e Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 4 May 2021 18:40:21 -0700 Subject: [PATCH 1208/1379] kfence: await for allocation using wait_event Patch series "kfence: optimize timer scheduling", v2. We have observed that mostly-idle systems with KFENCE enabled wake up otherwise idle CPUs, preventing such to enter a lower power state. Debugging revealed that KFENCE spends too much active time in toggle_allocation_gate(). While the first version of KFENCE was using all the right bits to be scheduling optimal, and thus power efficient, by simply using wait_event() + wake_up(), that code was unfortunately removed. As KFENCE was exposed to various different configs and tests, the scheduling optimal code slowly disappeared. First because of hung task warnings, and finally because of deadlocks when an allocation is made by timer code with debug objects enabled. Clearly, the "fixes" were not too friendly for devices that want to be power efficient. Therefore, let's try a little harder to fix the hung task and deadlock problems that we have with wait_event() + wake_up(), while remaining as scheduling friendly and power efficient as possible. Crucially, we need to defer the wake_up() to an irq_work, avoiding any potential for deadlock. The result with this series is that on the devices where we observed a power regression, power usage returns back to baseline levels. This patch (of 3): On mostly-idle systems, we have observed that toggle_allocation_gate() is a cause of frequent wake-ups, preventing an otherwise idle CPU to go into a lower power state. A late change in KFENCE's development, due to a potential deadlock [1], required changing the scheduling-friendly wait_event_timeout() and wake_up() to an open-coded wait-loop using schedule_timeout(). [1] https://lkml.kernel.org/r/000000000000c0645805b7f982e4@google.com To avoid unnecessary wake-ups, switch to using wait_event_timeout(). Unfortunately, we still cannot use a version with direct wake_up() in __kfence_alloc() due to the same potential for deadlock as in [1]. Instead, add a level of indirection via an irq_work that is scheduled if we determine that the kfence_timer requires a wake_up(). Link: https://lkml.kernel.org/r/20210421105132.3965998-1-elver@google.com Link: https://lkml.kernel.org/r/20210421105132.3965998-2-elver@google.com Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure") Signed-off-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jann Horn Cc: Mark Rutland Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.kfence | 1 + mm/kfence/core.c | 43 ++++++++++++++++++++++++++++--------------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence index 78f50ccb3b45..e641add33947 100644 --- a/lib/Kconfig.kfence +++ b/lib/Kconfig.kfence @@ -7,6 +7,7 @@ menuconfig KFENCE bool "KFENCE: low-overhead sampling-based memory safety error detector" depends on HAVE_ARCH_KFENCE && (SLAB || SLUB) select STACKTRACE + select IRQ_WORK help KFENCE is a low-overhead sampling-based detector of heap out-of-bounds access, use-after-free, and invalid-free errors. KFENCE is designed diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 768dbd58170d..235d726f88bc 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -587,6 +588,17 @@ late_initcall(kfence_debugfs_init); /* === Allocation Gate Timer ================================================ */ +#ifdef CONFIG_KFENCE_STATIC_KEYS +/* Wait queue to wake up allocation-gate timer task. */ +static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); + +static void wake_up_kfence_timer(struct irq_work *work) +{ + wake_up(&allocation_wait); +} +static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer); +#endif + /* * Set up delayed work, which will enable and disable the static key. We need to * use a work queue (rather than a simple timer), since enabling and disabling a @@ -604,25 +616,13 @@ static void toggle_allocation_gate(struct work_struct *work) if (!READ_ONCE(kfence_enabled)) return; - /* Enable static key, and await allocation to happen. */ atomic_set(&kfence_allocation_gate, 0); #ifdef CONFIG_KFENCE_STATIC_KEYS + /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); - /* - * Await an allocation. Timeout after 1 second, in case the kernel stops - * doing allocations, to avoid stalling this worker task for too long. - */ - { - unsigned long end_wait = jiffies + HZ; - do { - set_current_state(TASK_UNINTERRUPTIBLE); - if (atomic_read(&kfence_allocation_gate) != 0) - break; - schedule_timeout(1); - } while (time_before(jiffies, end_wait)); - __set_current_state(TASK_RUNNING); - } + wait_event_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), HZ); + /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); #endif @@ -729,6 +729,19 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) */ if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1) return NULL; +#ifdef CONFIG_KFENCE_STATIC_KEYS + /* + * waitqueue_active() is fully ordered after the update of + * kfence_allocation_gate per atomic_inc_return(). + */ + if (waitqueue_active(&allocation_wait)) { + /* + * Calling wake_up() here may deadlock when allocations happen + * from within timer code. Use an irq_work to defer it. + */ + irq_work_queue(&wake_up_kfence_timer_work); + } +#endif if (!READ_ONCE(kfence_enabled)) return NULL; From 37c9284f6932b915043717703d6496dfd59c85f5 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 4 May 2021 18:40:24 -0700 Subject: [PATCH 1209/1379] kfence: maximize allocation wait timeout duration The allocation wait timeout was initially added because of warnings due to CONFIG_DETECT_HUNG_TASK=y [1]. While the 1 sec timeout is sufficient to resolve the warnings (given the hung task timeout must be 1 sec or larger) it may cause unnecessary wake-ups if the system is idle: https://lkml.kernel.org/r/CADYN=9J0DQhizAGB0-jz4HOBBh+05kMBXb4c0cXMS7Qi5NAJiw@mail.gmail.com Fix it by computing the timeout duration in terms of the current sysctl_hung_task_timeout_secs value. Link: https://lkml.kernel.org/r/20210421105132.3965998-3-elver@google.com Signed-off-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Hillf Danton Cc: Jann Horn Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 235d726f88bc..9742649f3f88 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -621,7 +622,16 @@ static void toggle_allocation_gate(struct work_struct *work) /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); - wait_event_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), HZ); + if (sysctl_hung_task_timeout_secs) { + /* + * During low activity with no allocations we might wait a + * while; let's avoid the hung task warning. + */ + wait_event_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), + sysctl_hung_task_timeout_secs * HZ / 2); + } else { + wait_event(allocation_wait, atomic_read(&kfence_allocation_gate)); + } /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); From 36f0b35d0894576fe63268ede80d9f5aa140be09 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 4 May 2021 18:40:27 -0700 Subject: [PATCH 1210/1379] kfence: use power-efficient work queue to run delayed work Use the power-efficient work queue, to avoid the pathological case where we keep pinning ourselves on the same possibly idle CPU on systems that want to be power-efficient (https://lwn.net/Articles/731052/). Link: https://lkml.kernel.org/r/20210421105132.3965998-4-elver@google.com Signed-off-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Hillf Danton Cc: Jann Horn Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kfence/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 9742649f3f88..e18fbbd5d9b4 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -636,7 +636,8 @@ static void toggle_allocation_gate(struct work_struct *work) /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); #endif - schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval)); + queue_delayed_work(system_power_efficient_wq, &kfence_timer, + msecs_to_jiffies(kfence_sample_interval)); } static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate); @@ -665,7 +666,7 @@ void __init kfence_init(void) } WRITE_ONCE(kfence_enabled, true); - schedule_delayed_work(&kfence_timer, 0); + queue_delayed_work(system_power_efficient_wq, &kfence_timer, 0); pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE, CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool, (void *)(__kfence_pool + KFENCE_POOL_SIZE)); From cf754ae331be7cc192b951756a1dd031e9ed978a Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 5 May 2021 00:47:14 +0200 Subject: [PATCH 1211/1379] ethtool: fix missing NLM_F_MULTI flag when dumping When dumping the ethtool information from all the interfaces, the netlink reply should contain the NLM_F_MULTI flag. This flag allows userspace tools to identify that multiple messages are expected. Link: https://bugzilla.redhat.com/1953847 Fixes: 365f9ae4ee36 ("ethtool: fix genlmsg_put() failure handling in ethnl_default_dumpit()") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: David S. Miller --- net/ethtool/netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 290012d0d11d..88d8a0243f35 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -387,7 +387,8 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, int ret; ehdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - ðtool_genl_family, 0, ctx->ops->reply_cmd); + ðtool_genl_family, NLM_F_MULTI, + ctx->ops->reply_cmd); if (!ehdr) return -EMSGSIZE; From f941d686e602163faca0c90568cca6ead3ca41b3 Mon Sep 17 00:00:00 2001 From: Sean Gloumeau Date: Wed, 5 May 2021 00:15:39 -0400 Subject: [PATCH 1212/1379] Fix spelling error from "eleminate" to "eliminate" Spelling error "eleminate" amended to "eliminate". Signed-off-by: Sean Gloumeau Reviewed-by: Kieran Bingham Signed-off-by: David S. Miller --- drivers/net/ethernet/brocade/bna/bnad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c index 7e4e831d720f..ba47777d9cff 100644 --- a/drivers/net/ethernet/brocade/bna/bnad.c +++ b/drivers/net/ethernet/brocade/bna/bnad.c @@ -1764,7 +1764,7 @@ bnad_dim_timeout(struct timer_list *t) } } - /* Check for BNAD_CF_DIM_ENABLED, does not eleminate a race */ + /* Check for BNAD_CF_DIM_ENABLED, does not eliminate a race */ if (test_bit(BNAD_RF_DIM_TIMER_RUNNING, &bnad->run_flags)) mod_timer(&bnad->dim_timer, jiffies + msecs_to_jiffies(BNAD_DIM_TIMER_FREQ)); From 52bfcdd87e83d9e69d22da5f26b1512ffc81deed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= Date: Wed, 5 May 2021 14:54:50 +0200 Subject: [PATCH 1213/1379] net:CXGB4: fix leak if sk_buff is not used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An sk_buff is allocated to send a flow control message, but it's not sent in all cases: in case the state is not appropiate to send it or if it can't be enqueued. In the first of these 2 cases, the sk_buff was discarded but not freed, producing a memory leak. Signed-off-by: Íñigo Huguet Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/sge.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 256fae15e032..1e5f2edb70cf 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -2563,12 +2563,12 @@ int cxgb4_ethofld_send_flowc(struct net_device *dev, u32 eotid, u32 tc) spin_lock_bh(&eosw_txq->lock); if (tc != FW_SCHED_CLS_NONE) { if (eosw_txq->state != CXGB4_EO_STATE_CLOSED) - goto out_unlock; + goto out_free_skb; next_state = CXGB4_EO_STATE_FLOWC_OPEN_SEND; } else { if (eosw_txq->state != CXGB4_EO_STATE_ACTIVE) - goto out_unlock; + goto out_free_skb; next_state = CXGB4_EO_STATE_FLOWC_CLOSE_SEND; } @@ -2604,17 +2604,19 @@ int cxgb4_ethofld_send_flowc(struct net_device *dev, u32 eotid, u32 tc) eosw_txq_flush_pending_skbs(eosw_txq); ret = eosw_txq_enqueue(eosw_txq, skb); - if (ret) { - dev_consume_skb_any(skb); - goto out_unlock; - } + if (ret) + goto out_free_skb; eosw_txq->state = next_state; eosw_txq->flowc_idx = eosw_txq->pidx; eosw_txq_advance(eosw_txq, 1); ethofld_xmit(dev, eosw_txq); -out_unlock: + spin_unlock_bh(&eosw_txq->lock); + return 0; + +out_free_skb: + dev_consume_skb_any(skb); spin_unlock_bh(&eosw_txq->lock); return ret; } From 2c16db6c92b0ee4aa61e88366df82169e83c3f7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Wed, 5 May 2021 09:58:31 -0700 Subject: [PATCH 1214/1379] net: fix nla_strcmp to handle more then one trailing null character Android userspace has been using TCA_KIND with a char[IFNAMESIZ] many-null-terminated buffer containing the string 'bpf'. This works on 4.19 and ceases to work on 5.10. I'm not entirely sure what fixes tag to use, but I think the issue was likely introduced in the below mentioned 5.4 commit. Reported-by: Nucca Chen Cc: Cong Wang Cc: David Ahern Cc: David S. Miller Cc: Jakub Kicinski Cc: Jamal Hadi Salim Cc: Jiri Pirko Cc: Jiri Pirko Fixes: 62794fc4fbf5 ("net_sched: add max len check for TCA_KIND") Change-Id: I66dc281f165a2858fc29a44869a270a2d698a82b Signed-off-by: David S. Miller --- lib/nlattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/nlattr.c b/lib/nlattr.c index 5b6116e81f9f..1d051ef66afe 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -828,7 +828,7 @@ int nla_strcmp(const struct nlattr *nla, const char *str) int attrlen = nla_len(nla); int d; - if (attrlen > 0 && buf[attrlen - 1] == '\0') + while (attrlen > 0 && buf[attrlen - 1] == '\0') attrlen--; d = attrlen - len; From 3cf4524ce40b204418537e6a3a55ed44911b3f53 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Tue, 27 Apr 2021 14:38:26 +0800 Subject: [PATCH 1215/1379] x86/smpboot: Remove duplicate includes Signed-off-by: Wan Jiabing Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210427063835.9039-1-wanjiabing@vivo.com --- arch/x86/kernel/smpboot.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7ffb0cf3f997..0ad5214f598a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1865,9 +1865,6 @@ static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) return true; } -#include -#include - #define X86_MATCH(model) \ X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) From 790d1ce71de9199bf9fd37c4743aec4a09489a51 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 22 Apr 2021 21:58:40 +0300 Subject: [PATCH 1216/1379] x86: Delete UD0, UD1 traces Both instructions aren't used by kernel. Signed-off-by: Alexey Dobriyan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/YIHHYNKbiSf5N7+o@localhost.localdomain --- arch/x86/include/asm/bug.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 297fa12e7e27..84b87538a15d 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -7,18 +7,9 @@ /* * Despite that some emulators terminate on UD2, we use it for WARN(). - * - * Since various instruction decoders/specs disagree on the encoding of - * UD0/UD1. */ - -#define ASM_UD0 ".byte 0x0f, 0xff" /* + ModRM (for Intel) */ -#define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */ #define ASM_UD2 ".byte 0x0f, 0x0b" - -#define INSN_UD0 0xff0f #define INSN_UD2 0x0b0f - #define LEN_UD2 2 #ifdef CONFIG_GENERIC_BUG From 4029b9706d53e5e8db2e1cee6ecd75e60b62cd09 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 25 Apr 2021 14:12:29 -0700 Subject: [PATCH 1217/1379] x86/resctrl: Fix init const confusion const variable must be initconst, not initdata. Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210425211229.3157674-1-ak@linux.intel.com --- arch/x86/kernel/cpu/resctrl/monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index dbeaa8409313..f07c10b87a87 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -84,7 +84,7 @@ unsigned int resctrl_cqm_threshold; static const struct mbm_correction_factor_table { u32 rmidthreshold; u64 cf; -} mbm_cf_table[] __initdata = { +} mbm_cf_table[] __initconst = { {7, CF(1.000000)}, {15, CF(1.000000)}, {15, CF(0.969650)}, From b6b4fbd90b155a0025223df2c137af8a701d53b3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 15:56:31 -0700 Subject: [PATCH 1218/1379] x86/cpu: Initialize MSR_TSC_AUX if RDTSCP *or* RDPID is supported Initialize MSR_TSC_AUX with CPU node information if RDTSCP or RDPID is supported. This fixes a bug where vdso_read_cpunode() will read garbage via RDPID if RDPID is supported but RDTSCP is not. While no known CPU supports RDPID but not RDTSCP, both Intel's SDM and AMD's APM allow for RDPID to exist without RDTSCP, e.g. it's technically a legal CPU model for a virtual machine. Note, technically MSR_TSC_AUX could be initialized if and only if RDPID is supported since RDTSCP is currently not used to retrieve the CPU node. But, the cost of the superfluous WRMSR is negigible, whereas leaving MSR_TSC_AUX uninitialized is just asking for future breakage if someone decides to utilize RDTSCP. Fixes: a582c540ac1b ("x86/vdso: Use RDPID in preference to LSL when available") Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210504225632.1532621-2-seanjc@google.com --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6bdb69a9a7dc..490bed07fe35 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1851,7 +1851,7 @@ static inline void setup_getcpu(int cpu) unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); struct desc_struct d = { }; - if (boot_cpu_has(X86_FEATURE_RDTSCP)) + if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID)) write_rdtscp_aux(cpudata); /* Store CPU and node number in limit. */ From fc48a6d1faadbf08b7a840d58a5a6eb85bd1a79a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 15:56:32 -0700 Subject: [PATCH 1219/1379] x86/cpu: Remove write_tsc() and write_rdtscp_aux() wrappers Drop write_tsc() and write_rdtscp_aux(); the former has no users, and the latter has only a single user and is slightly misleading since the only in-kernel consumer of MSR_TSC_AUX is RDPID, not RDTSCP. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210504225632.1532621-3-seanjc@google.com --- arch/x86/include/asm/msr.h | 4 ---- arch/x86/kernel/cpu/common.c | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index e16cccdd0420..a3f87f1015d3 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -324,10 +324,6 @@ static inline int wrmsrl_safe(u32 msr, u64 val) return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); } -#define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high)) - -#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0) - struct msr *msrs_alloc(void); void msrs_free(struct msr *msrs); int msr_set_bit(u32 msr, u8 bit); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 490bed07fe35..a1b756c49a93 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1852,7 +1852,7 @@ static inline void setup_getcpu(int cpu) struct desc_struct d = { }; if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID)) - write_rdtscp_aux(cpudata); + wrmsr(MSR_TSC_AUX, cpudata, 0); /* Store CPU and node number in limit. */ d.limit0 = cpudata; From 8621436671f3a4bba5db57482e1ee604708bf1eb Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 5 May 2021 12:40:48 -0700 Subject: [PATCH 1220/1379] smc: disallow TCP_ULP in smc_setsockopt() syzbot is able to setup kTLS on an SMC socket which coincidentally uses sk_user_data too. Later, kTLS treats it as psock so triggers a refcnt warning. The root cause is that smc_setsockopt() simply calls TCP setsockopt() which includes TCP_ULP. I do not think it makes sense to setup kTLS on top of SMC sockets, so we should just disallow this setup. It is hard to find a commit to blame, but we can apply this patch since the beginning of TCP_ULP. Reported-and-tested-by: syzbot+b54a1ce86ba4a623b7f0@syzkaller.appspotmail.com Fixes: 734942cc4ea6 ("tcp: ULP infrastructure") Cc: John Fastabend Signed-off-by: Karsten Graul Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/smc/af_smc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index be3e80b3e27f..5eff7cccceff 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2161,6 +2161,9 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, struct smc_sock *smc; int val, rc; + if (level == SOL_TCP && optname == TCP_ULP) + return -EOPNOTSUPP; + smc = smc_sk(sk); /* generic setsockopts reaching us here always apply to the @@ -2185,7 +2188,6 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, if (rc || smc->use_fallback) goto out; switch (optname) { - case TCP_ULP: case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_KEY: From 5e024c325406470d1165a09c6feaf8ec897936be Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 5 May 2021 22:25:24 +0200 Subject: [PATCH 1221/1379] netfilter: nfnetlink_osf: Fix a missing skb_header_pointer() NULL check Do not assume that the tcph->doff field is correct when parsing for TCP options, skb_header_pointer() might fail to fetch these bits. Fixes: 11eeef41d5f6 ("netfilter: passive OS fingerprint xtables match") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_osf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index e8f8875c6884..0fa2e2030427 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -186,6 +186,8 @@ static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) + sizeof(struct tcphdr), ctx->optsize, opts); + if (!ctx->optp) + return NULL; } return tcp; From a217a6593cec8b315d4c2f344bae33660b39b703 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 4 May 2021 21:50:14 +0200 Subject: [PATCH 1222/1379] KVM/VMX: Invoke NMI non-IST entry instead of IST entry In VMX, the host NMI handler needs to be invoked after NMI VM-Exit. Before commit 1a5488ef0dcf6 ("KVM: VMX: Invoke NMI handler via indirect call instead of INTn"), this was done by INTn ("int $2"). But INTn microcode is relatively expensive, so the commit reworked NMI VM-Exit handling to invoke the kernel handler by function call. But this missed a detail. The NMI entry point for direct invocation is fetched from the IDT table and called on the kernel stack. But on 64-bit the NMI entry installed in the IDT expects to be invoked on the IST stack. It relies on the "NMI executing" variable on the IST stack to work correctly, which is at a fixed position in the IST stack. When the entry point is unexpectedly called on the kernel stack, the RSP-addressed "NMI executing" variable is obviously also on the kernel stack and is "uninitialized" and can cause the NMI entry code to run in the wrong way. Provide a non-ist entry point for VMX which shares the C-function with the regular NMI entry and invoke the new asm entry point instead. On 32-bit this just maps to the regular NMI entry point as 32-bit has no ISTs and is not affected. [ tglx: Made it independent for backporting, massaged changelog ] Fixes: 1a5488ef0dcf6 ("KVM: VMX: Invoke NMI handler via indirect call instead of INTn") Signed-off-by: Lai Jiangshan Signed-off-by: Thomas Gleixner Tested-by: Lai Jiangshan Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87r1imi8i1.ffs@nanos.tec.linutronix.de --- arch/x86/include/asm/idtentry.h | 15 +++++++++++++++ arch/x86/kernel/nmi.c | 10 ++++++++++ arch/x86/kvm/vmx/vmx.c | 16 +++++++++------- 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index e35e342673c7..73d45b0dfff2 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -588,6 +588,21 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check); #endif /* NMI */ + +#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) +/* + * Special NOIST entry point for VMX which invokes this on the kernel + * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI + * 'executing' marker. + * + * On 32bit this just uses the regular NMI entry point because 32-bit does + * not have ISTs. + */ +DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist); +#else +#define asm_exc_nmi_noist asm_exc_nmi +#endif + DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi); #ifdef CONFIG_XEN_PV DECLARE_IDTENTRY_RAW(X86_TRAP_NMI, xenpv_exc_nmi); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index bf250a339655..2ef961cf4cfc 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -524,6 +524,16 @@ nmi_restart: mds_user_clear_cpu_buffers(); } +#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) +DEFINE_IDTENTRY_RAW(exc_nmi_noist) +{ + exc_nmi(regs); +} +#endif +#if IS_MODULE(CONFIG_KVM_INTEL) +EXPORT_SYMBOL_GPL(asm_exc_nmi_noist); +#endif + void stop_nmi(void) { ignore_nmis++; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cbe0cdade38a..b21d751407b5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -6415,18 +6416,17 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) void vmx_do_interrupt_nmi_irqoff(unsigned long entry); -static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, u32 intr_info) +static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, + unsigned long entry) { - unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; - gate_desc *desc = (gate_desc *)host_idt_base + vector; - kvm_before_interrupt(vcpu); - vmx_do_interrupt_nmi_irqoff(gate_offset(desc)); + vmx_do_interrupt_nmi_irqoff(entry); kvm_after_interrupt(vcpu); } static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { + const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ @@ -6437,18 +6437,20 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ else if (is_nmi(intr_info)) - handle_interrupt_nmi_irqoff(&vmx->vcpu, intr_info); + handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry); } static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) { u32 intr_info = vmx_get_intr_info(vcpu); + unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; + gate_desc *desc = (gate_desc *)host_idt_base + vector; if (WARN_ONCE(!is_external_intr(intr_info), "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; - handle_interrupt_nmi_irqoff(vcpu, intr_info); + handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); } static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) From 866a6dadbb027b2955a7ae00bab9705d382def12 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 4 May 2021 17:27:28 -0700 Subject: [PATCH 1223/1379] context_tracking: Move guest exit context tracking to separate helpers Provide separate context tracking helpers for guest exit, the standalone helpers will be called separately by KVM x86 in later patches to fix tick-based accounting. Suggested-by: Thomas Gleixner Signed-off-by: Wanpeng Li Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210505002735.1684165-2-seanjc@google.com --- include/linux/context_tracking.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index bceb06498521..b8c7313495a7 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -131,10 +131,15 @@ static __always_inline void guest_enter_irqoff(void) } } -static __always_inline void guest_exit_irqoff(void) +static __always_inline void context_tracking_guest_exit(void) { if (context_tracking_enabled()) __context_tracking_exit(CONTEXT_GUEST); +} + +static __always_inline void guest_exit_irqoff(void) +{ + context_tracking_guest_exit(); instrumentation_begin(); if (vtime_accounting_enabled_this_cpu()) @@ -159,6 +164,8 @@ static __always_inline void guest_enter_irqoff(void) instrumentation_end(); } +static __always_inline void context_tracking_guest_exit(void) { } + static __always_inline void guest_exit_irqoff(void) { instrumentation_begin(); From 88d8220bbf06dd8045b2ac4be1046290eaa7773a Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 4 May 2021 17:27:29 -0700 Subject: [PATCH 1224/1379] context_tracking: Move guest exit vtime accounting to separate helpers Provide separate vtime accounting functions for guest exit instead of open coding the logic within the context tracking code. This will allow KVM x86 to handle vtime accounting slightly differently when using tick-based accounting. Suggested-by: Thomas Gleixner Signed-off-by: Wanpeng Li Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Reviewed-by: Christian Borntraeger Link: https://lore.kernel.org/r/20210505002735.1684165-3-seanjc@google.com --- include/linux/context_tracking.h | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index b8c7313495a7..4f4556232dcf 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -137,15 +137,20 @@ static __always_inline void context_tracking_guest_exit(void) __context_tracking_exit(CONTEXT_GUEST); } +static __always_inline void vtime_account_guest_exit(void) +{ + if (vtime_accounting_enabled_this_cpu()) + vtime_guest_exit(current); + else + current->flags &= ~PF_VCPU; +} + static __always_inline void guest_exit_irqoff(void) { context_tracking_guest_exit(); instrumentation_begin(); - if (vtime_accounting_enabled_this_cpu()) - vtime_guest_exit(current); - else - current->flags &= ~PF_VCPU; + vtime_account_guest_exit(); instrumentation_end(); } @@ -166,12 +171,17 @@ static __always_inline void guest_enter_irqoff(void) static __always_inline void context_tracking_guest_exit(void) { } +static __always_inline void vtime_account_guest_exit(void) +{ + vtime_account_kernel(current); + current->flags &= ~PF_VCPU; +} + static __always_inline void guest_exit_irqoff(void) { instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ - vtime_account_kernel(current); - current->flags &= ~PF_VCPU; + vtime_account_guest_exit(); instrumentation_end(); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ From 160457140187c5fb127b844e5a85f87f00a01b14 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 4 May 2021 17:27:30 -0700 Subject: [PATCH 1225/1379] KVM: x86: Defer vtime accounting 'til after IRQ handling Defer the call to account guest time until after servicing any IRQ(s) that happened in the guest or immediately after VM-Exit. Tick-based accounting of vCPU time relies on PF_VCPU being set when the tick IRQ handler runs, and IRQs are blocked throughout the main sequence of vcpu_enter_guest(), including the call into vendor code to actually enter and exit the guest. This fixes a bug where reported guest time remains '0', even when running an infinite loop in the guest: https://bugzilla.kernel.org/show_bug.cgi?id=209831 Fixes: 87fa7f3e98a131 ("x86/kvm: Move context tracking where it belongs") Suggested-by: Thomas Gleixner Co-developed-by: Sean Christopherson Signed-off-by: Wanpeng Li Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210505002735.1684165-4-seanjc@google.com --- arch/x86/kvm/svm/svm.c | 6 +++--- arch/x86/kvm/vmx/vmx.c | 6 +++--- arch/x86/kvm/x86.c | 9 +++++++++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9790c73f2a32..c400def6220b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3753,15 +3753,15 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) * have them in state 'on' as recorded before entering guest mode. * Same as enter_from_user_mode(). * - * guest_exit_irqoff() restores host context and reinstates RCU if - * enabled and required. + * context_tracking_guest_exit() restores host context and reinstates + * RCU if enabled and required. * * This needs to be done before the below as native_read_msr() * contains a tracepoint and x86_spec_ctrl_restore_host() calls * into world and some more. */ lockdep_hardirqs_off(CALLER_ADDR0); - guest_exit_irqoff(); + context_tracking_guest_exit(); instrumentation_begin(); trace_hardirqs_off_finish(); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b21d751407b5..e108fb47855b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6703,15 +6703,15 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, * have them in state 'on' as recorded before entering guest mode. * Same as enter_from_user_mode(). * - * guest_exit_irqoff() restores host context and reinstates RCU if - * enabled and required. + * context_tracking_guest_exit() restores host context and reinstates + * RCU if enabled and required. * * This needs to be done before the below as native_read_msr() * contains a tracepoint and x86_spec_ctrl_restore_host() calls * into world and some more. */ lockdep_hardirqs_off(CALLER_ADDR0); - guest_exit_irqoff(); + context_tracking_guest_exit(); instrumentation_begin(); trace_hardirqs_off_finish(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cebdaa1e3cf5..6eda2834fc05 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9315,6 +9315,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) local_irq_disable(); kvm_after_interrupt(vcpu); + /* + * Wait until after servicing IRQs to account guest time so that any + * ticks that occurred while running the guest are properly accounted + * to the guest. Waiting until IRQs are enabled degrades the accuracy + * of accounting via context tracking, but the loss of accuracy is + * acceptable for all known use cases. + */ + vtime_account_guest_exit(); + if (lapic_in_kernel(vcpu)) { s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; if (delta != S64_MIN) { From b41c723b203e19480c26f2ec8f04eedc03d34b34 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 17:27:31 -0700 Subject: [PATCH 1226/1379] sched/vtime: Move vtime accounting external declarations above inlines Move the blob of external declarations (and their stubs) above the set of inline definitions (and their stubs) for vtime accounting. This will allow a future patch to bring in more inline definitions without also having to shuffle large chunks of code. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Reviewed-by: Christian Borntraeger Link: https://lore.kernel.org/r/20210505002735.1684165-5-seanjc@google.com --- include/linux/vtime.h | 74 +++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 041d6524d144..6a4317560539 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -10,6 +10,43 @@ struct task_struct; +/* + * Common vtime APIs + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +extern void vtime_account_kernel(struct task_struct *tsk); +extern void vtime_account_idle(struct task_struct *tsk); +#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ +static inline void vtime_account_kernel(struct task_struct *tsk) { } +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +extern void arch_vtime_task_switch(struct task_struct *tsk); +extern void vtime_user_enter(struct task_struct *tsk); +extern void vtime_user_exit(struct task_struct *tsk); +extern void vtime_guest_enter(struct task_struct *tsk); +extern void vtime_guest_exit(struct task_struct *tsk); +extern void vtime_init_idle(struct task_struct *tsk, int cpu); +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ +static inline void vtime_user_enter(struct task_struct *tsk) { } +static inline void vtime_user_exit(struct task_struct *tsk) { } +static inline void vtime_guest_enter(struct task_struct *tsk) { } +static inline void vtime_guest_exit(struct task_struct *tsk) { } +static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } +#endif + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset); +extern void vtime_account_softirq(struct task_struct *tsk); +extern void vtime_account_hardirq(struct task_struct *tsk); +extern void vtime_flush(struct task_struct *tsk); +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { } +static inline void vtime_account_softirq(struct task_struct *tsk) { } +static inline void vtime_account_hardirq(struct task_struct *tsk) { } +static inline void vtime_flush(struct task_struct *tsk) { } +#endif + /* * vtime_accounting_enabled_this_cpu() definitions/declarations */ @@ -57,43 +94,6 @@ static inline void vtime_task_switch(struct task_struct *prev) { } #endif -/* - * Common vtime APIs - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -extern void vtime_account_kernel(struct task_struct *tsk); -extern void vtime_account_idle(struct task_struct *tsk); -#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ -static inline void vtime_account_kernel(struct task_struct *tsk) { } -#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -extern void arch_vtime_task_switch(struct task_struct *tsk); -extern void vtime_user_enter(struct task_struct *tsk); -extern void vtime_user_exit(struct task_struct *tsk); -extern void vtime_guest_enter(struct task_struct *tsk); -extern void vtime_guest_exit(struct task_struct *tsk); -extern void vtime_init_idle(struct task_struct *tsk, int cpu); -#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -static inline void vtime_user_enter(struct task_struct *tsk) { } -static inline void vtime_user_exit(struct task_struct *tsk) { } -static inline void vtime_guest_enter(struct task_struct *tsk) { } -static inline void vtime_guest_exit(struct task_struct *tsk) { } -static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } -#endif - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset); -extern void vtime_account_softirq(struct task_struct *tsk); -extern void vtime_account_hardirq(struct task_struct *tsk); -extern void vtime_flush(struct task_struct *tsk); -#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ -static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { } -static inline void vtime_account_softirq(struct task_struct *tsk) { } -static inline void vtime_account_hardirq(struct task_struct *tsk) { } -static inline void vtime_flush(struct task_struct *tsk) { } -#endif - #ifdef CONFIG_IRQ_TIME_ACCOUNTING extern void irqtime_account_irq(struct task_struct *tsk, unsigned int offset); From 6f922b89e5518143920b10e3643e556d9df58d94 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 17:27:32 -0700 Subject: [PATCH 1227/1379] sched/vtime: Move guest enter/exit vtime accounting to vtime.h Provide separate helpers for guest enter vtime accounting (in addition to the existing guest exit helpers), and move all vtime accounting helpers to vtime.h where the existing #ifdef infrastructure can be leveraged to better delineate the different types of accounting. This will also allow future cleanups via deduplication of context tracking code. Opportunstically delete the vtime_account_kernel() stub now that all callers are wrapped with CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210505002735.1684165-6-seanjc@google.com --- include/linux/context_tracking.h | 17 +----------- include/linux/vtime.h | 46 +++++++++++++++++++++++++++----- 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 4f4556232dcf..56c648bdbde8 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -137,14 +137,6 @@ static __always_inline void context_tracking_guest_exit(void) __context_tracking_exit(CONTEXT_GUEST); } -static __always_inline void vtime_account_guest_exit(void) -{ - if (vtime_accounting_enabled_this_cpu()) - vtime_guest_exit(current); - else - current->flags &= ~PF_VCPU; -} - static __always_inline void guest_exit_irqoff(void) { context_tracking_guest_exit(); @@ -163,20 +155,13 @@ static __always_inline void guest_enter_irqoff(void) * to flush. */ instrumentation_begin(); - vtime_account_kernel(current); - current->flags |= PF_VCPU; + vtime_account_guest_enter(); rcu_virt_note_context_switch(smp_processor_id()); instrumentation_end(); } static __always_inline void context_tracking_guest_exit(void) { } -static __always_inline void vtime_account_guest_exit(void) -{ - vtime_account_kernel(current); - current->flags &= ~PF_VCPU; -} - static __always_inline void guest_exit_irqoff(void) { instrumentation_begin(); diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 6a4317560539..3684487d01e1 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -3,21 +3,18 @@ #define _LINUX_KERNEL_VTIME_H #include +#include + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #include #endif - -struct task_struct; - /* * Common vtime APIs */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING extern void vtime_account_kernel(struct task_struct *tsk); extern void vtime_account_idle(struct task_struct *tsk); -#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ -static inline void vtime_account_kernel(struct task_struct *tsk) { } #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN @@ -55,6 +52,18 @@ static inline void vtime_flush(struct task_struct *tsk) { } static inline bool vtime_accounting_enabled_this_cpu(void) { return true; } extern void vtime_task_switch(struct task_struct *prev); +static __always_inline void vtime_account_guest_enter(void) +{ + vtime_account_kernel(current); + current->flags |= PF_VCPU; +} + +static __always_inline void vtime_account_guest_exit(void) +{ + vtime_account_kernel(current); + current->flags &= ~PF_VCPU; +} + #elif defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN) /* @@ -86,12 +95,37 @@ static inline void vtime_task_switch(struct task_struct *prev) vtime_task_switch_generic(prev); } +static __always_inline void vtime_account_guest_enter(void) +{ + if (vtime_accounting_enabled_this_cpu()) + vtime_guest_enter(current); + else + current->flags |= PF_VCPU; +} + +static __always_inline void vtime_account_guest_exit(void) +{ + if (vtime_accounting_enabled_this_cpu()) + vtime_guest_exit(current); + else + current->flags &= ~PF_VCPU; +} + #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ -static inline bool vtime_accounting_enabled_cpu(int cpu) {return false; } static inline bool vtime_accounting_enabled_this_cpu(void) { return false; } static inline void vtime_task_switch(struct task_struct *prev) { } +static __always_inline void vtime_account_guest_enter(void) +{ + current->flags |= PF_VCPU; +} + +static __always_inline void vtime_account_guest_exit(void) +{ + current->flags &= ~PF_VCPU; +} + #endif From 14296e0c447885d6c7b326e059fb528eb00526ed Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 17:27:33 -0700 Subject: [PATCH 1228/1379] context_tracking: Consolidate guest enter/exit wrappers Consolidate the guest enter/exit wrappers, providing and tweaking stubs as needed. This will allow moving the wrappers under KVM without having to bleed #ifdefs into the soon-to-be KVM code. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210505002735.1684165-7-seanjc@google.com --- include/linux/context_tracking.h | 65 ++++++++++++-------------------- 1 file changed, 24 insertions(+), 41 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 56c648bdbde8..aa58c2ac67ca 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -71,6 +71,19 @@ static inline void exception_exit(enum ctx_state prev_ctx) } } +static __always_inline bool context_tracking_guest_enter(void) +{ + if (context_tracking_enabled()) + __context_tracking_enter(CONTEXT_GUEST); + + return context_tracking_enabled_this_cpu(); +} + +static __always_inline void context_tracking_guest_exit(void) +{ + if (context_tracking_enabled()) + __context_tracking_exit(CONTEXT_GUEST); +} /** * ct_state() - return the current context tracking state if known @@ -92,6 +105,9 @@ static inline void user_exit_irqoff(void) { } static inline enum ctx_state exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; } +static inline bool context_tracking_guest_enter(void) { return false; } +static inline void context_tracking_guest_exit(void) { } + #endif /* !CONFIG_CONTEXT_TRACKING */ #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond)) @@ -102,74 +118,41 @@ extern void context_tracking_init(void); static inline void context_tracking_init(void) { } #endif /* CONFIG_CONTEXT_TRACKING_FORCE */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN /* must be called with irqs disabled */ static __always_inline void guest_enter_irqoff(void) { + /* + * This is running in ioctl context so its safe to assume that it's the + * stime pending cputime to flush. + */ instrumentation_begin(); - if (vtime_accounting_enabled_this_cpu()) - vtime_guest_enter(current); - else - current->flags |= PF_VCPU; + vtime_account_guest_enter(); instrumentation_end(); - if (context_tracking_enabled()) - __context_tracking_enter(CONTEXT_GUEST); - - /* KVM does not hold any references to rcu protected data when it + /* + * KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode * is very similar to exiting to userspace from rcu point of view. In * addition CPU may stay in a guest mode for quite a long time (up to * one time slice). Lets treat guest mode as quiescent state, just like * we do with user-mode execution. */ - if (!context_tracking_enabled_this_cpu()) { + if (!context_tracking_guest_enter()) { instrumentation_begin(); rcu_virt_note_context_switch(smp_processor_id()); instrumentation_end(); } } -static __always_inline void context_tracking_guest_exit(void) -{ - if (context_tracking_enabled()) - __context_tracking_exit(CONTEXT_GUEST); -} - static __always_inline void guest_exit_irqoff(void) { context_tracking_guest_exit(); - instrumentation_begin(); - vtime_account_guest_exit(); - instrumentation_end(); -} - -#else -static __always_inline void guest_enter_irqoff(void) -{ - /* - * This is running in ioctl context so its safe - * to assume that it's the stime pending cputime - * to flush. - */ - instrumentation_begin(); - vtime_account_guest_enter(); - rcu_virt_note_context_switch(smp_processor_id()); - instrumentation_end(); -} - -static __always_inline void context_tracking_guest_exit(void) { } - -static __always_inline void guest_exit_irqoff(void) -{ instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ vtime_account_guest_exit(); instrumentation_end(); } -#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ static inline void guest_exit(void) { From 1ca0016c149be35fe19a6b75fce95c25807b7159 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 17:27:34 -0700 Subject: [PATCH 1229/1379] context_tracking: KVM: Move guest enter/exit wrappers to KVM's domain Move the guest enter/exit wrappers to kvm_host.h so that KVM can manage its context tracking vs. vtime accounting without bleeding too many KVM details into the context tracking code. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210505002735.1684165-8-seanjc@google.com --- include/linux/context_tracking.h | 45 -------------------------------- include/linux/kvm_host.h | 45 ++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index aa58c2ac67ca..4d7fced3a39f 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -118,49 +118,4 @@ extern void context_tracking_init(void); static inline void context_tracking_init(void) { } #endif /* CONFIG_CONTEXT_TRACKING_FORCE */ -/* must be called with irqs disabled */ -static __always_inline void guest_enter_irqoff(void) -{ - /* - * This is running in ioctl context so its safe to assume that it's the - * stime pending cputime to flush. - */ - instrumentation_begin(); - vtime_account_guest_enter(); - instrumentation_end(); - - /* - * KVM does not hold any references to rcu protected data when it - * switches CPU into a guest mode. In fact switching to a guest mode - * is very similar to exiting to userspace from rcu point of view. In - * addition CPU may stay in a guest mode for quite a long time (up to - * one time slice). Lets treat guest mode as quiescent state, just like - * we do with user-mode execution. - */ - if (!context_tracking_guest_enter()) { - instrumentation_begin(); - rcu_virt_note_context_switch(smp_processor_id()); - instrumentation_end(); - } -} - -static __always_inline void guest_exit_irqoff(void) -{ - context_tracking_guest_exit(); - - instrumentation_begin(); - /* Flush the guest cputime we spent on the guest */ - vtime_account_guest_exit(); - instrumentation_end(); -} - -static inline void guest_exit(void) -{ - unsigned long flags; - - local_irq_save(flags); - guest_exit_irqoff(); - local_irq_restore(flags); -} - #endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8895b95b6a22..2f34487e21f2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -338,6 +338,51 @@ struct kvm_vcpu { struct kvm_dirty_ring dirty_ring; }; +/* must be called with irqs disabled */ +static __always_inline void guest_enter_irqoff(void) +{ + /* + * This is running in ioctl context so its safe to assume that it's the + * stime pending cputime to flush. + */ + instrumentation_begin(); + vtime_account_guest_enter(); + instrumentation_end(); + + /* + * KVM does not hold any references to rcu protected data when it + * switches CPU into a guest mode. In fact switching to a guest mode + * is very similar to exiting to userspace from rcu point of view. In + * addition CPU may stay in a guest mode for quite a long time (up to + * one time slice). Lets treat guest mode as quiescent state, just like + * we do with user-mode execution. + */ + if (!context_tracking_guest_enter()) { + instrumentation_begin(); + rcu_virt_note_context_switch(smp_processor_id()); + instrumentation_end(); + } +} + +static __always_inline void guest_exit_irqoff(void) +{ + context_tracking_guest_exit(); + + instrumentation_begin(); + /* Flush the guest cputime we spent on the guest */ + vtime_account_guest_exit(); + instrumentation_end(); +} + +static inline void guest_exit(void) +{ + unsigned long flags; + + local_irq_save(flags); + guest_exit_irqoff(); + local_irq_restore(flags); +} + static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { /* From bc908e091b3264672889162733020048901021fb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 May 2021 17:27:35 -0700 Subject: [PATCH 1230/1379] KVM: x86: Consolidate guest enter/exit logic to common helpers Move the enter/exit logic in {svm,vmx}_vcpu_enter_exit() to common helpers. Opportunistically update the somewhat stale comment about the updates needing to occur immediately after VM-Exit. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210505002735.1684165-9-seanjc@google.com --- arch/x86/kvm/svm/svm.c | 39 ++---------------------------------- arch/x86/kvm/vmx/vmx.c | 39 ++---------------------------------- arch/x86/kvm/x86.h | 45 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 74 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c400def6220b..b649f92287a2 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3710,25 +3710,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); unsigned long vmcb_pa = svm->current_vmcb->pa; - /* - * VMENTER enables interrupts (host state), but the kernel state is - * interrupts disabled when this is invoked. Also tell RCU about - * it. This is the same logic as for exit_to_user_mode(). - * - * This ensures that e.g. latency analysis on the host observes - * guest mode as interrupt enabled. - * - * guest_enter_irqoff() informs context tracking about the - * transition to guest mode and if enabled adjusts RCU state - * accordingly. - */ - instrumentation_begin(); - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - instrumentation_end(); - - guest_enter_irqoff(); - lockdep_hardirqs_on(CALLER_ADDR0); + kvm_guest_enter_irqoff(); if (sev_es_guest(vcpu->kvm)) { __svm_sev_es_vcpu_run(vmcb_pa); @@ -3748,24 +3730,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) vmload(__sme_page_pa(sd->save_area)); } - /* - * VMEXIT disables interrupts (host state), but tracing and lockdep - * have them in state 'on' as recorded before entering guest mode. - * Same as enter_from_user_mode(). - * - * context_tracking_guest_exit() restores host context and reinstates - * RCU if enabled and required. - * - * This needs to be done before the below as native_read_msr() - * contains a tracepoint and x86_spec_ctrl_restore_host() calls - * into world and some more. - */ - lockdep_hardirqs_off(CALLER_ADDR0); - context_tracking_guest_exit(); - - instrumentation_begin(); - trace_hardirqs_off_finish(); - instrumentation_end(); + kvm_guest_exit_irqoff(); } static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e108fb47855b..d000cddbd734 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6664,25 +6664,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - /* - * VMENTER enables interrupts (host state), but the kernel state is - * interrupts disabled when this is invoked. Also tell RCU about - * it. This is the same logic as for exit_to_user_mode(). - * - * This ensures that e.g. latency analysis on the host observes - * guest mode as interrupt enabled. - * - * guest_enter_irqoff() informs context tracking about the - * transition to guest mode and if enabled adjusts RCU state - * accordingly. - */ - instrumentation_begin(); - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - instrumentation_end(); - - guest_enter_irqoff(); - lockdep_hardirqs_on(CALLER_ADDR0); + kvm_guest_enter_irqoff(); /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) @@ -6698,24 +6680,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vcpu->arch.cr2 = native_read_cr2(); - /* - * VMEXIT disables interrupts (host state), but tracing and lockdep - * have them in state 'on' as recorded before entering guest mode. - * Same as enter_from_user_mode(). - * - * context_tracking_guest_exit() restores host context and reinstates - * RCU if enabled and required. - * - * This needs to be done before the below as native_read_msr() - * contains a tracepoint and x86_spec_ctrl_restore_host() calls - * into world and some more. - */ - lockdep_hardirqs_off(CALLER_ADDR0); - context_tracking_guest_exit(); - - instrumentation_begin(); - trace_hardirqs_off_finish(); - instrumentation_end(); + kvm_guest_exit_irqoff(); } static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 8ddd38146525..521f74e5bbf2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,51 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +static __always_inline void kvm_guest_enter_irqoff(void) +{ + /* + * VMENTER enables interrupts (host state), but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about + * it. This is the same logic as for exit_to_user_mode(). + * + * This ensures that e.g. latency analysis on the host observes + * guest mode as interrupt enabled. + * + * guest_enter_irqoff() informs context tracking about the + * transition to guest mode and if enabled adjusts RCU state + * accordingly. + */ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + guest_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); +} + +static __always_inline void kvm_guest_exit_irqoff(void) +{ + /* + * VMEXIT disables interrupts (host state), but tracing and lockdep + * have them in state 'on' as recorded before entering guest mode. + * Same as enter_from_user_mode(). + * + * context_tracking_guest_exit() restores host context and reinstates + * RCU if enabled and required. + * + * This needs to be done immediately after VM-Exit, before any code + * that might contain tracepoints or call out to the greater world, + * e.g. before x86_spec_ctrl_restore_host(). + */ + lockdep_hardirqs_off(CALLER_ADDR0); + context_tracking_guest_exit(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ ({ \ bool failed = (consistency_check); \ From d1f82808877bb10d3deee7cf3374a4eb3fb582db Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 5 May 2021 09:47:06 -0300 Subject: [PATCH 1231/1379] io_uring: truncate lengths larger than MAX_RW_COUNT on provide buffers Read and write operations are capped to MAX_RW_COUNT. Some read ops rely on that limit, and that is not guaranteed by the IORING_OP_PROVIDE_BUFFERS. Truncate those lengths when doing io_add_buffers, so buffer addresses still use the uncapped length. Also, take the chance and change struct io_buffer len member to __u32, so it matches struct io_provide_buffer len member. This fixes CVE-2021-3491, also reported as ZDI-CAN-13546. Fixes: ddf0322db79c ("io_uring: add IORING_OP_PROVIDE_BUFFERS") Reported-by: Billy Jheng Bing-Jhong (@st424204) Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7a2e83bc005d..f46acbbeed57 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -251,7 +251,7 @@ struct io_rsrc_data { struct io_buffer { struct list_head list; __u64 addr; - __s32 len; + __u32 len; __u16 bid; }; @@ -3986,7 +3986,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) break; buf->addr = addr; - buf->len = pbuf->len; + buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; addr += pbuf->len; bid++; From a5e7da1494e191c561ecce8829a6c19449585e3d Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 5 May 2021 07:37:28 +0200 Subject: [PATCH 1232/1379] MAINTAINERS: add io_uring tool to IO_URING The files in ./tools/io_uring/ are maintained by the IO_URING maintainers. Reflect that fact in MAINTAINERS. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20210505053728.3868-1-lukas.bulwahn@gmail.com Signed-off-by: Jens Axboe --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 5c90148f0369..b680192d18b9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9497,6 +9497,7 @@ F: fs/io-wq.h F: fs/io_uring.c F: include/linux/io_uring.h F: include/uapi/linux/io_uring.h +F: tools/io_uring/ IPMI SUBSYSTEM M: Corey Minyard From 198ad973839ca4686f3575155ba9ff178289905f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 5 May 2021 22:30:49 +0200 Subject: [PATCH 1233/1379] netfilter: remove BUG_ON() after skb_header_pointer() Several conntrack helpers and the TCP tracker assume that skb_header_pointer() never fails based on upfront header validation. Even if this should not ever happen, BUG_ON() is a too drastic measure, remove them. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_ftp.c | 5 ++++- net/netfilter/nf_conntrack_h323_main.c | 3 ++- net/netfilter/nf_conntrack_irc.c | 5 ++++- net/netfilter/nf_conntrack_pptp.c | 4 +++- net/netfilter/nf_conntrack_proto_tcp.c | 6 ++++-- net/netfilter/nf_conntrack_sane.c | 5 ++++- 6 files changed, 21 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index b22801f97bce..a414274338cf 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -413,7 +413,10 @@ static int help(struct sk_buff *skb, spin_lock_bh(&nf_ftp_lock); fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer); - BUG_ON(fb_ptr == NULL); + if (!fb_ptr) { + spin_unlock_bh(&nf_ftp_lock); + return NF_ACCEPT; + } ends_in_nl = (fb_ptr[datalen - 1] == '\n'); seq = ntohl(th->seq) + datalen; diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 8ba037b76ad3..aafaff00baf1 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -146,7 +146,8 @@ static int get_tpkt_data(struct sk_buff *skb, unsigned int protoff, /* Get first TPKT pointer */ tpkt = skb_header_pointer(skb, tcpdataoff, tcpdatalen, h323_buffer); - BUG_ON(tpkt == NULL); + if (!tpkt) + goto clear_out; /* Validate TPKT identifier */ if (tcpdatalen < 4 || tpkt[0] != 0x03 || tpkt[1] != 0) { diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index e40988a2f22f..08ee4e760a3d 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -143,7 +143,10 @@ static int help(struct sk_buff *skb, unsigned int protoff, spin_lock_bh(&irc_buffer_lock); ib_ptr = skb_header_pointer(skb, dataoff, skb->len - dataoff, irc_buffer); - BUG_ON(ib_ptr == NULL); + if (!ib_ptr) { + spin_unlock_bh(&irc_buffer_lock); + return NF_ACCEPT; + } data = ib_ptr; data_limit = ib_ptr + skb->len - dataoff; diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 5105d4250012..7d5708b92138 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -544,7 +544,9 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff, nexthdr_off = protoff; tcph = skb_header_pointer(skb, nexthdr_off, sizeof(_tcph), &_tcph); - BUG_ON(!tcph); + if (!tcph) + return NF_ACCEPT; + nexthdr_off += tcph->doff * 4; datalen = tcplen - tcph->doff * 4; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 318b8f723349..34e22416a721 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -338,7 +338,8 @@ static void tcp_options(const struct sk_buff *skb, ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); - BUG_ON(ptr == NULL); + if (!ptr) + return; state->td_scale = state->flags = 0; @@ -394,7 +395,8 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr), length, buff); - BUG_ON(ptr == NULL); + if (!ptr) + return; /* Fast path for timestamp-only option */ if (length == TCPOLEN_TSTAMP_ALIGNED diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index 1aebd6569d4e..fcb33b1d5456 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -95,7 +95,10 @@ static int help(struct sk_buff *skb, spin_lock_bh(&nf_sane_lock); sb_ptr = skb_header_pointer(skb, dataoff, datalen, sane_buffer); - BUG_ON(sb_ptr == NULL); + if (!sb_ptr) { + spin_unlock_bh(&nf_sane_lock); + return NF_ACCEPT; + } if (dir == IP_CT_DIR_ORIGINAL) { if (datalen != sizeof(struct sane_request)) From 85dfd816fabfc16e71786eda0a33a7046688b5b0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 5 May 2021 23:06:43 +0200 Subject: [PATCH 1234/1379] netfilter: nftables: Fix a memleak from userdata error path in new objects Release object name if userdata allocation fails. Fixes: b131c96496b3 ("netfilter: nf_tables: add userdata support for nft_object") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0b7fe0a902ff..926da6ed8d51 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6615,9 +6615,9 @@ err_obj_ht: INIT_LIST_HEAD(&obj->list); return err; err_trans: - kfree(obj->key.name); -err_userdata: kfree(obj->udata); +err_userdata: + kfree(obj->key.name); err_strdup: if (obj->ops->destroy) obj->ops->destroy(&ctx, obj); From 50b7b6f29de3e18e9d6c09641256a0296361cfee Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 5 May 2021 13:03:10 +0200 Subject: [PATCH 1235/1379] x86/process: setup io_threads more like normal user space threads As io_threads are fully set up USER threads it's clearer to separate the code path from the KTHREAD logic. The only remaining difference to user space threads is that io_threads never return to user space again. Instead they loop within the given worker function. The fact that they never return to user space means they don't have an user space thread stack. In order to indicate that to tools like gdb we reset the stack and instruction pointers to 0. This allows gdb attach to user space processes using io-uring, which like means that they have io_threads, without printing worrying message like this: warning: Selected architecture i386:x86-64 is not compatible with reported target architecture i386 warning: Architecture rejected target-supplied description The output will be something like this: (gdb) info threads Id Target Id Frame * 1 LWP 4863 "io_uring-cp-for" syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:38 2 LWP 4864 "iou-mgr-4863" 0x0000000000000000 in ?? () 3 LWP 4865 "iou-wrk-4863" 0x0000000000000000 in ?? () (gdb) thread 3 [Switching to thread 3 (LWP 4865)] #0 0x0000000000000000 in ?? () (gdb) bt #0 0x0000000000000000 in ?? () Backtrace stopped: Cannot access memory at address 0x0 Fixes: 4727dc20e042 ("arch: setup PF_IO_WORKER threads like PF_KTHREAD") Link: https://lore.kernel.org/io-uring/044d0bad-6888-a211-e1d3-159a4aeed52d@polymtl.ca/T/#m1bbf5727e3d4e839603f6ec7ed79c7eebfba6267 Signed-off-by: Stefan Metzmacher cc: Linus Torvalds cc: Jens Axboe cc: Andy Lutomirski cc: linux-kernel@vger.kernel.org cc: io-uring@vger.kernel.org cc: x86@kernel.org Link: https://lore.kernel.org/r/20210505110310.237537-1-metze@samba.org Reviewed-by: Thomas Gleixner Signed-off-by: Jens Axboe --- arch/x86/kernel/process.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 43cbfc84153a..5e1f38179f49 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -156,7 +156,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, #endif /* Kernel thread ? */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(p->flags & PF_KTHREAD)) { memset(childregs, 0, sizeof(struct pt_regs)); kthread_frame_init(frame, sp, arg); return 0; @@ -172,6 +172,23 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, task_user_gs(p) = get_user_gs(current_pt_regs()); #endif + if (unlikely(p->flags & PF_IO_WORKER)) { + /* + * An IO thread is a user space thread, but it doesn't + * return to ret_after_fork(). + * + * In order to indicate that to tools like gdb, + * we reset the stack and instruction pointers. + * + * It does the same kernel frame setup to return to a kernel + * function that a kernel thread does. + */ + childregs->sp = 0; + childregs->ip = 0; + kthread_frame_init(frame, sp, arg); + return 0; + } + /* Set a new TLS for the child thread? */ if (clone_flags & CLONE_SETTLS) ret = set_new_tls(p, tls); From 8bf073ca9235fe38d7b74a0b4e779cfa7cc70fc9 Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Wed, 5 May 2021 03:27:49 +0200 Subject: [PATCH 1236/1379] drm/amdgpu: Init GFX10_ADDR_CONFIG for VCN v3 in DPG mode. Otherwise tiling modes that require the values form this field (In particular _*_X) would be corrupted upon video decode. Copied from the VCN v2 code. Fixes: 99541f392b4d ("drm/amdgpu: add mc resume DPG mode for VCN3.0") Reviewed-and-Tested by: Leo Liu Signed-off-by: Bas Nieuwenhuizen Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c index 3f15bf34123a..cf165ab5dd26 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c @@ -589,6 +589,10 @@ static void vcn_v3_0_mc_resume_dpg_mode(struct amdgpu_device *adev, int inst_idx WREG32_SOC15_DPG_MODE(inst_idx, SOC15_DPG_MODE_OFFSET( VCN, inst_idx, mmUVD_VCPU_NONCACHE_SIZE0), AMDGPU_GPU_PAGE_ALIGN(sizeof(struct amdgpu_fw_shared)), 0, indirect); + + /* VCN global tiling registers */ + WREG32_SOC15_DPG_MODE(0, SOC15_DPG_MODE_OFFSET( + UVD, 0, mmUVD_GFX10_ADDR_CONFIG), adev->gfx.config.gb_addr_config, 0, indirect); } static void vcn_v3_0_disable_static_power_gating(struct amdgpu_device *adev, int inst) From 234055fd9728e6726787bc63b24b6450034876cf Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Tue, 4 May 2021 11:43:34 +0200 Subject: [PATCH 1237/1379] drm/amdgpu: Use device specific BO size & stride check. The builtin size check isn't really the right thing for AMD modifiers due to a couple of reasons: 1) In the format structs we don't do set any of the tilesize / blocks etc. to avoid having format arrays per modifier/GPU 2) The pitch on the main plane is pixel_pitch * bytes_per_pixel even for tiled ... 3) The pitch for the DCC planes is really the pixel pitch of the main surface that would be covered by it ... Note that we only handle GFX9+ case but we do this after converting the implicit modifier to an explicit modifier, so on GFX9+ all framebuffers should be checked here. There is a TODO about DCC alignment, but it isn't worse than before and I'd need to dig a bunch into the specifics. Getting this out in a reasonable timeframe to make sure it gets the appropriate testing seemed more important. Finally as I've found that debugging addfb2 failures is a pita I was generous adding explicit error messages to every failure case. Fixes: f258907fdd83 ("drm/amdgpu: Verify bo size can fit framebuffer size on init.") Tested-by: Simon Ser Signed-off-by: Bas Nieuwenhuizen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 181 +++++++++++++++++++- 1 file changed, 175 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c index c4cfefb33e03..dab98a1fdcaa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c @@ -837,6 +837,174 @@ static int convert_tiling_flags_to_modifier(struct amdgpu_framebuffer *afb) return 0; } +static void get_block_dimensions(unsigned int block_log2, unsigned int cpp, + unsigned int *width, unsigned int *height) +{ + unsigned int cpp_log2 = ilog2(cpp); + unsigned int pixel_log2 = block_log2 - cpp_log2; + unsigned int width_log2 = (pixel_log2 + 1) / 2; + unsigned int height_log2 = pixel_log2 - width_log2; + + *width = 1 << width_log2; + *height = 1 << height_log2; +} + +static unsigned int get_dcc_block_size(uint64_t modifier, bool rb_aligned, + bool pipe_aligned) +{ + unsigned int ver = AMD_FMT_MOD_GET(TILE_VERSION, modifier); + + switch (ver) { + case AMD_FMT_MOD_TILE_VER_GFX9: { + /* + * TODO: for pipe aligned we may need to check the alignment of the + * total size of the surface, which may need to be bigger than the + * natural alignment due to some HW workarounds + */ + return max(10 + (rb_aligned ? (int)AMD_FMT_MOD_GET(RB, modifier) : 0), 12); + } + case AMD_FMT_MOD_TILE_VER_GFX10: + case AMD_FMT_MOD_TILE_VER_GFX10_RBPLUS: { + int pipes_log2 = AMD_FMT_MOD_GET(PIPE_XOR_BITS, modifier); + + if (ver == AMD_FMT_MOD_TILE_VER_GFX10_RBPLUS && pipes_log2 > 1 && + AMD_FMT_MOD_GET(PACKERS, modifier) == pipes_log2) + ++pipes_log2; + + return max(8 + (pipe_aligned ? pipes_log2 : 0), 12); + } + default: + return 0; + } +} + +static int amdgpu_display_verify_plane(struct amdgpu_framebuffer *rfb, int plane, + const struct drm_format_info *format, + unsigned int block_width, unsigned int block_height, + unsigned int block_size_log2) +{ + unsigned int width = rfb->base.width / + ((plane && plane < format->num_planes) ? format->hsub : 1); + unsigned int height = rfb->base.height / + ((plane && plane < format->num_planes) ? format->vsub : 1); + unsigned int cpp = plane < format->num_planes ? format->cpp[plane] : 1; + unsigned int block_pitch = block_width * cpp; + unsigned int min_pitch = ALIGN(width * cpp, block_pitch); + unsigned int block_size = 1 << block_size_log2; + uint64_t size; + + if (rfb->base.pitches[plane] % block_pitch) { + drm_dbg_kms(rfb->base.dev, + "pitch %d for plane %d is not a multiple of block pitch %d\n", + rfb->base.pitches[plane], plane, block_pitch); + return -EINVAL; + } + if (rfb->base.pitches[plane] < min_pitch) { + drm_dbg_kms(rfb->base.dev, + "pitch %d for plane %d is less than minimum pitch %d\n", + rfb->base.pitches[plane], plane, min_pitch); + return -EINVAL; + } + + /* Force at least natural alignment. */ + if (rfb->base.offsets[plane] % block_size) { + drm_dbg_kms(rfb->base.dev, + "offset 0x%x for plane %d is not a multiple of block pitch 0x%x\n", + rfb->base.offsets[plane], plane, block_size); + return -EINVAL; + } + + size = rfb->base.offsets[plane] + + (uint64_t)rfb->base.pitches[plane] / block_pitch * + block_size * DIV_ROUND_UP(height, block_height); + + if (rfb->base.obj[0]->size < size) { + drm_dbg_kms(rfb->base.dev, + "BO size 0x%zx is less than 0x%llx required for plane %d\n", + rfb->base.obj[0]->size, size, plane); + return -EINVAL; + } + + return 0; +} + + +static int amdgpu_display_verify_sizes(struct amdgpu_framebuffer *rfb) +{ + const struct drm_format_info *format_info = drm_format_info(rfb->base.format->format); + uint64_t modifier = rfb->base.modifier; + int ret; + unsigned int i, block_width, block_height, block_size_log2; + + if (!rfb->base.dev->mode_config.allow_fb_modifiers) + return 0; + + for (i = 0; i < format_info->num_planes; ++i) { + if (modifier == DRM_FORMAT_MOD_LINEAR) { + block_width = 256 / format_info->cpp[i]; + block_height = 1; + block_size_log2 = 8; + } else { + int swizzle = AMD_FMT_MOD_GET(TILE, modifier); + + switch ((swizzle & ~3) + 1) { + case DC_SW_256B_S: + block_size_log2 = 8; + break; + case DC_SW_4KB_S: + case DC_SW_4KB_S_X: + block_size_log2 = 12; + break; + case DC_SW_64KB_S: + case DC_SW_64KB_S_T: + case DC_SW_64KB_S_X: + block_size_log2 = 16; + break; + default: + drm_dbg_kms(rfb->base.dev, + "Swizzle mode with unknown block size: %d\n", swizzle); + return -EINVAL; + } + + get_block_dimensions(block_size_log2, format_info->cpp[i], + &block_width, &block_height); + } + + ret = amdgpu_display_verify_plane(rfb, i, format_info, + block_width, block_height, block_size_log2); + if (ret) + return ret; + } + + if (AMD_FMT_MOD_GET(DCC, modifier)) { + if (AMD_FMT_MOD_GET(DCC_RETILE, modifier)) { + block_size_log2 = get_dcc_block_size(modifier, false, false); + get_block_dimensions(block_size_log2 + 8, format_info->cpp[0], + &block_width, &block_height); + ret = amdgpu_display_verify_plane(rfb, i, format_info, + block_width, block_height, + block_size_log2); + if (ret) + return ret; + + ++i; + block_size_log2 = get_dcc_block_size(modifier, true, true); + } else { + bool pipe_aligned = AMD_FMT_MOD_GET(DCC_PIPE_ALIGN, modifier); + + block_size_log2 = get_dcc_block_size(modifier, true, pipe_aligned); + } + get_block_dimensions(block_size_log2 + 8, format_info->cpp[0], + &block_width, &block_height); + ret = amdgpu_display_verify_plane(rfb, i, format_info, + block_width, block_height, block_size_log2); + if (ret) + return ret; + } + + return 0; +} + static int amdgpu_display_get_fb_info(const struct amdgpu_framebuffer *amdgpu_fb, uint64_t *tiling_flags, bool *tmz_surface) { @@ -902,10 +1070,8 @@ int amdgpu_display_gem_fb_verify_and_init( int ret; rfb->base.obj[0] = obj; - - /* Verify that bo size can fit the fb size. */ - ret = drm_gem_fb_init_with_funcs(dev, &rfb->base, file_priv, mode_cmd, - &amdgpu_fb_funcs); + drm_helper_mode_fill_fb_struct(dev, &rfb->base, mode_cmd); + ret = drm_framebuffer_init(dev, &rfb->base, &amdgpu_fb_funcs); if (ret) goto err; @@ -954,9 +1120,12 @@ int amdgpu_display_framebuffer_init(struct drm_device *dev, } } - for (i = 1; i < rfb->base.format->num_planes; ++i) { + ret = amdgpu_display_verify_sizes(rfb); + if (ret) + return ret; + + for (i = 0; i < rfb->base.format->num_planes; ++i) { drm_gem_object_get(rfb->base.obj[0]); - drm_gem_object_put(rfb->base.obj[i]); rfb->base.obj[i] = rfb->base.obj[0]; } From 4cc7faa406975b460aa674606291dea197c1210c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 3 May 2021 17:49:09 +0300 Subject: [PATCH 1238/1379] can: mcp251xfd: mcp251xfd_probe(): fix an error pointer dereference in probe When we converted this code to use dev_err_probe() we accidentally removed a return. It means that if devm_clk_get() it will lead to an Oops when we call clk_get_rate() on the next line. Fixes: cf8ee6de2543 ("can: mcp251xfd: mcp251xfd_probe(): use dev_err_probe() to simplify error handling") Link: https://lore.kernel.org/r/YJANZf13Qxd5Mhr1@mwanda Signed-off-by: Dan Carpenter Reviewed-by: Manivannan Sadhasivam Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 970dc570e7a5..f122976e90da 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -2885,8 +2885,8 @@ static int mcp251xfd_probe(struct spi_device *spi) clk = devm_clk_get(&spi->dev, NULL); if (IS_ERR(clk)) - dev_err_probe(&spi->dev, PTR_ERR(clk), - "Failed to get Oscillator (clock)!\n"); + return dev_err_probe(&spi->dev, PTR_ERR(clk), + "Failed to get Oscillator (clock)!\n"); freq = clk_get_rate(clk); /* Sanity check */ From 4376ea42db8bfcac2bc3a30bba93917244a8c2d4 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Sun, 2 May 2021 11:34:34 +0200 Subject: [PATCH 1239/1379] can: mcp251xfd: mcp251xfd_probe(): add missing can_rx_offload_del() in error path This patch adds the missing can_rx_offload_del(), that must be called if mcp251xfd_register() fails. Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN") Link: https://lore.kernel.org/r/20210504091838.1109047-1-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index f122976e90da..e0ae00e34c7b 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -2986,10 +2986,12 @@ static int mcp251xfd_probe(struct spi_device *spi) err = mcp251xfd_register(priv); if (err) - goto out_free_candev; + goto out_can_rx_offload_del; return 0; + out_can_rx_offload_del: + can_rx_offload_del(&priv->offload); out_free_candev: spi->max_speed_hz = priv->spi_max_speed_hz_orig; From 03c427147b2d3e503af258711af4fc792b89b0af Mon Sep 17 00:00:00 2001 From: Frieder Schrempf Date: Wed, 5 May 2021 09:14:15 +0200 Subject: [PATCH 1240/1379] can: mcp251x: fix resume from sleep before interface was brought up Since 8ce8c0abcba3 the driver queues work via priv->restart_work when resuming after suspend, even when the interface was not previously enabled. This causes a null dereference error as the workqueue is only allocated and initialized in mcp251x_open(). To fix this we move the workqueue init to mcp251x_can_probe() as there is no reason to do it later and repeat it whenever mcp251x_open() is called. Fixes: 8ce8c0abcba3 ("can: mcp251x: only reset hardware as required") Link: https://lore.kernel.org/r/17d5d714-b468-482f-f37a-482e3d6df84e@kontron.de Signed-off-by: Frieder Schrempf Reviewed-by: Andy Shevchenko [mkl: fix error handling in mcp251x_stop()] Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251x.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/net/can/spi/mcp251x.c b/drivers/net/can/spi/mcp251x.c index 492f1bcb0516..173c6614086f 100644 --- a/drivers/net/can/spi/mcp251x.c +++ b/drivers/net/can/spi/mcp251x.c @@ -956,8 +956,6 @@ static int mcp251x_stop(struct net_device *net) priv->force_quit = 1; free_irq(spi->irq, priv); - destroy_workqueue(priv->wq); - priv->wq = NULL; mutex_lock(&priv->mcp_lock); @@ -1224,24 +1222,15 @@ static int mcp251x_open(struct net_device *net) goto out_close; } - priv->wq = alloc_workqueue("mcp251x_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM, - 0); - if (!priv->wq) { - ret = -ENOMEM; - goto out_clean; - } - INIT_WORK(&priv->tx_work, mcp251x_tx_work_handler); - INIT_WORK(&priv->restart_work, mcp251x_restart_work_handler); - ret = mcp251x_hw_wake(spi); if (ret) - goto out_free_wq; + goto out_free_irq; ret = mcp251x_setup(net, spi); if (ret) - goto out_free_wq; + goto out_free_irq; ret = mcp251x_set_normal_mode(spi); if (ret) - goto out_free_wq; + goto out_free_irq; can_led_event(net, CAN_LED_EVENT_OPEN); @@ -1250,9 +1239,7 @@ static int mcp251x_open(struct net_device *net) return 0; -out_free_wq: - destroy_workqueue(priv->wq); -out_clean: +out_free_irq: free_irq(spi->irq, priv); mcp251x_hw_sleep(spi); out_close: @@ -1373,6 +1360,15 @@ static int mcp251x_can_probe(struct spi_device *spi) if (ret) goto out_clk; + priv->wq = alloc_workqueue("mcp251x_wq", WQ_FREEZABLE | WQ_MEM_RECLAIM, + 0); + if (!priv->wq) { + ret = -ENOMEM; + goto out_clk; + } + INIT_WORK(&priv->tx_work, mcp251x_tx_work_handler); + INIT_WORK(&priv->restart_work, mcp251x_restart_work_handler); + priv->spi = spi; mutex_init(&priv->mcp_lock); @@ -1417,6 +1413,8 @@ static int mcp251x_can_probe(struct spi_device *spi) return 0; error_probe: + destroy_workqueue(priv->wq); + priv->wq = NULL; mcp251x_power_enable(priv->power, 0); out_clk: @@ -1438,6 +1436,9 @@ static int mcp251x_can_remove(struct spi_device *spi) mcp251x_power_enable(priv->power, 0); + destroy_workqueue(priv->wq); + priv->wq = NULL; + clk_disable_unprepare(priv->clk); free_candev(net); From e04b2cfe61072c7966e1a5fb73dd1feb30c206ed Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 5 May 2021 13:32:27 +0200 Subject: [PATCH 1241/1379] can: m_can: m_can_tx_work_queue(): fix tx_skb race condition The m_can_start_xmit() function checks if the cdev->tx_skb is NULL and returns with NETDEV_TX_BUSY in case tx_sbk is not NULL. There is a race condition in the m_can_tx_work_queue(), where first the skb is send to the driver and then the case tx_sbk is set to NULL. A TX complete IRQ might come in between and wake the queue, which results in tx_skb not being cleared yet. Fixes: f524f829b75a ("can: m_can: Create a m_can platform framework") Tested-by: Torin Cooper-Bennun Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index 34073cd077e4..3cf6de21d19c 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1562,6 +1562,8 @@ static netdev_tx_t m_can_tx_handler(struct m_can_classdev *cdev) int i; int putidx; + cdev->tx_skb = NULL; + /* Generate ID field for TX buffer Element */ /* Common to all supported M_CAN versions */ if (cf->can_id & CAN_EFF_FLAG) { @@ -1678,7 +1680,6 @@ static void m_can_tx_work_queue(struct work_struct *ws) tx_work); m_can_tx_handler(cdev); - cdev->tx_skb = NULL; } static netdev_tx_t m_can_start_xmit(struct sk_buff *skb, From f48652bbe3ae62ba2835a396b7e01f063e51c4cd Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Tue, 4 May 2021 15:39:17 +0800 Subject: [PATCH 1242/1379] ALSA: hda: generic: change the DAC ctl name for LO+SPK or LO+HP Without this change, the DAC ctl's name could be changed only when the machine has both Speaker and Headphone, but we met some machines which only has Lineout and Headhpone, and the Lineout and Headphone share the Audio Mixer0 and DAC0, the ctl's name is set to "Front". On most of machines, the "Front" is used for Speaker only or Lineout only, but on this machine it is shared by Lineout and Headphone, This introduces an issue in the pipewire and pulseaudio, suppose users want the Headphone to be on and the Speaker/Lineout to be off, they could turn off the "Front", this works on most of the machines, but on this machine, the "Front" couldn't be turned off otherwise the headphone will be off too. Here we do some change to let the ctl's name change to "Headphone+LO" on this machine, and pipewire and pulseaudio already could handle "Headphone+LO" and "Speaker+LO". (https://gitlab.freedesktop.org/pipewire/pipewire/-/issues/747) BugLink: http://bugs.launchpad.net/bugs/804178 Signed-off-by: Hui Wang Link: https://lore.kernel.org/r/20210504073917.22406-1-hui.wang@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_generic.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/sound/pci/hda/hda_generic.c b/sound/pci/hda/hda_generic.c index 3998e1771805..b638fc2ef6f7 100644 --- a/sound/pci/hda/hda_generic.c +++ b/sound/pci/hda/hda_generic.c @@ -1204,11 +1204,17 @@ static const char *get_line_out_pfx(struct hda_codec *codec, int ch, *index = ch; return "Headphone"; case AUTO_PIN_LINE_OUT: - /* This deals with the case where we have two DACs and - * one LO, one HP and one Speaker */ - if (!ch && cfg->speaker_outs && cfg->hp_outs) { - bool hp_lo_shared = !path_has_mixer(codec, spec->hp_paths[0], ctl_type); - bool spk_lo_shared = !path_has_mixer(codec, spec->speaker_paths[0], ctl_type); + /* This deals with the case where one HP or one Speaker or + * one HP + one Speaker need to share the DAC with LO + */ + if (!ch) { + bool hp_lo_shared = false, spk_lo_shared = false; + + if (cfg->speaker_outs) + spk_lo_shared = !path_has_mixer(codec, + spec->speaker_paths[0], ctl_type); + if (cfg->hp_outs) + hp_lo_shared = !path_has_mixer(codec, spec->hp_paths[0], ctl_type); if (hp_lo_shared && spk_lo_shared) return spec->vmaster_mute.hook ? "PCM" : "Master"; if (hp_lo_shared) From c76fba33467b96b8234a1bbef852cd257c0dca69 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Thu, 6 May 2021 13:54:22 +0800 Subject: [PATCH 1243/1379] arm64: kernel: Update the stale comment Commit af391b15f7b5 ("arm64: kernel: rename __cpu_suspend to keep it aligned with arm") has used @index instead of @arg, but the comment is stale, update it. Cc: Sudeep Holla Cc: Will Deacon Signed-off-by: Shaokun Zhang Reviewed-by: Sudeep Holla Link: https://lore.kernel.org/r/1620280462-21937-1-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpuidle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c index b512b5503f6e..03991eeff643 100644 --- a/arch/arm64/kernel/cpuidle.c +++ b/arch/arm64/kernel/cpuidle.c @@ -29,7 +29,7 @@ int arm_cpuidle_init(unsigned int cpu) /** * arm_cpuidle_suspend() - function to enter a low-power idle state - * @arg: argument to pass to CPU suspend operations + * @index: argument to pass to CPU suspend operations * * Return: 0 on success, -EOPNOTSUPP if CPU suspend hook not initialized, CPU * operations back-end error code otherwise. From 19987fdad506515a92b3c430076cbdb329a11aee Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 4 May 2021 22:53:43 +1200 Subject: [PATCH 1244/1379] sched,doc: sched_debug_verbose cmdline should be sched_verbose The cmdline should include sched_verbose but not sched_debug_verbose as sched_debug_verbose is only the variant name in code. Fixes: 9406415f46 ("sched/debug: Rename the sched_debug parameter to sched_verbose") Signed-off-by: Barry Song Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210504105344.31344-1-song.bao.hua@hisilicon.com --- Documentation/scheduler/sched-domains.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index 14ea2f21d20e..84dcdcd2911c 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -74,7 +74,7 @@ for a given topology level by creating a sched_domain_topology_level array and calling set_sched_topology() with this array as the parameter. The sched-domains debugging infrastructure can be enabled by enabling -CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you +CONFIG_SCHED_DEBUG and adding 'sched_verbose' to your cmdline. If you forgot to tweak your cmdline, you can also flip the /sys/kernel/debug/sched/verbose knob. This enables an error checking parse of the sched domains which should catch most possible errors (described above). It From d583d360a620e6229422b3455d0be082b8255f5e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 3 May 2021 13:49:17 -0400 Subject: [PATCH 1245/1379] psi: Fix psi state corruption when schedule() races with cgroup move 4117cebf1a9f ("psi: Optimize task switch inside shared cgroups") introduced a race condition that corrupts internal psi state. This manifests as kernel warnings, sometimes followed by bogusly high IO pressure: psi: task underflow! cpu=1 t=2 tasks=[0 0 0 0] clear=c set=0 (schedule() decreasing RUNNING and ONCPU, both of which are 0) psi: incosistent task state! task=2412744:systemd cpu=17 psi_flags=e clear=3 set=0 (cgroup_move_task() clearing MEMSTALL and IOWAIT, but task is MEMSTALL | RUNNING | ONCPU) What the offending commit does is batch the two psi callbacks in schedule() to reduce the number of cgroup tree updates. When prev is deactivated and removed from the runqueue, nothing is done in psi at first; when the task switch completes, TSK_RUNNING and TSK_IOWAIT are updated along with TSK_ONCPU. However, the deactivation and the task switch inside schedule() aren't atomic: pick_next_task() may drop the rq lock for load balancing. When this happens, cgroup_move_task() can run after the task has been physically dequeued, but the psi updates are still pending. Since it looks at the task's scheduler state, it doesn't move everything to the new cgroup that the task switch that follows is about to clear from it. cgroup_move_task() will leak the TSK_RUNNING count in the old cgroup, and psi_sched_switch() will underflow it in the new cgroup. A similar thing can happen for iowait. TSK_IOWAIT is usually set when a p->in_iowait task is dequeued, but again this update is deferred to the switch. cgroup_move_task() can see an unqueued p->in_iowait task and move a non-existent TSK_IOWAIT. This results in the inconsistent task state warning, as well as a counter underflow that will result in permanent IO ghost pressure being reported. Fix this bug by making cgroup_move_task() use task->psi_flags instead of looking at the potentially mismatching scheduler state. [ We used the scheduler state historically in order to not rely on task->psi_flags for anything but debugging. But that ship has sailed anyway, and this is simpler and more robust. We previously already batched TSK_ONCPU clearing with the TSK_RUNNING update inside the deactivation call from schedule(). But that ordering was safe and didn't result in TSK_ONCPU corruption: unlike most places in the scheduler, cgroup_move_task() only checked task_current() and handled TSK_ONCPU if the task was still queued. ] Fixes: 4117cebf1a9f ("psi: Optimize task switch inside shared cgroups") Signed-off-by: Johannes Weiner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210503174917.38579-1-hannes@cmpxchg.org --- kernel/sched/psi.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index db27b69fa92a..cc25a3cff41f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -972,7 +972,7 @@ void psi_cgroup_free(struct cgroup *cgroup) */ void cgroup_move_task(struct task_struct *task, struct css_set *to) { - unsigned int task_flags = 0; + unsigned int task_flags; struct rq_flags rf; struct rq *rq; @@ -987,15 +987,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) rq = task_rq_lock(task, &rf); - if (task_on_rq_queued(task)) { - task_flags = TSK_RUNNING; - if (task_current(rq, task)) - task_flags |= TSK_ONCPU; - } else if (task->in_iowait) - task_flags = TSK_IOWAIT; - - if (task->in_memstall) - task_flags |= TSK_MEMSTALL; + /* + * We may race with schedule() dropping the rq lock between + * deactivating prev and switching to next. Because the psi + * updates from the deactivation are deferred to the switch + * callback to save cgroup tree updates, the task's scheduling + * state here is not coherent with its psi state: + * + * schedule() cgroup_move_task() + * rq_lock() + * deactivate_task() + * p->on_rq = 0 + * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates + * pick_next_task() + * rq_unlock() + * rq_lock() + * psi_task_change() // old cgroup + * task->cgroups = to + * psi_task_change() // new cgroup + * rq_unlock() + * rq_lock() + * psi_sched_switch() // does deferred updates in new cgroup + * + * Don't rely on the scheduling state. Use psi_flags instead. + */ + task_flags = task->psi_flags; if (task_flags) psi_task_change(task, task_flags, 0); From 6d2f8909a5fabb73fe2a63918117943986c39b6c Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 30 Apr 2021 15:14:12 +0000 Subject: [PATCH 1246/1379] sched: Fix out-of-bound access in uclamp Util-clamp places tasks in different buckets based on their clamp values for performance reasons. However, the size of buckets is currently computed using a rounding division, which can lead to an off-by-one error in some configurations. For instance, with 20 buckets, the bucket size will be 1024/20=51. A task with a clamp of 1024 will be mapped to bucket id 1024/51=20. Sadly, correct indexes are in range [0,19], hence leading to an out of bound memory access. Clamp the bucket id to fix the issue. Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcounting") Suggested-by: Qais Yousef Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20210430151412.160913-1-qperret@google.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9143163fa678..5226cc26a095 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -938,7 +938,7 @@ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) { - return clamp_value / UCLAMP_BUCKET_DELTA; + return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); } static inline unsigned int uclamp_none(enum uclamp_id clamp_id) From 0258bdfaff5bd13c4d2383150b7097aecd6b6d82 Mon Sep 17 00:00:00 2001 From: Odin Ugedal Date: Sat, 1 May 2021 16:19:50 +0200 Subject: [PATCH 1247/1379] sched/fair: Fix unfairness caused by missing load decay This fixes an issue where old load on a cfs_rq is not properly decayed, resulting in strange behavior where fairness can decrease drastically. Real workloads with equally weighted control groups have ended up getting a respective 99% and 1%(!!) of cpu time. When an idle task is attached to a cfs_rq by attaching a pid to a cgroup, the old load of the task is attached to the new cfs_rq and sched_entity by attach_entity_cfs_rq. If the task is then moved to another cpu (and therefore cfs_rq) before being enqueued/woken up, the load will be moved to cfs_rq->removed from the sched_entity. Such a move will happen when enforcing a cpuset on the task (eg. via a cgroup) that force it to move. The load will however not be removed from the task_group itself, making it look like there is a constant load on that cfs_rq. This causes the vruntime of tasks on other sibling cfs_rq's to increase faster than they are supposed to; causing severe fairness issues. If no other task is started on the given cfs_rq, and due to the cpuset it would not happen, this load would never be properly unloaded. With this patch the load will be properly removed inside update_blocked_averages. This also applies to tasks moved to the fair scheduling class and moved to another cpu, and this path will also fix that. For fork, the entity is queued right away, so this problem does not affect that. This applies to cases where the new process is the first in the cfs_rq, issue introduced 3d30544f0212 ("sched/fair: Apply more PELT fixes"), and when there has previously been load on the cgroup but the cgroup was removed from the leaflist due to having null PELT load, indroduced in 039ae8bcf7a5 ("sched/fair: Fix O(nr_cgroups) in the load balancing path"). For a simple cgroup hierarchy (as seen below) with two equally weighted groups, that in theory should get 50/50 of cpu time each, it often leads to a load of 60/40 or 70/30. parent/ cg-1/ cpu.weight: 100 cpuset.cpus: 1 cg-2/ cpu.weight: 100 cpuset.cpus: 1 If the hierarchy is deeper (as seen below), while keeping cg-1 and cg-2 equally weighted, they should still get a 50/50 balance of cpu time. This however sometimes results in a balance of 10/90 or 1/99(!!) between the task groups. $ ps u -C stress USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND root 18568 1.1 0.0 3684 100 pts/12 R+ 13:36 0:00 stress --cpu 1 root 18580 99.3 0.0 3684 100 pts/12 R+ 13:36 0:09 stress --cpu 1 parent/ cg-1/ cpu.weight: 100 sub-group/ cpu.weight: 1 cpuset.cpus: 1 cg-2/ cpu.weight: 100 sub-group/ cpu.weight: 10000 cpuset.cpus: 1 This can be reproduced by attaching an idle process to a cgroup and moving it to a given cpuset before it wakes up. The issue is evident in many (if not most) container runtimes, and has been reproduced with both crun and runc (and therefore docker and all its "derivatives"), and with both cgroup v1 and v2. Fixes: 3d30544f0212 ("sched/fair: Apply more PELT fixes") Fixes: 039ae8bcf7a5 ("sched/fair: Fix O(nr_cgroups) in the load balancing path") Signed-off-by: Odin Ugedal Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210501141950.23622-2-odin@uged.al --- kernel/sched/fair.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1d75af1ecfb4..20aa234ffe04 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10878,16 +10878,22 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq; + list_add_leaf_cfs_rq(cfs_rq_of(se)); + /* Start to propagate at parent */ se = se->parent; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - if (cfs_rq_throttled(cfs_rq)) - break; + if (!cfs_rq_throttled(cfs_rq)){ + update_load_avg(cfs_rq, se, UPDATE_TG); + list_add_leaf_cfs_rq(cfs_rq); + continue; + } - update_load_avg(cfs_rq, se, UPDATE_TG); + if (list_add_leaf_cfs_rq(cfs_rq)) + break; } } #else From e10de314287c2c14b0e6f0e3e961975ce2f4a83d Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 4 May 2021 01:52:36 -0500 Subject: [PATCH 1248/1379] x86/events/amd/iommu: Fix invalid Perf result due to IOMMU PMC power-gating On certain AMD platforms, when the IOMMU performance counter source (csource) field is zero, power-gating for the counter is enabled, which prevents write access and returns zero for read access. This can cause invalid perf result especially when event multiplexing is needed (i.e. more number of events than available counters) since the current logic keeps track of the previously read counter value, and subsequently re-program the counter to continue counting the event. With power-gating enabled, we cannot gurantee successful re-programming of the counter. Workaround this issue by : 1. Modifying the ordering of setting/reading counters and enabing/ disabling csources to only access the counter when the csource is set to non-zero. 2. Since AMD IOMMU PMU does not support interrupt mode, the logic can be simplified to always start counting with value zero, and accumulate the counter value when stopping without the need to keep track and reprogram the counter with the previously read counter value. This has been tested on systems with and without power-gating. Fixes: 994d6608efe4 ("iommu/amd: Remove performance counter pre-initialization test") Suggested-by: Alexander Monakov Signed-off-by: Suravee Suthikulpanit Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210504065236.4415-1-suravee.suthikulpanit@amd.com --- arch/x86/events/amd/iommu.c | 47 ++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index 6a98a7651621..2da6139b0977 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -18,8 +18,6 @@ #include "../perf_event.h" #include "iommu.h" -#define COUNTER_SHIFT 16 - /* iommu pmu conf masks */ #define GET_CSOURCE(x) ((x)->conf & 0xFFULL) #define GET_DEVID(x) (((x)->conf >> 8) & 0xFFFFULL) @@ -285,22 +283,31 @@ static void perf_iommu_start(struct perf_event *event, int flags) WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); hwc->state = 0; + /* + * To account for power-gating, which prevents write to + * the counter, we need to enable the counter + * before setting up counter register. + */ + perf_iommu_enable_event(event); + if (flags & PERF_EF_RELOAD) { - u64 prev_raw_count = local64_read(&hwc->prev_count); + u64 count = 0; struct amd_iommu *iommu = perf_event_2_iommu(event); + /* + * Since the IOMMU PMU only support counting mode, + * the counter always start with value zero. + */ amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr, - IOMMU_PC_COUNTER_REG, &prev_raw_count); + IOMMU_PC_COUNTER_REG, &count); } - perf_iommu_enable_event(event); perf_event_update_userpage(event); - } static void perf_iommu_read(struct perf_event *event) { - u64 count, prev, delta; + u64 count; struct hw_perf_event *hwc = &event->hw; struct amd_iommu *iommu = perf_event_2_iommu(event); @@ -311,14 +318,11 @@ static void perf_iommu_read(struct perf_event *event) /* IOMMU pc counter register is only 48 bits */ count &= GENMASK_ULL(47, 0); - prev = local64_read(&hwc->prev_count); - if (local64_cmpxchg(&hwc->prev_count, prev, count) != prev) - return; - - /* Handle 48-bit counter overflow */ - delta = (count << COUNTER_SHIFT) - (prev << COUNTER_SHIFT); - delta >>= COUNTER_SHIFT; - local64_add(delta, &event->count); + /* + * Since the counter always start with value zero, + * simply just accumulate the count for the event. + */ + local64_add(count, &event->count); } static void perf_iommu_stop(struct perf_event *event, int flags) @@ -328,15 +332,16 @@ static void perf_iommu_stop(struct perf_event *event, int flags) if (hwc->state & PERF_HES_UPTODATE) return; + /* + * To account for power-gating, in which reading the counter would + * return zero, we need to read the register before disabling. + */ + perf_iommu_read(event); + hwc->state |= PERF_HES_UPTODATE; + perf_iommu_disable_event(event); WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; - - if (hwc->state & PERF_HES_UPTODATE) - return; - - perf_iommu_read(event); - hwc->state |= PERF_HES_UPTODATE; } static int perf_iommu_add(struct perf_event *event, int flags) From 1139aeb1c521eb4a050920ce6c64c36c4f2a3ab7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 5 May 2021 23:12:42 +0200 Subject: [PATCH 1249/1379] smp: Fix smp_call_function_single_async prototype As of commit 966a967116e6 ("smp: Avoid using two cache lines for struct call_single_data"), the smp code prefers 32-byte aligned call_single_data objects for performance reasons, but the block layer includes an instance of this structure in the main 'struct request' that is more senstive to size than to performance here, see 4ccafe032005 ("block: unalign call_single_data in struct request"). The result is a violation of the calling conventions that clang correctly points out: block/blk-mq.c:630:39: warning: passing 8-byte aligned argument to 32-byte aligned parameter 2 of 'smp_call_function_single_async' may result in an unaligned pointer access [-Walign-mismatch] smp_call_function_single_async(cpu, &rq->csd); It does seem that the usage of the call_single_data without cache line alignment should still be allowed by the smp code, so just change the function prototype so it accepts both, but leave the default alignment unchanged for the other users. This seems better to me than adding a local hack to shut up an otherwise correct warning in the caller. Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jens Axboe Link: https://lkml.kernel.org/r/20210505211300.3174456-1-arnd@kernel.org --- include/linux/smp.h | 2 +- kernel/smp.c | 26 +++++++++++++------------- kernel/up.c | 2 +- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/smp.h b/include/linux/smp.h index 84a0b4828f66..f0d3ef654207 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -53,7 +53,7 @@ int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask); -int smp_call_function_single_async(int cpu, call_single_data_t *csd); +int smp_call_function_single_async(int cpu, struct __call_single_data *csd); /* * Call a function on all processors diff --git a/kernel/smp.c b/kernel/smp.c index e21074900006..52bf159ec400 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -211,7 +211,7 @@ static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type) } while (0) /* Record current CSD work for current CPU, NULL to erase. */ -static void __csd_lock_record(call_single_data_t *csd) +static void __csd_lock_record(struct __call_single_data *csd) { if (!csd) { smp_mb(); /* NULL cur_csd after unlock. */ @@ -226,13 +226,13 @@ static void __csd_lock_record(call_single_data_t *csd) /* Or before unlock, as the case may be. */ } -static __always_inline void csd_lock_record(call_single_data_t *csd) +static __always_inline void csd_lock_record(struct __call_single_data *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) __csd_lock_record(csd); } -static int csd_lock_wait_getcpu(call_single_data_t *csd) +static int csd_lock_wait_getcpu(struct __call_single_data *csd) { unsigned int csd_type; @@ -282,7 +282,7 @@ static const char *csd_lock_get_type(unsigned int type) return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type]; } -static void csd_lock_print_extended(call_single_data_t *csd, int cpu) +static void csd_lock_print_extended(struct __call_single_data *csd, int cpu) { struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu); unsigned int srccpu = csd->node.src; @@ -321,7 +321,7 @@ static void csd_lock_print_extended(call_single_data_t *csd, int cpu) * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ -static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id) { int cpu = -1; int cpux; @@ -387,7 +387,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static void __csd_lock_wait(call_single_data_t *csd) +static void __csd_lock_wait(struct __call_single_data *csd) { int bug_id = 0; u64 ts0, ts1; @@ -401,7 +401,7 @@ static void __csd_lock_wait(call_single_data_t *csd) smp_acquire__after_ctrl_dep(); } -static __always_inline void csd_lock_wait(call_single_data_t *csd) +static __always_inline void csd_lock_wait(struct __call_single_data *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) { __csd_lock_wait(csd); @@ -431,17 +431,17 @@ static void __smp_call_single_queue_debug(int cpu, struct llist_node *node) #else #define cfd_seq_store(var, src, dst, type) -static void csd_lock_record(call_single_data_t *csd) +static void csd_lock_record(struct __call_single_data *csd) { } -static __always_inline void csd_lock_wait(call_single_data_t *csd) +static __always_inline void csd_lock_wait(struct __call_single_data *csd) { smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #endif -static __always_inline void csd_lock(call_single_data_t *csd) +static __always_inline void csd_lock(struct __call_single_data *csd) { csd_lock_wait(csd); csd->node.u_flags |= CSD_FLAG_LOCK; @@ -454,7 +454,7 @@ static __always_inline void csd_lock(call_single_data_t *csd) smp_wmb(); } -static __always_inline void csd_unlock(call_single_data_t *csd) +static __always_inline void csd_unlock(struct __call_single_data *csd) { WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); @@ -501,7 +501,7 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, call_single_data_t *csd) +static int generic_exec_single(int cpu, struct __call_single_data *csd) { if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; @@ -784,7 +784,7 @@ EXPORT_SYMBOL(smp_call_function_single); * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. */ -int smp_call_function_single_async(int cpu, call_single_data_t *csd) +int smp_call_function_single_async(int cpu, struct __call_single_data *csd) { int err = 0; diff --git a/kernel/up.c b/kernel/up.c index bf20b4a9af60..c7321307923a 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); -int smp_call_function_single_async(int cpu, call_single_data_t *csd) +int smp_call_function_single_async(int cpu, struct __call_single_data *csd) { unsigned long flags; From 28ce0e70ecc30cc7d558a0304e6b816d70848f9a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 26 Apr 2021 14:50:17 -0400 Subject: [PATCH 1250/1379] locking/qrwlock: Cleanup queued_write_lock_slowpath() Make the code more readable by replacing the atomic_cmpxchg_acquire() by an equivalent atomic_try_cmpxchg_acquire() and change atomic_add() to atomic_or(). For architectures that use qrwlock, I do not find one that has an atomic_add() defined but not an atomic_or(). I guess it should be fine by changing atomic_add() to atomic_or(). Note that the previous use of atomic_add() isn't wrong as only one writer that is the wait_lock owner can set the waiting flag and the flag will be cleared later on when acquiring the write lock. Suggested-by: Linus Torvalds Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Link: https://lkml.kernel.org/r/20210426185017.19815-1-longman@redhat.com --- kernel/locking/qrwlock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index b94f3831e963..ec36b73f4733 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -66,12 +66,12 @@ void queued_write_lock_slowpath(struct qrwlock *lock) arch_spin_lock(&lock->wait_lock); /* Try to acquire the lock directly if no reader is present */ - if (!atomic_read(&lock->cnts) && - (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) + if (!(cnts = atomic_read(&lock->cnts)) && + atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED)) goto unlock; /* Set the waiting flag to notify readers that a writer is pending */ - atomic_add(_QW_WAITING, &lock->cnts); + atomic_or(_QW_WAITING, &lock->cnts); /* When no more readers or writers, set the locked flag */ do { From cf7b39a0cbf6bf57aa07a008d46cf695add05b4c Mon Sep 17 00:00:00 2001 From: yangerkun Date: Thu, 1 Apr 2021 15:18:07 +0800 Subject: [PATCH 1251/1379] block: reexpand iov_iter after read/write We get a bug: BUG: KASAN: slab-out-of-bounds in iov_iter_revert+0x11c/0x404 lib/iov_iter.c:1139 Read of size 8 at addr ffff0000d3fb11f8 by task CPU: 0 PID: 12582 Comm: syz-executor.2 Not tainted 5.10.0-00843-g352c8610ccd2 #2 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x2d0 arch/arm64/kernel/stacktrace.c:132 show_stack+0x28/0x34 arch/arm64/kernel/stacktrace.c:196 __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x110/0x164 lib/dump_stack.c:118 print_address_description+0x78/0x5c8 mm/kasan/report.c:385 __kasan_report mm/kasan/report.c:545 [inline] kasan_report+0x148/0x1e4 mm/kasan/report.c:562 check_memory_region_inline mm/kasan/generic.c:183 [inline] __asan_load8+0xb4/0xbc mm/kasan/generic.c:252 iov_iter_revert+0x11c/0x404 lib/iov_iter.c:1139 io_read fs/io_uring.c:3421 [inline] io_issue_sqe+0x2344/0x2d64 fs/io_uring.c:5943 __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260 io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326 io_submit_sqe fs/io_uring.c:6395 [inline] io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624 __do_sys_io_uring_enter fs/io_uring.c:9013 [inline] __se_sys_io_uring_enter fs/io_uring.c:8960 [inline] __arm64_sys_io_uring_enter+0x190/0x708 fs/io_uring.c:8960 __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline] invoke_syscall arch/arm64/kernel/syscall.c:48 [inline] el0_svc_common arch/arm64/kernel/syscall.c:158 [inline] do_el0_svc+0x120/0x290 arch/arm64/kernel/syscall.c:227 el0_svc+0x1c/0x28 arch/arm64/kernel/entry-common.c:367 el0_sync_handler+0x98/0x170 arch/arm64/kernel/entry-common.c:383 el0_sync+0x140/0x180 arch/arm64/kernel/entry.S:670 Allocated by task 12570: stack_trace_save+0x80/0xb8 kernel/stacktrace.c:121 kasan_save_stack mm/kasan/common.c:48 [inline] kasan_set_track mm/kasan/common.c:56 [inline] __kasan_kmalloc+0xdc/0x120 mm/kasan/common.c:461 kasan_kmalloc+0xc/0x14 mm/kasan/common.c:475 __kmalloc+0x23c/0x334 mm/slub.c:3970 kmalloc include/linux/slab.h:557 [inline] __io_alloc_async_data+0x68/0x9c fs/io_uring.c:3210 io_setup_async_rw fs/io_uring.c:3229 [inline] io_read fs/io_uring.c:3436 [inline] io_issue_sqe+0x2954/0x2d64 fs/io_uring.c:5943 __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260 io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326 io_submit_sqe fs/io_uring.c:6395 [inline] io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624 __do_sys_io_uring_enter fs/io_uring.c:9013 [inline] __se_sys_io_uring_enter fs/io_uring.c:8960 [inline] __arm64_sys_io_uring_enter+0x190/0x708 fs/io_uring.c:8960 __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline] invoke_syscall arch/arm64/kernel/syscall.c:48 [inline] el0_svc_common arch/arm64/kernel/syscall.c:158 [inline] do_el0_svc+0x120/0x290 arch/arm64/kernel/syscall.c:227 el0_svc+0x1c/0x28 arch/arm64/kernel/entry-common.c:367 el0_sync_handler+0x98/0x170 arch/arm64/kernel/entry-common.c:383 el0_sync+0x140/0x180 arch/arm64/kernel/entry.S:670 Freed by task 12570: stack_trace_save+0x80/0xb8 kernel/stacktrace.c:121 kasan_save_stack mm/kasan/common.c:48 [inline] kasan_set_track+0x38/0x6c mm/kasan/common.c:56 kasan_set_free_info+0x20/0x40 mm/kasan/generic.c:355 __kasan_slab_free+0x124/0x150 mm/kasan/common.c:422 kasan_slab_free+0x10/0x1c mm/kasan/common.c:431 slab_free_hook mm/slub.c:1544 [inline] slab_free_freelist_hook mm/slub.c:1577 [inline] slab_free mm/slub.c:3142 [inline] kfree+0x104/0x38c mm/slub.c:4124 io_dismantle_req fs/io_uring.c:1855 [inline] __io_free_req+0x70/0x254 fs/io_uring.c:1867 io_put_req_find_next fs/io_uring.c:2173 [inline] __io_queue_sqe+0x1fc/0x520 fs/io_uring.c:6279 __io_req_task_submit+0x154/0x21c fs/io_uring.c:2051 io_req_task_submit+0x2c/0x44 fs/io_uring.c:2063 task_work_run+0xdc/0x128 kernel/task_work.c:151 get_signal+0x6f8/0x980 kernel/signal.c:2562 do_signal+0x108/0x3a4 arch/arm64/kernel/signal.c:658 do_notify_resume+0xbc/0x25c arch/arm64/kernel/signal.c:722 work_pending+0xc/0x180 blkdev_read_iter can truncate iov_iter's count since the count + pos may exceed the size of the blkdev. This will confuse io_read that we have consume the iovec. And once we do the iov_iter_revert in io_read, we will trigger the slab-out-of-bounds. Fix it by reexpand the count with size has been truncated. blkdev_write_iter can trigger the problem too. Signed-off-by: yangerkun Acked-by: Pavel Begunkov Link: https://lore.kernel.org/r/20210401071807.3328235-1-yangerkun@huawei.com Signed-off-by: Jens Axboe --- fs/block_dev.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index a5244e08b6c8..eb265d72fce8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1677,6 +1677,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); struct blk_plug plug; + size_t shorted = 0; ssize_t ret; if (bdev_read_only(I_BDEV(bd_inode))) @@ -1694,12 +1695,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; - iov_iter_truncate(from, size - iocb->ki_pos); + size -= iocb->ki_pos; + if (iov_iter_count(from) > size) { + shorted = iov_iter_count(from) - size; + iov_iter_truncate(from, size); + } blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); if (ret > 0) ret = generic_write_sync(iocb, ret); + iov_iter_reexpand(from, iov_iter_count(from) + shorted); blk_finish_plug(&plug); return ret; } @@ -1711,13 +1717,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); loff_t pos = iocb->ki_pos; + size_t shorted = 0; + ssize_t ret; if (pos >= size) return 0; size -= pos; - iov_iter_truncate(to, size); - return generic_file_read_iter(iocb, to); + if (iov_iter_count(to) > size) { + shorted = iov_iter_count(to) - size; + iov_iter_truncate(to, size); + } + + ret = generic_file_read_iter(iocb, to); + iov_iter_reexpand(to, iov_iter_count(to) + shorted); + return ret; } EXPORT_SYMBOL_GPL(blkdev_read_iter); From 8db6f937f4e76d9dd23795311fc14f0a5c0ac119 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 29 Apr 2021 17:05:00 +0200 Subject: [PATCH 1252/1379] riscv: Only extend kernel reservation if mapped read-only When the kernel mapping was moved outside of the linear mapping, the kernel memory reservation was increased, to take into account mapping granularity. However, this is done unconditionally, regardless of whether the kernel memory is mapped read-only or not. If this extension is not needed, up to 2 MiB may be lost, which has a big impact on e.g. Canaan K210 (64-bit nommu) platforms with only 8 MiB of RAM. Reclaim the lost memory by only extending the reserved region when needed, i.e. depending on a simplified version of the conditional logic around the call to protect_kernel_linear_mapping_text_rodata(). Fixes: 2bfc6cd81bd17e43 ("riscv: Move kernel mapping outside of linear mapping") Signed-off-by: Geert Uytterhoeven Tested-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index dfb5e4f7a670..1ec98d97b67f 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -135,11 +135,16 @@ void __init setup_bootmem(void) /* * Reserve from the start of the kernel to the end of the kernel - * and make sure we align the reservation on PMD_SIZE since we will + */ +#if defined(CONFIG_64BIT) && defined(CONFIG_STRICT_KERNEL_RWX) + /* + * Make sure we align the reservation on PMD_SIZE since we will * map the kernel in the linear mapping as read-only: we do not want * any allocation to happen between _end and the next pmd aligned page. */ - memblock_reserve(vmlinux_start, (vmlinux_end - vmlinux_start + PMD_SIZE - 1) & PMD_MASK); + vmlinux_end = (vmlinux_end + PMD_SIZE - 1) & PMD_MASK; +#endif + memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); /* * memblock allocator is not aware of the fact that last 4K bytes of From 0e0d4992517fba81ecbceb5b71d2851f1208a02b Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 29 Apr 2021 00:58:36 -0700 Subject: [PATCH 1253/1379] riscv: enable SiFive errata CIP-453 and CIP-1200 Kconfig only if CONFIG_64BIT=y The corresponding hardware issues of CONFIG_ERRATA_SIFIVE_CIP_453 and CONFIG_ERRATA_SIFIVE_CIP_1200 only exist in the SiFive 64bit CPU cores. Therefore, these two errata are required only if CONFIG_64BIT=y Reported-by: kernel test robot Signed-off-by: Vincent Chen Fixes: bff3ff525460 ("riscv: sifive: Apply errata "cip-1200" patch") Fixes: 800149a77c2c ("riscv: sifive: Apply errata "cip-453" patch") Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.erratas | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/Kconfig.erratas b/arch/riscv/Kconfig.erratas index d5d03ae8d685..b44d6ecdb46e 100644 --- a/arch/riscv/Kconfig.erratas +++ b/arch/riscv/Kconfig.erratas @@ -21,7 +21,7 @@ config ERRATA_SIFIVE config ERRATA_SIFIVE_CIP_453 bool "Apply SiFive errata CIP-453" - depends on ERRATA_SIFIVE + depends on ERRATA_SIFIVE && 64BIT default y help This will apply the SiFive CIP-453 errata to add sign extension @@ -32,7 +32,7 @@ config ERRATA_SIFIVE_CIP_453 config ERRATA_SIFIVE_CIP_1200 bool "Apply SiFive errata CIP-1200" - depends on ERRATA_SIFIVE + depends on ERRATA_SIFIVE && 64BIT default y help This will apply the SiFive CIP-1200 errata to repalce all From 8d91b097335892bfbc9fd5783e80e25f0fb5bb2b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 29 Apr 2021 17:10:04 +0200 Subject: [PATCH 1254/1379] riscv: Consistify protect_kernel_linear_mapping_text_rodata() use The various uses of protect_kernel_linear_mapping_text_rodata() are not consistent: - Its definition depends on "64BIT && !XIP_KERNEL", - Its forward declaration depends on MMU, - Its single caller depends on "STRICT_KERNEL_RWX && 64BIT && MMU && !XIP_KERNEL". Fix this by settling on the dependencies of the caller, which can be simplified as STRICT_KERNEL_RWX depends on "MMU && !XIP_KERNEL". Provide a dummy definition, as the caller is protected by "IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)" instead of "#ifdef CONFIG_STRICT_KERNEL_RWX". Signed-off-by: Geert Uytterhoeven Tested-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/set_memory.h | 7 ++++++- arch/riscv/kernel/setup.c | 2 -- arch/riscv/mm/init.c | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index a9c56776fa0e..086f757e8ba3 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -17,7 +17,6 @@ int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); int set_memory_rw_nx(unsigned long addr, int numpages); void protect_kernel_text_data(void); -void protect_kernel_linear_mapping_text_rodata(void); #else static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } @@ -27,6 +26,12 @@ static inline void protect_kernel_text_data(void) {} static inline int set_memory_rw_nx(unsigned long addr, int numpages) { return 0; } #endif +#if defined(CONFIG_64BIT) && defined(CONFIG_STRICT_KERNEL_RWX) +void protect_kernel_linear_mapping_text_rodata(void); +#else +static inline void protect_kernel_linear_mapping_text_rodata(void) {} +#endif + int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); bool kernel_page_present(struct page *page); diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 7b31779101f6..03901d3a8b02 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -293,9 +293,7 @@ void __init setup_arch(char **cmdline_p) if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) { protect_kernel_text_data(); -#if defined(CONFIG_64BIT) && defined(CONFIG_MMU) && !defined(CONFIG_XIP_KERNEL) protect_kernel_linear_mapping_text_rodata(); -#endif } #ifdef CONFIG_SWIOTLB diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 1ec98d97b67f..4faf8bd157ea 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -645,7 +645,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) #endif } -#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) +#if defined(CONFIG_64BIT) && defined(CONFIG_STRICT_KERNEL_RWX) void protect_kernel_linear_mapping_text_rodata(void) { unsigned long text_start = (unsigned long)lm_alias(_start); From beaf5ae15a13d835a01e30c282c8325ce0f1eb7e Mon Sep 17 00:00:00 2001 From: Rouven Czerwinski Date: Sat, 1 May 2021 20:53:58 +0200 Subject: [PATCH 1255/1379] riscv: remove unused handle_exception symbol Since commit 79b1feba5455 ("RISC-V: Setup exception vector early") exception vectors are setup early and the handle_exception symbol from the asm files is no longer referenced in traps.c. Remove it. Signed-off-by: Rouven Czerwinski Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 07fdded10c21..0721b9798595 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -25,8 +25,6 @@ int show_unhandled_signals = 1; -extern asmlinkage void handle_exception(void); - static DEFINE_SPINLOCK(die_lock); void die(struct pt_regs *regs, const char *str) From 4fbf5d6837bf81fd7a27d771358f4ee6c4f243f8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 22 Apr 2021 21:44:18 +0200 Subject: [PATCH 1256/1379] Revert 337f13046ff0 ("futex: Allow FUTEX_CLOCK_REALTIME with FUTEX_WAIT op") The FUTEX_WAIT operand has historically a relative timeout which means that the clock id is irrelevant as relative timeouts on CLOCK_REALTIME are not subject to wall clock changes and therefore are mapped by the kernel to CLOCK_MONOTONIC for simplicity. If a caller would set FUTEX_CLOCK_REALTIME for FUTEX_WAIT the timeout is still treated relative vs. CLOCK_MONOTONIC and then the wait arms that timeout based on CLOCK_REALTIME which is broken and obviously has never been used or even tested. Reject any attempt to use FUTEX_CLOCK_REALTIME with FUTEX_WAIT again. The desired functionality can be achieved with FUTEX_WAIT_BITSET and a FUTEX_BITSET_MATCH_ANY argument. Fixes: 337f13046ff0 ("futex: Allow FUTEX_CLOCK_REALTIME with FUTEX_WAIT op") Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210422194704.834797921@linutronix.de --- kernel/futex.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index c98b825da9cf..47402008a430 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3710,8 +3710,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, if (op & FUTEX_CLOCK_REALTIME) { flags |= FLAGS_CLOCKRT; - if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \ - cmd != FUTEX_WAIT_REQUEUE_PI) + if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) return -ENOSYS; } From cdf78db4070967869e4d027c11f4dd825d8f815a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 22 Apr 2021 21:44:19 +0200 Subject: [PATCH 1257/1379] futex: Do not apply time namespace adjustment on FUTEX_LOCK_PI FUTEX_LOCK_PI does not require to have the FUTEX_CLOCK_REALTIME bit set because it has been using CLOCK_REALTIME based absolute timeouts forever. Due to that, the time namespace adjustment which is applied when FUTEX_CLOCK_REALTIME is not set, will wrongly take place for FUTEX_LOCK_PI and wreckage the timeout. Exclude it from that procedure. Fixes: c2f7d08cccf4 ("futex: Adjust absolute futex timeouts with per time namespace offset") Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210422194704.984540159@linutronix.de --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 47402008a430..b0f53045f206 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3780,7 +3780,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, t = timespec64_to_ktime(ts); if (cmd == FUTEX_WAIT) t = ktime_add_safe(ktime_get(), t); - else if (!(op & FUTEX_CLOCK_REALTIME)) + else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) t = timens_ktime_to_host(CLOCK_MONOTONIC, t); tp = &t; } @@ -3974,7 +3974,7 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, t = timespec64_to_ktime(ts); if (cmd == FUTEX_WAIT) t = ktime_add_safe(ktime_get(), t); - else if (!(op & FUTEX_CLOCK_REALTIME)) + else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) t = timens_ktime_to_host(CLOCK_MONOTONIC, t); tp = &t; } From b097d5ed33561507eeffc77120a8c16c2f0f2c4c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 22 Apr 2021 21:44:20 +0200 Subject: [PATCH 1258/1379] futex: Get rid of the val2 conditional dance There is no point in checking which FUTEX operand treats the utime pointer as 'val2' argument because that argument to do_futex() is only used by exactly these operands. So just handing it in unconditionally is not making any difference, but removes a lot of pointless gunk. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210422194705.125957049@linutronix.de --- kernel/futex.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index b0f53045f206..4ddfdce325ad 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3764,7 +3764,6 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, { struct timespec64 ts; ktime_t t, *tp = NULL; - u32 val2 = 0; int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || @@ -3784,15 +3783,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, t = timens_ktime_to_host(CLOCK_MONOTONIC, t); tp = &t; } - /* - * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. - * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. - */ - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (u32) (unsigned long) utime; - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #ifdef CONFIG_COMPAT @@ -3960,7 +3952,6 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, { struct timespec64 ts; ktime_t t, *tp = NULL; - int val2 = 0; int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || @@ -3978,11 +3969,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, t = timens_ktime_to_host(CLOCK_MONOTONIC, t); tp = &t; } - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (int) (unsigned long) utime; - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); } #endif /* CONFIG_COMPAT_32BIT_TIME */ From 51cf94d16860a324e97d1b670d88f1f2b643bc32 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 22 Apr 2021 21:44:21 +0200 Subject: [PATCH 1259/1379] futex: Make syscall entry points less convoluted The futex and the compat syscall entry points do pretty much the same except for the timespec data types and the corresponding copy from user function. Split out the rest into inline functions and share the functionality. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210422194705.244476369@linutronix.de --- kernel/futex.c | 63 +++++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 4ddfdce325ad..4938a00bc785 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3757,30 +3757,48 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, return -ENOSYS; } +static __always_inline bool futex_cmd_has_timeout(u32 cmd) +{ + switch (cmd) { + case FUTEX_WAIT: + case FUTEX_LOCK_PI: + case FUTEX_WAIT_BITSET: + case FUTEX_WAIT_REQUEUE_PI: + return true; + } + return false; +} + +static __always_inline int +futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) +{ + if (!timespec64_valid(ts)) + return -EINVAL; + + *t = timespec64_to_ktime(*ts); + if (cmd == FUTEX_WAIT) + *t = ktime_add_safe(ktime_get(), *t); + else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) + *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); + return 0; +} SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, const struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, u32, val3) { - struct timespec64 ts; + int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; - int cmd = op & FUTEX_CMD_MASK; + struct timespec64 ts; - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (utime && futex_cmd_has_timeout(cmd)) { if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) return -EFAULT; if (get_timespec64(&ts, utime)) return -EFAULT; - if (!timespec64_valid(&ts)) - return -EINVAL; - - t = timespec64_to_ktime(ts); - if (cmd == FUTEX_WAIT) - t = ktime_add_safe(ktime_get(), t); - else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) - t = timens_ktime_to_host(CLOCK_MONOTONIC, t); + ret = futex_init_timeout(cmd, op, &ts, &t); + if (ret) + return ret; tp = &t; } @@ -3950,23 +3968,16 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { - struct timespec64 ts; + int ret, cmd = op & FUTEX_CMD_MASK; ktime_t t, *tp = NULL; - int cmd = op & FUTEX_CMD_MASK; + struct timespec64 ts; - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || - cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (utime && futex_cmd_has_timeout(cmd)) { if (get_old_timespec32(&ts, utime)) return -EFAULT; - if (!timespec64_valid(&ts)) - return -EINVAL; - - t = timespec64_to_ktime(ts); - if (cmd == FUTEX_WAIT) - t = ktime_add_safe(ktime_get(), t); - else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) - t = timens_ktime_to_host(CLOCK_MONOTONIC, t); + ret = futex_init_timeout(cmd, op, &ts, &t); + if (ret) + return ret; tp = &t; } From ac05a8a927e5a1027592d8f98510a511dadeed14 Mon Sep 17 00:00:00 2001 From: Hansem Ro Date: Thu, 6 May 2021 13:27:10 -0700 Subject: [PATCH 1260/1379] Input: ili210x - add missing negation for touch indication on ili210x This adds the negation needed for proper finger detection on Ilitek ili2107/ili210x. This fixes polling issues (on Amazon Kindle Fire) caused by returning false for the cooresponding finger on the touchscreen. Signed-off-by: Hansem Ro Fixes: e3559442afd2a ("ili210x - rework the touchscreen sample processing") Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/ili210x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/touchscreen/ili210x.c b/drivers/input/touchscreen/ili210x.c index d8fccf048bf4..30576a5f2f04 100644 --- a/drivers/input/touchscreen/ili210x.c +++ b/drivers/input/touchscreen/ili210x.c @@ -87,7 +87,7 @@ static bool ili210x_touchdata_to_coords(const u8 *touchdata, unsigned int *x, unsigned int *y, unsigned int *z) { - if (touchdata[0] & BIT(finger)) + if (!(touchdata[0] & BIT(finger))) return false; *x = get_unaligned_be16(touchdata + 1 + (finger * 4) + 0); From 05665cef4b745cb46b1d1b8e96deaa25464092d3 Mon Sep 17 00:00:00 2001 From: Matt Reynolds Date: Thu, 29 Apr 2021 15:29:37 -0700 Subject: [PATCH 1261/1379] Input: xpad - add support for Amazon Game Controller The Amazon Luna controller (product name "Amazon Game Controller") behaves like an Xbox 360 controller when connected over USB. Signed-off-by: Matt Reynolds Reviewed-by: Harry Cutts Link: https://lore.kernel.org/r/20210429103548.1.If5f9a44cb81e25b9350f7c6c0b3c88b4ecd81166@changeid Signed-off-by: Dmitry Torokhov --- drivers/input/joystick/xpad.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 9f0d07dcbf06..d69d7657ab12 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -268,6 +268,7 @@ static const struct xpad_device { { 0x1689, 0xfd00, "Razer Onza Tournament Edition", 0, XTYPE_XBOX360 }, { 0x1689, 0xfd01, "Razer Onza Classic Edition", 0, XTYPE_XBOX360 }, { 0x1689, 0xfe00, "Razer Sabertooth", 0, XTYPE_XBOX360 }, + { 0x1949, 0x041a, "Amazon Game Controller", 0, XTYPE_XBOX360 }, { 0x1bad, 0x0002, "Harmonix Rock Band Guitar", 0, XTYPE_XBOX360 }, { 0x1bad, 0x0003, "Harmonix Rock Band Drumkit", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 }, { 0x1bad, 0x0130, "Ion Drum Rocker", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360 }, @@ -440,6 +441,7 @@ static const struct usb_device_id xpad_table[] = { XPAD_XBOX360_VENDOR(0x15e4), /* Numark X-Box 360 controllers */ XPAD_XBOX360_VENDOR(0x162e), /* Joytech X-Box 360 controllers */ XPAD_XBOX360_VENDOR(0x1689), /* Razer Onza */ + XPAD_XBOX360_VENDOR(0x1949), /* Amazon controllers */ XPAD_XBOX360_VENDOR(0x1bad), /* Harminix Rock Band Guitar and Drums */ XPAD_XBOX360_VENDOR(0x20d6), /* PowerA Controllers */ XPAD_XBOXONE_VENDOR(0x20d6), /* PowerA Controllers */ From 6a780f51f87b430cc69ebf4e859e7e9be720b283 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 5 May 2021 17:36:36 -0500 Subject: [PATCH 1262/1379] net: ipa: fix inter-EE IRQ register definitions In gsi_irq_setup(), two registers are written with the intention of disabling inter-EE channel and event IRQs. But the wrong registers are used (and defined); the ones used are read-only registers that indicate whether the interrupt condition is present. Define the mask registers instead of the status registers, and use them to disable the inter-EE interrupt types. Fixes: 46f748ccaf01 ("net: ipa: explicitly disallow inter-EE interrupts") Signed-off-by: Alex Elder Link: https://lore.kernel.org/r/20210505223636.232527-1-elder@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ipa/gsi.c | 4 ++-- drivers/net/ipa/gsi_reg.h | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c index 9f06663cef26..e374079603cf 100644 --- a/drivers/net/ipa/gsi.c +++ b/drivers/net/ipa/gsi.c @@ -211,8 +211,8 @@ static void gsi_irq_setup(struct gsi *gsi) iowrite32(0, gsi->virt + GSI_CNTXT_SRC_IEOB_IRQ_MSK_OFFSET); /* The inter-EE registers are in the non-adjusted address range */ - iowrite32(0, gsi->virt_raw + GSI_INTER_EE_SRC_CH_IRQ_OFFSET); - iowrite32(0, gsi->virt_raw + GSI_INTER_EE_SRC_EV_CH_IRQ_OFFSET); + iowrite32(0, gsi->virt_raw + GSI_INTER_EE_SRC_CH_IRQ_MSK_OFFSET); + iowrite32(0, gsi->virt_raw + GSI_INTER_EE_SRC_EV_CH_IRQ_MSK_OFFSET); iowrite32(0, gsi->virt + GSI_CNTXT_GSI_IRQ_EN_OFFSET); } diff --git a/drivers/net/ipa/gsi_reg.h b/drivers/net/ipa/gsi_reg.h index b4ac0258d6e1..cb42c5ae86fa 100644 --- a/drivers/net/ipa/gsi_reg.h +++ b/drivers/net/ipa/gsi_reg.h @@ -53,15 +53,15 @@ #define GSI_EE_REG_ADJUST 0x0000d000 /* IPA v4.5+ */ /* The two inter-EE IRQ register offsets are relative to gsi->virt_raw */ -#define GSI_INTER_EE_SRC_CH_IRQ_OFFSET \ - GSI_INTER_EE_N_SRC_CH_IRQ_OFFSET(GSI_EE_AP) -#define GSI_INTER_EE_N_SRC_CH_IRQ_OFFSET(ee) \ - (0x0000c018 + 0x1000 * (ee)) +#define GSI_INTER_EE_SRC_CH_IRQ_MSK_OFFSET \ + GSI_INTER_EE_N_SRC_CH_IRQ_MSK_OFFSET(GSI_EE_AP) +#define GSI_INTER_EE_N_SRC_CH_IRQ_MSK_OFFSET(ee) \ + (0x0000c020 + 0x1000 * (ee)) -#define GSI_INTER_EE_SRC_EV_CH_IRQ_OFFSET \ - GSI_INTER_EE_N_SRC_EV_CH_IRQ_OFFSET(GSI_EE_AP) -#define GSI_INTER_EE_N_SRC_EV_CH_IRQ_OFFSET(ee) \ - (0x0000c01c + 0x1000 * (ee)) +#define GSI_INTER_EE_SRC_EV_CH_IRQ_MSK_OFFSET \ + GSI_INTER_EE_N_SRC_EV_CH_IRQ_MSK_OFFSET(GSI_EE_AP) +#define GSI_INTER_EE_N_SRC_EV_CH_IRQ_MSK_OFFSET(ee) \ + (0x0000c024 + 0x1000 * (ee)) /* All other register offsets are relative to gsi->virt */ From cbaf3f6af9c268caf558c8e7ec52bcb35c5455dd Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 6 May 2021 10:23:08 +0300 Subject: [PATCH 1263/1379] mlxsw: spectrum_mr: Update egress RIF list before route's action Each multicast route that is forwarding packets (as opposed to trapping them) points to a list of egress router interfaces (RIFs) through which packets are replicated. A route's action can transition from trap to forward when a RIF is created for one of the route's egress virtual interfaces (eVIF). When this happens, the route's action is first updated and only later the list of egress RIFs is committed to the device. This results in the route pointing to an invalid list. In case the list pointer is out of range (due to uninitialized memory), the device will complain: mlxsw_spectrum2 0000:06:00.0: EMAD reg access failed (tid=5733bf490000905c,reg_id=300f(pefa),type=write,status=7(bad parameter)) Fix this by first committing the list of egress RIFs to the device and only later update the route's action. Note that a fix is not needed in the reverse function (i.e., mlxsw_sp_mr_route_evif_unresolve()), as there the route's action is first updated and only later the RIF is removed from the list. Cc: stable@vger.kernel.org Fixes: c011ec1bbfd6 ("mlxsw: spectrum: Add the multicast routing offloading logic") Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Link: https://lore.kernel.org/r/20210506072308.3834303-1-idosch@idosch.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlxsw/spectrum_mr.c | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c index 7846a21555ef..1f6bc0c7e91d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_mr.c @@ -535,6 +535,16 @@ mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table, u16 erif_index = 0; int err; + /* Add the eRIF */ + if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) { + erif_index = mlxsw_sp_rif_index(rve->mr_vif->rif); + err = mr->mr_ops->route_erif_add(mlxsw_sp, + rve->mr_route->route_priv, + erif_index); + if (err) + return err; + } + /* Update the route action, as the new eVIF can be a tunnel or a pimreg * device which will require updating the action. */ @@ -544,17 +554,7 @@ mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table, rve->mr_route->route_priv, route_action); if (err) - return err; - } - - /* Add the eRIF */ - if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) { - erif_index = mlxsw_sp_rif_index(rve->mr_vif->rif); - err = mr->mr_ops->route_erif_add(mlxsw_sp, - rve->mr_route->route_priv, - erif_index); - if (err) - goto err_route_erif_add; + goto err_route_action_update; } /* Update the minimum MTU */ @@ -572,14 +572,14 @@ mlxsw_sp_mr_route_evif_resolve(struct mlxsw_sp_mr_table *mr_table, return 0; err_route_min_mtu_update: - if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) - mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv, - erif_index); -err_route_erif_add: if (route_action != rve->mr_route->route_action) mr->mr_ops->route_action_update(mlxsw_sp, rve->mr_route->route_priv, rve->mr_route->route_action); +err_route_action_update: + if (mlxsw_sp_mr_vif_valid(rve->mr_vif)) + mr->mr_ops->route_erif_del(mlxsw_sp, rve->mr_route->route_priv, + erif_index); return err; } From a6f8ee58a8e35f7e4380a5efce312e2a5bc27497 Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Thu, 6 May 2021 15:35:30 -0700 Subject: [PATCH 1264/1379] tcp: Specify cmsgbuf is user pointer for receive zerocopy. A prior change (1f466e1f15cf) introduces separate handling for ->msg_control depending on whether the pointer is a kernel or user pointer. However, while tcp receive zerocopy is using this field, it is not properly annotating that the buffer in this case is a user pointer. This can cause faults when the improper mechanism is used within put_cmsg(). This patch simply annotates tcp receive zerocopy's use as explicitly being a user pointer. Fixes: 7eeba1706eba ("tcp: Add receive timestamp support for receive zerocopy.") Signed-off-by: Arjun Roy Acked-by: Soheil Hassas Yeganeh Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20210506223530.2266456-1-arjunroy.kdev@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e14fd0c50c10..f1c1f9e3de72 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2039,6 +2039,7 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, (__kernel_size_t)zc->msg_controllen; cmsg_dummy.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; + cmsg_dummy.msg_control_is_user = true; zc->msg_flags = 0; if (zc->msg_control == msg_control_addr && zc->msg_controllen == cmsg_dummy.msg_controllen) { From 543203d2e4cb04bbdeccec0da9b2629c8a8f0569 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 6 May 2021 18:02:04 -0700 Subject: [PATCH 1265/1379] alpha: eliminate old-style function definitions 'make ARCH=alpha W=1' reports a couple of old-style function definitions with missing parameter list, so fix those. arch/alpha/kernel/pc873xx.c: In function 'pc873xx_get_base': arch/alpha/kernel/pc873xx.c:16:21: warning: old-style function definition [-Wold-style-definition] 16 | unsigned int __init pc873xx_get_base() arch/alpha/kernel/pc873xx.c: In function 'pc873xx_get_model': arch/alpha/kernel/pc873xx.c:21:14: warning: old-style function definition [-Wold-style-definition] 21 | char *__init pc873xx_get_model() Link: https://lkml.kernel.org/r/20210421061312.30097-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/pc873xx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/alpha/kernel/pc873xx.c b/arch/alpha/kernel/pc873xx.c index 63aee5d86e02..82b19c9e59d6 100644 --- a/arch/alpha/kernel/pc873xx.c +++ b/arch/alpha/kernel/pc873xx.c @@ -13,12 +13,12 @@ static char *pc873xx_names[] = { static unsigned int base, model; -unsigned int __init pc873xx_get_base() +unsigned int __init pc873xx_get_base(void) { return base; } -char *__init pc873xx_get_model() +char *__init pc873xx_get_model(void) { return pc873xx_names[model]; } From 0214967a376d0726baf35cc2845a59ac17ef4db1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 6 May 2021 18:02:07 -0700 Subject: [PATCH 1266/1379] alpha: csum_partial_copy.c: add function prototypes from Fix "no previous prototype" W=1 warnings from the kernel test robot: arch/alpha/lib/csum_partial_copy.c:349:1: error: no previous prototype for 'csum_and_copy_from_user' [-Werror=missing-prototypes] 349 | csum_and_copy_from_user(const void __user *src, void *dst, int len) | ^~~~~~~~~~~~~~~~~~~~~~~ arch/alpha/lib/csum_partial_copy.c:358:1: error: no previous prototype for 'csum_partial_copy_nocheck' [-Werror=missing-prototypes] 358 | csum_partial_copy_nocheck(const void *src, void *dst, int len) | ^~~~~~~~~~~~~~~~~~~~~~~~~ Link: https://lkml.kernel.org/r/20210425235749.19113-1-rdunlap@infradead.org Fixes: 808b49da54e6 ("alpha: turn csum_partial_copy_from_user() into csum_and_copy_from_user()") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Al Viro Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/lib/csum_partial_copy.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c index dc68efbe9367..1931a04af85a 100644 --- a/arch/alpha/lib/csum_partial_copy.c +++ b/arch/alpha/lib/csum_partial_copy.c @@ -13,6 +13,7 @@ #include #include #include +#include #define ldq_u(x,y) \ From f4bf74d82915708208bc9d0c9bd3f769f56bfbec Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 6 May 2021 18:02:10 -0700 Subject: [PATCH 1267/1379] fs/proc/generic.c: fix incorrect pde_is_permanent check Currently the pde_is_permanent() check is being run on root multiple times rather than on the next proc directory entry. This looks like a copy-paste error. Fix this by replacing root with next. Addresses-Coverity: ("Copy-paste error") Link: https://lkml.kernel.org/r/20210318122633.14222-1-colin.king@canonical.com Fixes: d919b33dafb3 ("proc: faster open/read/close with "permanent" files") Signed-off-by: Colin Ian King Acked-by: Christian Brauner Reviewed-by: Alexey Dobriyan Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index bc86aa87cc41..5600da30e289 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -756,7 +756,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) while (1) { next = pde_subdir_first(de); if (next) { - if (unlikely(pde_is_permanent(root))) { + if (unlikely(pde_is_permanent(next))) { write_unlock(&proc_subdir_lock); WARN(1, "removing permanent /proc entry '%s/%s'", next->parent->name, next->name); From b793cd9ab34da3c571a038219d1d6315f91e5afd Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 6 May 2021 18:02:13 -0700 Subject: [PATCH 1268/1379] proc: save LOC in __xlate_proc_name() Can't look at this verbosity anymore. Link: https://lkml.kernel.org/r/YFYXAp/fgq405qcy@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5600da30e289..5b78739e60e4 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -166,15 +166,8 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, const char *cp = name, *next; struct proc_dir_entry *de; - de = *ret; - if (!de) - de = &proc_root; - - while (1) { - next = strchr(cp, '/'); - if (!next) - break; - + de = *ret ?: &proc_root; + while ((next = strchr(cp, '/')) != NULL) { de = pde_subdir_find(de, cp, next - cp); if (!de) { WARN(1, "name '%s'\n", name); From d4455faccd6cbe11ddfdbe28723a2122453b4f4e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 6 May 2021 18:02:16 -0700 Subject: [PATCH 1269/1379] proc: mandate ->proc_lseek in "struct proc_ops" Now that proc_ops are separate from file_operations and other operations it easy to check all instances to have ->proc_lseek hook and remove check in main code. Note: nonseekable_open() files naturally don't require ->proc_lseek. Garbage collect pde_lseek() function. [adobriyan@gmail.com: smoke test lseek()] Link: https://lkml.kernel.org/r/YG4OIhChOrVTPgdN@localhost.localdomain Link: https://lkml.kernel.org/r/YFYX0Bzwxlc7aBa/@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/isdn/capi/kcapi_proc.c | 1 + drivers/net/wireless/intersil/hostap/hostap_proc.c | 1 + drivers/scsi/esas2r/esas2r_main.c | 1 + fs/proc/inode.c | 14 ++------------ include/linux/proc_fs.h | 1 + tools/testing/selftests/proc/read.c | 4 +++- 6 files changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/isdn/capi/kcapi_proc.c b/drivers/isdn/capi/kcapi_proc.c index b5ed4ea145cb..77e951206809 100644 --- a/drivers/isdn/capi/kcapi_proc.c +++ b/drivers/isdn/capi/kcapi_proc.c @@ -201,6 +201,7 @@ static ssize_t empty_read(struct file *file, char __user *buf, static const struct proc_ops empty_proc_ops = { .proc_read = empty_read, + .proc_lseek = default_llseek, }; // --------------------------------------------------------------------------- diff --git a/drivers/net/wireless/intersil/hostap/hostap_proc.c b/drivers/net/wireless/intersil/hostap/hostap_proc.c index 97c270845fd1..51c847d98755 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_proc.c +++ b/drivers/net/wireless/intersil/hostap/hostap_proc.c @@ -227,6 +227,7 @@ static ssize_t prism2_aux_dump_proc_no_read(struct file *file, char __user *buf, static const struct proc_ops prism2_aux_dump_proc_ops = { .proc_read = prism2_aux_dump_proc_no_read, + .proc_lseek = default_llseek, }; diff --git a/drivers/scsi/esas2r/esas2r_main.c b/drivers/scsi/esas2r/esas2r_main.c index 5d9eeac6717a..45ec9f16c085 100644 --- a/drivers/scsi/esas2r/esas2r_main.c +++ b/drivers/scsi/esas2r/esas2r_main.c @@ -616,6 +616,7 @@ static const struct file_operations esas2r_proc_fops = { }; static const struct proc_ops esas2r_proc_ops = { + .proc_lseek = default_llseek, .proc_ioctl = esas2r_proc_ioctl, #ifdef CONFIG_COMPAT .proc_compat_ioctl = compat_ptr_ioctl, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index bde6b6f69852..2ea8aaa7206e 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -273,25 +273,15 @@ void proc_entry_rundown(struct proc_dir_entry *de) spin_unlock(&de->pde_unload_lock); } -static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence) -{ - typeof_member(struct proc_ops, proc_lseek) lseek; - - lseek = pde->proc_ops->proc_lseek; - if (!lseek) - lseek = default_llseek; - return lseek(file, offset, whence); -} - static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; if (pde_is_permanent(pde)) { - return pde_lseek(pde, file, offset, whence); + return pde->proc_ops->proc_lseek(file, offset, whence); } else if (use_pde(pde)) { - rv = pde_lseek(pde, file, offset, whence); + rv = pde->proc_ops->proc_lseek(file, offset, whence); unuse_pde(pde); } return rv; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 000cc0533c33..069c7fd95396 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -32,6 +32,7 @@ struct proc_ops { ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *); ssize_t (*proc_read_iter)(struct kiocb *, struct iov_iter *); ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *); + /* mandatory unless nonseekable_open() or equivalent is used */ loff_t (*proc_lseek)(struct file *, loff_t, int); int (*proc_release)(struct inode *, struct file *); __poll_t (*proc_poll)(struct file *, struct poll_table_struct *); diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c index b3ef9e14d6cc..35ee78dff144 100644 --- a/tools/testing/selftests/proc/read.c +++ b/tools/testing/selftests/proc/read.c @@ -14,7 +14,7 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ // Test -// 1) read of every file in /proc +// 1) read and lseek on every file in /proc // 2) readlink of every symlink in /proc // 3) recursively (1) + (2) for every directory in /proc // 4) write to /proc/*/clear_refs and /proc/*/task/*/clear_refs @@ -45,6 +45,8 @@ static void f_reg(DIR *d, const char *filename) fd = openat(dirfd(d), filename, O_RDONLY|O_NONBLOCK); if (fd == -1) return; + /* struct proc_ops::proc_lseek is mandatory if file is seekable. */ + (void)lseek(fd, 0, SEEK_SET); rv = read(fd, buf, sizeof(buf)); assert((0 <= rv && rv <= sizeof(buf)) || rv == -1); close(fd); From 1dcdd7ef96ba11cf7c6a965114577b3509adb7cd Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 6 May 2021 18:02:18 -0700 Subject: [PATCH 1270/1379] proc: delete redundant subset=pid check Two checks in lookup and readdir code should be enough to not have third check in open code. Can't open what can't be looked up? Link: https://lkml.kernel.org/r/YFYYwIBIkytqnkxP@localhost.localdomain Signed-off-by: Alexey Dobriyan Acked-by: Alexey Gladkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 2ea8aaa7206e..599eb724ff2d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -483,7 +483,6 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, static int proc_reg_open(struct inode *inode, struct file *file) { - struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct proc_dir_entry *pde = PDE(inode); int rv = 0; typeof_member(struct proc_ops, proc_open) open; @@ -497,9 +496,6 @@ static int proc_reg_open(struct inode *inode, struct file *file) return rv; } - if (fs_info->pidonly == PROC_PIDONLY_ON) - return -ENOENT; - /* * Ensure that * 1) PDE's ->release hook will be called no matter what From 268af17ada5855a9b703995125a9920ac117b56b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 6 May 2021 18:02:21 -0700 Subject: [PATCH 1271/1379] selftests: proc: test subset=pid Test that /proc instance mounted with mount -t proc -o subset=pid contains only ".", "..", "self", "thread-self" and pid directories. Note: Currently "subset=pid" doesn't return "." and ".." via readdir. This must be a bug. Link: https://lkml.kernel.org/r/YFYZZ7WGaZlsnChS@localhost.localdomain Signed-off-by: Alexey Dobriyan Acked-by: Alexey Gladkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/Makefile | 1 + .../testing/selftests/proc/proc-subset-pid.c | 121 ++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 tools/testing/selftests/proc/proc-subset-pid.c diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 8be8a03d2973..1054e40a499a 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -12,6 +12,7 @@ TEST_GEN_PROGS += proc-self-map-files-001 TEST_GEN_PROGS += proc-self-map-files-002 TEST_GEN_PROGS += proc-self-syscall TEST_GEN_PROGS += proc-self-wchan +TEST_GEN_PROGS += proc-subset-pid TEST_GEN_PROGS += proc-uptime-001 TEST_GEN_PROGS += proc-uptime-002 TEST_GEN_PROGS += read diff --git a/tools/testing/selftests/proc/proc-subset-pid.c b/tools/testing/selftests/proc/proc-subset-pid.c new file mode 100644 index 000000000000..d1052bcab039 --- /dev/null +++ b/tools/testing/selftests/proc/proc-subset-pid.c @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2021 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +/* + * Test that "mount -t proc -o subset=pid" hides everything but pids, + * /proc/self and /proc/thread-self. + */ +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline bool streq(const char *a, const char *b) +{ + return strcmp(a, b) == 0; +} + +static void make_private_proc(void) +{ + if (unshare(CLONE_NEWNS) == -1) { + if (errno == ENOSYS || errno == EPERM) { + exit(4); + } + exit(1); + } + if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { + exit(1); + } + if (mount(NULL, "/proc", "proc", 0, "subset=pid") == -1) { + exit(1); + } +} + +static bool string_is_pid(const char *s) +{ + while (1) { + switch (*s++) { + case '0':case '1':case '2':case '3':case '4': + case '5':case '6':case '7':case '8':case '9': + continue; + + case '\0': + return true; + + default: + return false; + } + } +} + +int main(void) +{ + make_private_proc(); + + DIR *d = opendir("/proc"); + assert(d); + + struct dirent *de; + + bool dot = false; + bool dot_dot = false; + bool self = false; + bool thread_self = false; + + while ((de = readdir(d))) { + if (streq(de->d_name, ".")) { + assert(!dot); + dot = true; + assert(de->d_type == DT_DIR); + } else if (streq(de->d_name, "..")) { + assert(!dot_dot); + dot_dot = true; + assert(de->d_type == DT_DIR); + } else if (streq(de->d_name, "self")) { + assert(!self); + self = true; + assert(de->d_type == DT_LNK); + } else if (streq(de->d_name, "thread-self")) { + assert(!thread_self); + thread_self = true; + assert(de->d_type == DT_LNK); + } else { + if (!string_is_pid(de->d_name)) { + fprintf(stderr, "d_name '%s'\n", de->d_name); + assert(0); + } + assert(de->d_type == DT_DIR); + } + } + + char c; + int rv = readlink("/proc/cpuinfo", &c, 1); + assert(rv == -1 && errno == ENOENT); + + int fd = open("/proc/cpuinfo", O_RDONLY); + assert(fd == -1 && errno == ENOENT); + + return 0; +} From 5b31a7dfa35098a8c331b47fe4869282597df89f Mon Sep 17 00:00:00 2001 From: zhouchuangao Date: Thu, 6 May 2021 18:02:24 -0700 Subject: [PATCH 1272/1379] proc/sysctl: fix function name error in comments The function name should be modified to register_sysctl_paths instead of register_sysctl_table_path. Link: https://lkml.kernel.org/r/1615807194-79646-1-git-send-email-zhouchuangao@vivo.com Signed-off-by: zhouchuangao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 66c7dd11bd7c..dea0f5ee540c 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1563,7 +1563,7 @@ err_register_leaves: } /** - * register_sysctl_table_path - register a sysctl table hierarchy + * register_sysctl_paths - register a sysctl table hierarchy * @path: The path to the directory the sysctl table is in. * @table: the top-level table structure * From 4ee60ec156d91c315d1f62dfc1bc5799dcc6b473 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 6 May 2021 18:02:27 -0700 Subject: [PATCH 1273/1379] include: remove pagemap.h from blkdev.h My UEK-derived config has 1030 files depending on pagemap.h before this change. Afterwards, just 326 files need to be rebuilt when I touch pagemap.h. I think blkdev.h is probably included too widely, but untangling that dependency is harder and this solves my problem. x86 allmodconfig builds, but there may be implicit include problems on other architectures. Link: https://lkml.kernel.org/r/20210309195747.283796-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Dan Williams [nvdimm] Acked-by: Jens Axboe [block] Reviewed-by: Christoph Hellwig Acked-by: Coly Li [bcache] Acked-by: Martin K. Petersen [scsi] Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-settings.c | 1 + drivers/block/brd.c | 1 + drivers/block/loop.c | 1 + drivers/md/bcache/super.c | 1 + drivers/nvdimm/btt.c | 1 + drivers/nvdimm/pmem.c | 1 + drivers/scsi/scsicam.c | 1 + include/linux/blkdev.h | 1 - include/linux/swap.h | 1 + 9 files changed, 8 insertions(+), 1 deletion(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 9c009090c4b5..902c40d67120 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 6e622c1327ee..7562cf30b14e 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a370cde3ddd4..d58d68f3c7cd 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 2b6d6e9cd680..bea8c4429ae8 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -16,6 +16,7 @@ #include "features.h" #include +#include #include #include #include diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 41aa1f01fc07..18a267d5073f 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 7daac795db39..ed10a8b66068 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c index f1553a453616..0ffdb8f2995f 100644 --- a/drivers/scsi/scsicam.c +++ b/drivers/scsi/scsicam.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b91ba6207365..1255823b2bc0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/swap.h b/include/linux/swap.h index f69e0f67651d..144727041e78 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include From 08c5188ef40ff82aed559123dc0ab2d2254b1b1c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 6 May 2021 18:02:30 -0700 Subject: [PATCH 1274/1379] kernel.h: drop inclusion in bitmap.h The bitmap.h header is used in a lot of code around the kernel. Besides that it includes kernel.h which sometimes makes a loop. The problem here is many unneeded loops that make header hell dependencies. For example, how may you move bitmap_zalloc() from C-file to the header? Currently it's impossible. And bitmap.h here is only the tip of an iceberg. kerne.h is a dump of everything that even has nothing in common at all. We may still have it, but in my new code I prefer to include only the headers that I want to use, without the bulk of unneeded kernel code. Break the loop by introducing align.h, including it in kernel.h and bitmap.h followed by replacing kernel.h with limits.h. Link: https://lkml.kernel.org/r/20210326170347.37441-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Yury Norov Cc: Rasmus Villemoes Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/align.h | 15 +++++++++++++++ include/linux/bitmap.h | 3 ++- include/linux/kernel.h | 9 +-------- 3 files changed, 18 insertions(+), 9 deletions(-) create mode 100644 include/linux/align.h diff --git a/include/linux/align.h b/include/linux/align.h new file mode 100644 index 000000000000..2b4acec7b95a --- /dev/null +++ b/include/linux/align.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ALIGN_H +#define _LINUX_ALIGN_H + +#include + +/* @a is a power of 2 value */ +#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) +#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) +#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) +#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) +#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) +#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) + +#endif /* _LINUX_ALIGN_H */ diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 70a932470b2d..6cbcd9d9edd2 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -4,10 +4,11 @@ #ifndef __ASSEMBLY__ +#include #include #include +#include #include -#include /* * bitmaps provide bit arrays that consume one or more unsigned diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5b7ed6dc99ac..09035ac67d4b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -3,6 +3,7 @@ #define _LINUX_KERNEL_H #include +#include #include #include #include @@ -30,14 +31,6 @@ */ #define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) -/* @a is a power of 2 value */ -#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) -#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) -#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) -#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) -#define PTR_ALIGN_DOWN(p, a) ((typeof(p))ALIGN_DOWN((unsigned long)(p), (a))) -#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) - /* generic data direction definitions */ #define READ 0 #define WRITE 1 From 112dfce8f29798192eb0be8066b54f4a68f4eb36 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Thu, 6 May 2021 18:02:33 -0700 Subject: [PATCH 1275/1379] linux/profile.h: remove unnecessary declaration Declaring struct pt_regs is unnecessary. On the one hand, there is no function using it; on the other hand, struct pt_regs has been declared in linux/kernel.h. Remove them. Link: https://lkml.kernel.org/r/20210401104834.1009157-1-wanjiabing@vivo.com Signed-off-by: Wan Jiabing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/profile.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/profile.h b/include/linux/profile.h index bad18ca43150..fd18ca96f557 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -15,7 +15,6 @@ #define KVM_PROFILING 4 struct proc_dir_entry; -struct pt_regs; struct notifier_block; #if defined(CONFIG_PROFILING) && defined(CONFIG_PROC_FS) @@ -84,8 +83,6 @@ int task_handoff_unregister(struct notifier_block * n); int profile_event_register(enum profile_type, struct notifier_block * n); int profile_event_unregister(enum profile_type, struct notifier_block * n); -struct pt_regs; - #else #define prof_on 0 From 8ba9d40b6b2bf62377fd6fce25e9997e42b0317a Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 6 May 2021 18:02:36 -0700 Subject: [PATCH 1276/1379] kernel/async.c: fix pr_debug statement An async_func_t returns void - any errors encountered it has to stash somewhere for consumers to discover later. Link: https://lkml.kernel.org/r/20210226124355.2503524-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/async.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/async.c b/kernel/async.c index 33258e6e20f8..45a867b8644a 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work) if (initcall_debug && system_state < SYSTEM_RUNNING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n", + pr_debug("initcall %lli_%pS returned after %lld usecs\n", (long long)entry->cookie, entry->func, (long long)ktime_to_ns(delta) >> 10); From 32c93976ac2ee7ecb4b09cc032efe1445d37bd7e Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 6 May 2021 18:02:39 -0700 Subject: [PATCH 1277/1379] kernel/cred.c: make init_groups static init_groups is declared in both cred.h and init_task.h, but it is not actually referenced anywhere outside of cred.c where it is defined. So make it static and remove the declarations. Link: https://lkml.kernel.org/r/20210310220102.2484201-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cred.h | 1 - include/linux/init_task.h | 1 - kernel/cred.c | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/cred.h b/include/linux/cred.h index ac0e5f97d7d8..14971322e1a0 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -53,7 +53,6 @@ do { \ groups_free(group_info); \ } while (0) -extern struct group_info init_groups; #ifdef CONFIG_MULTIUSER extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index b2412b4d4c20..40fc5813cf93 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,7 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; extern struct nsproxy init_nsproxy; -extern struct group_info init_groups; extern struct cred init_cred; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE diff --git a/kernel/cred.c b/kernel/cred.c index 421b1149c651..e1d274cd741b 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -33,7 +33,7 @@ do { \ static struct kmem_cache *cred_jar; /* init to 2 - one for init_task, one to ensure it is never freed */ -struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; +static struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; /* * The initial credentials for the initial task From d1d1a2cd4627724c37539892db8efa611d2cbd70 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:02:42 -0700 Subject: [PATCH 1278/1379] tools: disable -Wno-type-limits Patch series "lib/find_bit: fast path for small bitmaps", v6. Bitmap operations are much simpler and faster in case of small bitmaps which fit into a single word. In linux/bitmap.c we have a machinery that allows compiler to replace actual function call with a few instructions if bitmaps passed into the function are small and their size is known at compile time. find_*_bit() API lacks this functionality; but users will benefit from it a lot. One important example is cpumask subsystem when NR_CPUS <= BITS_PER_LONG. This patch (of 12): GENMASK(h, l) may be passed with unsigned types. In such case, type-limits warning is generated for example in case of GENMASK(h, 0). Link: https://lkml.kernel.org/r/20210401003153.97325-1-yury.norov@gmail.com Link: https://lkml.kernel.org/r/20210401003153.97325-2-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/scripts/Makefile.include | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/scripts/Makefile.include b/tools/scripts/Makefile.include index 25adfec2cb39..f9271f3ea912 100644 --- a/tools/scripts/Makefile.include +++ b/tools/scripts/Makefile.include @@ -38,6 +38,7 @@ EXTRA_WARNINGS += -Wswitch-enum EXTRA_WARNINGS += -Wundef EXTRA_WARNINGS += -Wwrite-strings EXTRA_WARNINGS += -Wformat +EXTRA_WARNINGS += -Wno-type-limits # Makefiles suck: This macro sets a default value of $(2) for the # variable named by $(1), unless the variable has been set by From e5b9252d9000fc82324af5864701c1daffeebd7e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:02:46 -0700 Subject: [PATCH 1279/1379] tools: bitmap: sync function declarations with the kernel Some functions in tools/include/linux/bitmap.h declare nbits as int. In the kernel nbits is declared as unsigned int. Link: https://lkml.kernel.org/r/20210401003153.97325-3-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/linux/bitmap.h | 8 ++++---- tools/lib/bitmap.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 477a1cae513f..7cbd23e56d48 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -30,7 +30,7 @@ void bitmap_clear(unsigned long *map, unsigned int start, int len); #define small_const_nbits(nbits) \ (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) -static inline void bitmap_zero(unsigned long *dst, int nbits) +static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = 0UL; @@ -66,7 +66,7 @@ static inline int bitmap_full(const unsigned long *src, unsigned int nbits) return find_first_zero_bit(src, nbits) == nbits; } -static inline int bitmap_weight(const unsigned long *src, int nbits) +static inline int bitmap_weight(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); @@ -74,7 +74,7 @@ static inline int bitmap_weight(const unsigned long *src, int nbits) } static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 | *src2; @@ -141,7 +141,7 @@ static inline void bitmap_free(unsigned long *bitmap) * @buf: buffer to store output * @size: size of @buf */ -size_t bitmap_scnprintf(unsigned long *bitmap, int nbits, +size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits, char *buf, size_t size); /** diff --git a/tools/lib/bitmap.c b/tools/lib/bitmap.c index 5043747ef6c5..f4e914712b6f 100644 --- a/tools/lib/bitmap.c +++ b/tools/lib/bitmap.c @@ -28,11 +28,11 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, dst[k] = bitmap1[k] | bitmap2[k]; } -size_t bitmap_scnprintf(unsigned long *bitmap, int nbits, +size_t bitmap_scnprintf(unsigned long *bitmap, unsigned int nbits, char *buf, size_t size) { /* current bit is 'cur', most recently seen range is [rbot, rtop] */ - int cur, rbot, rtop; + unsigned int cur, rbot, rtop; bool first = true; size_t ret = 0; From a719101f19d2b4f107c8a79ed8b2866832a1816f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:02:49 -0700 Subject: [PATCH 1280/1379] tools: sync BITMAP_LAST_WORD_MASK() macro with the kernel Kernel version generates better code. Link: https://lkml.kernel.org/r/20210401003153.97325-4-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/linux/bitmap.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 7cbd23e56d48..4aabc23ec747 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -20,12 +20,7 @@ int __bitmap_equal(const unsigned long *bitmap1, void bitmap_clear(unsigned long *map, unsigned int start, int len); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) - -#define BITMAP_LAST_WORD_MASK(nbits) \ -( \ - ((nbits) % BITS_PER_LONG) ? \ - (1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL \ -) +#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) #define small_const_nbits(nbits) \ (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) From bb8bc36ef8a9873e79c5bbde74fd493c47492c42 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:02:53 -0700 Subject: [PATCH 1281/1379] arch: rearrange headers inclusion order in asm/bitops for m68k, sh and h8300 m68k and sh include bitmap/{find,le}.h prior to ffs/fls headers. New fast-path implementation in find.h requires ffs/fls. Reordering the headers inclusion sequence helps to prevent compile-time implicit function declaration error. [yury.norov@gmail.com: h8300: rearrange headers inclusion order in asm/bitops] Link: https://lkml.kernel.org/r/20210406183625.794227-1-yury.norov@gmail.com Link: https://lkml.kernel.org/r/20210401003153.97325-5-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Geert Uytterhoeven Acked-by: Rasmus Villemoes Reviewed-by: Andy Shevchenko Tested-by: Guenter Roeck Cc: Alexey Klimov Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/h8300/include/asm/bitops.h | 8 ++++---- arch/m68k/include/asm/bitops.h | 6 +++--- arch/sh/include/asm/bitops.h | 5 +++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/h8300/include/asm/bitops.h b/arch/h8300/include/asm/bitops.h index 7aa16c732aa9..c867a80cab5b 100644 --- a/arch/h8300/include/asm/bitops.h +++ b/arch/h8300/include/asm/bitops.h @@ -9,6 +9,10 @@ #include +#include +#include +#include + #ifdef __KERNEL__ #ifndef _LINUX_BITOPS_H @@ -173,8 +177,4 @@ static inline unsigned long __ffs(unsigned long word) #endif /* __KERNEL__ */ -#include -#include -#include - #endif /* _H8300_BITOPS_H */ diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index 10133a968c8e..7b414099e5fc 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -440,8 +440,6 @@ static inline unsigned long ffz(unsigned long word) #endif -#include - #ifdef __KERNEL__ #if defined(CONFIG_CPU_HAS_NO_BITFIELDS) @@ -525,10 +523,12 @@ static inline int __fls(int x) #define __clear_bit_unlock clear_bit_unlock #include -#include #include #include #include +#include #endif /* __KERNEL__ */ +#include + #endif /* _M68K_BITOPS_H */ diff --git a/arch/sh/include/asm/bitops.h b/arch/sh/include/asm/bitops.h index 450b5854d7c6..3b6c7b5b7ec9 100644 --- a/arch/sh/include/asm/bitops.h +++ b/arch/sh/include/asm/bitops.h @@ -58,15 +58,16 @@ static inline unsigned long __ffs(unsigned long word) return result; } -#include #include #include #include #include -#include #include #include #include #include +#include +#include + #endif /* __ASM_SH_BITOPS_H */ From 586eaebea5988302c5a8b018096dd6c6f4564940 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:02:56 -0700 Subject: [PATCH 1282/1379] lib: extend the scope of small_const_nbits() macro find_bit would also benefit from small_const_nbits() optimizations. The detailed comment is provided by Rasmus Villemoes. Link: https://lkml.kernel.org/r/20210401003153.97325-6-yury.norov@gmail.com Suggested-by: Rasmus Villemoes Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Acked-by: Andy Shevchenko Cc: Alexey Klimov Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bitsperlong.h | 12 ++++++++++++ include/linux/bitmap.h | 8 -------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/asm-generic/bitsperlong.h b/include/asm-generic/bitsperlong.h index 3905c1c93dc2..1023e2a4bd37 100644 --- a/include/asm-generic/bitsperlong.h +++ b/include/asm-generic/bitsperlong.h @@ -23,4 +23,16 @@ #define BITS_PER_LONG_LONG 64 #endif +/* + * small_const_nbits(n) is true precisely when it is known at compile-time + * that BITMAP_SIZE(n) is 1, i.e. 1 <= n <= BITS_PER_LONG. This allows + * various bit/bitmap APIs to provide a fast inline implementation. Bitmaps + * of size 0 are very rare, and a compile-time-known-size 0 is most likely + * a sign of error. They will be handled correctly by the bit/bitmap APIs, + * but using the out-of-line functions, so that the inline implementations + * can unconditionally dereference the pointer(s). + */ +#define small_const_nbits(nbits) \ + (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0) + #endif /* __ASM_GENERIC_BITS_PER_LONG */ diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 6cbcd9d9edd2..f57f4473bbe4 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -223,14 +223,6 @@ extern int bitmap_print_to_pagebuf(bool list, char *buf, #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) -/* - * The static inlines below do not handle constant nbits==0 correctly, - * so make such users (should any ever turn up) call the out-of-line - * versions. - */ -#define small_const_nbits(nbits) \ - (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0) - static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); From 78e48f0667ff11ee444e057c757896062b6ad06b Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:03:00 -0700 Subject: [PATCH 1283/1379] tools: sync small_const_nbits() macro with the kernel Sync implementation with the kernel and move the macro from tools/include/linux/bitmap.h to tools/include/asm-generic/bitsperlong.h Link: https://lkml.kernel.org/r/20210401003153.97325-7-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/asm-generic/bitsperlong.h | 3 +++ tools/include/linux/bitmap.h | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/include/asm-generic/bitsperlong.h b/tools/include/asm-generic/bitsperlong.h index 8f2283052333..2093d56ddd11 100644 --- a/tools/include/asm-generic/bitsperlong.h +++ b/tools/include/asm-generic/bitsperlong.h @@ -18,4 +18,7 @@ #define BITS_PER_LONG_LONG 64 #endif +#define small_const_nbits(nbits) \ + (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG && (nbits) > 0) + #endif /* __ASM_GENERIC_BITS_PER_LONG */ diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 4aabc23ec747..330dbf7509cc 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -22,9 +22,6 @@ void bitmap_clear(unsigned long *map, unsigned int start, int len); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) -#define small_const_nbits(nbits) \ - (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) - static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { if (small_const_nbits(nbits)) From 5c88af59f9abc202648a431428ad9d32e5d2a201 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:03:03 -0700 Subject: [PATCH 1284/1379] lib: inline _find_next_bit() wrappers lib/find_bit.c declares five single-line wrappers for _find_next_bit(). We may turn those wrappers to inline functions. It eliminates unneeded function calls and opens room for compile-time optimizations. Link: https://lkml.kernel.org/r/20210401003153.97325-8-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Acked-by: Andy Shevchenko Cc: Alexey Klimov Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bitops/find.h | 28 ++++++++++++---- include/asm-generic/bitops/le.h | 17 +++++++--- lib/find_bit.c | 56 ++----------------------------- 3 files changed, 37 insertions(+), 64 deletions(-) diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 9fdf21302fdf..7ad70dab8e93 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -2,6 +2,10 @@ #ifndef _ASM_GENERIC_BITOPS_FIND_H_ #define _ASM_GENERIC_BITOPS_FIND_H_ +extern unsigned long _find_next_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long nbits, + unsigned long start, unsigned long invert, unsigned long le); + #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region @@ -12,8 +16,12 @@ * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -extern unsigned long find_next_bit(const unsigned long *addr, unsigned long - size, unsigned long offset); +static inline +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + return _find_next_bit(addr, NULL, size, offset, 0UL, 0); +} #endif #ifndef find_next_and_bit @@ -27,9 +35,13 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -extern unsigned long find_next_and_bit(const unsigned long *addr1, +static inline +unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, - unsigned long offset); + unsigned long offset) +{ + return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); +} #endif #ifndef find_next_zero_bit @@ -42,8 +54,12 @@ extern unsigned long find_next_and_bit(const unsigned long *addr1, * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ -extern unsigned long find_next_zero_bit(const unsigned long *addr, unsigned - long size, unsigned long offset); +static inline +unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); +} #endif #ifdef CONFIG_GENERIC_FIND_FIRST_BIT diff --git a/include/asm-generic/bitops/le.h b/include/asm-generic/bitops/le.h index 188d3eba3ace..21305f6cea0b 100644 --- a/include/asm-generic/bitops/le.h +++ b/include/asm-generic/bitops/le.h @@ -2,6 +2,7 @@ #ifndef _ASM_GENERIC_BITOPS_LE_H_ #define _ASM_GENERIC_BITOPS_LE_H_ +#include #include #include @@ -32,13 +33,21 @@ static inline unsigned long find_first_zero_bit_le(const void *addr, #define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) #ifndef find_next_zero_bit_le -extern unsigned long find_next_zero_bit_le(const void *addr, - unsigned long size, unsigned long offset); +static inline +unsigned long find_next_zero_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); +} #endif #ifndef find_next_bit_le -extern unsigned long find_next_bit_le(const void *addr, - unsigned long size, unsigned long offset); +static inline +unsigned long find_next_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + return _find_next_bit(addr, NULL, size, offset, 0UL, 1); +} #endif #ifndef find_first_zero_bit_le diff --git a/lib/find_bit.c b/lib/find_bit.c index f67f86fd2f62..b03a101367f8 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -29,7 +29,7 @@ * searching it for one bits. * - The optional "addr2", which is anded with "addr1" if present. */ -static unsigned long _find_next_bit(const unsigned long *addr1, +unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start, unsigned long invert, unsigned long le) { @@ -68,37 +68,7 @@ static unsigned long _find_next_bit(const unsigned long *addr1, return min(start + __ffs(tmp), nbits); } -#endif - -#ifndef find_next_bit -/* - * Find the next set bit in a memory region. - */ -unsigned long find_next_bit(const unsigned long *addr, unsigned long size, - unsigned long offset) -{ - return _find_next_bit(addr, NULL, size, offset, 0UL, 0); -} -EXPORT_SYMBOL(find_next_bit); -#endif - -#ifndef find_next_zero_bit -unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, - unsigned long offset) -{ - return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); -} -EXPORT_SYMBOL(find_next_zero_bit); -#endif - -#if !defined(find_next_and_bit) -unsigned long find_next_and_bit(const unsigned long *addr1, - const unsigned long *addr2, unsigned long size, - unsigned long offset) -{ - return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); -} -EXPORT_SYMBOL(find_next_and_bit); +EXPORT_SYMBOL(_find_next_bit); #endif #ifndef find_first_bit @@ -157,28 +127,6 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size) EXPORT_SYMBOL(find_last_bit); #endif -#ifdef __BIG_ENDIAN - -#ifndef find_next_zero_bit_le -unsigned long find_next_zero_bit_le(const void *addr, unsigned - long size, unsigned long offset) -{ - return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); -} -EXPORT_SYMBOL(find_next_zero_bit_le); -#endif - -#ifndef find_next_bit_le -unsigned long find_next_bit_le(const void *addr, unsigned - long size, unsigned long offset) -{ - return _find_next_bit(addr, NULL, size, offset, 0UL, 1); -} -EXPORT_SYMBOL(find_next_bit_le); -#endif - -#endif /* __BIG_ENDIAN */ - unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, unsigned long size, unsigned long offset) { From ea81c1ef441733ee779d776292d6269a97c5d2e1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:03:07 -0700 Subject: [PATCH 1285/1379] tools: sync find_next_bit implementation Sync the implementation with recent kernel changes. Link: https://lkml.kernel.org/r/20210401003153.97325-9-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/asm-generic/bitops/find.h | 27 ++++++++++--- tools/lib/find_bit.c | 52 ++++++++++--------------- 2 files changed, 42 insertions(+), 37 deletions(-) diff --git a/tools/include/asm-generic/bitops/find.h b/tools/include/asm-generic/bitops/find.h index 16ed1982cb34..9fe62d10b084 100644 --- a/tools/include/asm-generic/bitops/find.h +++ b/tools/include/asm-generic/bitops/find.h @@ -2,6 +2,10 @@ #ifndef _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ #define _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ +extern unsigned long _find_next_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long nbits, + unsigned long start, unsigned long invert, unsigned long le); + #ifndef find_next_bit /** * find_next_bit - find the next set bit in a memory region @@ -12,8 +16,12 @@ * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -extern unsigned long find_next_bit(const unsigned long *addr, unsigned long - size, unsigned long offset); +static inline +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + return _find_next_bit(addr, NULL, size, offset, 0UL, 0); +} #endif #ifndef find_next_and_bit @@ -27,13 +35,16 @@ extern unsigned long find_next_bit(const unsigned long *addr, unsigned long * Returns the bit number for the next set bit * If no bits are set, returns @size. */ -extern unsigned long find_next_and_bit(const unsigned long *addr1, +static inline +unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, - unsigned long offset); + unsigned long offset) +{ + return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); +} #endif #ifndef find_next_zero_bit - /** * find_next_zero_bit - find the next cleared bit in a memory region * @addr: The address to base the search on @@ -43,8 +54,12 @@ extern unsigned long find_next_and_bit(const unsigned long *addr1, * Returns the bit number of the next zero bit * If no bits are zero, returns @size. */ +static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, - unsigned long offset); + unsigned long offset) +{ + return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); +} #endif #ifndef find_first_bit diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c index ac37022e9486..589fd2f26f94 100644 --- a/tools/lib/find_bit.c +++ b/tools/lib/find_bit.c @@ -28,11 +28,12 @@ * searching it for one bits. * - The optional "addr2", which is anded with "addr1" if present. */ -static inline unsigned long _find_next_bit(const unsigned long *addr1, +unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, - unsigned long start, unsigned long invert) + unsigned long start, unsigned long invert, unsigned long le) { - unsigned long tmp; + unsigned long tmp, mask; + (void) le; if (unlikely(start >= nbits)) return nbits; @@ -43,7 +44,19 @@ static inline unsigned long _find_next_bit(const unsigned long *addr1, tmp ^= invert; /* Handle 1st word. */ - tmp &= BITMAP_FIRST_WORD_MASK(start); + mask = BITMAP_FIRST_WORD_MASK(start); + + /* + * Due to the lack of swab() in tools, and the fact that it doesn't + * need little-endian support, just comment it out + */ +#if (0) + if (le) + mask = swab(mask); +#endif + + tmp &= mask; + start = round_down(start, BITS_PER_LONG); while (!tmp) { @@ -57,18 +70,12 @@ static inline unsigned long _find_next_bit(const unsigned long *addr1, tmp ^= invert; } - return min(start + __ffs(tmp), nbits); -} +#if (0) + if (le) + tmp = swab(tmp); #endif -#ifndef find_next_bit -/* - * Find the next set bit in a memory region. - */ -unsigned long find_next_bit(const unsigned long *addr, unsigned long size, - unsigned long offset) -{ - return _find_next_bit(addr, NULL, size, offset, 0UL); + return min(start + __ffs(tmp), nbits); } #endif @@ -105,20 +112,3 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) return size; } #endif - -#ifndef find_next_zero_bit -unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, - unsigned long offset) -{ - return _find_next_bit(addr, NULL, size, offset, ~0UL); -} -#endif - -#ifndef find_next_and_bit -unsigned long find_next_and_bit(const unsigned long *addr1, - const unsigned long *addr2, unsigned long size, - unsigned long offset) -{ - return _find_next_bit(addr1, addr2, size, offset, 0UL); -} -#endif From 277a20a498d30753f5d8a607dbf967bc163552c1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:03:11 -0700 Subject: [PATCH 1286/1379] lib: add fast path for find_next_*_bit() Similarly to bitmap functions, find_next_*_bit() users will benefit if we'll handle a case of bitmaps that fit into a single word inline. In the very best case, the compiler may replace a function call with a few instructions. This is the quite typical find_next_bit() user: unsigned int cpumask_next(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n + 1); } EXPORT_SYMBOL(cpumask_next); Currently, on ARM64 the generated code looks like this: 0000000000000000 : 0: a9bf7bfd stp x29, x30, [sp, #-16]! 4: 11000402 add w2, w0, #0x1 8: aa0103e0 mov x0, x1 c: d2800401 mov x1, #0x40 // #64 10: 910003fd mov x29, sp 14: 93407c42 sxtw x2, w2 18: 94000000 bl 0 1c: a8c17bfd ldp x29, x30, [sp], #16 20: d65f03c0 ret 24: d503201f nop After applying this patch: 0000000000000140 : 140: 11000400 add w0, w0, #0x1 144: 93407c00 sxtw x0, w0 148: f100fc1f cmp x0, #0x3f 14c: 54000168 b.hi 178 // b.pmore 150: f9400023 ldr x3, [x1] 154: 92800001 mov x1, #0xffffffffffffffff // #-1 158: 9ac02020 lsl x0, x1, x0 15c: 52800802 mov w2, #0x40 // #64 160: 8a030001 and x1, x0, x3 164: dac00020 rbit x0, x1 168: f100003f cmp x1, #0x0 16c: dac01000 clz x0, x0 170: 1a800040 csel w0, w2, w0, eq // eq = none 174: d65f03c0 ret 178: 52800800 mov w0, #0x40 // #64 17c: d65f03c0 ret find_next_bit() call is replaced with 6 instructions. find_next_bit() itself is 41 instructions plus function call overhead. Despite inlining, the scripts/bloat-o-meter report smaller .text size after applying the series: add/remove: 11/9 grow/shrink: 233/176 up/down: 5780/-6768 (-988) Link: https://lkml.kernel.org/r/20210401003153.97325-10-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Acked-by: Andy Shevchenko Cc: Alexey Klimov Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bitops/find.h | 30 ++++++++++++++++++++++++++++++ include/asm-generic/bitops/le.h | 21 +++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 7ad70dab8e93..4148c74a1e4d 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -20,6 +20,16 @@ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + return _find_next_bit(addr, NULL, size, offset, 0UL, 0); } #endif @@ -40,6 +50,16 @@ unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr1 & *addr2 & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); } #endif @@ -58,6 +78,16 @@ static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); } #endif diff --git a/include/asm-generic/bitops/le.h b/include/asm-generic/bitops/le.h index 21305f6cea0b..5a28629cbf4d 100644 --- a/include/asm-generic/bitops/le.h +++ b/include/asm-generic/bitops/le.h @@ -5,6 +5,7 @@ #include #include #include +#include #if defined(__LITTLE_ENDIAN) @@ -37,6 +38,16 @@ static inline unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); } #endif @@ -46,6 +57,16 @@ static inline unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + return _find_next_bit(addr, NULL, size, offset, 0UL, 1); } #endif From 2cc7b6a44ac21d31b398b03f4845c53152070416 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:03:14 -0700 Subject: [PATCH 1287/1379] lib: add fast path for find_first_*_bit() and find_last_bit() Similarly to bitmap functions, users would benefit if we'll handle a case of small-size bitmaps that fit into a single word. While here, move the find_last_bit() declaration to bitops/find.h where other find_*_bit() functions sit. Link: https://lkml.kernel.org/r/20210401003153.97325-11-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Acked-by: Andy Shevchenko Cc: Alexey Klimov Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bitops/find.h | 50 ++++++++++++++++++++++++++++--- include/linux/bitops.h | 12 -------- lib/find_bit.c | 12 ++++---- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 4148c74a1e4d..0d132ee2a291 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -5,6 +5,9 @@ extern unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start, unsigned long invert, unsigned long le); +extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); #ifndef find_next_bit /** @@ -102,8 +105,17 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ -extern unsigned long find_first_bit(const unsigned long *addr, - unsigned long size); +static inline +unsigned long find_first_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_bit(addr, size); +} /** * find_first_zero_bit - find the first cleared bit in a memory region @@ -113,8 +125,17 @@ extern unsigned long find_first_bit(const unsigned long *addr, * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ -extern unsigned long find_first_zero_bit(const unsigned long *addr, - unsigned long size); +static inline +unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr | ~GENMASK(size - 1, 0); + + return val == ~0UL ? size : ffz(val); + } + + return _find_first_zero_bit(addr, size); +} #else /* CONFIG_GENERIC_FIND_FIRST_BIT */ #ifndef find_first_bit @@ -126,6 +147,27 @@ extern unsigned long find_first_zero_bit(const unsigned long *addr, #endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ +#ifndef find_last_bit +/** + * find_last_bit - find the last set bit in a memory region + * @addr: The address to start the search at + * @size: The number of bits to search + * + * Returns the bit number of the last set bit, or size. + */ +static inline +unsigned long find_last_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __fls(val) : size; + } + + return _find_last_bit(addr, size); +} +#endif + /** * find_next_clump8 - find next 8-bit clump with set bits in a memory region * @clump: location to store copy of found clump diff --git a/include/linux/bitops.h b/include/linux/bitops.h index a5a48303b0f1..26bf15e6cd35 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -286,17 +286,5 @@ static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, }) #endif -#ifndef find_last_bit -/** - * find_last_bit - find the last set bit in a memory region - * @addr: The address to start the search at - * @size: The number of bits to search - * - * Returns the bit number of the last set bit, or size. - */ -extern unsigned long find_last_bit(const unsigned long *addr, - unsigned long size); -#endif - #endif /* __KERNEL__ */ #endif diff --git a/lib/find_bit.c b/lib/find_bit.c index b03a101367f8..0f8e2e369b1d 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -75,7 +75,7 @@ EXPORT_SYMBOL(_find_next_bit); /* * Find the first set bit in a memory region. */ -unsigned long find_first_bit(const unsigned long *addr, unsigned long size) +unsigned long _find_first_bit(const unsigned long *addr, unsigned long size) { unsigned long idx; @@ -86,14 +86,14 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) return size; } -EXPORT_SYMBOL(find_first_bit); +EXPORT_SYMBOL(_find_first_bit); #endif #ifndef find_first_zero_bit /* * Find the first cleared bit in a memory region. */ -unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) +unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size) { unsigned long idx; @@ -104,11 +104,11 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) return size; } -EXPORT_SYMBOL(find_first_zero_bit); +EXPORT_SYMBOL(_find_first_zero_bit); #endif #ifndef find_last_bit -unsigned long find_last_bit(const unsigned long *addr, unsigned long size) +unsigned long _find_last_bit(const unsigned long *addr, unsigned long size) { if (size) { unsigned long val = BITMAP_LAST_WORD_MASK(size); @@ -124,7 +124,7 @@ unsigned long find_last_bit(const unsigned long *addr, unsigned long size) } return size; } -EXPORT_SYMBOL(find_last_bit); +EXPORT_SYMBOL(_find_last_bit); #endif unsigned long find_next_clump8(unsigned long *clump, const unsigned long *addr, From eaae7841ba83bb42dcac3177dc65f8dd974e6c0b Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:03:18 -0700 Subject: [PATCH 1288/1379] tools: sync lib/find_bit implementation Add fast paths to find_*_bit() functions as per kernel implementation. Link: https://lkml.kernel.org/r/20210401003153.97325-12-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Rasmus Villemoes Cc: Alexey Klimov Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/asm-generic/bitops/find.h | 58 +++++++++++++++++++++++-- tools/lib/find_bit.c | 4 +- 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/tools/include/asm-generic/bitops/find.h b/tools/include/asm-generic/bitops/find.h index 9fe62d10b084..6481fd11012a 100644 --- a/tools/include/asm-generic/bitops/find.h +++ b/tools/include/asm-generic/bitops/find.h @@ -5,6 +5,9 @@ extern unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start, unsigned long invert, unsigned long le); +extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); #ifndef find_next_bit /** @@ -20,6 +23,16 @@ static inline unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + return _find_next_bit(addr, NULL, size, offset, 0UL, 0); } #endif @@ -40,6 +53,16 @@ unsigned long find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr1 & *addr2 & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); } #endif @@ -58,6 +81,16 @@ static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); } #endif @@ -72,8 +105,17 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, * Returns the bit number of the first set bit. * If no bits are set, returns @size. */ -extern unsigned long find_first_bit(const unsigned long *addr, - unsigned long size); +static inline +unsigned long find_first_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_bit(addr, size); +} #endif /* find_first_bit */ @@ -87,7 +129,17 @@ extern unsigned long find_first_bit(const unsigned long *addr, * Returns the bit number of the first cleared bit. * If no bits are zero, returns @size. */ -unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size); +static inline +unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr | ~GENMASK(size - 1, 0); + + return val == ~0UL ? size : ffz(val); + } + + return _find_first_zero_bit(addr, size); +} #endif #endif /*_TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ */ diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c index 589fd2f26f94..109aa7ffcf97 100644 --- a/tools/lib/find_bit.c +++ b/tools/lib/find_bit.c @@ -83,7 +83,7 @@ unsigned long _find_next_bit(const unsigned long *addr1, /* * Find the first set bit in a memory region. */ -unsigned long find_first_bit(const unsigned long *addr, unsigned long size) +unsigned long _find_first_bit(const unsigned long *addr, unsigned long size) { unsigned long idx; @@ -100,7 +100,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) /* * Find the first cleared bit in a memory region. */ -unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) +unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size) { unsigned long idx; From 550eb38bde07fb71a1d877c2ab284f0cf926d327 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 6 May 2021 18:03:22 -0700 Subject: [PATCH 1289/1379] MAINTAINERS: add entry for the bitmap API Add myself as maintainer for bitmap API and Andy and Rasmus as reviewers. Link: https://lkml.kernel.org/r/20210401003153.97325-13-yury.norov@gmail.com Signed-off-by: Yury Norov Acked-by: Andy Shevchenko Acked-by: Rasmus Villemoes Cc: Alexey Klimov Cc: Arnd Bergmann Cc: David Sterba Cc: Dennis Zhou Cc: Geert Uytterhoeven Cc: Jianpeng Ma Cc: Joe Perches Cc: John Paul Adrian Glaubitz Cc: Josh Poimboeuf Cc: Rich Felker Cc: Stefano Brivio Cc: Wei Yang Cc: Wolfram Sang Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 591897e03525..2b4f29e2aebd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3205,6 +3205,22 @@ F: Documentation/filesystems/bfs.rst F: fs/bfs/ F: include/uapi/linux/bfs_fs.h +BITMAP API +M: Yury Norov +R: Andy Shevchenko +R: Rasmus Villemoes +S: Maintained +F: include/asm-generic/bitops/find.h +F: include/linux/bitmap.h +F: lib/bitmap.c +F: lib/find_bit.c +F: lib/find_bit_benchmark.c +F: lib/test_bitmap.c +F: tools/include/asm-generic/bitops/find.h +F: tools/include/linux/bitmap.h +F: tools/lib/bitmap.c +F: tools/lib/find_bit.c + BLINKM RGB LED DRIVER M: Jan-Simon Moeller S: Maintained From 0523c6922e8bd8d31d3377a56d57730d448b85a8 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 6 May 2021 18:03:25 -0700 Subject: [PATCH 1290/1379] lib/bch.c: fix a typo in the file bch.c s/buid/build/ Link: https://lkml.kernel.org/r/20210301123129.18754-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/bch.c b/lib/bch.c index 7c031ee8b93b..c8095f30f254 100644 --- a/lib/bch.c +++ b/lib/bch.c @@ -584,7 +584,7 @@ static int find_affine4_roots(struct bch_control *bch, unsigned int a, k = a_log(bch, a); rows[0] = c; - /* buid linear system to solve X^4+aX^2+bX+c = 0 */ + /* build linear system to solve X^4+aX^2+bX+c = 0 */ for (i = 0; i < m; i++) { rows[i+1] = bch->a_pow_tab[4*i]^ (a ? bch->a_pow_tab[mod_s(bch, k)] : 0)^ From b8cf20277941f6954f12a8d5a54eb334c806a6a3 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Thu, 6 May 2021 18:03:28 -0700 Subject: [PATCH 1291/1379] lib: fix inconsistent indenting in process_bit1() Smatch gives the warning: lib/decompress_unlzma.c:395 process_bit1() warn: inconsistent indenting Link: https://lkml.kernel.org/r/1614567775-4478-1-git-send-email-wangqing@vivo.com Signed-off-by: Wang Qing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/decompress_unlzma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c index 1cf409ef8d04..20a858031f12 100644 --- a/lib/decompress_unlzma.c +++ b/lib/decompress_unlzma.c @@ -391,7 +391,7 @@ static inline int INIT process_bit0(struct writer *wr, struct rc *rc, static inline int INIT process_bit1(struct writer *wr, struct rc *rc, struct cstate *cst, uint16_t *p, int pos_state, uint16_t *prob) { - int offset; + int offset; uint16_t *prob_len; int num_bits; int len; From e89b6358052de202e53e47623f50b6d28182ccdf Mon Sep 17 00:00:00 2001 From: ToastC Date: Thu, 6 May 2021 18:03:31 -0700 Subject: [PATCH 1292/1379] lib/list_sort.c: fix typo in function description Replace beautiully with beautifully Link: https://lkml.kernel.org/r/20210315090633.9759-1-mrtoastcheng@gmail.com Signed-off-by: ShihCheng Tu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_sort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/list_sort.c b/lib/list_sort.c index a926d96ffd44..1e1e37762799 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -137,7 +137,7 @@ static void merge_final(void *priv, list_cmp_func_t cmp, struct list_head *head, * * * The merging is controlled by "count", the number of elements in the - * pending lists. This is beautiully simple code, but rather subtle. + * pending lists. This is beautifully simple code, but rather subtle. * * Each time we increment "count", we set one bit (bit k) and clear * bits k-1 .. 0. Each time this happens (except the very first time From ade29d4fdbe675d72ee6115baaf3b3382942fd12 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 6 May 2021 18:03:34 -0700 Subject: [PATCH 1293/1379] lib/genalloc.c: Fix a typo s/macthing/matching/ Link: https://lkml.kernel.org/r/20210326131530.30481-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/genalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/genalloc.c b/lib/genalloc.c index 5dcf9cdcbc46..8273760884a7 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -735,7 +735,7 @@ EXPORT_SYMBOL(gen_pool_first_fit_order_align); /** * gen_pool_best_fit - find the best fitting region of memory - * macthing the size requirement (no alignment constraint) + * matching the size requirement (no alignment constraint) * @map: The address to base the search on * @size: The bitmap size in bits * @start: The bitnumber to start searching at From e18baa7cc3598999317d6c2fe255756f6b3b7562 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 6 May 2021 18:03:37 -0700 Subject: [PATCH 1294/1379] lib: crc8: pointer to data block should be const crc8() does not change the data passed to it, so the pointer argument should be declared const. This avoids callers that receive const data having to cast it to a non-const pointer to call crc8(). Link: https://lkml.kernel.org/r/20210329122409.3291-1-rf@opensource.cirrus.com Signed-off-by: Richard Fitzgerald Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/crc8.h | 2 +- lib/crc8.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/crc8.h b/include/linux/crc8.h index 13c8dabb0441..674045c59a04 100644 --- a/include/linux/crc8.h +++ b/include/linux/crc8.h @@ -96,6 +96,6 @@ void crc8_populate_msb(u8 table[CRC8_TABLE_SIZE], u8 polynomial); * Williams, Ross N., rossross.net * (see URL http://www.ross.net/crc/download/crc_v3.txt). */ -u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc); +u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc); #endif /* __CRC8_H_ */ diff --git a/lib/crc8.c b/lib/crc8.c index 595a5a75e3cd..1ad8e501d9b6 100644 --- a/lib/crc8.c +++ b/lib/crc8.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL(crc8_populate_lsb); * @nbytes: number of bytes in data buffer. * @crc: previous returned crc8 value. */ -u8 crc8(const u8 table[CRC8_TABLE_SIZE], u8 *pdata, size_t nbytes, u8 crc) +u8 crc8(const u8 table[CRC8_TABLE_SIZE], const u8 *pdata, size_t nbytes, u8 crc) { /* loop over the buffer data */ while (nbytes-- > 0) From 78564b9434878d686c5f88c4488b20cccbcc42bc Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 6 May 2021 18:03:40 -0700 Subject: [PATCH 1295/1379] lib: stackdepot: turn depot_lock spinlock to raw_spinlock In RT system, the spin_lock will be replaced by sleepable rt_mutex lock, in __call_rcu(), disable interrupts before calling kasan_record_aux_stack(), will trigger this calltrace: BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:951 in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 19, name: pgdatinit0 Call Trace: ___might_sleep.cold+0x1b2/0x1f1 rt_spin_lock+0x3b/0xb0 stack_depot_save+0x1b9/0x440 kasan_save_stack+0x32/0x40 kasan_record_aux_stack+0xa5/0xb0 __call_rcu+0x117/0x880 __exit_signal+0xafb/0x1180 release_task+0x1d6/0x480 exit_notify+0x303/0x750 do_exit+0x678/0xcf0 kthread+0x364/0x4f0 ret_from_fork+0x22/0x30 Replace spinlock with raw_spinlock. Link: https://lkml.kernel.org/r/20210329084009.27013-1-qiang.zhang@windriver.com Signed-off-by: Zqiang Reported-by: Andrew Halaney Cc: Alexander Potapenko Cc: Gustavo A. R. Silva Cc: Vijayanand Jitta Cc: Vinayak Menon Cc: Yogesh Lal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/stackdepot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 49f67a0c6e5d..df9179f4f441 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -71,7 +71,7 @@ static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; static int depot_index; static int next_slab_inited; static size_t depot_offset; -static DEFINE_SPINLOCK(depot_lock); +static DEFINE_RAW_SPINLOCK(depot_lock); static bool init_stack_slab(void **prealloc) { @@ -305,7 +305,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, prealloc = page_address(page); } - spin_lock_irqsave(&depot_lock, flags); + raw_spin_lock_irqsave(&depot_lock, flags); found = find_stack(*bucket, entries, nr_entries, hash); if (!found) { @@ -329,7 +329,7 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, WARN_ON(!init_stack_slab(&prealloc)); } - spin_unlock_irqrestore(&depot_lock, flags); + raw_spin_unlock_irqrestore(&depot_lock, flags); exit: if (prealloc) { /* Nobody used this memory, ok to free it. */ From db65a867fd40fb33d4a7d619e95f2b796e798999 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 6 May 2021 18:03:43 -0700 Subject: [PATCH 1296/1379] lib/percpu_counter: tame kernel-doc compile warning commit 3e8f399da490 ("writeback: rework wb_[dec|inc]_stat family of functions") add some function description of percpu_counter_add_batch. but the double '*' in comments means a kernel-doc format comment which isn't right. Since the whole file of lib/percpu_counter.c has no any other kernel-doc format comments, we'd better to remove this incomplete one to tame the kernel-doc warning: lib/percpu_counter.c:83: warning: Function parameter or member 'fbc' not described in 'percpu_counter_add_batch' lib/percpu_counter.c:83: warning: Function parameter or member 'amount' not described in 'percpu_counter_add_batch' lib/percpu_counter.c:83: warning: Function parameter or member 'batch' not described in 'percpu_counter_add_batch' Link: https://lkml.kernel.org/r/20210405135505.132446-1-alexs@kernel.org Signed-off-by: Alex Shi Cc: Nikolay Borisov Cc: Stephen Boyd Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/percpu_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 00f666d94486..ed610b75dc32 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -72,7 +72,7 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) } EXPORT_SYMBOL(percpu_counter_set); -/** +/* * This function is both preempt and irq safe. The former is due to explicit * preemption disable. The latter is guaranteed by the fact that the slow path * is explicitly protected by an irq-safe spinlock whereas the fast patch uses From 9d6ecac093a2412822bdb5376b9bd434d45939af Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Thu, 6 May 2021 18:03:46 -0700 Subject: [PATCH 1297/1379] lib/genalloc: add parameter description to fix doc compile warning Commit 52fbf1134d47 ("lib/genalloc.c: fix allocation of aligned buffer from non-aligned chunk") added a new parameter 'start_addr' w/o description for it. That causes some doc compile warning: lib/genalloc.c:649: warning: Function parameter or member 'start_addr' not described in 'gen_pool_first_fit' lib/genalloc.c:667: warning: Function parameter or member 'start_addr' not described in 'gen_pool_first_fit_align' lib/genalloc.c:694: warning: Function parameter or member 'start_addr' not described in 'gen_pool_fixed_alloc' lib/genalloc.c:729: warning: Function parameter or member 'start_addr' not described in 'gen_pool_first_fit_order_align' lib/genalloc.c:752: warning: Function parameter or member 'start_addr' not described in 'gen_pool_best_fit' This fixes it by adding a parameter descriptions. Link: https://lkml.kernel.org/r/20210405132021.131231-1-alexs@kernel.org Signed-off-by: Alex Shi Cc: Alexey Skidanov Cc: Huang Shijie Cc: Alex Shi Cc: Bhaskar Chowdhury Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/genalloc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/genalloc.c b/lib/genalloc.c index 8273760884a7..9a57257988c7 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -642,6 +642,7 @@ EXPORT_SYMBOL(gen_pool_set_algo); * @nr: The number of zeroed bits we're looking for * @data: additional data - unused * @pool: pool to find the fit region memory from + * @start_addr: not used in this function */ unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, @@ -660,6 +661,7 @@ EXPORT_SYMBOL(gen_pool_first_fit); * @nr: The number of zeroed bits we're looking for * @data: data for alignment * @pool: pool to get order from + * @start_addr: start addr of alloction chunk */ unsigned long gen_pool_first_fit_align(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, @@ -687,6 +689,7 @@ EXPORT_SYMBOL(gen_pool_first_fit_align); * @nr: The number of zeroed bits we're looking for * @data: data for alignment * @pool: pool to get order from + * @start_addr: not used in this function */ unsigned long gen_pool_fixed_alloc(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data, @@ -721,6 +724,7 @@ EXPORT_SYMBOL(gen_pool_fixed_alloc); * @nr: The number of zeroed bits we're looking for * @data: additional data - unused * @pool: pool to find the fit region memory from + * @start_addr: not used in this function */ unsigned long gen_pool_first_fit_order_align(unsigned long *map, unsigned long size, unsigned long start, @@ -742,6 +746,7 @@ EXPORT_SYMBOL(gen_pool_first_fit_order_align); * @nr: The number of zeroed bits we're looking for * @data: additional data - unused * @pool: pool to find the fit region memory from + * @start_addr: not used in this function * * Iterate over the bitmap to find the smallest free region * which we can allocate the memory. From edd9334c8dfed7341066a25f79dcaab6893465d9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 6 May 2021 18:03:49 -0700 Subject: [PATCH 1298/1379] lib: parser: clean up kernel-doc Mark match_uint() as kernel-doc notation since it is already fully annotated as such. Use % prefix on constants in kernel-doc comments. Convert function return descriptions to use the "Return:" kernel-doc notation. Link: https://lkml.kernel.org/r/20210407034514.5651-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Alexander Viro Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/parser.c | 61 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 23 deletions(-) diff --git a/lib/parser.c b/lib/parser.c index 7a5769db389f..f1a6d90b8c34 100644 --- a/lib/parser.c +++ b/lib/parser.c @@ -98,7 +98,7 @@ static int match_one(char *s, const char *p, substring_t args[]) * locations. * * Description: Detects which if any of a set of token strings has been passed - * to it. Tokens can include up to MAX_OPT_ARGS instances of basic c-style + * to it. Tokens can include up to %MAX_OPT_ARGS instances of basic c-style * format identifiers which will be taken into account when matching the * tokens, and whose locations will be returned in the @args array. */ @@ -120,8 +120,10 @@ EXPORT_SYMBOL(match_token); * @base: base to use when converting string * * Description: Given a &substring_t and a base, attempts to parse the substring - * as a number in that base. On success, sets @result to the integer represented - * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. + * as a number in that base. + * + * Return: On success, sets @result to the integer represented by the + * string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. */ static int match_number(substring_t *s, int *result, int base) { @@ -153,8 +155,10 @@ static int match_number(substring_t *s, int *result, int base) * @base: base to use when converting string * * Description: Given a &substring_t and a base, attempts to parse the substring - * as a number in that base. On success, sets @result to the integer represented - * by the string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. + * as a number in that base. + * + * Return: On success, sets @result to the integer represented by the + * string and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. */ static int match_u64int(substring_t *s, u64 *result, int base) { @@ -178,9 +182,10 @@ static int match_u64int(substring_t *s, u64 *result, int base) * @s: substring_t to be scanned * @result: resulting integer on success * - * Description: Attempts to parse the &substring_t @s as a decimal integer. On - * success, sets @result to the integer represented by the string and returns 0. - * Returns -ENOMEM, -EINVAL, or -ERANGE on failure. + * Description: Attempts to parse the &substring_t @s as a decimal integer. + * + * Return: On success, sets @result to the integer represented by the string + * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. */ int match_int(substring_t *s, int *result) { @@ -188,14 +193,15 @@ int match_int(substring_t *s, int *result) } EXPORT_SYMBOL(match_int); -/* +/** * match_uint - scan a decimal representation of an integer from a substring_t * @s: substring_t to be scanned * @result: resulting integer on success * - * Description: Attempts to parse the &substring_t @s as a decimal integer. On - * success, sets @result to the integer represented by the string and returns 0. - * Returns -ENOMEM, -EINVAL, or -ERANGE on failure. + * Description: Attempts to parse the &substring_t @s as a decimal integer. + * + * Return: On success, sets @result to the integer represented by the string + * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. */ int match_uint(substring_t *s, unsigned int *result) { @@ -217,9 +223,10 @@ EXPORT_SYMBOL(match_uint); * @result: resulting unsigned long long on success * * Description: Attempts to parse the &substring_t @s as a long decimal - * integer. On success, sets @result to the integer represented by the - * string and returns 0. - * Returns -ENOMEM, -EINVAL, or -ERANGE on failure. + * integer. + * + * Return: On success, sets @result to the integer represented by the string + * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. */ int match_u64(substring_t *s, u64 *result) { @@ -232,9 +239,10 @@ EXPORT_SYMBOL(match_u64); * @s: substring_t to be scanned * @result: resulting integer on success * - * Description: Attempts to parse the &substring_t @s as an octal integer. On - * success, sets @result to the integer represented by the string and returns - * 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. + * Description: Attempts to parse the &substring_t @s as an octal integer. + * + * Return: On success, sets @result to the integer represented by the string + * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. */ int match_octal(substring_t *s, int *result) { @@ -248,8 +256,9 @@ EXPORT_SYMBOL(match_octal); * @result: resulting integer on success * * Description: Attempts to parse the &substring_t @s as a hexadecimal integer. - * On success, sets @result to the integer represented by the string and - * returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. + * + * Return: On success, sets @result to the integer represented by the string + * and returns 0. Returns -ENOMEM, -EINVAL, or -ERANGE on failure. */ int match_hex(substring_t *s, int *result) { @@ -263,10 +272,11 @@ EXPORT_SYMBOL(match_hex); * @str: the string to be parsed * * Description: Parse the string @str to check if matches wildcard - * pattern @pattern. The pattern may contain two type wildcardes: + * pattern @pattern. The pattern may contain two types of wildcards: * '*' - matches zero or more characters * '?' - matches one character - * If it's matched, return true, else return false. + * + * Return: If the @str matches the @pattern, return true, else return false. */ bool match_wildcard(const char *pattern, const char *str) { @@ -316,7 +326,9 @@ EXPORT_SYMBOL(match_wildcard); * * Description: Copy the characters in &substring_t @src to the * c-style string @dest. Copy no more than @size - 1 characters, plus - * the terminating NUL. Return length of @src. + * the terminating NUL. + * + * Return: length of @src. */ size_t match_strlcpy(char *dest, const substring_t *src, size_t size) { @@ -338,6 +350,9 @@ EXPORT_SYMBOL(match_strlcpy); * Description: Allocates and returns a string filled with the contents of * the &substring_t @s. The caller is responsible for freeing the returned * string with kfree(). + * + * Return: the address of the newly allocated NUL-terminated string or + * %NULL on error. */ char *match_strdup(const substring_t *s) { From e13d04ec45b07388d3c38c0e18a4d0aa4841b0c3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 6 May 2021 18:03:52 -0700 Subject: [PATCH 1299/1379] include/linux/compat.h: remove unneeded declaration from COMPAT_SYSCALL_DEFINEx() compat_sys##name is declared twice, just one line below. With this removal SYSCALL_DEFINEx() (defined in ) and COMPAT_SYSCALL_DEFINEx() look symmetrical. Link: https://lkml.kernel.org/r/20210223114924.854794-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/compat.h b/include/linux/compat.h index f0d2dd35d408..cfac4ec9a7df 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -75,7 +75,6 @@ __diag_push(); \ __diag_ignore(GCC, 8, "-Wattribute-alias", \ "Type aliasing is used to sanitize syscall arguments");\ - asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_compat_sys##name)))); \ ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \ From fbe745416d11b1a17c35a7c7f0ef6f4dbe5a7573 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 6 May 2021 18:03:55 -0700 Subject: [PATCH 1300/1379] checkpatch: warn when missing newline in return sysfs_emit() formats return sysfs_emit() uses should include a newline. Suggest adding a newline when one is missing. Add one using --fix too. Link: https://lkml.kernel.org/r/aa1819fa5faf786573df298e5e2e7d357ba7d4ad.camel@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index ccb412a74725..3870c8a01987 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -7198,6 +7198,17 @@ sub process { "Using $1 should generally have parentheses around the comparison\n" . $herecurr); } +# return sysfs_emit(foo, fmt, ...) fmt without newline + if ($line =~ /\breturn\s+sysfs_emit\s*\(\s*$FuncArg\s*,\s*($String)/ && + substr($rawline, $-[6], $+[6] - $-[6]) !~ /\\n"$/) { + my $offset = $+[6] - 1; + if (WARN("SYSFS_EMIT", + "return sysfs_emit(...) formats should include a terminating newline\n" . $herecurr) && + $fix) { + substr($fixed[$fixlinenr], $offset, 0) = '\\n'; + } + } + # nested likely/unlikely calls if ($line =~ /\b(?:(?:un)?likely)\s*\(\s*!?\s*(IS_ERR(?:_OR_NULL|_VALUE)?|WARN)/) { WARN("LIKELY_MISUSE", From 7b844345fc2a9c46f8bb8cdb7408c766dfcdd83d Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 6 May 2021 18:03:58 -0700 Subject: [PATCH 1301/1379] checkpatch: exclude four preprocessor sub-expressions from MACRO_ARG_REUSE __must_be_array, offsetof, sizeof_field and __stringify are all preprocessor macros and do not evaluate their arguments. As such, it is safe not to warn when arguments are being reused in those four sub-expressions. Exclude those so that they can pass checkpatch. Link: https://lkml.kernel.org/r/20210407105042.25380-1-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3870c8a01987..44b9dc330ac6 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5829,7 +5829,7 @@ sub process { next if ($arg =~ /\.\.\./); next if ($arg =~ /^type$/i); my $tmp_stmt = $define_stmt; - $tmp_stmt =~ s/\b(sizeof|typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g; + $tmp_stmt =~ s/\b(__must_be_array|offsetof|sizeof|sizeof_field|__stringify|typeof|__typeof__|__builtin\w+|typecheck\s*\(\s*$Type\s*,|\#+)\s*\(*\s*$arg\s*\)*\b//g; $tmp_stmt =~ s/\#+\s*$arg\b//g; $tmp_stmt =~ s/\b$arg\s*\#\#//g; my $use_cnt = () = $tmp_stmt =~ /\b$arg\b/g; From 7e6cdd7fd94380a3b87b2ce087903b3722b3d0d6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 May 2021 18:04:01 -0700 Subject: [PATCH 1302/1379] checkpatch: improve ALLOC_ARRAY_ARGS test The devm_ variant of 'kcalloc()' and 'kmalloc_array()' are not tested Add the corresponding check. Link: https://lkml.kernel.org/r/205fc4847972fb6779abcc8818f39c14d1b45af1.1618595794.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 44b9dc330ac6..23697a6b1eaa 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -7006,7 +7006,7 @@ sub process { } # check for alloc argument mismatch - if ($line =~ /\b(kcalloc|kmalloc_array)\s*\(\s*sizeof\b/) { + if ($line =~ /\b((?:devm_)?(?:kcalloc|kmalloc_array))\s*\(\s*sizeof\b/) { WARN("ALLOC_ARRAY_ARGS", "$1 uses number as first arg, sizeof is generally wrong\n" . $herecurr); } From 1e3b918d1dd18bcea3df9339c2d8910ffa95686a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 6 May 2021 18:04:04 -0700 Subject: [PATCH 1303/1379] kselftest: introduce new epoll test case Patch series "fs/epoll: restore user-visible behavior upon event ready". This series tries to address a change in user visible behavior, reported in https://bugzilla.kernel.org/show_bug.cgi?id=208943. Epoll does not report an event to all the threads running epoll_wait() on the same epoll descriptor. Unsurprisingly, this was bisected back to 339ddb53d373 (fs/epoll: remove unnecessary wakeups of nested epoll), which has had various problems in the past, beyond only nested epoll usage. This patch (of 2): This incorporates the testcase originally reported in: https://bugzilla.kernel.org/show_bug.cgi?id=208943 Which ensures an event is reported to all threads blocked on the same epoll descriptor, otherwise only a single thread will receive the wakeup once the event become ready. Link: https://lkml.kernel.org/r/20210405231025.33829-1-dave@stgolabs.net Link: https://lkml.kernel.org/r/20210405231025.33829-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Jason Baron Cc: Roman Penyaev Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../filesystems/epoll/epoll_wakeup_test.c | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c index ad7fabd575f9..65ede506305c 100644 --- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c +++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c @@ -3449,4 +3449,48 @@ TEST(epoll63) close(sfd[1]); } +/* + * t0 t1 + * (ew) \ / (ew) + * e0 + * | (lt) + * s0 + */ +TEST(epoll64) +{ + pthread_t waiter[2]; + struct epoll_event e; + struct epoll_mtcontext ctx = { 0 }; + + signal(SIGUSR1, signal_handler); + + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0); + + ctx.efd[0] = epoll_create(1); + ASSERT_GE(ctx.efd[0], 0); + + e.events = EPOLLIN; + ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0); + + /* + * main will act as the emitter once both waiter threads are + * blocked and expects to both be awoken upon the ready event. + */ + ctx.main = pthread_self(); + ASSERT_EQ(pthread_create(&waiter[0], NULL, waiter_entry1a, &ctx), 0); + ASSERT_EQ(pthread_create(&waiter[1], NULL, waiter_entry1a, &ctx), 0); + + usleep(100000); + ASSERT_EQ(write(ctx.sfd[1], "w", 1), 1); + + ASSERT_EQ(pthread_join(waiter[0], NULL), 0); + ASSERT_EQ(pthread_join(waiter[1], NULL), 0); + + EXPECT_EQ(ctx.count, 2); + + close(ctx.efd[0]); + close(ctx.sfd[0]); + close(ctx.sfd[1]); +} + TEST_HARNESS_MAIN From 7fab29e356309ff93a4b30ecc466129682ec190b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 6 May 2021 18:04:07 -0700 Subject: [PATCH 1304/1379] fs/epoll: restore waking from ep_done_scan() Commit 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll") changed the userspace visible behavior of exclusive waiters blocked on a common epoll descriptor upon a single event becoming ready. Previously, all tasks doing epoll_wait would awake, and now only one is awoken, potentially causing missed wakeups on applications that rely on this behavior, such as Apache Qpid. While the aforementioned commit aims at having only a wakeup single path in ep_poll_callback (with the exceptions of epoll_ctl cases), we need to restore the wakeup in what was the old ep_scan_ready_list() such that the next thread can be awoken, in a cascading style, after the waker's corresponding ep_send_events(). Link: https://lkml.kernel.org/r/20210405231025.33829-3-dave@stgolabs.net Fixes: 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll") Signed-off-by: Davidlohr Bueso Cc: Al Viro Cc: Jason Baron Cc: Roman Penyaev Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 73138ea68342..1e596e1d0bba 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -657,6 +657,12 @@ static void ep_done_scan(struct eventpoll *ep, */ list_splice(txlist, &ep->rdllist); __pm_relax(ep->ws); + + if (!list_empty(&ep->rdllist)) { + if (waitqueue_active(&ep->wq)) + wake_up(&ep->wq); + } + write_unlock_irq(&ep->lock); } From b4ca4c01780b186a1abeff9ace665ea10c8545d3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 6 May 2021 18:04:10 -0700 Subject: [PATCH 1305/1379] isofs: fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix a warning by explicitly adding a break statement instead of just letting the code fall through to the next case. Link: https://github.com/KSPP/linux/issues/115 Link: https://lkml.kernel.org/r/5b7caa73958588065fabc59032c340179b409ef5.1605896059.git.gustavoars@kernel.org Signed-off-by: Gustavo A. R. Silva Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/isofs/rock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 94ef92fe806c..4880146babaf 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -767,6 +767,7 @@ repeat: rs.cont_extent = isonum_733(rr->u.CE.extent); rs.cont_offset = isonum_733(rr->u.CE.offset); rs.cont_size = isonum_733(rr->u.CE.size); + break; default: break; } From 300563e6e01465df831b06f6b6587bfaffaf0642 Mon Sep 17 00:00:00 2001 From: Liu xuzhi Date: Thu, 6 May 2021 18:04:13 -0700 Subject: [PATCH 1306/1379] fs/nilfs2: fix misspellings using codespell tool Two typos are found out by codespell tool \ in 2217th and 2254th lines of segment.c: $ codespell ./fs/nilfs2/ ./segment.c:2217 :retured ==> returned ./segment.c:2254: retured ==> returned Fix two typos found by codespell. Link: https://lkml.kernel.org/r/1617864087-8198-1-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Liu xuzhi Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segment.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index cd4da9535aed..686c8ee7b29c 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2214,7 +2214,7 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err) * nilfs_construct_segment - construct a logical segment * @sb: super block * - * Return Value: On success, 0 is retured. On errors, one of the following + * Return Value: On success, 0 is returned. On errors, one of the following * negative error code is returned. * * %-EROFS - Read only filesystem. @@ -2251,7 +2251,7 @@ int nilfs_construct_segment(struct super_block *sb) * @start: start byte offset * @end: end byte offset (inclusive) * - * Return Value: On success, 0 is retured. On errors, one of the following + * Return Value: On success, 0 is returned. On errors, one of the following * negative error code is returned. * * %-EROFS - Read only filesystem. From 312f79c486e9860ec4c2ec4ef5b89fd518d9c833 Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 6 May 2021 18:04:16 -0700 Subject: [PATCH 1307/1379] nilfs2: fix typos in comments numer -> number in fs/nilfs2/cpfile.c Decription -> Description in fs/nilfs2/ioctl.c isntance -> instance in fs/nilfs2/the_nilfs.c Link: https://lkml.kernel.org/r/1617942951-14631-1-git-send-email-konishi.ryusuke@gmail.com Link: https://lore.kernel.org/r/20210409022519.176988-1-lujialin4@huawei.com Signed-off-by: Lu Jialin Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/cpfile.c | 2 +- fs/nilfs2/ioctl.c | 4 ++-- fs/nilfs2/the_nilfs.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 025fb082575a..ce144776b4ef 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -293,7 +293,7 @@ void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno, * nilfs_cpfile_delete_checkpoints - delete checkpoints * @cpfile: inode of checkpoint file * @start: start checkpoint number - * @end: end checkpoint numer + * @end: end checkpoint number * * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in * the period from @start to @end, excluding @end itself. The checkpoints diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 3fcb9357bbfd..640ac8fe891e 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1043,7 +1043,7 @@ out: * @inode: inode object * @argp: pointer on argument from userspace * - * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It + * Description: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which * performs the actual trim operation. * @@ -1085,7 +1085,7 @@ static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp) * @inode: inode object * @argp: pointer on argument from userspace * - * Decription: nilfs_ioctl_set_alloc_range() function defines lower limit + * Description: nilfs_ioctl_set_alloc_range() function defines lower limit * of segments in bytes and upper limit of segments in bytes. * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility. * diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 221a1cc597f0..8b7b01a380ce 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -195,7 +195,7 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs, /** * load_nilfs - load and recover the nilfs * @nilfs: the_nilfs structure to be released - * @sb: super block isntance used to recover past segment + * @sb: super block instance used to recover past segment * * load_nilfs() searches and load the latest super root, * attaches the last segment, and does recovery if needed. From c1e4726f4654407bfd509bb8fc7324b96f2f9285 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 6 May 2021 18:04:19 -0700 Subject: [PATCH 1308/1379] hpfs: replace one-element array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. Also, this helps with the ongoing efforts to enable -Warray-bounds by fixing the following warning: CC [M] fs/hpfs/dir.o fs/hpfs/dir.c: In function `hpfs_readdir': fs/hpfs/dir.c:163:41: warning: array subscript 1 is above array bounds of `u8[1]' {aka `unsigned char[1]'} [-Warray-bounds] 163 | || de ->name[0] != 1 || de->name[1] != 1)) | ~~~~~~~~^~~ [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/79 Link: https://github.com/KSPP/linux/issues/109 Link: https://lkml.kernel.org/r/20210326173510.GA81212@embeddedor Signed-off-by: Gustavo A. R. Silva Cc: Mikulas Patocka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hpfs/hpfs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h index 302f45101a96..d92c4af3e1b4 100644 --- a/fs/hpfs/hpfs.h +++ b/fs/hpfs/hpfs.h @@ -356,7 +356,8 @@ struct hpfs_dirent { u8 no_of_acls; /* number of ACL's (low 3 bits) */ u8 ix; /* code page index (of filename), see struct code_page_data */ - u8 namelen, name[1]; /* file name */ + u8 namelen; /* file name length */ + u8 name[]; /* file name */ /* dnode_secno down; btree down pointer, if present, follows name on next word boundary, or maybe it precedes next dirent, which is on a word boundary. */ From 5449162ac001a926ad8884882b071601df5edb44 Mon Sep 17 00:00:00 2001 From: Jim Newsome Date: Thu, 6 May 2021 18:04:22 -0700 Subject: [PATCH 1309/1379] do_wait: make PIDTYPE_PID case O(1) instead of O(n) Add a special-case when waiting on a pid (via waitpid, waitid, wait4, etc) to avoid doing an O(n) scan of children and tracees, and instead do an O(1) lookup. This improves performance when waiting on a pid from a thread group with many children and/or tracees. Time to fork and then call waitpid on the child, from a task that already has N children [1]: N | Before | After -----|---------|------ 1 | 74 us | 74 us 20 | 72 us | 75 us 100 | 83 us | 77 us 500 | 99 us | 74 us 1000 | 179 us | 75 us 5000 | 804 us | 79 us 8000 | 1268 us | 78 us [1]: https://lkml.org/lkml/2021/3/12/1567 This can make a substantial performance improvement for applications with a thread that has many children or tracees and frequently needs to wait on them. Tools that use ptrace to intercept syscalls for a large number of processes are likely to fall into this category. In particular this patch was developed while building a ptrace-based second generation of the Shadow emulator [2], for which it allows us to avoid quadratic scaling (without having to use a workaround that introduces a ~40% performance penalty) [3]. Other examples of tools that fall into this category which this patch may help include User Mode Linux [4] and DetTrace [5]. [2]: https://shadow.github.io/ [3]: https://github.com/shadow/shadow/issues/1134#issuecomment-798992292 [4]: https://en.wikipedia.org/wiki/User-mode_Linux [5]: https://github.com/dettrace/dettrace Link: https://lkml.kernel.org/r/20210314231544.9379-1-jnewsome@torproject.org Signed-off-by: James Newsome Reviewed-by: Oleg Nesterov Cc: "Eric W . Biederman" Cc: Christian Brauner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 67 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 0596526ed9ea..fd1c04193e18 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1440,9 +1440,48 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent) TASK_INTERRUPTIBLE, p); } +static bool is_effectively_child(struct wait_opts *wo, bool ptrace, + struct task_struct *target) +{ + struct task_struct *parent = + !ptrace ? target->real_parent : target->parent; + + return current == parent || (!(wo->wo_flags & __WNOTHREAD) && + same_thread_group(current, parent)); +} + +/* + * Optimization for waiting on PIDTYPE_PID. No need to iterate through child + * and tracee lists to find the target task. + */ +static int do_wait_pid(struct wait_opts *wo) +{ + bool ptrace; + struct task_struct *target; + int retval; + + ptrace = false; + target = pid_task(wo->wo_pid, PIDTYPE_TGID); + if (target && is_effectively_child(wo, ptrace, target)) { + retval = wait_consider_task(wo, ptrace, target); + if (retval) + return retval; + } + + ptrace = true; + target = pid_task(wo->wo_pid, PIDTYPE_PID); + if (target && target->ptrace && + is_effectively_child(wo, ptrace, target)) { + retval = wait_consider_task(wo, ptrace, target); + if (retval) + return retval; + } + + return 0; +} + static long do_wait(struct wait_opts *wo) { - struct task_struct *tsk; int retval; trace_sched_process_wait(wo->wo_pid); @@ -1464,19 +1503,27 @@ repeat: set_current_state(TASK_INTERRUPTIBLE); read_lock(&tasklist_lock); - tsk = current; - do { - retval = do_wait_thread(wo, tsk); + + if (wo->wo_type == PIDTYPE_PID) { + retval = do_wait_pid(wo); if (retval) goto end; + } else { + struct task_struct *tsk = current; - retval = ptrace_do_wait(wo, tsk); - if (retval) - goto end; + do { + retval = do_wait_thread(wo, tsk); + if (retval) + goto end; - if (wo->wo_flags & __WNOTHREAD) - break; - } while_each_thread(current, tsk); + retval = ptrace_do_wait(wo, tsk); + if (retval) + goto end; + + if (wo->wo_flags & __WNOTHREAD) + break; + } while_each_thread(current, tsk); + } read_unlock(&tasklist_lock); notask: From a6895399380ab58d9efd0a0bec2fcb98d77e20bd Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Thu, 6 May 2021 18:04:25 -0700 Subject: [PATCH 1310/1379] kernel/fork.c: simplify copy_mm() All this can happen without a single goto. Link: https://lkml.kernel.org/r/2072685.XptgVkyDqn@devpool47 Signed-off-by: Rolf Eike Beer Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 771e0ea90499..784f7ca7c17e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1396,7 +1396,6 @@ fail_nomem: static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; - int retval; tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; @@ -1423,21 +1422,15 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (clone_flags & CLONE_VM) { mmget(oldmm); mm = oldmm; - goto good_mm; + } else { + mm = dup_mm(tsk, current->mm); + if (!mm) + return -ENOMEM; } - retval = -ENOMEM; - mm = dup_mm(tsk, current->mm); - if (!mm) - goto fail_nomem; - -good_mm: tsk->mm = mm; tsk->active_mm = mm; return 0; - -fail_nomem: - return retval; } static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) From a8ca6b1388a91c79dad257a7cc0bc14c009312fe Mon Sep 17 00:00:00 2001 From: Xiaofeng Cao Date: Thu, 6 May 2021 18:04:28 -0700 Subject: [PATCH 1311/1379] kernel/fork.c: fix typos change 'ancestoral' to 'ancestral' change 'reuseable' to 'reusable' delete 'do' grammatically Link: https://lkml.kernel.org/r/20210317082031.11692-1-caoxiaofeng@yulong.com Signed-off-by: Xiaofeng Cao Reviewed-by: Christian Brauner Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 784f7ca7c17e..dc06afd725cb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1145,7 +1145,7 @@ void mmput_async(struct mm_struct *mm) * invocations: in mmput() nobody alive left, in execve task is single * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the * mm->exe_file, but does so without using set_mm_exe_file() in order - * to do avoid the need for any locks. + * to avoid the need for any locks. */ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { @@ -1736,7 +1736,7 @@ static int pidfd_release(struct inode *inode, struct file *file) * /proc//status where Pid and NSpid are always shown relative to * the pid namespace of the procfs instance. The difference becomes * obvious when sending around a pidfd between pid namespaces from a - * different branch of the tree, i.e. where no ancestoral relation is + * different branch of the tree, i.e. where no ancestral relation is * present between the pid namespaces: * - create two new pid namespaces ns1 and ns2 in the initial pid * namespace (also take care to create new mount namespaces in the @@ -2728,8 +2728,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) return false; /* - * - make the CLONE_DETACHED bit reuseable for clone3 - * - make the CSIGNAL bits reuseable for clone3 + * - make the CLONE_DETACHED bit reusable for clone3 + * - make the CSIGNAL bits reusable for clone3 */ if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) return false; From a119b4e5186c283ee13850b65004de6d746a81be Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Thu, 6 May 2021 18:04:35 -0700 Subject: [PATCH 1312/1379] kexec: Add kexec reboot string The purpose is to notify the kernel module for fast reboot. Upstream a patch from the SONiC network operating system [1]. [1]: https://github.com/Azure/sonic-linux-kernel/pull/46 Link: https://lkml.kernel.org/r/20210304124626.13927-1-pmenzel@molgen.mpg.de Signed-off-by: Joe LeVeque Signed-off-by: Paul Menzel Acked-by: Baoquan He Cc: Guohan Lu Cc: Joe LeVeque Cc: Paul Menzel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index a0b6780740c8..f04d04d1b855 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1165,7 +1165,7 @@ int kernel_kexec(void) #endif { kexec_in_progress = true; - kernel_restart_prepare(NULL); + kernel_restart_prepare("kexec reboot"); migrate_to_reboot_cpu(); /* From 31d82c2c787d5cf65fedd35ebbc0c1bd95c1a679 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Thu, 6 May 2021 18:04:38 -0700 Subject: [PATCH 1313/1379] kernel: kexec_file: fix error return code of kexec_calculate_store_digests() When vzalloc() returns NULL to sha_regions, no error return code of kexec_calculate_store_digests() is assigned. To fix this bug, ret is assigned with -ENOMEM in this case. Link: https://lkml.kernel.org/r/20210309083904.24321-1-baijiaju1990@gmail.com Fixes: a43cac0d9dc2 ("kexec: split kexec_file syscall code to kexec_file.c") Signed-off-by: Jia-Ju Bai Reported-by: TOTE Robot Acked-by: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 5c3447cf7ad5..33400ff051a8 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -740,8 +740,10 @@ static int kexec_calculate_store_digests(struct kimage *image) sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); sha_regions = vzalloc(sha_region_sz); - if (!sha_regions) + if (!sha_regions) { + ret = -ENOMEM; goto out_free_desc; + } desc->tfm = tfm; From b2075dbb15d7ae952aeb01331198f4dc45a7e46a Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 6 May 2021 18:04:41 -0700 Subject: [PATCH 1314/1379] kexec: dump kmessage before machine_kexec kmsg_dump(KMSG_DUMP_SHUTDOWN) is called before machine_restart(), machine_halt(), and machine_power_off(). The only one that is missing is machine_kexec(). The dmesg output that it contains can be used to study the shutdown performance of both kernel and systemd during kexec reboot. Here is example of dmesg data collected after kexec: root@dplat-cp22:~# cat /sys/fs/pstore/dmesg-ramoops-0 | tail ... [ 70.914592] psci: CPU3 killed (polled 0 ms) [ 70.915705] CPU4: shutdown [ 70.916643] psci: CPU4 killed (polled 4 ms) [ 70.917715] CPU5: shutdown [ 70.918725] psci: CPU5 killed (polled 0 ms) [ 70.919704] CPU6: shutdown [ 70.920726] psci: CPU6 killed (polled 4 ms) [ 70.921642] CPU7: shutdown [ 70.922650] psci: CPU7 killed (polled 0 ms) Link: https://lkml.kernel.org/r/20210319192326.146000-2-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Reviewed-by: Bhupesh Sharma Acked-by: Baoquan He Reviewed-by: Tyler Hicks Cc: James Morris Cc: Sasha Levin Cc: Eric W. Biederman Cc: Anton Vorontsov Cc: Colin Cross Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index f04d04d1b855..f099baee3578 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -1179,6 +1180,7 @@ int kernel_kexec(void) machine_shutdown(); } + kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_kexec(kexec_image); #ifdef CONFIG_KEXEC_JUMP From 7a1d55b987dfcbddecdb67eecc76fe555d4348ba Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 6 May 2021 18:04:45 -0700 Subject: [PATCH 1315/1379] gcov: combine common code There's a lot of duplicated code between gcc and clang implementations, move it over to fs.c to simplify the code, there's no reason to believe that for small data like this one would not just implement the simple convert_to_gcda() function. Link: https://lkml.kernel.org/r/20210315235453.e3fbb86e99a0.I08a3ee6dbe47ea3e8024956083f162884a958e40@changeid Signed-off-by: Johannes Berg Acked-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/base.c | 49 +++++++++++++ kernel/gcov/clang.c | 167 +----------------------------------------- kernel/gcov/fs.c | 116 +++++++++++++++++++++++++++++ kernel/gcov/gcc_4_7.c | 167 +----------------------------------------- kernel/gcov/gcov.h | 14 +--- 5 files changed, 171 insertions(+), 342 deletions(-) diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 0ffe9f194080..073a3738c5e6 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -49,6 +49,55 @@ void gcov_enable_events(void) mutex_unlock(&gcov_lock); } +/** + * store_gcov_u32 - store 32 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't + * store anything. + */ +size_t store_gcov_u32(void *buffer, size_t off, u32 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + *data = v; + } + + return sizeof(*data); +} + +/** + * store_gcov_u64 - store 64 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store + * anything. + */ +size_t store_gcov_u64(void *buffer, size_t off, u64 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + + data[0] = (v & 0xffffffffUL); + data[1] = (v >> 32); + } + + return sizeof(*data) * 2; +} + #ifdef CONFIG_MODULES /* Update list and generate events when modules are unloaded. */ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c index c466c7fbdece..1741c65b4fb2 100644 --- a/kernel/gcov/clang.c +++ b/kernel/gcov/clang.c @@ -48,7 +48,6 @@ #include #include #include -#include #include #include #include "gcov.h" @@ -449,71 +448,6 @@ void gcov_info_free(struct gcov_info *info) } #endif -#define ITER_STRIDE PAGE_SIZE - -/** - * struct gcov_iterator - specifies current file position in logical records - * @info: associated profiling data - * @buffer: buffer containing file data - * @size: size of buffer - * @pos: current position in file - */ -struct gcov_iterator { - struct gcov_info *info; - void *buffer; - size_t size; - loff_t pos; -}; - -/** - * store_gcov_u32 - store 32 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't - * store anything. - */ -static size_t store_gcov_u32(void *buffer, size_t off, u32 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - *data = v; - } - - return sizeof(*data); -} - -/** - * store_gcov_u64 - store 64 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. 64 bit numbers are stored as two 32 bit numbers, the low part - * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store - * anything. - */ -static size_t store_gcov_u64(void *buffer, size_t off, u64 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - - data[0] = (v & 0xffffffffUL); - data[1] = (v >> 32); - } - - return sizeof(*data) * 2; -} - /** * convert_to_gcda - convert profiling data set to gcda file format * @buffer: the buffer to store file data or %NULL if no data should be stored @@ -521,7 +455,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v) * * Returns the number of bytes that were/would have been stored into the buffer. */ -static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +size_t convert_to_gcda(char *buffer, struct gcov_info *info) { struct gcov_fn_info *fi_ptr; size_t pos = 0; @@ -558,102 +492,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info) return pos; } - -/** - * gcov_iter_new - allocate and initialize profiling data iterator - * @info: profiling data set to be iterated - * - * Return file iterator on success, %NULL otherwise. - */ -struct gcov_iterator *gcov_iter_new(struct gcov_info *info) -{ - struct gcov_iterator *iter; - - iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); - if (!iter) - goto err_free; - - iter->info = info; - /* Dry-run to get the actual buffer size. */ - iter->size = convert_to_gcda(NULL, info); - iter->buffer = vmalloc(iter->size); - if (!iter->buffer) - goto err_free; - - convert_to_gcda(iter->buffer, info); - - return iter; - -err_free: - kfree(iter); - return NULL; -} - - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -void gcov_iter_free(struct gcov_iterator *iter) -{ - vfree(iter->buffer); - kfree(iter); -} - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) -{ - return iter->info; -} - -/** - * gcov_iter_start - reset file iterator to starting position - * @iter: file iterator - */ -void gcov_iter_start(struct gcov_iterator *iter) -{ - iter->pos = 0; -} - -/** - * gcov_iter_next - advance file iterator to next logical record - * @iter: file iterator - * - * Return zero if new position is valid, non-zero if iterator has reached end. - */ -int gcov_iter_next(struct gcov_iterator *iter) -{ - if (iter->pos < iter->size) - iter->pos += ITER_STRIDE; - - if (iter->pos >= iter->size) - return -EINVAL; - - return 0; -} - -/** - * gcov_iter_write - write data for current pos to seq_file - * @iter: file iterator - * @seq: seq_file handle - * - * Return zero on success, non-zero otherwise. - */ -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) -{ - size_t len; - - if (iter->pos >= iter->size) - return -EINVAL; - - len = ITER_STRIDE; - if (iter->pos + len > iter->size) - len = iter->size - iter->pos; - - seq_write(seq, iter->buffer + iter->pos, len); - - return 0; -} diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 82babf5aa077..2d29e1d1225d 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "gcov.h" /** @@ -85,6 +86,121 @@ static int __init gcov_persist_setup(char *str) } __setup("gcov_persist=", gcov_persist_setup); +#define ITER_STRIDE PAGE_SIZE + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @buffer: buffer containing file data + * @size: size of buffer + * @pos: current position in file + */ +struct gcov_iterator { + struct gcov_info *info; + void *buffer; + size_t size; + loff_t pos; +}; + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +static struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ + struct gcov_iterator *iter; + + iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); + if (!iter) + goto err_free; + + iter->info = info; + /* Dry-run to get the actual buffer size. */ + iter->size = convert_to_gcda(NULL, info); + iter->buffer = vmalloc(iter->size); + if (!iter->buffer) + goto err_free; + + convert_to_gcda(iter->buffer, info); + + return iter; + +err_free: + kfree(iter); + return NULL; +} + + +/** + * gcov_iter_free - free iterator data + * @iter: file iterator + */ +static void gcov_iter_free(struct gcov_iterator *iter) +{ + vfree(iter->buffer); + kfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +static struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ + return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +static void gcov_iter_start(struct gcov_iterator *iter) +{ + iter->pos = 0; +} + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +static int gcov_iter_next(struct gcov_iterator *iter) +{ + if (iter->pos < iter->size) + iter->pos += ITER_STRIDE; + + if (iter->pos >= iter->size) + return -EINVAL; + + return 0; +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +static int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ + size_t len; + + if (iter->pos >= iter->size) + return -EINVAL; + + len = ITER_STRIDE; + if (iter->pos + len > iter->size) + len = iter->size - iter->pos; + + seq_write(seq, iter->buffer + iter->pos, len); + + return 0; +} + /* * seq_file.start() implementation for gcov data files. Note that the * gcov_iterator interface is designed to be more restrictive than seq_file diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index c53408a00d0b..1251f2434e90 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include "gcov.h" @@ -363,71 +362,6 @@ free_info: kfree(info); } -#define ITER_STRIDE PAGE_SIZE - -/** - * struct gcov_iterator - specifies current file position in logical records - * @info: associated profiling data - * @buffer: buffer containing file data - * @size: size of buffer - * @pos: current position in file - */ -struct gcov_iterator { - struct gcov_info *info; - void *buffer; - size_t size; - loff_t pos; -}; - -/** - * store_gcov_u32 - store 32 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't - * store anything. - */ -static size_t store_gcov_u32(void *buffer, size_t off, u32 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - *data = v; - } - - return sizeof(*data); -} - -/** - * store_gcov_u64 - store 64 bit number in gcov format to buffer - * @buffer: target buffer or NULL - * @off: offset into the buffer - * @v: value to be stored - * - * Number format defined by gcc: numbers are recorded in the 32 bit - * unsigned binary form of the endianness of the machine generating the - * file. 64 bit numbers are stored as two 32 bit numbers, the low part - * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store - * anything. - */ -static size_t store_gcov_u64(void *buffer, size_t off, u64 v) -{ - u32 *data; - - if (buffer) { - data = buffer + off; - - data[0] = (v & 0xffffffffUL); - data[1] = (v >> 32); - } - - return sizeof(*data) * 2; -} - /** * convert_to_gcda - convert profiling data set to gcda file format * @buffer: the buffer to store file data or %NULL if no data should be stored @@ -435,7 +369,7 @@ static size_t store_gcov_u64(void *buffer, size_t off, u64 v) * * Returns the number of bytes that were/would have been stored into the buffer. */ -static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +size_t convert_to_gcda(char *buffer, struct gcov_info *info) { struct gcov_fn_info *fi_ptr; struct gcov_ctr_info *ci_ptr; @@ -481,102 +415,3 @@ static size_t convert_to_gcda(char *buffer, struct gcov_info *info) return pos; } - -/** - * gcov_iter_new - allocate and initialize profiling data iterator - * @info: profiling data set to be iterated - * - * Return file iterator on success, %NULL otherwise. - */ -struct gcov_iterator *gcov_iter_new(struct gcov_info *info) -{ - struct gcov_iterator *iter; - - iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); - if (!iter) - goto err_free; - - iter->info = info; - /* Dry-run to get the actual buffer size. */ - iter->size = convert_to_gcda(NULL, info); - iter->buffer = vmalloc(iter->size); - if (!iter->buffer) - goto err_free; - - convert_to_gcda(iter->buffer, info); - - return iter; - -err_free: - kfree(iter); - return NULL; -} - - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -void gcov_iter_free(struct gcov_iterator *iter) -{ - vfree(iter->buffer); - kfree(iter); -} - -/** - * gcov_iter_get_info - return profiling data set for given file iterator - * @iter: file iterator - */ -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) -{ - return iter->info; -} - -/** - * gcov_iter_start - reset file iterator to starting position - * @iter: file iterator - */ -void gcov_iter_start(struct gcov_iterator *iter) -{ - iter->pos = 0; -} - -/** - * gcov_iter_next - advance file iterator to next logical record - * @iter: file iterator - * - * Return zero if new position is valid, non-zero if iterator has reached end. - */ -int gcov_iter_next(struct gcov_iterator *iter) -{ - if (iter->pos < iter->size) - iter->pos += ITER_STRIDE; - - if (iter->pos >= iter->size) - return -EINVAL; - - return 0; -} - -/** - * gcov_iter_write - write data for current pos to seq_file - * @iter: file iterator - * @seq: seq_file handle - * - * Return zero on success, non-zero otherwise. - */ -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) -{ - size_t len; - - if (iter->pos >= iter->size) - return -EINVAL; - - len = ITER_STRIDE; - if (iter->pos + len > iter->size) - len = iter->size - iter->pos; - - seq_write(seq, iter->buffer + iter->pos, len); - - return 0; -} diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 6ab2c1808c9d..912b8ea01d33 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -48,6 +48,7 @@ struct gcov_info *gcov_info_next(struct gcov_info *info); void gcov_info_link(struct gcov_info *info); void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); bool gcov_info_within_module(struct gcov_info *info, struct module *mod); +size_t convert_to_gcda(char *buffer, struct gcov_info *info); /* Base interface. */ enum gcov_action { @@ -58,16 +59,9 @@ enum gcov_action { void gcov_event(enum gcov_action action, struct gcov_info *info); void gcov_enable_events(void); -/* Iterator control. */ -struct seq_file; -struct gcov_iterator; - -struct gcov_iterator *gcov_iter_new(struct gcov_info *info); -void gcov_iter_free(struct gcov_iterator *iter); -void gcov_iter_start(struct gcov_iterator *iter); -int gcov_iter_next(struct gcov_iterator *iter); -int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq); -struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter); +/* writing helpers */ +size_t store_gcov_u32(void *buffer, size_t off, u32 v); +size_t store_gcov_u64(void *buffer, size_t off, u64 v); /* gcov_info control. */ void gcov_info_reset(struct gcov_info *info); From 3180c44fe1baf14fc876a4cdad77ea7b51ddc387 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 6 May 2021 18:04:48 -0700 Subject: [PATCH 1316/1379] gcov: simplify buffer allocation Use just a single vmalloc() with struct_size() instead of a separate kmalloc() for the iter struct. Link: https://lkml.kernel.org/r/20210315235453.b6de4a92096e.Iac40a5166589cefbff8449e466bd1b38ea7a17af@changeid Signed-off-by: Johannes Berg Cc: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/fs.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 2d29e1d1225d..40ea81c0475b 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -97,9 +97,9 @@ __setup("gcov_persist=", gcov_persist_setup); */ struct gcov_iterator { struct gcov_info *info; - void *buffer; size_t size; loff_t pos; + char buffer[]; }; /** @@ -111,25 +111,20 @@ struct gcov_iterator { static struct gcov_iterator *gcov_iter_new(struct gcov_info *info) { struct gcov_iterator *iter; + size_t size; - iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); + /* Dry-run to get the actual buffer size. */ + size = convert_to_gcda(NULL, info); + + iter = vmalloc(struct_size(iter, buffer, size)); if (!iter) - goto err_free; + return NULL; iter->info = info; - /* Dry-run to get the actual buffer size. */ - iter->size = convert_to_gcda(NULL, info); - iter->buffer = vmalloc(iter->size); - if (!iter->buffer) - goto err_free; - + iter->size = size; convert_to_gcda(iter->buffer, info); return iter; - -err_free: - kfree(iter); - return NULL; } @@ -139,8 +134,7 @@ err_free: */ static void gcov_iter_free(struct gcov_iterator *iter) { - vfree(iter->buffer); - kfree(iter); + vfree(iter); } /** From 1391efa952e8b22088f8626fc63ade26767b92d6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 6 May 2021 18:04:51 -0700 Subject: [PATCH 1317/1379] gcov: use kvmalloc() Using vmalloc() in gcov is really quite wasteful, many of the objects allocated are really small (e.g. I've seen 24 bytes.) Use kvmalloc() to automatically pick the better of kmalloc() or vmalloc() depending on the size. [johannes.berg@intel.com: fix clang-11+ build] Link: https://lkml.kernel.org/r/20210412214210.6e1ecca9cdc5.I24459763acf0591d5e6b31c7e3a59890d802f79c@changeid Link: https://lkml.kernel.org/r/20210315235453.799e7a9d627d.I741d0db096c6f312910f7f1bcdfde0fda20801a4@changeid Signed-off-by: Johannes Berg Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Cc: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/clang.c | 12 ++++++------ kernel/gcov/fs.c | 6 +++--- kernel/gcov/gcc_4_7.c | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c index 1741c65b4fb2..d43ffd0c5ddb 100644 --- a/kernel/gcov/clang.c +++ b/kernel/gcov/clang.c @@ -49,7 +49,7 @@ #include #include #include -#include +#include #include "gcov.h" typedef void (*llvm_gcov_callback)(void); @@ -333,8 +333,8 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) { size_t cv_size; /* counter values size */ - struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), - GFP_KERNEL); + struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), GFP_KERNEL); + if (!fn_dup) return NULL; INIT_LIST_HEAD(&fn_dup->head); @@ -344,7 +344,7 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) goto err_name; cv_size = fn->num_counters * sizeof(fn->counters[0]); - fn_dup->counters = vmalloc(cv_size); + fn_dup->counters = kvmalloc(cv_size, GFP_KERNEL); if (!fn_dup->counters) goto err_counters; memcpy(fn_dup->counters, fn->counters, cv_size); @@ -368,7 +368,7 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) INIT_LIST_HEAD(&fn_dup->head); cv_size = fn->num_counters * sizeof(fn->counters[0]); - fn_dup->counters = vmalloc(cv_size); + fn_dup->counters = kvmalloc(cv_size, GFP_KERNEL); if (!fn_dup->counters) { kfree(fn_dup); return NULL; @@ -439,7 +439,7 @@ void gcov_info_free(struct gcov_info *info) struct gcov_fn_info *fn, *tmp; list_for_each_entry_safe(fn, tmp, &info->functions, head) { - vfree(fn->counters); + kvfree(fn->counters); list_del(&fn->head); kfree(fn); } diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 40ea81c0475b..5c3086cad8f9 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include "gcov.h" /** @@ -116,7 +116,7 @@ static struct gcov_iterator *gcov_iter_new(struct gcov_info *info) /* Dry-run to get the actual buffer size. */ size = convert_to_gcda(NULL, info); - iter = vmalloc(struct_size(iter, buffer, size)); + iter = kvmalloc(struct_size(iter, buffer, size), GFP_KERNEL); if (!iter) return NULL; @@ -134,7 +134,7 @@ static struct gcov_iterator *gcov_iter_new(struct gcov_info *info) */ static void gcov_iter_free(struct gcov_iterator *iter) { - vfree(iter); + kvfree(iter); } /** diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 1251f2434e90..460c12b7dfea 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "gcov.h" #if (__GNUC__ >= 10) @@ -309,7 +309,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info) cv_size = sizeof(gcov_type) * sci_ptr->num; - dci_ptr->values = vmalloc(cv_size); + dci_ptr->values = kvmalloc(cv_size, GFP_KERNEL); if (!dci_ptr->values) goto err_free; @@ -351,7 +351,7 @@ void gcov_info_free(struct gcov_info *info) ci_ptr = info->functions[fi_idx]->ctrs; for (ct_idx = 0; ct_idx < active; ct_idx++, ci_ptr++) - vfree(ci_ptr->values); + kvfree(ci_ptr->values); kfree(info->functions[fi_idx]); } From 9b472e85d098a40b84dd8b33fbf8a15ab1452025 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 6 May 2021 18:04:54 -0700 Subject: [PATCH 1318/1379] gcov: clang: drop support for clang-10 and older LLVM changed the expected function signatures for llvm_gcda_start_file() and llvm_gcda_emit_function() in the clang-11 release. Drop the older implementations and require folks to upgrade their compiler if they're interested in GCOV support. Link: https://reviews.llvm.org/rGcdd683b516d147925212724b09ec6fb792a40041 Link: https://reviews.llvm.org/rG13a633b438b6500ecad9e4f936ebadf3411d0f44 Link: https://lkml.kernel.org/r/20210312224132.3413602-3-ndesaulniers@google.com Link: https://lkml.kernel.org/r/20210413183113.2977432-1-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Suggested-by: Nathan Chancellor Acked-by: Peter Oberparleiter Reviewed-by: Nathan Chancellor Reviewed-by: Fangrui Song Cc: Prasad Sodagudi Cc: Johannes Berg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/Kconfig | 1 + kernel/gcov/clang.c | 103 -------------------------------------------- 2 files changed, 1 insertion(+), 103 deletions(-) diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index f62de2dea8a3..58f87a3092f3 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,6 +4,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS + depends on !CC_IS_CLANG || CLANG_VERSION >= 110000 select CONSTRUCTORS default n help diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c index d43ffd0c5ddb..cbb0bed958ab 100644 --- a/kernel/gcov/clang.c +++ b/kernel/gcov/clang.c @@ -69,16 +69,10 @@ struct gcov_fn_info { u32 ident; u32 checksum; -#if CONFIG_CLANG_VERSION < 110000 - u8 use_extra_checksum; -#endif u32 cfg_checksum; u32 num_counters; u64 *counters; -#if CONFIG_CLANG_VERSION < 110000 - const char *function_name; -#endif }; static struct gcov_info *current_info; @@ -108,16 +102,6 @@ void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush) } EXPORT_SYMBOL(llvm_gcov_init); -#if CONFIG_CLANG_VERSION < 110000 -void llvm_gcda_start_file(const char *orig_filename, const char version[4], - u32 checksum) -{ - current_info->filename = orig_filename; - memcpy(¤t_info->version, version, sizeof(current_info->version)); - current_info->checksum = checksum; -} -EXPORT_SYMBOL(llvm_gcda_start_file); -#else void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum) { current_info->filename = orig_filename; @@ -125,28 +109,7 @@ void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum) current_info->checksum = checksum; } EXPORT_SYMBOL(llvm_gcda_start_file); -#endif -#if CONFIG_CLANG_VERSION < 110000 -void llvm_gcda_emit_function(u32 ident, const char *function_name, - u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum) -{ - struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); - - if (!info) - return; - - INIT_LIST_HEAD(&info->head); - info->ident = ident; - info->checksum = func_checksum; - info->use_extra_checksum = use_extra_checksum; - info->cfg_checksum = cfg_checksum; - if (function_name) - info->function_name = kstrdup(function_name, GFP_KERNEL); - - list_add_tail(&info->head, ¤t_info->functions); -} -#else void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum) { struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); @@ -160,7 +123,6 @@ void llvm_gcda_emit_function(u32 ident, u32 func_checksum, u32 cfg_checksum) info->cfg_checksum = cfg_checksum; list_add_tail(&info->head, ¤t_info->functions); } -#endif EXPORT_SYMBOL(llvm_gcda_emit_function); void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) @@ -291,16 +253,8 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) !list_is_last(&fn_ptr2->head, &info2->functions)) { if (fn_ptr1->checksum != fn_ptr2->checksum) return false; -#if CONFIG_CLANG_VERSION < 110000 - if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum) - return false; - if (fn_ptr1->use_extra_checksum && - fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) - return false; -#else if (fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) return false; -#endif fn_ptr1 = list_next_entry(fn_ptr1, head); fn_ptr2 = list_next_entry(fn_ptr2, head); } @@ -329,35 +283,6 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) } } -#if CONFIG_CLANG_VERSION < 110000 -static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) -{ - size_t cv_size; /* counter values size */ - struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), GFP_KERNEL); - - if (!fn_dup) - return NULL; - INIT_LIST_HEAD(&fn_dup->head); - - fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL); - if (!fn_dup->function_name) - goto err_name; - - cv_size = fn->num_counters * sizeof(fn->counters[0]); - fn_dup->counters = kvmalloc(cv_size, GFP_KERNEL); - if (!fn_dup->counters) - goto err_counters; - memcpy(fn_dup->counters, fn->counters, cv_size); - - return fn_dup; - -err_counters: - kfree(fn_dup->function_name); -err_name: - kfree(fn_dup); - return NULL; -} -#else static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) { size_t cv_size; /* counter values size */ @@ -378,7 +303,6 @@ static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) return fn_dup; } -#endif /** * gcov_info_dup - duplicate profiling data set @@ -419,21 +343,6 @@ err: * gcov_info_free - release memory for profiling data set duplicate * @info: profiling data set duplicate to free */ -#if CONFIG_CLANG_VERSION < 110000 -void gcov_info_free(struct gcov_info *info) -{ - struct gcov_fn_info *fn, *tmp; - - list_for_each_entry_safe(fn, tmp, &info->functions, head) { - kfree(fn->function_name); - vfree(fn->counters); - list_del(&fn->head); - kfree(fn); - } - kfree(info->filename); - kfree(info); -} -#else void gcov_info_free(struct gcov_info *info) { struct gcov_fn_info *fn, *tmp; @@ -446,7 +355,6 @@ void gcov_info_free(struct gcov_info *info) kfree(info->filename); kfree(info); } -#endif /** * convert_to_gcda - convert profiling data set to gcda file format @@ -469,21 +377,10 @@ size_t convert_to_gcda(char *buffer, struct gcov_info *info) u32 i; pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); -#if CONFIG_CLANG_VERSION < 110000 - pos += store_gcov_u32(buffer, pos, - fi_ptr->use_extra_checksum ? 3 : 2); -#else pos += store_gcov_u32(buffer, pos, 3); -#endif pos += store_gcov_u32(buffer, pos, fi_ptr->ident); pos += store_gcov_u32(buffer, pos, fi_ptr->checksum); -#if CONFIG_CLANG_VERSION < 110000 - if (fi_ptr->use_extra_checksum) - pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); -#else pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); -#endif - pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE); pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2); for (i = 0; i < fi_ptr->num_counters; i++) From 6f1f942cd5fbbe308f912fc84e3f10fbc8113a68 Mon Sep 17 00:00:00 2001 From: He Ying Date: Thu, 6 May 2021 18:04:57 -0700 Subject: [PATCH 1319/1379] smp: kernel/panic.c - silence warnings We found these warnings in kernel/panic.c by using sparse tool: warning: symbol 'panic_smp_self_stop' was not declared. warning: symbol 'nmi_panic_self_stop' was not declared. warning: symbol 'crash_smp_send_stop' was not declared. To avoid them, add declarations for these three functions in include/linux/smp.h. Link: https://lkml.kernel.org/r/20210316084150.75201-1-heying24@huawei.com Signed-off-by: He Ying Reported-by: Hulk Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/smp.h b/include/linux/smp.h index 84a0b4828f66..669e35c03be2 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -55,6 +55,14 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, int smp_call_function_single_async(int cpu, call_single_data_t *csd); +/* + * Cpus stopping functions in panic. All have default weak definitions. + * Architecture-dependent code may override them. + */ +void panic_smp_self_stop(void); +void nmi_panic_self_stop(struct pt_regs *regs); +void crash_smp_send_stop(void); + /* * Call a function on all processors */ From 3d1c7fd97e4c5e54034231cd11319079dfaed60e Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Thu, 6 May 2021 18:05:00 -0700 Subject: [PATCH 1320/1379] delayacct: clear right task's flag after blkio completes When I was implementing a latency analyzer tool by using task->delays and other things, I found an issue in delayacct. The issue is it should clear the target's flag instead of current's in delayacct_blkio_end(). When I git blame delayacct, I found there're some similar issues we have fixed in delayacct_blkio_end(). - Commit c96f5471ce7d ("delayacct: Account blkio completion on the correct task") fixed the issue that it should account blkio completion on the target task instead of current. - Commit b512719f771a ("delayacct: fix crash in delayacct_blkio_end() after delayacct init failure") fixed the issue that it should check target task's delays instead of current task'. It seems that delayacct_blkio_{begin, end} are error prone. So I introduce a new paratmeter - the target task 'p' - to these helpers. After that change, the callsite will specifilly set the right task, which should make it less error prone. Link: https://lkml.kernel.org/r/20210414083720.24083-1-laoar.shao@gmail.com Signed-off-by: Yafang Shao Cc: Tejun Heo Cc: Josh Snyder Cc: Jens Axboe Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 20 ++++++++++---------- mm/memory.c | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 2d3bdcccf5eb..21651f946751 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -82,16 +82,16 @@ static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) return 0; } -static inline void delayacct_set_flag(int flag) +static inline void delayacct_set_flag(struct task_struct *p, int flag) { - if (current->delays) - current->delays->flags |= flag; + if (p->delays) + p->delays->flags |= flag; } -static inline void delayacct_clear_flag(int flag) +static inline void delayacct_clear_flag(struct task_struct *p, int flag) { - if (current->delays) - current->delays->flags &= ~flag; + if (p->delays) + p->delays->flags &= ~flag; } static inline void delayacct_tsk_init(struct task_struct *tsk) @@ -114,7 +114,7 @@ static inline void delayacct_tsk_free(struct task_struct *tsk) static inline void delayacct_blkio_start(void) { - delayacct_set_flag(DELAYACCT_PF_BLKIO); + delayacct_set_flag(current, DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); } @@ -123,7 +123,7 @@ static inline void delayacct_blkio_end(struct task_struct *p) { if (p->delays) __delayacct_blkio_end(p); - delayacct_clear_flag(DELAYACCT_PF_BLKIO); + delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); } static inline int delayacct_add_tsk(struct taskstats *d, @@ -166,9 +166,9 @@ static inline void delayacct_thrashing_end(void) } #else -static inline void delayacct_set_flag(int flag) +static inline void delayacct_set_flag(struct task_struct *p, int flag) {} -static inline void delayacct_clear_flag(int flag) +static inline void delayacct_clear_flag(struct task_struct *p, int flag) {} static inline void delayacct_init(void) {} diff --git a/mm/memory.c b/mm/memory.c index cbdc2cd9cedb..8c491f813687 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3339,7 +3339,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } - delayacct_set_flag(DELAYACCT_PF_SWAPIN); + delayacct_set_flag(current, DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry, vma, vmf->address); swapcache = page; @@ -3388,7 +3388,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto unlock; } @@ -3402,13 +3402,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * owner processes (which may be unknown at hwpoison time) */ ret = VM_FAULT_HWPOISON; - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto out_release; } locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); if (!locked) { ret |= VM_FAULT_RETRY; goto out_release; From 23921540d2c0a4d8530078f6f64fc3e28444ca9d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 6 May 2021 18:05:03 -0700 Subject: [PATCH 1321/1379] gdb: lx-symbols: store the abspath() If we store the relative path, the user might later cd to a different directory, and that would break the automatic symbol resolving that happens when a module is loaded into the target kernel. Fix this by storing the abspath() of each path given, just like we already do for the cwd (os.getcwd() is absolute.) Link: https://lkml.kernel.org/r/20201217091747.bf4332cf2b35.I10ebbdb7e9b80ab1a5cddebf53d073be8232d656@changeid Signed-off-by: Johannes Berg Reviewed-by: Jan Kiszka Cc: Kieran Bingham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/gdb/linux/symbols.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py index 1be9763cf8bb..08d264ac328b 100644 --- a/scripts/gdb/linux/symbols.py +++ b/scripts/gdb/linux/symbols.py @@ -164,7 +164,8 @@ lx-symbols command.""" saved_state['breakpoint'].enabled = saved_state['enabled'] def invoke(self, arg, from_tty): - self.module_paths = [os.path.expanduser(p) for p in arg.split()] + self.module_paths = [os.path.abspath(os.path.expanduser(p)) + for p in arg.split()] self.module_paths.append(os.getcwd()) # enforce update From dc9586823f3e06867344e6cf88741688c2c7737f Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 6 May 2021 18:05:06 -0700 Subject: [PATCH 1322/1379] scripts/gdb: document lx_current is only supported by x86 Patch series "scripts/gdb: clarify the platforms supporting lx_current and add arm64 support", v2. lx_current depends on per_cpu current_task variable which exists on x86 only. so it actually works on x86 only. the 1st patch documents this clearly; the 2nd patch adds support for arm64. This patch (of 2): x86 is the only architecture which has per_cpu current_task: arch$ git grep current_task | grep -i per_cpu x86/include/asm/current.h:DECLARE_PER_CPU(struct task_struct *, current_task); x86/kernel/cpu/common.c:DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = x86/kernel/cpu/common.c:EXPORT_PER_CPU_SYMBOL(current_task); x86/kernel/cpu/common.c:DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; x86/kernel/cpu/common.c:EXPORT_PER_CPU_SYMBOL(current_task); x86/kernel/smpboot.c: per_cpu(current_task, cpu) = idle; On other architectures, lx_current() will lead to a python exception: (gdb) p $lx_current().pid Python Exception No symbol "current_task" in current context.: Error occurred in Python: No symbol "current_task" in current context. To avoid more people struggling and wasting time in other architectures, document it. Link: https://lkml.kernel.org/r/20210314203444.15188-1-song.bao.hua@hisilicon.com Link: https://lkml.kernel.org/r/20210314203444.15188-2-song.bao.hua@hisilicon.com Signed-off-by: Barry Song Cc: Jan Kiszka Cc: Kieran Bingham Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/gdb-kernel-debugging.rst | 2 +- scripts/gdb/linux/cpus.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Documentation/dev-tools/gdb-kernel-debugging.rst b/Documentation/dev-tools/gdb-kernel-debugging.rst index 4756f6b3a04e..1586901b683c 100644 --- a/Documentation/dev-tools/gdb-kernel-debugging.rst +++ b/Documentation/dev-tools/gdb-kernel-debugging.rst @@ -114,7 +114,7 @@ Examples of using the Linux-provided gdb helpers [ 0.000000] BIOS-e820: [mem 0x000000000009fc00-0x000000000009ffff] reserved .... -- Examine fields of the current task struct:: +- Examine fields of the current task struct(supported by x86 only):: (gdb) p $lx_current().pid $1 = 4998 diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py index 008e62f3190d..f382762509d3 100644 --- a/scripts/gdb/linux/cpus.py +++ b/scripts/gdb/linux/cpus.py @@ -156,6 +156,13 @@ Note that VAR has to be quoted as string.""" PerCpu() +def get_current_task(cpu): + if utils.is_target_arch("x86"): + var_ptr = gdb.parse_and_eval("¤t_task") + return per_cpu(var_ptr, cpu).dereference() + else: + raise gdb.GdbError("Sorry, obtaining the current task is not yet " + "supported with this arch") class LxCurrentFunc(gdb.Function): """Return current task. @@ -167,8 +174,7 @@ number. If CPU is omitted, the CPU of the current context is used.""" super(LxCurrentFunc, self).__init__("lx_current") def invoke(self, cpu=-1): - var_ptr = gdb.parse_and_eval("¤t_task") - return per_cpu(var_ptr, cpu).dereference() + return get_current_task(cpu) LxCurrentFunc() From 526940e3962620f1a24d5e30c3dac7358194d963 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 6 May 2021 18:05:09 -0700 Subject: [PATCH 1323/1379] scripts/gdb: add lx_current support for arm64 arm64 uses SP_EL0 to save the current task_struct address. While running in EL0, SP_EL0 is clobbered by userspace. So if the upper bit is not 1 (not TTBR1), the current address is invalid. This patch checks the upper bit of SP_EL0, if the upper bit is 1, lx_current() of arm64 will return the derefrence of current task. Otherwise, lx_current() will tell users they are running in userspace(EL0). While arm64 is running in EL0, it is actually pointless to print current task as the memory of kernel space is not accessible in EL0. Link: https://lkml.kernel.org/r/20210314203444.15188-3-song.bao.hua@hisilicon.com Signed-off-by: Barry Song Cc: Jan Kiszka Cc: Jonathan Corbet Cc: Kieran Bingham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/gdb-kernel-debugging.rst | 2 +- scripts/gdb/linux/cpus.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Documentation/dev-tools/gdb-kernel-debugging.rst b/Documentation/dev-tools/gdb-kernel-debugging.rst index 1586901b683c..8e0f1fe8d17a 100644 --- a/Documentation/dev-tools/gdb-kernel-debugging.rst +++ b/Documentation/dev-tools/gdb-kernel-debugging.rst @@ -114,7 +114,7 @@ Examples of using the Linux-provided gdb helpers [ 0.000000] BIOS-e820: [mem 0x000000000009fc00-0x000000000009ffff] reserved .... -- Examine fields of the current task struct(supported by x86 only):: +- Examine fields of the current task struct(supported by x86 and arm64 only):: (gdb) p $lx_current().pid $1 = 4998 diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py index f382762509d3..15fc4626d236 100644 --- a/scripts/gdb/linux/cpus.py +++ b/scripts/gdb/linux/cpus.py @@ -16,6 +16,9 @@ import gdb from linux import tasks, utils +task_type = utils.CachedType("struct task_struct") + + MAX_CPUS = 4096 @@ -157,9 +160,19 @@ Note that VAR has to be quoted as string.""" PerCpu() def get_current_task(cpu): + task_ptr_type = task_type.get_type().pointer() + if utils.is_target_arch("x86"): var_ptr = gdb.parse_and_eval("¤t_task") return per_cpu(var_ptr, cpu).dereference() + elif utils.is_target_arch("aarch64"): + current_task_addr = gdb.parse_and_eval("$SP_EL0") + if((current_task_addr >> 63) != 0): + current_task = current_task_addr.cast(task_ptr_type) + return current_task.dereference() + else: + raise gdb.GdbError("Sorry, obtaining the current task is not allowed " + "while running in userspace(EL0)") else: raise gdb.GdbError("Sorry, obtaining the current task is not yet " "supported with this arch") From 97f61c8f44ec9020708b97a51188170add4f3084 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 6 May 2021 18:05:12 -0700 Subject: [PATCH 1324/1379] kernel/resource: make walk_system_ram_res() find all busy IORESOURCE_SYSTEM_RAM resources Patch series "kernel/resource: make walk_system_ram_res() and walk_mem_res() search the whole tree", v2. Playing with kdump+virtio-mem I noticed that kexec_file_load() does not consider System RAM added via dax/kmem and virtio-mem when preparing the elf header for kdump. Looking into the details, the logic used in walk_system_ram_res() and walk_mem_res() seems to be outdated. walk_system_ram_range() already does the right thing, let's change walk_system_ram_res() and walk_mem_res(), and clean up. Loading a kdump kernel via "kexec -p -s" ... will result in the kdump kernel to also dump dax/kmem and virtio-mem added System RAM now. Note: kexec-tools on x86-64 also have to be updated to consider this memory in the kexec_load() case when processing /proc/iomem. This patch (of 3): It used to be true that we can have system RAM (IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY) only on the first level in the resource tree. However, this is no longer holds for driver-managed system RAM (i.e., added via dax/kmem and virtio-mem), which gets added on lower levels, for example, inside device containers. We have two users of walk_system_ram_res(), which currently only consideres the first level: a) kernel/kexec_file.c:kexec_walk_resources() -- We properly skip IORESOURCE_SYSRAM_DRIVER_MANAGED resources via locate_mem_hole_callback(), so even after this change, we won't be placing kexec images onto dax/kmem and virtio-mem added memory. No change. b) arch/x86/kernel/crash.c:fill_up_crash_elf_data() -- we're currently not adding relevant ranges to the crash elf header, resulting in them not getting dumped via kdump. This change fixes loading a crashkernel via kexec_file_load() and including dax/kmem and virtio-mem added System RAM in the crashdump on x86-64. Note that e.g,, arm64 relies on memblock data and, therefore, always considers all added System RAM already. Let's find all IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY resources, making the function behave like walk_system_ram_range(). Link: https://lkml.kernel.org/r/20210325115326.7826-1-david@redhat.com Link: https://lkml.kernel.org/r/20210325115326.7826-2-david@redhat.com Fixes: ebf71552bb0e ("virtio-mem: Add parent resource for all added "System RAM"") Fixes: c221c0b0308f ("device-dax: "Hotplug" persistent memory for use like normal RAM") Signed-off-by: David Hildenbrand Reviewed-by: Dan Williams Acked-by: Baoquan He Cc: Greg Kroah-Hartman Cc: Dan Williams Cc: Daniel Vetter Cc: Andy Shevchenko Cc: Mauro Carvalho Chehab Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Hansen Cc: Keith Busch Cc: Michal Hocko Cc: Qian Cai Cc: Oscar Salvador Cc: Eric Biederman Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Tom Lendacky Cc: Brijesh Singh Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/resource.c b/kernel/resource.c index 627e61b0c124..4efd6e912279 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -457,7 +457,7 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, { unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, false, arg, func); } From 3c9c797534364593b73ba6ab060a014af8934721 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 6 May 2021 18:05:16 -0700 Subject: [PATCH 1325/1379] kernel/resource: make walk_mem_res() find all busy IORESOURCE_MEM resources It used to be true that we can have system RAM (IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY) only on the first level in the resource tree. However, this is no longer holds for driver-managed system RAM (i.e., added via dax/kmem and virtio-mem), which gets added on lower levels, for example, inside device containers. IORESOURCE_SYSTEM_RAM is defined as IORESOURCE_MEM | IORESOURCE_SYSRAM and just a special type of IORESOURCE_MEM. The function walk_mem_res() only considers the first level and is used in arch/x86/mm/ioremap.c:__ioremap_check_mem() only. We currently fail to identify System RAM added by dax/kmem and virtio-mem as "IORES_MAP_SYSTEM_RAM", for example, allowing for remapping of such "normal RAM" in __ioremap_caller(). Let's find all IORESOURCE_MEM | IORESOURCE_BUSY resources, making the function behave similar to walk_system_ram_res(). Link: https://lkml.kernel.org/r/20210325115326.7826-3-david@redhat.com Fixes: ebf71552bb0e ("virtio-mem: Add parent resource for all added "System RAM"") Fixes: c221c0b0308f ("device-dax: "Hotplug" persistent memory for use like normal RAM") Signed-off-by: David Hildenbrand Reviewed-by: Dan Williams Cc: Greg Kroah-Hartman Cc: Dan Williams Cc: Daniel Vetter Cc: Andy Shevchenko Cc: Mauro Carvalho Chehab Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Hansen Cc: Keith Busch Cc: Michal Hocko Cc: Qian Cai Cc: Oscar Salvador Cc: Eric Biederman Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Tom Lendacky Cc: Brijesh Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/resource.c b/kernel/resource.c index 4efd6e912279..16e0c7e8ed24 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -470,7 +470,7 @@ int walk_mem_res(u64 start, u64 end, void *arg, { unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; - return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true, + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, false, arg, func); } From 97523a4edb7b9dc2be48a24a2387fb1328b29521 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 6 May 2021 18:05:20 -0700 Subject: [PATCH 1326/1379] kernel/resource: remove first_lvl / siblings_only logic All functions that search for IORESOURCE_SYSTEM_RAM or IORESOURCE_MEM resources now properly consider the whole resource tree, not just the first level. Let's drop the unused first_lvl / siblings_only logic. Remove documentation that indicates that some functions behave differently, all consider the full resource tree now. Link: https://lkml.kernel.org/r/20210325115326.7826-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Dan Williams Reviewed-by: Andy Shevchenko Cc: Greg Kroah-Hartman Cc: Dan Williams Cc: Daniel Vetter Cc: Andy Shevchenko Cc: Mauro Carvalho Chehab Cc: Dave Young Cc: Baoquan He Cc: Vivek Goyal Cc: Dave Hansen Cc: Keith Busch Cc: Michal Hocko Cc: Qian Cai Cc: Oscar Salvador Cc: Eric Biederman Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Tom Lendacky Cc: Brijesh Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 45 ++++++++++++--------------------------------- 1 file changed, 12 insertions(+), 33 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 16e0c7e8ed24..7e00239a023a 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -64,12 +64,8 @@ static DEFINE_RWLOCK(resource_lock); static struct resource *bootmem_resource_free; static DEFINE_SPINLOCK(bootmem_resource_lock); -static struct resource *next_resource(struct resource *p, bool sibling_only) +static struct resource *next_resource(struct resource *p) { - /* Caller wants to traverse through siblings only */ - if (sibling_only) - return p->sibling; - if (p->child) return p->child; while (!p->sibling && p->parent) @@ -81,7 +77,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; (*pos)++; - return (void *)next_resource(p, false); + return (void *)next_resource(p); } #ifdef CONFIG_PROC_FS @@ -330,14 +326,10 @@ EXPORT_SYMBOL(release_resource); * of the resource that's within [@start..@end]; if none is found, returns * -ENODEV. Returns -EINVAL for invalid parameters. * - * This function walks the whole tree and not just first level children - * unless @first_lvl is true. - * * @start: start address of the resource searched for * @end: end address of same resource * @flags: flags which the resource must have * @desc: descriptor the resource must have - * @first_lvl: walk only the first level children, if set * @res: return ptr, if resource found * * The caller must specify @start, @end, @flags, and @desc @@ -345,9 +337,8 @@ EXPORT_SYMBOL(release_resource); */ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, - bool first_lvl, struct resource *res) + struct resource *res) { - bool siblings_only = true; struct resource *p; if (!res) @@ -358,7 +349,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, read_lock(&resource_lock); - for (p = iomem_resource.child; p; p = next_resource(p, siblings_only)) { + for (p = iomem_resource.child; p; p = next_resource(p)) { /* If we passed the resource we are looking for, stop */ if (p->start > end) { p = NULL; @@ -369,13 +360,6 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, if (p->end < start) continue; - /* - * Now that we found a range that matches what we look for, - * check the flags and the descriptor. If we were not asked to - * use only the first level, start looking at children as well. - */ - siblings_only = first_lvl; - if ((p->flags & flags) != flags) continue; if ((desc != IORES_DESC_NONE) && (desc != p->desc)) @@ -402,14 +386,14 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end, static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, - bool first_lvl, void *arg, + void *arg, int (*func)(struct resource *, void *)) { struct resource res; int ret = -EINVAL; while (start < end && - !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) { + !find_next_iomem_res(start, end, flags, desc, &res)) { ret = (*func)(&res, arg); if (ret) break; @@ -431,7 +415,6 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, * @arg: function argument for the callback @func * @func: callback function that is called for each qualifying resource area * - * This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and * desc are valid candidates. * @@ -441,7 +424,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)) { - return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func); + return __walk_iomem_res_desc(start, end, flags, desc, arg, func); } EXPORT_SYMBOL_GPL(walk_iomem_res_desc); @@ -457,8 +440,8 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, { unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; - return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, false, - arg, func); + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg, + func); } /* @@ -470,17 +453,14 @@ int walk_mem_res(u64 start, u64 end, void *arg, { unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; - return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, false, - arg, func); + return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, arg, + func); } /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. * It is to be used only for System RAM. - * - * This will find System RAM ranges that are children of top-level resources - * in addition to top-level System RAM resources. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) @@ -495,8 +475,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; while (start < end && - !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, - false, &res)) { + !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, &res)) { pfn = PFN_UP(res.start); end_pfn = PFN_DOWN(res.end + 1); if (end_pfn > pfn) From d486ccb2522fc22f04f191cac99a844f92d56a7e Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 6 May 2021 18:05:24 -0700 Subject: [PATCH 1327/1379] kernel/resource: allow region_intersects users to hold resource_lock Introduce a version of region_intersects() that can be called with the resource_lock already held. This will be used in a future fix to __request_free_mem_region(). [akpm@linux-foundation.org: make __region_intersects static] Link: https://lkml.kernel.org/r/20210419070109.4780-1-apopple@nvidia.com Signed-off-by: Alistair Popple Cc: David Hildenbrand Cc: Daniel Vetter Cc: Dan Williams Cc: Greg Kroah-Hartman Cc: John Hubbard Cc: Jerome Glisse Cc: Balbir Singh Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 52 ++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index 7e00239a023a..f4aeeda7ee28 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -502,6 +502,34 @@ int __weak page_is_ram(unsigned long pfn) } EXPORT_SYMBOL_GPL(page_is_ram); +static int __region_intersects(resource_size_t start, size_t size, + unsigned long flags, unsigned long desc) +{ + struct resource res; + int type = 0; int other = 0; + struct resource *p; + + res.start = start; + res.end = start + size - 1; + + for (p = iomem_resource.child; p ; p = p->sibling) { + bool is_type = (((p->flags & flags) == flags) && + ((desc == IORES_DESC_NONE) || + (desc == p->desc))); + + if (resource_overlaps(p, &res)) + is_type ? type++ : other++; + } + + if (type == 0) + return REGION_DISJOINT; + + if (other == 0) + return REGION_INTERSECTS; + + return REGION_MIXED; +} + /** * region_intersects() - determine intersection of region with known resources * @start: region start address @@ -525,31 +553,13 @@ EXPORT_SYMBOL_GPL(page_is_ram); int region_intersects(resource_size_t start, size_t size, unsigned long flags, unsigned long desc) { - struct resource res; - int type = 0; int other = 0; - struct resource *p; - - res.start = start; - res.end = start + size - 1; + int ret; read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - bool is_type = (((p->flags & flags) == flags) && - ((desc == IORES_DESC_NONE) || - (desc == p->desc))); - - if (resource_overlaps(p, &res)) - is_type ? type++ : other++; - } + ret = __region_intersects(start, size, flags, desc); read_unlock(&resource_lock); - if (type == 0) - return REGION_DISJOINT; - - if (other == 0) - return REGION_INTERSECTS; - - return REGION_MIXED; + return ret; } EXPORT_SYMBOL_GPL(region_intersects); From 63cdafe0af982e7da9ded37ccf21109a02bc6832 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 6 May 2021 18:05:27 -0700 Subject: [PATCH 1328/1379] kernel/resource: refactor __request_region to allow external locking Refactor the portion of __request_region() done whilst holding the resource_lock into a separate function to allow callers to hold the lock. Link: https://lkml.kernel.org/r/20210419070109.4780-2-apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: David Hildenbrand Cc: Balbir Singh Cc: Daniel Vetter Cc: Dan Williams Cc: Greg Kroah-Hartman Cc: Jerome Glisse Cc: John Hubbard Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 52 +++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index f4aeeda7ee28..c8e473b68f17 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1160,31 +1160,16 @@ struct address_space *iomem_get_mapping(void) return smp_load_acquire(&iomem_inode)->i_mapping; } -/** - * __request_region - create a new busy resource region - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * @name: reserving caller's ID string - * @flags: IO resource flags - */ -struct resource * __request_region(struct resource *parent, +static int __request_region_locked(struct resource *res, struct resource *parent, resource_size_t start, resource_size_t n, const char *name, int flags) { DECLARE_WAITQUEUE(wait, current); - struct resource *res = alloc_resource(GFP_KERNEL); - struct resource *orig_parent = parent; - - if (!res) - return NULL; res->name = name; res->start = start; res->end = start + n - 1; - write_lock(&resource_lock); - for (;;) { struct resource *conflict; @@ -1220,13 +1205,40 @@ struct resource * __request_region(struct resource *parent, continue; } /* Uhhuh, that didn't work out.. */ - free_resource(res); - res = NULL; - break; + return -EBUSY; } + + return 0; +} + +/** + * __request_region - create a new busy resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * @name: reserving caller's ID string + * @flags: IO resource flags + */ +struct resource *__request_region(struct resource *parent, + resource_size_t start, resource_size_t n, + const char *name, int flags) +{ + struct resource *res = alloc_resource(GFP_KERNEL); + int ret; + + if (!res) + return NULL; + + write_lock(&resource_lock); + ret = __request_region_locked(res, parent, start, n, name, flags); write_unlock(&resource_lock); - if (res && orig_parent == &iomem_resource) + if (ret) { + free_resource(res); + return NULL; + } + + if (parent == &iomem_resource) revoke_iomem(res); return res; From 56fd94919b8bfdbe162f78920b4ebc72b4ce2f39 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 6 May 2021 18:05:30 -0700 Subject: [PATCH 1329/1379] kernel/resource: fix locking in request_free_mem_region request_free_mem_region() is used to find an empty range of physical addresses for hotplugging ZONE_DEVICE memory. It does this by iterating over the range of possible addresses using region_intersects() to see if the range is free before calling request_mem_region() to allocate the region. However the resource_lock is dropped between these two calls meaning by the time request_mem_region() is called in request_free_mem_region() another thread may have already reserved the requested region. This results in unexpected failures and a message in the kernel log from hitting this condition: /* * mm/hmm.c reserves physical addresses which then * become unavailable to other users. Conflicts are * not expected. Warn to aid debugging if encountered. */ if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { pr_warn("Unaddressable device %s %pR conflicts with %pR", conflict->name, conflict, res); These unexpected failures can be corrected by holding resource_lock across the two calls. This also requires memory allocation to be performed prior to taking the lock. Link: https://lkml.kernel.org/r/20210419070109.4780-3-apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: David Hildenbrand Cc: Balbir Singh Cc: Daniel Vetter Cc: Dan Williams Cc: Greg Kroah-Hartman Cc: Jerome Glisse Cc: John Hubbard Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 45 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index c8e473b68f17..028a5ab18818 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1780,25 +1780,56 @@ static struct resource *__request_free_mem_region(struct device *dev, { resource_size_t end, addr; struct resource *res; + struct region_devres *dr = NULL; size = ALIGN(size, 1UL << PA_SECTION_SHIFT); end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); addr = end - size + 1UL; + res = alloc_resource(GFP_KERNEL); + if (!res) + return ERR_PTR(-ENOMEM); + + if (dev) { + dr = devres_alloc(devm_region_release, + sizeof(struct region_devres), GFP_KERNEL); + if (!dr) { + free_resource(res); + return ERR_PTR(-ENOMEM); + } + } + + write_lock(&resource_lock); for (; addr > size && addr >= base->start; addr -= size) { - if (region_intersects(addr, size, 0, IORES_DESC_NONE) != + if (__region_intersects(addr, size, 0, IORES_DESC_NONE) != REGION_DISJOINT) continue; - if (dev) - res = devm_request_mem_region(dev, addr, size, name); - else - res = request_mem_region(addr, size, name); - if (!res) - return ERR_PTR(-ENOMEM); + if (!__request_region_locked(res, &iomem_resource, addr, size, + name, 0)) + break; + + if (dev) { + dr->parent = &iomem_resource; + dr->start = addr; + dr->n = size; + devres_add(dev, dr); + } + res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + write_unlock(&resource_lock); + + /* + * A driver is claiming this region so revoke any mappings. + */ + revoke_iomem(res); return res; } + write_unlock(&resource_lock); + + free_resource(res); + if (dr) + devres_free(dr); return ERR_PTR(-ERANGE); } From 9c39c6ffe0c2945c7cf814814c096bc23b63f53d Mon Sep 17 00:00:00 2001 From: Zhang Yunkai Date: Thu, 6 May 2021 18:05:33 -0700 Subject: [PATCH 1330/1379] selftests: remove duplicate include 'assert.h' included in 'sparsebit.c' is duplicated. It is also included in the 161th line. 'string.h' included in 'mincore_selftest.c' is duplicated. It is also included in the 15th line. 'sched.h' included in 'tlbie_test.c' is duplicated. It is also included in the 33th line. Link: https://lkml.kernel.org/r/20210316073336.426255-1-zhang.yunkai@zte.com.cn Signed-off-by: Zhang Yunkai Cc: Paolo Bonzini Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/kvm/lib/sparsebit.c | 1 - tools/testing/selftests/mincore/mincore_selftest.c | 1 - tools/testing/selftests/powerpc/mm/tlbie_test.c | 1 - 3 files changed, 3 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c index 031ba3c932ed..a0d0c83d83de 100644 --- a/tools/testing/selftests/kvm/lib/sparsebit.c +++ b/tools/testing/selftests/kvm/lib/sparsebit.c @@ -1890,7 +1890,6 @@ void sparsebit_validate_internal(struct sparsebit *s) */ #include -#include struct range { sparsebit_idx_t first, last; diff --git a/tools/testing/selftests/mincore/mincore_selftest.c b/tools/testing/selftests/mincore/mincore_selftest.c index 5a1e85ff5d32..e54106643337 100644 --- a/tools/testing/selftests/mincore/mincore_selftest.c +++ b/tools/testing/selftests/mincore/mincore_selftest.c @@ -14,7 +14,6 @@ #include #include #include -#include #include "../kselftest.h" #include "../kselftest_harness.h" diff --git a/tools/testing/selftests/powerpc/mm/tlbie_test.c b/tools/testing/selftests/powerpc/mm/tlbie_test.c index f85a0938ab25..48344a74b212 100644 --- a/tools/testing/selftests/powerpc/mm/tlbie_test.c +++ b/tools/testing/selftests/powerpc/mm/tlbie_test.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include From 07416af11dd85ca61abe60155ace37ced1233617 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 6 May 2021 18:05:36 -0700 Subject: [PATCH 1331/1379] kernel/async.c: stop guarding pr_debug() statements It's currently nigh impossible to get these pr_debug()s to print something. Being guarded by initcall_debug means one has to enable tons of other debug output during boot, and the system_state condition further means it's impossible to get them when loading modules later. Also, the compiler can't know that these global conditions do not change, so there are W=2 warnings kernel/async.c:125:9: warning: `calltime' may be used uninitialized in this function [-Wmaybe-uninitialized] kernel/async.c:300:9: warning: `starttime' may be used uninitialized in this function [-Wmaybe-uninitialized] Make it possible, for a DYNAMIC_DEBUG kernel, to get these to print their messages by booting with appropriate 'dyndbg="file async.c +p"' command line argument. For a non-DYNAMIC_DEBUG kernel, pr_debug() compiles to nothing. This does cost doing an unconditional ktime_get() for the starttime value, but the corresponding ktime_get for the end time can be elided by factoring it into a function which only gets called if the printk() arguments end up being evaluated. Link: https://lkml.kernel.org/r/20210309151723.1907838-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/async.c | 48 ++++++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/kernel/async.c b/kernel/async.c index 45a867b8644a..4b5971142922 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -78,6 +78,12 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done); static atomic_t entry_count; +static long long microseconds_since(ktime_t start) +{ + ktime_t now = ktime_get(); + return ktime_to_ns(ktime_sub(now, start)) >> 10; +} + static async_cookie_t lowest_in_progress(struct async_domain *domain) { struct async_entry *first = NULL; @@ -111,24 +117,18 @@ static void async_run_entry_fn(struct work_struct *work) struct async_entry *entry = container_of(work, struct async_entry, work); unsigned long flags; - ktime_t calltime, delta, rettime; + ktime_t calltime; /* 1) run (and print duration) */ - if (initcall_debug && system_state < SYSTEM_RUNNING) { - pr_debug("calling %lli_%pS @ %i\n", - (long long)entry->cookie, - entry->func, task_pid_nr(current)); - calltime = ktime_get(); - } + pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie, + entry->func, task_pid_nr(current)); + calltime = ktime_get(); + entry->func(entry->data, entry->cookie); - if (initcall_debug && system_state < SYSTEM_RUNNING) { - rettime = ktime_get(); - delta = ktime_sub(rettime, calltime); - pr_debug("initcall %lli_%pS returned after %lld usecs\n", - (long long)entry->cookie, - entry->func, - (long long)ktime_to_ns(delta) >> 10); - } + + pr_debug("initcall %lli_%pS returned after %lld usecs\n", + (long long)entry->cookie, entry->func, + microseconds_since(calltime)); /* 2) remove self from the pending queues */ spin_lock_irqsave(&async_lock, flags); @@ -287,23 +287,15 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); */ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain) { - ktime_t starttime, delta, endtime; + ktime_t starttime; - if (initcall_debug && system_state < SYSTEM_RUNNING) { - pr_debug("async_waiting @ %i\n", task_pid_nr(current)); - starttime = ktime_get(); - } + pr_debug("async_waiting @ %i\n", task_pid_nr(current)); + starttime = ktime_get(); wait_event(async_done, lowest_in_progress(domain) >= cookie); - if (initcall_debug && system_state < SYSTEM_RUNNING) { - endtime = ktime_get(); - delta = ktime_sub(endtime, starttime); - - pr_debug("async_continuing @ %i after %lli usec\n", - task_pid_nr(current), - (long long)ktime_to_ns(delta) >> 10); - } + pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current), + microseconds_since(starttime)); } EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain); From a065c0faacb1e472cd4e048986407d1b177373a2 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 6 May 2021 18:05:39 -0700 Subject: [PATCH 1332/1379] kernel/async.c: remove async_unregister_domain() No callers in the tree. Link: https://lkml.kernel.org/r/20210309151723.1907838-2-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/async.h | 1 - kernel/async.c | 18 ------------------ 2 files changed, 19 deletions(-) diff --git a/include/linux/async.h b/include/linux/async.h index 0a17cd27f348..cce4ad31e8fc 100644 --- a/include/linux/async.h +++ b/include/linux/async.h @@ -112,7 +112,6 @@ async_schedule_dev_domain(async_func_t func, struct device *dev, return async_schedule_node_domain(func, dev, dev_to_node(dev), domain); } -void async_unregister_domain(struct async_domain *domain); extern void async_synchronize_full(void); extern void async_synchronize_full_domain(struct async_domain *domain); extern void async_synchronize_cookie(async_cookie_t cookie); diff --git a/kernel/async.c b/kernel/async.c index 4b5971142922..b8d7a663497f 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -245,24 +245,6 @@ void async_synchronize_full(void) } EXPORT_SYMBOL_GPL(async_synchronize_full); -/** - * async_unregister_domain - ensure no more anonymous waiters on this domain - * @domain: idle domain to flush out of any async_synchronize_full instances - * - * async_synchronize_{cookie|full}_domain() are not flushed since callers - * of these routines should know the lifetime of @domain - * - * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing - */ -void async_unregister_domain(struct async_domain *domain) -{ - spin_lock_irq(&async_lock); - WARN_ON(!domain->registered || !list_empty(&domain->pending)); - domain->registered = 0; - spin_unlock_irq(&async_lock); -} -EXPORT_SYMBOL_GPL(async_unregister_domain); - /** * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain * @domain: the domain to synchronize From e7cb072eb988e46295512617c39d004f9e1c26f8 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 6 May 2021 18:05:42 -0700 Subject: [PATCH 1333/1379] init/initramfs.c: do unpacking asynchronously Patch series "background initramfs unpacking, and CONFIG_MODPROBE_PATH", v3. These two patches are independent, but better-together. The second is a rather trivial patch that simply allows the developer to change "/sbin/modprobe" to something else - e.g. the empty string, so that all request_module() during early boot return -ENOENT early, without even spawning a usermode helper, needlessly synchronizing with the initramfs unpacking. The first patch delegates decompressing the initramfs to a worker thread, allowing do_initcalls() in main.c to proceed to the device_ and late_ initcalls without waiting for that decompression (and populating of rootfs) to finish. Obviously, some of those later calls may rely on the initramfs being available, so I've added synchronization points in the firmware loader and usermodehelper paths - there might be other places that would need this, but so far no one has been able to think of any places I have missed. There's not much to win if most of the functionality needed during boot is only available as modules. But systems with a custom-made .config and initramfs can boot faster, partly due to utilizing more than one cpu earlier, partly by avoiding known-futile modprobe calls (which would still trigger synchronization with the initramfs unpacking, thus eliminating most of the first benefit). This patch (of 2): Most of the boot process doesn't actually need anything from the initramfs, until of course PID1 is to be executed. So instead of doing the decompressing and populating of the initramfs synchronously in populate_rootfs() itself, push that off to a worker thread. This is primarily motivated by an embedded ppc target, where unpacking even the rather modest sized initramfs takes 0.6 seconds, which is long enough that the external watchdog becomes unhappy that it doesn't get attention soon enough. By doing the initramfs decompression in a worker thread, we get to do the device_initcalls and hence start petting the watchdog much sooner. Normal desktops might benefit as well. On my mostly stock Ubuntu kernel, my initramfs is a 26M xz-compressed blob, decompressing to around 126M. That takes almost two seconds: [ 0.201454] Trying to unpack rootfs image as initramfs... [ 1.976633] Freeing initrd memory: 29416K Before this patch, these lines occur consecutively in dmesg. With this patch, the timestamps on these two lines is roughly the same as above, but with 172 lines inbetween - so more than one cpu has been kept busy doing work that would otherwise only happen after the populate_rootfs() finished. Should one of the initcalls done after rootfs_initcall time (i.e., device_ and late_ initcalls) need something from the initramfs (say, a kernel module or a firmware blob), it will simply wait for the initramfs unpacking to be done before proceeding, which should in theory make this completely safe. But if some driver pokes around in the filesystem directly and not via one of the official kernel interfaces (i.e. request_firmware*(), call_usermodehelper*) that theory may not hold - also, I certainly might have missed a spot when sprinkling wait_for_initramfs(). So there is an escape hatch in the form of an initramfs_async= command line parameter. Link: https://lkml.kernel.org/r/20210313212528.2956377-1-linux@rasmusvillemoes.dk Link: https://lkml.kernel.org/r/20210313212528.2956377-2-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Reviewed-by: Luis Chamberlain Cc: Jessica Yu Cc: Borislav Petkov Cc: Jonathan Corbet Cc: Greg Kroah-Hartman Cc: Nick Desaulniers Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 12 ++++++ drivers/base/firmware_loader/main.c | 2 + include/linux/initrd.h | 2 + init/initramfs.c | 38 ++++++++++++++++++- init/main.c | 1 + kernel/umh.c | 2 + 6 files changed, 56 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d93fbc1c1917..7866cc1bd4a9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1833,6 +1833,18 @@ initcall functions. Useful for debugging built-in modules and initcalls. + initramfs_async= [KNL] + Format: + Default: 1 + This parameter controls whether the initramfs + image is unpacked asynchronously, concurrently + with devices being probed and + initialized. This should normally just work, + but as a debugging aid, one can get the + historical behaviour of the initramfs + unpacking being completed before device_ and + late_ initcalls. + initrd= [BOOT] Specify the location of the initial ramdisk initrdmem= [KNL] Specify a physical address and size from which to diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 78355095e00d..4fdb8219cd08 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -504,6 +505,7 @@ fw_get_filesystem_firmware(struct device *device, struct fw_priv *fw_priv, if (!path) return -ENOMEM; + wait_for_initramfs(); for (i = 0; i < ARRAY_SIZE(fw_path); i++) { size_t file_size = 0; size_t *file_size_ptr = NULL; diff --git a/include/linux/initrd.h b/include/linux/initrd.h index 85c15717af34..1bbe9af48dc3 100644 --- a/include/linux/initrd.h +++ b/include/linux/initrd.h @@ -20,8 +20,10 @@ extern void free_initrd_mem(unsigned long, unsigned long); #ifdef CONFIG_BLK_DEV_INITRD extern void __init reserve_initrd_mem(void); +extern void wait_for_initramfs(void); #else static inline void __init reserve_initrd_mem(void) {} +static inline void wait_for_initramfs(void) {} #endif extern phys_addr_t phys_initrd_start; diff --git a/init/initramfs.c b/init/initramfs.c index d677e8e717f1..af27abc59643 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include @@ -541,6 +542,14 @@ static int __init keepinitrd_setup(char *__unused) __setup("keepinitrd", keepinitrd_setup); #endif +static bool __initdata initramfs_async = true; +static int __init initramfs_async_setup(char *str) +{ + strtobool(str, &initramfs_async); + return 1; +} +__setup("initramfs_async=", initramfs_async_setup); + extern char __initramfs_start[]; extern unsigned long __initramfs_size; #include @@ -658,7 +667,7 @@ static void __init populate_initrd_image(char *err) } #endif /* CONFIG_BLK_DEV_RAM */ -static int __init populate_rootfs(void) +static void __init do_populate_rootfs(void *unused, async_cookie_t cookie) { /* Load the built in initramfs */ char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); @@ -693,6 +702,33 @@ done: initrd_end = 0; flush_delayed_fput(); +} + +static ASYNC_DOMAIN_EXCLUSIVE(initramfs_domain); +static async_cookie_t initramfs_cookie; + +void wait_for_initramfs(void) +{ + if (!initramfs_cookie) { + /* + * Something before rootfs_initcall wants to access + * the filesystem/initramfs. Probably a bug. Make a + * note, avoid deadlocking the machine, and let the + * caller's access fail as it used to. + */ + pr_warn_once("wait_for_initramfs() called before rootfs_initcalls\n"); + return; + } + async_synchronize_cookie_domain(initramfs_cookie + 1, &initramfs_domain); +} +EXPORT_SYMBOL_GPL(wait_for_initramfs); + +static int __init populate_rootfs(void) +{ + initramfs_cookie = async_schedule_domain(do_populate_rootfs, NULL, + &initramfs_domain); + if (!initramfs_async) + wait_for_initramfs(); return 0; } rootfs_initcall(populate_rootfs); diff --git a/init/main.c b/init/main.c index dd11bfd10ead..11d34ccf5786 100644 --- a/init/main.c +++ b/init/main.c @@ -1561,6 +1561,7 @@ static noinline void __init kernel_init_freeable(void) kunit_run_all_tests(); + wait_for_initramfs(); console_on_rootfs(); /* diff --git a/kernel/umh.c b/kernel/umh.c index 3f646613a9d3..61f6b82c354b 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -107,6 +108,7 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); + wait_for_initramfs(); retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); From 17652f4240f7a501ecc13e9fdb06982569cde51f Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 6 May 2021 18:05:45 -0700 Subject: [PATCH 1334/1379] modules: add CONFIG_MODPROBE_PATH Allow the developer to specifiy the initial value of the modprobe_path[] string. This can be used to set it to the empty string initially, thus effectively disabling request_module() during early boot until userspace writes a new value via the /proc/sys/kernel/modprobe interface. [1] When building a custom kernel (often for an embedded target), it's normal to build everything into the kernel that is needed for booting, and indeed the initramfs often contains no modules at all, so every such request_module() done before userspace init has mounted the real rootfs is a waste of time. This is particularly useful when combined with the previous patch, which made the initramfs unpacking asynchronous - for that to work, it had to make any usermodehelper call wait for the unpacking to finish before attempting to invoke the userspace helper. By eliminating all such (known-to-be-futile) calls of usermodehelper, the initramfs unpacking and the {device,late}_initcalls can proceed in parallel for much longer. For a relatively slow ppc board I'm working on, the two patches combined lead to 0.2s faster boot - but more importantly, the fact that the initramfs unpacking proceeds completely in the background while devices get probed means I get to handle the gpio watchdog in time without getting reset. [1] __request_module() already has an early -ENOENT return when modprobe_path is the empty string. Link: https://lkml.kernel.org/r/20210313212528.2956377-3-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Reviewed-by: Greg Kroah-Hartman Acked-by: Jessica Yu Acked-by: Luis Chamberlain Cc: Borislav Petkov Cc: Jonathan Corbet Cc: Nick Desaulniers Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 12 ++++++++++++ kernel/kmod.c | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 1413413fcb9f..d83cb634c24f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2299,6 +2299,18 @@ config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS If unsure, say N. +config MODPROBE_PATH + string "Path to modprobe binary" + default "/sbin/modprobe" + help + When kernel code requests a module, it does so by calling + the "modprobe" userspace utility. This option allows you to + set the path where that binary is found. This can be changed + at runtime via the sysctl file + /proc/sys/kernel/modprobe. Setting this to the empty string + removes the kernel's ability to request modules (but + userspace can still load modules explicitly). + config TRIM_UNUSED_KSYMS bool "Trim unused exported kernel symbols" if EXPERT depends on !COMPILE_TEST diff --git a/kernel/kmod.c b/kernel/kmod.c index 3cd075ce2a1e..b717134ebe17 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -58,7 +58,7 @@ static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); /* modprobe_path is set via /proc/sys. */ -char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; +char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH; static void free_modprobe_argv(struct subprocess_info *info) { From b1989a3db45a6e8a5f1178bab621e8b9b8838602 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 6 May 2021 18:05:48 -0700 Subject: [PATCH 1335/1379] ipc/sem.c: mundane typo fixes s/runtine/runtime/ s/AQUIRE/ACQUIRE/ s/seperately/separately/ s/wont/won\'t/ s/succesfull/successful/ Link: https://lkml.kernel.org/r/20210326022240.26375-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ipc/sem.c b/ipc/sem.c index f6c30a85dadf..0897dac27f43 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -36,7 +36,7 @@ * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO. * - undo adjustments at process exit are limited to 0..SEMVMX. * - namespace are supported. - * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing + * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtime by writing * to /proc/sys/kernel/sem. * - statistics about the usage are reported in /proc/sysvipc/sem. * @@ -224,7 +224,7 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); * Setting it to a result code is a RELEASE, this is ensured by both a * smp_store_release() (for case a) and while holding sem_lock() * (for case b). - * The AQUIRE when reading the result code without holding sem_lock() is + * The ACQUIRE when reading the result code without holding sem_lock() is * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep(). * (case a above). * Reading the result code while holding sem_lock() needs no further barriers, @@ -821,7 +821,7 @@ static inline int check_restart(struct sem_array *sma, struct sem_queue *q) /* It is impossible that someone waits for the new value: * - complex operations always restart. - * - wait-for-zero are handled seperately. + * - wait-for-zero are handled separately. * - q is a previously sleeping simple operation that * altered the array. It must be a decrement, because * simple increments never sleep. @@ -1046,7 +1046,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop * - No complex ops, thus all sleeping ops are * decrease. * - if we decreased the value, then any sleeping - * semaphore ops wont be able to run: If the + * semaphore ops won't be able to run: If the * previous value was too small, then the new * value will be too small, too. */ @@ -2108,7 +2108,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, queue.dupsop = dupsop; error = perform_atomic_semop(sma, &queue); - if (error == 0) { /* non-blocking succesfull path */ + if (error == 0) { /* non-blocking successful path */ DEFINE_WAKE_Q(wake_q); /* From cb152a1a95606aadd81df7a537dde9ef16da4b80 Mon Sep 17 00:00:00 2001 From: Shijie Luo Date: Thu, 6 May 2021 18:05:51 -0700 Subject: [PATCH 1336/1379] mm: fix some typos and code style problems fix some typos and code style problems in mm. gfp.h: s/MAXNODES/MAX_NUMNODES mmzone.h: s/then/than rmap.c: s/__vma_split()/__vma_adjust() swap.c: s/__mod_zone_page_stat/__mod_zone_page_state, s/is is/is swap_state.c: s/whoes/whose z3fold.c: code style problem fix in z3fold_unregister_migration zsmalloc.c: s/of/or, s/give/given Link: https://lkml.kernel.org/r/20210419083057.64820-1-luoshijie1@huawei.com Signed-off-by: Shijie Luo Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- include/linux/mmzone.h | 2 +- mm/rmap.c | 2 +- mm/swap.c | 4 ++-- mm/swap_state.c | 2 +- mm/z3fold.c | 2 +- mm/zsmalloc.c | 4 ++-- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 8a5f6c3d7dba..11da8af06704 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -490,7 +490,7 @@ static inline int gfp_zonelist(gfp_t flags) /* * We get the zone list from the current node and the gfp_mask. - * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones. + * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones. * There are two zonelists per node, one for all zones with memory and * one containing just zones from the node the zonelist belongs to. * diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 917bd6c604d5..0d53eba1c383 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -55,7 +55,7 @@ enum migratetype { * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function. What is important though * is that a range of pageblocks must be aligned to - * MAX_ORDER_NR_PAGES should biggest page be bigger then + * MAX_ORDER_NR_PAGES should biggest page be bigger than * a single pageblock. */ MIGRATE_CMA, diff --git a/mm/rmap.c b/mm/rmap.c index b0fc27e77d6d..693a610e181d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -257,7 +257,7 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. * - * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and + * anon_vma_clone() is called by __vma_adjust(), __split_vma(), copy_vma() and * anon_vma_fork(). The first three want an exact copy of src, while the last * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call, diff --git a/mm/swap.c b/mm/swap.c index a75a8265302b..dfb48cf9c2c9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -496,7 +496,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page, if (unlikely(unevictable) && !TestSetPageMlocked(page)) { int nr_pages = thp_nr_pages(page); /* - * We use the irq-unsafe __mod_zone_page_stat because this + * We use the irq-unsafe __mod_zone_page_state because this * counter is not modified from interrupt context, and the pte * lock is held(spinlock), which implies preemption disabled. */ @@ -808,7 +808,7 @@ inline void __lru_add_drain_all(bool force_all_cpus) * below which drains the page vectors. * * Let x, y, and z represent some system CPU numbers, where x < y < z. - * Assume CPU #z is is in the middle of the for_each_online_cpu loop + * Assume CPU #z is in the middle of the for_each_online_cpu loop * below and has already reached CPU #y's per-cpu data. CPU #x comes * along, adds some pages to its per-cpu vectors, then calls * lru_add_drain_all(). diff --git a/mm/swap_state.c b/mm/swap_state.c index 3a1259c13f3b..272ea2108c9d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -792,7 +792,7 @@ static void swap_ra_info(struct vm_fault *vmf, * * Returns the struct page for entry and addr, after queueing swapin. * - * Primitive swap readahead code. We simply read in a few pages whoes + * Primitive swap readahead code. We simply read in a few pages whose * virtual addresses are around the fault address in the same vma. * * Caller must hold read mmap_lock if vmf->vma is not NULL. diff --git a/mm/z3fold.c b/mm/z3fold.c index 9d889ad2bb86..7fe7adaaad01 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -391,7 +391,7 @@ static void z3fold_unregister_migration(struct z3fold_pool *pool) { if (pool->inode) iput(pool->inode); - } +} /* Initializes the z3fold header of a newly allocated z3fold page */ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 58697f7a43f8..5004c176b045 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -61,7 +61,7 @@ #define ZSPAGE_MAGIC 0x58 /* - * This must be power of 2 and greater than of equal to sizeof(link_free). + * This must be power of 2 and greater than or equal to sizeof(link_free). * These two conditions ensure that any 'struct link_free' itself doesn't * span more than 1 page which avoids complex case of mapping 2 pages simply * to restore link_free pointer values. @@ -530,7 +530,7 @@ static void set_zspage_mapping(struct zspage *zspage, * class maintains a list of zspages where each zspage is divided * into equal sized chunks. Each allocation falls into one of these * classes depending on its size. This function returns index of the - * size class which has chunk size big enough to hold the give size. + * size class which has chunk size big enough to hold the given size. */ static int get_size_class_index(int size) { From bbcd53c960713507ae764bf81970651b5577b95a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 6 May 2021 18:05:55 -0700 Subject: [PATCH 1337/1379] drivers/char: remove /dev/kmem for good Patch series "drivers/char: remove /dev/kmem for good". Exploring /dev/kmem and /dev/mem in the context of memory hot(un)plug and memory ballooning, I started questioning the existence of /dev/kmem. Comparing it with the /proc/kcore implementation, it does not seem to be able to deal with things like a) Pages unmapped from the direct mapping (e.g., to be used by secretmem) -> kern_addr_valid(). virt_addr_valid() is not sufficient. b) Special cases like gart aperture memory that is not to be touched -> mem_pfn_is_ram() Unless I am missing something, it's at least broken in some cases and might fault/crash the machine. Looks like its existence has been questioned before in 2005 and 2010 [1], after ~11 additional years, it might make sense to revive the discussion. CONFIG_DEVKMEM is only enabled in a single defconfig (on purpose or by mistake?). All distributions disable it: in Ubuntu it has been disabled for more than 10 years, in Debian since 2.6.31, in Fedora at least starting with FC3, in RHEL starting with RHEL4, in SUSE starting from 15sp2, and OpenSUSE has it disabled as well. 1) /dev/kmem was popular for rootkits [2] before it got disabled basically everywhere. Ubuntu documents [3] "There is no modern user of /dev/kmem any more beyond attackers using it to load kernel rootkits.". RHEL documents in a BZ [5] "it served no practical purpose other than to serve as a potential security problem or to enable binary module drivers to access structures/functions they shouldn't be touching" 2) /proc/kcore is a decent interface to have a controlled way to read kernel memory for debugging puposes. (will need some extensions to deal with memory offlining/unplug, memory ballooning, and poisoned pages, though) 3) It might be useful for corner case debugging [1]. KDB/KGDB might be a better fit, especially, to write random memory; harder to shoot yourself into the foot. 4) "Kernel Memory Editor" [4] hasn't seen any updates since 2000 and seems to be incompatible with 64bit [1]. For educational purposes, /proc/kcore might be used to monitor value updates -- or older kernels can be used. 5) It's broken on arm64, and therefore, completely disabled there. Looks like it's essentially unused and has been replaced by better suited interfaces for individual tasks (/proc/kcore, KDB/KGDB). Let's just remove it. [1] https://lwn.net/Articles/147901/ [2] https://www.linuxjournal.com/article/10505 [3] https://wiki.ubuntu.com/Security/Features#A.2Fdev.2Fkmem_disabled [4] https://sourceforge.net/projects/kme/ [5] https://bugzilla.redhat.com/show_bug.cgi?id=154796 Link: https://lkml.kernel.org/r/20210324102351.6932-1-david@redhat.com Link: https://lkml.kernel.org/r/20210324102351.6932-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Kees Cook Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: "Alexander A. Klimov" Cc: Alexander Viro Cc: Alexandre Belloni Cc: Andrew Lunn Cc: Andrey Zhizhikin Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Brian Cain Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: Corentin Labbe Cc: "David S. Miller" Cc: "Eric W. Biederman" Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Greentime Hu Cc: Gregory Clement Cc: Heiko Carstens Cc: Helge Deller Cc: Hillf Danton Cc: huang ying Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: "James E.J. Bottomley" Cc: James Troup Cc: Jiaxun Yang Cc: Jonas Bonn Cc: Jonathan Corbet Cc: Kairui Song Cc: Krzysztof Kozlowski Cc: Kuninori Morimoto Cc: Liviu Dudau Cc: Lorenzo Pieralisi Cc: Luc Van Oostenryck Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport Cc: Mikulas Patocka Cc: Minchan Kim Cc: Niklas Schnelle Cc: Oleksiy Avramchenko Cc: openrisc@lists.librecores.org Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: "Pavel Machek (CIP)" Cc: Pavel Machek Cc: "Peter Zijlstra (Intel)" Cc: Pierre Morel Cc: Randy Dunlap Cc: Richard Henderson Cc: Rich Felker Cc: Robert Richter Cc: Rob Herring Cc: Russell King Cc: Sam Ravnborg Cc: Sebastian Andrzej Siewior Cc: Sebastian Hesselbarth Cc: sparclinux@vger.kernel.org Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Steven Rostedt Cc: Sudeep Holla Cc: Theodore Dubois Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Viresh Kumar Cc: William Cohen Cc: Xiaoming Ni Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/devices.txt | 2 +- arch/arm/configs/dove_defconfig | 1 - arch/arm/configs/magician_defconfig | 1 - arch/arm/configs/moxart_defconfig | 1 - arch/arm/configs/mps2_defconfig | 1 - arch/arm/configs/mvebu_v5_defconfig | 1 - arch/arm/configs/xcep_defconfig | 1 - arch/hexagon/configs/comet_defconfig | 1 - arch/m68k/configs/amcore_defconfig | 1 - arch/openrisc/configs/or1ksim_defconfig | 1 - arch/sh/configs/edosk7705_defconfig | 1 - arch/sh/configs/se7206_defconfig | 1 - arch/sh/configs/sh2007_defconfig | 1 - arch/sh/configs/sh7724_generic_defconfig | 1 - arch/sh/configs/sh7770_generic_defconfig | 1 - arch/sh/configs/sh7785lcr_32bit_defconfig | 1 - arch/sparc/configs/sparc64_defconfig | 1 - arch/xtensa/configs/xip_kc705_defconfig | 1 - drivers/char/Kconfig | 10 - drivers/char/mem.c | 231 ---------------------- include/linux/fs.h | 2 +- include/linux/vmalloc.h | 2 +- kernel/configs/android-base.config | 1 - mm/ksm.c | 2 +- mm/vmalloc.c | 2 +- 25 files changed, 5 insertions(+), 264 deletions(-) diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt index ef41f77cb979..9c2be821c225 100644 --- a/Documentation/admin-guide/devices.txt +++ b/Documentation/admin-guide/devices.txt @@ -4,7 +4,7 @@ 1 char Memory devices 1 = /dev/mem Physical memory access - 2 = /dev/kmem Kernel virtual memory access + 2 = /dev/kmem OBSOLETE - replaced by /proc/kcore 3 = /dev/null Null device 4 = /dev/port I/O port access 5 = /dev/zero Null byte source diff --git a/arch/arm/configs/dove_defconfig b/arch/arm/configs/dove_defconfig index e70c997d5f4c..b935162a8bba 100644 --- a/arch/arm/configs/dove_defconfig +++ b/arch/arm/configs/dove_defconfig @@ -63,7 +63,6 @@ CONFIG_INPUT_EVDEV=y # CONFIG_MOUSE_PS2 is not set # CONFIG_SERIO is not set CONFIG_LEGACY_PTY_COUNT=16 -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_RUNTIME_UARTS=2 diff --git a/arch/arm/configs/magician_defconfig b/arch/arm/configs/magician_defconfig index b4670d42f378..abde1fb23b20 100644 --- a/arch/arm/configs/magician_defconfig +++ b/arch/arm/configs/magician_defconfig @@ -72,7 +72,6 @@ CONFIG_INPUT_TOUCHSCREEN=y CONFIG_INPUT_MISC=y CONFIG_INPUT_UINPUT=m # CONFIG_SERIO is not set -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_PXA=y # CONFIG_LEGACY_PTYS is not set # CONFIG_HW_RANDOM is not set diff --git a/arch/arm/configs/moxart_defconfig b/arch/arm/configs/moxart_defconfig index 6834e97af348..eacc089d86c5 100644 --- a/arch/arm/configs/moxart_defconfig +++ b/arch/arm/configs/moxart_defconfig @@ -79,7 +79,6 @@ CONFIG_INPUT_EVBUG=y # CONFIG_SERIO is not set # CONFIG_VT is not set # CONFIG_LEGACY_PTYS is not set -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_NR_UARTS=1 diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig index 1d923dbb9928..89f4a6ff30bd 100644 --- a/arch/arm/configs/mps2_defconfig +++ b/arch/arm/configs/mps2_defconfig @@ -69,7 +69,6 @@ CONFIG_SMSC911X=y # CONFIG_VT is not set # CONFIG_LEGACY_PTYS is not set CONFIG_SERIAL_NONSTANDARD=y -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_MPS2_UART_CONSOLE=y CONFIG_SERIAL_MPS2_UART=y # CONFIG_HW_RANDOM is not set diff --git a/arch/arm/configs/mvebu_v5_defconfig b/arch/arm/configs/mvebu_v5_defconfig index 4f16716bfc32..d57ff30dabff 100644 --- a/arch/arm/configs/mvebu_v5_defconfig +++ b/arch/arm/configs/mvebu_v5_defconfig @@ -100,7 +100,6 @@ CONFIG_INPUT_EVDEV=y CONFIG_KEYBOARD_GPIO=y # CONFIG_INPUT_MOUSE is not set CONFIG_LEGACY_PTY_COUNT=16 -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_RUNTIME_UARTS=2 diff --git a/arch/arm/configs/xcep_defconfig b/arch/arm/configs/xcep_defconfig index f1fbdfc5c8c6..4d8e7f2eaef7 100644 --- a/arch/arm/configs/xcep_defconfig +++ b/arch/arm/configs/xcep_defconfig @@ -53,7 +53,6 @@ CONFIG_NET_ETHERNET=y # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_PXA=y CONFIG_SERIAL_PXA_CONSOLE=y # CONFIG_LEGACY_PTYS is not set diff --git a/arch/hexagon/configs/comet_defconfig b/arch/hexagon/configs/comet_defconfig index f19ae2ab0aaa..c5a214716a38 100644 --- a/arch/hexagon/configs/comet_defconfig +++ b/arch/hexagon/configs/comet_defconfig @@ -34,7 +34,6 @@ CONFIG_NET_ETHERNET=y # CONFIG_SERIO is not set # CONFIG_CONSOLE_TRANSLATIONS is not set CONFIG_LEGACY_PTY_COUNT=64 -# CONFIG_DEVKMEM is not set # CONFIG_HW_RANDOM is not set CONFIG_SPI=y CONFIG_SPI_DEBUG=y diff --git a/arch/m68k/configs/amcore_defconfig b/arch/m68k/configs/amcore_defconfig index 3a84f24d41c8..6d9ed2198170 100644 --- a/arch/m68k/configs/amcore_defconfig +++ b/arch/m68k/configs/amcore_defconfig @@ -60,7 +60,6 @@ CONFIG_DM9000=y # CONFIG_VT is not set # CONFIG_UNIX98_PTYS is not set # CONFIG_DEVMEM is not set -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_MCF=y CONFIG_SERIAL_MCF_BAUDRATE=115200 CONFIG_SERIAL_MCF_CONSOLE=y diff --git a/arch/openrisc/configs/or1ksim_defconfig b/arch/openrisc/configs/or1ksim_defconfig index 75f2da324d0e..6e1e004047c7 100644 --- a/arch/openrisc/configs/or1ksim_defconfig +++ b/arch/openrisc/configs/or1ksim_defconfig @@ -43,7 +43,6 @@ CONFIG_MICREL_PHY=y # CONFIG_SERIO is not set # CONFIG_VT is not set # CONFIG_LEGACY_PTYS is not set -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_OF_PLATFORM=y diff --git a/arch/sh/configs/edosk7705_defconfig b/arch/sh/configs/edosk7705_defconfig index ef7cc31997b1..9ee35269bee2 100644 --- a/arch/sh/configs/edosk7705_defconfig +++ b/arch/sh/configs/edosk7705_defconfig @@ -23,7 +23,6 @@ CONFIG_SH_PCLK_FREQ=31250000 # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set -# CONFIG_DEVKMEM is not set # CONFIG_UNIX98_PTYS is not set # CONFIG_LEGACY_PTYS is not set # CONFIG_HW_RANDOM is not set diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig index 315b04a8dd2f..601d062250d1 100644 --- a/arch/sh/configs/se7206_defconfig +++ b/arch/sh/configs/se7206_defconfig @@ -71,7 +71,6 @@ CONFIG_SMC91X=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_SH_SCI=y CONFIG_SERIAL_SH_SCI_NR_UARTS=4 CONFIG_SERIAL_SH_SCI_CONSOLE=y diff --git a/arch/sh/configs/sh2007_defconfig b/arch/sh/configs/sh2007_defconfig index 99975db461d8..79f02f1c0dc8 100644 --- a/arch/sh/configs/sh2007_defconfig +++ b/arch/sh/configs/sh2007_defconfig @@ -75,7 +75,6 @@ CONFIG_INPUT_FF_MEMLESS=y # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set CONFIG_VT_HW_CONSOLE_BINDING=y -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_SH_SCI=y CONFIG_SERIAL_SH_SCI_CONSOLE=y # CONFIG_LEGACY_PTYS is not set diff --git a/arch/sh/configs/sh7724_generic_defconfig b/arch/sh/configs/sh7724_generic_defconfig index 2c46c0004780..cbc9389a89a8 100644 --- a/arch/sh/configs/sh7724_generic_defconfig +++ b/arch/sh/configs/sh7724_generic_defconfig @@ -18,7 +18,6 @@ CONFIG_CPU_IDLE=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_SH_SCI=y CONFIG_SERIAL_SH_SCI_NR_UARTS=6 CONFIG_SERIAL_SH_SCI_CONSOLE=y diff --git a/arch/sh/configs/sh7770_generic_defconfig b/arch/sh/configs/sh7770_generic_defconfig index 88193153e51b..ee2357deba0f 100644 --- a/arch/sh/configs/sh7770_generic_defconfig +++ b/arch/sh/configs/sh7770_generic_defconfig @@ -20,7 +20,6 @@ CONFIG_CPU_IDLE=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_SH_SCI=y CONFIG_SERIAL_SH_SCI_NR_UARTS=6 CONFIG_SERIAL_SH_SCI_CONSOLE=y diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig index 9b885c14c400..5c725c75fcef 100644 --- a/arch/sh/configs/sh7785lcr_32bit_defconfig +++ b/arch/sh/configs/sh7785lcr_32bit_defconfig @@ -66,7 +66,6 @@ CONFIG_INPUT_FF_MEMLESS=m CONFIG_INPUT_EVDEV=y CONFIG_INPUT_EVBUG=m CONFIG_VT_HW_CONSOLE_BINDING=y -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_SH_SCI=y CONFIG_SERIAL_SH_SCI_NR_UARTS=6 CONFIG_SERIAL_SH_SCI_CONSOLE=y diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig index 12a4fb0bd52a..18099099583e 100644 --- a/arch/sparc/configs/sparc64_defconfig +++ b/arch/sparc/configs/sparc64_defconfig @@ -122,7 +122,6 @@ CONFIG_INPUT_SPARCSPKR=y # CONFIG_SERIO_SERPORT is not set CONFIG_SERIO_PCIPS2=m CONFIG_SERIO_RAW=m -# CONFIG_DEVKMEM is not set CONFIG_SERIAL_SUNSU=y CONFIG_SERIAL_SUNSU_CONSOLE=y CONFIG_SERIAL_SUNSAB=y diff --git a/arch/xtensa/configs/xip_kc705_defconfig b/arch/xtensa/configs/xip_kc705_defconfig index 4f1ff9531f6a..062148e17135 100644 --- a/arch/xtensa/configs/xip_kc705_defconfig +++ b/arch/xtensa/configs/xip_kc705_defconfig @@ -72,7 +72,6 @@ CONFIG_MARVELL_PHY=y # CONFIG_INPUT_KEYBOARD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_SERIO is not set -CONFIG_DEVKMEM=y CONFIG_SERIAL_8250=y # CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set CONFIG_SERIAL_8250_CONSOLE=y diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index d229a2d0c017..b151e0fcdeb5 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -334,16 +334,6 @@ config DEVMEM memory. When in doubt, say "Y". -config DEVKMEM - bool "/dev/kmem virtual device support" - # On arm64, VMALLOC_START < PAGE_OFFSET, which confuses kmem read/write - depends on !ARM64 - help - Say Y here if you want to support the /dev/kmem device. The - /dev/kmem device is rarely used, but can be used for certain - kind of kernel debugging operations. - When in doubt, say "N". - config NVRAM tristate "/dev/nvram support" depends on X86 || HAVE_ARCH_NVRAM_OPS diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 869b9f5e8e03..15dc54fa1d47 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -403,221 +403,6 @@ static int mmap_mem(struct file *file, struct vm_area_struct *vma) return 0; } -static int mmap_kmem(struct file *file, struct vm_area_struct *vma) -{ - unsigned long pfn; - - /* Turn a kernel-virtual address into a physical page frame */ - pfn = __pa((u64)vma->vm_pgoff << PAGE_SHIFT) >> PAGE_SHIFT; - - /* - * RED-PEN: on some architectures there is more mapped memory than - * available in mem_map which pfn_valid checks for. Perhaps should add a - * new macro here. - * - * RED-PEN: vmalloc is not supported right now. - */ - if (!pfn_valid(pfn)) - return -EIO; - - vma->vm_pgoff = pfn; - return mmap_mem(file, vma); -} - -/* - * This function reads the *virtual* memory as seen by the kernel. - */ -static ssize_t read_kmem(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - unsigned long p = *ppos; - ssize_t low_count, read, sz; - char *kbuf; /* k-addr because vread() takes vmlist_lock rwlock */ - int err = 0; - - read = 0; - if (p < (unsigned long) high_memory) { - low_count = count; - if (count > (unsigned long)high_memory - p) - low_count = (unsigned long)high_memory - p; - -#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED - /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE && low_count > 0) { - sz = size_inside_page(p, low_count); - if (clear_user(buf, sz)) - return -EFAULT; - buf += sz; - p += sz; - read += sz; - low_count -= sz; - count -= sz; - } -#endif - while (low_count > 0) { - sz = size_inside_page(p, low_count); - - /* - * On ia64 if a page has been mapped somewhere as - * uncached, then it must also be accessed uncached - * by the kernel or data corruption may occur - */ - kbuf = xlate_dev_kmem_ptr((void *)p); - if (!virt_addr_valid(kbuf)) - return -ENXIO; - - if (copy_to_user(buf, kbuf, sz)) - return -EFAULT; - buf += sz; - p += sz; - read += sz; - low_count -= sz; - count -= sz; - if (should_stop_iteration()) { - count = 0; - break; - } - } - } - - if (count > 0) { - kbuf = (char *)__get_free_page(GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - while (count > 0) { - sz = size_inside_page(p, count); - if (!is_vmalloc_or_module_addr((void *)p)) { - err = -ENXIO; - break; - } - sz = vread(kbuf, (char *)p, sz); - if (!sz) - break; - if (copy_to_user(buf, kbuf, sz)) { - err = -EFAULT; - break; - } - count -= sz; - buf += sz; - read += sz; - p += sz; - if (should_stop_iteration()) - break; - } - free_page((unsigned long)kbuf); - } - *ppos = p; - return read ? read : err; -} - - -static ssize_t do_write_kmem(unsigned long p, const char __user *buf, - size_t count, loff_t *ppos) -{ - ssize_t written, sz; - unsigned long copied; - - written = 0; -#ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED - /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE) { - sz = size_inside_page(p, count); - /* Hmm. Do something? */ - buf += sz; - p += sz; - count -= sz; - written += sz; - } -#endif - - while (count > 0) { - void *ptr; - - sz = size_inside_page(p, count); - - /* - * On ia64 if a page has been mapped somewhere as uncached, then - * it must also be accessed uncached by the kernel or data - * corruption may occur. - */ - ptr = xlate_dev_kmem_ptr((void *)p); - if (!virt_addr_valid(ptr)) - return -ENXIO; - - copied = copy_from_user(ptr, buf, sz); - if (copied) { - written += sz - copied; - if (written) - break; - return -EFAULT; - } - buf += sz; - p += sz; - count -= sz; - written += sz; - if (should_stop_iteration()) - break; - } - - *ppos += written; - return written; -} - -/* - * This function writes to the *virtual* memory as seen by the kernel. - */ -static ssize_t write_kmem(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - unsigned long p = *ppos; - ssize_t wrote = 0; - ssize_t virtr = 0; - char *kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */ - int err = 0; - - if (p < (unsigned long) high_memory) { - unsigned long to_write = min_t(unsigned long, count, - (unsigned long)high_memory - p); - wrote = do_write_kmem(p, buf, to_write, ppos); - if (wrote != to_write) - return wrote; - p += wrote; - buf += wrote; - count -= wrote; - } - - if (count > 0) { - kbuf = (char *)__get_free_page(GFP_KERNEL); - if (!kbuf) - return wrote ? wrote : -ENOMEM; - while (count > 0) { - unsigned long sz = size_inside_page(p, count); - unsigned long n; - - if (!is_vmalloc_or_module_addr((void *)p)) { - err = -ENXIO; - break; - } - n = copy_from_user(kbuf, buf, sz); - if (n) { - err = -EFAULT; - break; - } - vwrite(kbuf, (char *)p, sz); - count -= sz; - buf += sz; - virtr += sz; - p += sz; - if (should_stop_iteration()) - break; - } - free_page((unsigned long)kbuf); - } - - *ppos = p; - return virtr + wrote ? : err; -} - static ssize_t read_port(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -855,7 +640,6 @@ static int open_port(struct inode *inode, struct file *filp) #define write_zero write_null #define write_iter_zero write_iter_null #define open_mem open_port -#define open_kmem open_mem static const struct file_operations __maybe_unused mem_fops = { .llseek = memory_lseek, @@ -869,18 +653,6 @@ static const struct file_operations __maybe_unused mem_fops = { #endif }; -static const struct file_operations __maybe_unused kmem_fops = { - .llseek = memory_lseek, - .read = read_kmem, - .write = write_kmem, - .mmap = mmap_kmem, - .open = open_kmem, -#ifndef CONFIG_MMU - .get_unmapped_area = get_unmapped_area_mem, - .mmap_capabilities = memory_mmap_capabilities, -#endif -}; - static const struct file_operations null_fops = { .llseek = null_lseek, .read = read_null, @@ -924,9 +696,6 @@ static const struct memdev { } devlist[] = { #ifdef CONFIG_DEVMEM [DEVMEM_MINOR] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET }, -#endif -#ifdef CONFIG_DEVKMEM - [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET }, #endif [3] = { "null", 0666, &null_fops, 0 }, #ifdef CONFIG_DEVPORT diff --git a/include/linux/fs.h b/include/linux/fs.h index acef282b97c6..c3c88fdb9b2a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -145,7 +145,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)0x1000) -/* File is huge (eg. /dev/kmem): treat loff_t as unsigned */ +/* File is huge (eg. /dev/mem): treat loff_t as unsigned */ #define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) /* File is opened with O_PATH; almost nothing can be done with it */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 394d03cc0e92..f31ba59fb1ef 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -227,7 +227,7 @@ static inline void set_vm_flush_reset_perms(void *addr) } #endif -/* for /dev/kmem */ +/* for /proc/kcore */ extern long vread(char *buf, char *addr, unsigned long count); extern long vwrite(char *buf, char *addr, unsigned long count); diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config index d3fd428f4b92..eb701b2ac72f 100644 --- a/kernel/configs/android-base.config +++ b/kernel/configs/android-base.config @@ -1,5 +1,4 @@ # KEEP ALPHABETICALLY SORTED -# CONFIG_DEVKMEM is not set # CONFIG_DEVMEM is not set # CONFIG_FHANDLE is not set # CONFIG_INET_LRO is not set diff --git a/mm/ksm.c b/mm/ksm.c index b321a67ebaa9..b7cbcc7d4977 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -459,7 +459,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm) * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP - * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. + * mmap of /dev/mem, where we would not want to touch it. * * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context * of the process that owns 'vma'. We also do not want to enforce diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9c539f0730a5..2868692c6807 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3219,7 +3219,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) * Note: In usual ops, vread() is never necessary because the caller * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without - * any information, as /dev/kmem. + * any information, as /proc/kcore. * * Return: number of bytes for which addr and buf should be increased * (same number as @count) or %0 if [addr...addr+count) doesn't From f2e762bab9f5ec74cc9860fc24f01b7f58c98659 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 6 May 2021 18:06:01 -0700 Subject: [PATCH 1338/1379] mm: remove xlate_dev_kmem_ptr() Since /dev/kmem has been removed, let's remove the xlate_dev_kmem_ptr() leftovers. Link: https://lkml.kernel.org/r/20210324102351.6932-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Geert Uytterhoeven Acked-by: Michal Hocko Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Russell King Cc: Brian Cain Cc: Geert Uytterhoeven Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Cc: Arnd Bergmann Cc: David Hildenbrand Cc: Krzysztof Kozlowski Cc: Mikulas Patocka Cc: Luc Van Oostenryck Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Luis Chamberlain Cc: Greentime Hu Cc: Sebastian Andrzej Siewior Cc: Randy Dunlap Cc: Jiaxun Yang Cc: "Peter Zijlstra (Intel)" Cc: Christophe Leroy Cc: Gerald Schaefer Cc: Niklas Schnelle Cc: Pierre Morel Cc: Ingo Molnar Cc: Kuninori Morimoto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/io.h | 5 ----- arch/arm/include/asm/io.h | 5 ----- arch/hexagon/include/asm/io.h | 1 - arch/ia64/include/asm/io.h | 1 - arch/ia64/include/asm/uaccess.h | 18 ------------------ arch/m68k/include/asm/io_mm.h | 5 ----- arch/mips/include/asm/io.h | 5 ----- arch/parisc/include/asm/io.h | 5 ----- arch/powerpc/include/asm/io.h | 5 ----- arch/s390/include/asm/io.h | 5 ----- arch/sh/include/asm/io.h | 5 ----- arch/sparc/include/asm/io_64.h | 5 ----- include/asm-generic/io.h | 11 ----------- 13 files changed, 76 deletions(-) diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index 1f6a909d1fa5..0fab5ac90775 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -602,11 +602,6 @@ extern void outsl (unsigned long port, const void *src, unsigned long count); */ #define xlate_dev_mem_ptr(p) __va(p) -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - #endif /* __KERNEL__ */ #endif /* __ALPHA_IO_H */ diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index fc748122f1e0..f74944c6fe8d 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -430,11 +430,6 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem *addr); */ #define xlate_dev_mem_ptr(p) __va(p) -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - #include #ifdef CONFIG_MMU diff --git a/arch/hexagon/include/asm/io.h b/arch/hexagon/include/asm/io.h index bda2a9c2df78..c33241425a5c 100644 --- a/arch/hexagon/include/asm/io.h +++ b/arch/hexagon/include/asm/io.h @@ -64,7 +64,6 @@ static inline void *phys_to_virt(unsigned long address) * convert a physical pointer to a virtual kernel pointer for * /dev/mem access. */ -#define xlate_dev_kmem_ptr(p) __va(p) #define xlate_dev_mem_ptr(p) __va(p) /* diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h index 3d666a11a2de..6d93b923b379 100644 --- a/arch/ia64/include/asm/io.h +++ b/arch/ia64/include/asm/io.h @@ -277,7 +277,6 @@ extern void memset_io(volatile void __iomem *s, int c, long n); #define memcpy_fromio memcpy_fromio #define memcpy_toio memcpy_toio #define memset_io memset_io -#define xlate_dev_kmem_ptr xlate_dev_kmem_ptr #define xlate_dev_mem_ptr xlate_dev_mem_ptr #include #undef PCI_IOBASE diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h index 179243c3dfc7..e19d2dcc0ced 100644 --- a/arch/ia64/include/asm/uaccess.h +++ b/arch/ia64/include/asm/uaccess.h @@ -272,22 +272,4 @@ xlate_dev_mem_ptr(phys_addr_t p) return ptr; } -/* - * Convert a virtual cached kernel memory pointer to an uncached pointer - */ -static __inline__ void * -xlate_dev_kmem_ptr(void *p) -{ - struct page *page; - void *ptr; - - page = virt_to_page((unsigned long)p); - if (PageUncached(page)) - ptr = (void *)__pa(p) + __IA64_UNCACHED_OFFSET; - else - ptr = p; - - return ptr; -} - #endif /* _ASM_IA64_UACCESS_H */ diff --git a/arch/m68k/include/asm/io_mm.h b/arch/m68k/include/asm/io_mm.h index 819f611dccf2..d41fa488453b 100644 --- a/arch/m68k/include/asm/io_mm.h +++ b/arch/m68k/include/asm/io_mm.h @@ -397,11 +397,6 @@ static inline void isa_delay(void) */ #define xlate_dev_mem_ptr(p) __va(p) -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - #define readb_relaxed(addr) readb(addr) #define readw_relaxed(addr) readw(addr) #define readl_relaxed(addr) readl(addr) diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h index 2c138450ad3b..6f5c86d2bab4 100644 --- a/arch/mips/include/asm/io.h +++ b/arch/mips/include/asm/io.h @@ -564,11 +564,6 @@ extern void (*_dma_cache_inv)(unsigned long start, unsigned long size); */ #define xlate_dev_mem_ptr(p) __va(p) -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - void __ioread64_copy(void *to, const void __iomem *from, size_t count); #endif /* _ASM_IO_H */ diff --git a/arch/parisc/include/asm/io.h b/arch/parisc/include/asm/io.h index 8a11b8cf4719..0b5259102319 100644 --- a/arch/parisc/include/asm/io.h +++ b/arch/parisc/include/asm/io.h @@ -316,11 +316,6 @@ extern void iowrite64be(u64 val, void __iomem *addr); */ #define xlate_dev_mem_ptr(p) __va(p) -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - extern int devmem_is_allowed(unsigned long pfn); #endif diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 273edd208ec5..f130783c8301 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -662,11 +662,6 @@ static inline void name at \ */ #define xlate_dev_mem_ptr(p) __va(p) -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - /* * We don't do relaxed operations yet, at least not with this semantic */ diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index 28664ee0abc1..e3882b012bfa 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -20,11 +20,6 @@ void *xlate_dev_mem_ptr(phys_addr_t phys); #define unxlate_dev_mem_ptr unxlate_dev_mem_ptr void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - #define IO_SPACE_LIMIT 0 void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot); diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index 6d5c6463bc07..cf9a3ec32406 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -283,11 +283,6 @@ static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, */ #define xlate_dev_mem_ptr(p) __va(p) -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - #define ARCH_HAS_VALID_PHYS_ADDR_RANGE int valid_phys_addr_range(phys_addr_t addr, size_t size); int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); diff --git a/arch/sparc/include/asm/io_64.h b/arch/sparc/include/asm/io_64.h index 9fbfc9574432..5ffa820dcd4d 100644 --- a/arch/sparc/include/asm/io_64.h +++ b/arch/sparc/include/asm/io_64.h @@ -454,11 +454,6 @@ void sbus_set_sbus64(struct device *, int); */ #define xlate_dev_mem_ptr(p) __va(p) -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#define xlate_dev_kmem_ptr(p) p - #endif #endif /* !(__SPARC64_IO_H) */ diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 76d456c516a1..e93375c710b9 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1064,17 +1064,6 @@ static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p) #endif #endif /* CONFIG_GENERIC_IOMAP */ -/* - * Convert a virtual cached pointer to an uncached pointer - */ -#ifndef xlate_dev_kmem_ptr -#define xlate_dev_kmem_ptr xlate_dev_kmem_ptr -static inline void *xlate_dev_kmem_ptr(void *addr) -{ - return addr; -} -#endif - #ifndef xlate_dev_mem_ptr #define xlate_dev_mem_ptr xlate_dev_mem_ptr static inline void *xlate_dev_mem_ptr(phys_addr_t addr) From f7c8ce44ebb113b83135ada6e496db33d8a535e3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 6 May 2021 18:06:06 -0700 Subject: [PATCH 1339/1379] mm/vmalloc: remove vwrite() The last user (/dev/kmem) is gone. Let's drop it. Link: https://lkml.kernel.org/r/20210324102351.6932-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Linus Torvalds Cc: Greg Kroah-Hartman Cc: Hillf Danton Cc: Matthew Wilcox Cc: Oleksiy Avramchenko Cc: Steven Rostedt Cc: Minchan Kim Cc: huang ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 - mm/nommu.c | 10 ---- mm/vmalloc.c | 116 +--------------------------------------- 3 files changed, 1 insertion(+), 126 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index f31ba59fb1ef..b6ff16393bf6 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -229,7 +229,6 @@ static inline void set_vm_flush_reset_perms(void *addr) /* for /proc/kcore */ extern long vread(char *buf, char *addr, unsigned long count); -extern long vwrite(char *buf, char *addr, unsigned long count); /* * Internals. Dont't use.. diff --git a/mm/nommu.c b/mm/nommu.c index 5c9ab799c0e6..85a3a68dffb6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -210,16 +210,6 @@ long vread(char *buf, char *addr, unsigned long count) return count; } -long vwrite(char *buf, char *addr, unsigned long count) -{ - /* Don't allow overflow */ - if ((unsigned long) addr + count < count) - count = -(unsigned long) addr; - - memcpy(addr, buf, count); - return count; -} - /* * vmalloc - allocate virtually contiguous memory * diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2868692c6807..a7f318c9e426 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3146,10 +3146,7 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) * kmap() and get small overhead in this access function. */ if (p) { - /* - * we can expect USER0 is not used (see vread/vwrite's - * function description) - */ + /* We can expect USER0 is not used -- see vread() */ void *map = kmap_atomic(p); memcpy(buf, map + offset, length); kunmap_atomic(map); @@ -3164,43 +3161,6 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) return copied; } -static int aligned_vwrite(char *buf, char *addr, unsigned long count) -{ - struct page *p; - int copied = 0; - - while (count) { - unsigned long offset, length; - - offset = offset_in_page(addr); - length = PAGE_SIZE - offset; - if (length > count) - length = count; - p = vmalloc_to_page(addr); - /* - * To do safe access to this _mapped_ area, we need - * lock. But adding lock here means that we need to add - * overhead of vmalloc()/vfree() calles for this _debug_ - * interface, rarely used. Instead of that, we'll use - * kmap() and get small overhead in this access function. - */ - if (p) { - /* - * we can expect USER0 is not used (see vread/vwrite's - * function description) - */ - void *map = kmap_atomic(p); - memcpy(map + offset, buf, length); - kunmap_atomic(map); - } - addr += length; - buf += length; - copied += length; - count -= length; - } - return copied; -} - /** * vread() - read vmalloc area in a safe way. * @buf: buffer for reading data @@ -3283,80 +3243,6 @@ finished: return buflen; } -/** - * vwrite() - write vmalloc area in a safe way. - * @buf: buffer for source data - * @addr: vm address. - * @count: number of bytes to be read. - * - * This function checks that addr is a valid vmalloc'ed area, and - * copy data from a buffer to the given addr. If specified range of - * [addr...addr+count) includes some valid address, data is copied from - * proper area of @buf. If there are memory holes, no copy to hole. - * IOREMAP area is treated as memory hole and no copy is done. - * - * If [addr...addr+count) doesn't includes any intersects with alive - * vm_struct area, returns 0. @buf should be kernel's buffer. - * - * Note: In usual ops, vwrite() is never necessary because the caller - * should know vmalloc() area is valid and can use memcpy(). - * This is for routines which have to access vmalloc area without - * any information, as /dev/kmem. - * - * Return: number of bytes for which addr and buf should be - * increased (same number as @count) or %0 if [addr...addr+count) - * doesn't include any intersection with valid vmalloc area - */ -long vwrite(char *buf, char *addr, unsigned long count) -{ - struct vmap_area *va; - struct vm_struct *vm; - char *vaddr; - unsigned long n, buflen; - int copied = 0; - - /* Don't allow overflow */ - if ((unsigned long) addr + count < count) - count = -(unsigned long) addr; - buflen = count; - - spin_lock(&vmap_area_lock); - list_for_each_entry(va, &vmap_area_list, list) { - if (!count) - break; - - if (!va->vm) - continue; - - vm = va->vm; - vaddr = (char *) vm->addr; - if (addr >= vaddr + get_vm_area_size(vm)) - continue; - while (addr < vaddr) { - if (count == 0) - goto finished; - buf++; - addr++; - count--; - } - n = vaddr + get_vm_area_size(vm) - addr; - if (n > count) - n = count; - if (!(vm->flags & VM_IOREMAP)) { - aligned_vwrite(buf, addr, n); - copied++; - } - buf += n; - addr += n; - count -= n; - } -finished: - spin_unlock(&vmap_area_lock); - if (!copied) - return 0; - return buflen; -} - /** * remap_vmalloc_range_partial - map vmalloc pages to userspace * @vma: vma to cover From 5aa6b70ed182549cae9c7ebb48820c42ffaf2eb1 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Thu, 6 May 2021 18:06:09 -0700 Subject: [PATCH 1340/1379] arm: print alloc free paths for address in registers In case of a use after free kernel oops, the freeing path of the object is required to debug futher. In most of cases the object address is present in one of the registers. Thus check the register's address and if it belongs to slab, print its alloc and free path. e.g. in the below issue register r6 belongs to slab, and a use after free issue occurred on one of its dereferenced values: Unable to handle kernel paging request at virtual address 6b6b6b6f .... pc : [] lr : [] psr: 60000013 sp : c8927d40 ip : ffffefff fp : c8aa8020 r10: c8927e10 r9 : 00000001 r8 : 00400cc0 r7 : 00000000 r6 : c8ab0180 r5 : c1804a80 r4 : c8aa8008 r3 : c1a5661c r2 : 00000000 r1 : 6b6b6b6b r0 : c139bf48 ..... Register r6 information: slab kmalloc-64 start c8ab0140 data offset 64 pointer offset 0 size 64 allocated at meminfo_proc_show+0x40/0x4fc meminfo_proc_show+0x40/0x4fc seq_read_iter+0x18c/0x4c4 proc_reg_read_iter+0x84/0xac generic_file_splice_read+0xe8/0x17c splice_direct_to_actor+0xb8/0x290 do_splice_direct+0xa0/0xe0 do_sendfile+0x2d0/0x438 sys_sendfile64+0x12c/0x140 ret_fast_syscall+0x0/0x58 0xbeeacde4 Free path: meminfo_proc_show+0x5c/0x4fc seq_read_iter+0x18c/0x4c4 proc_reg_read_iter+0x84/0xac generic_file_splice_read+0xe8/0x17c splice_direct_to_actor+0xb8/0x290 do_splice_direct+0xa0/0xe0 do_sendfile+0x2d0/0x438 sys_sendfile64+0x12c/0x140 ret_fast_syscall+0x0/0x58 0xbeeacde4 Link: https://lkml.kernel.org/r/1615891032-29160-3-git-send-email-maninder1.s@samsung.com Co-developed-by: Vaneet Narang Signed-off-by: Vaneet Narang Signed-off-by: Maninder Singh Cc: Al Viro Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Joonsoo Kim Cc: Paul E. McKenney Cc: Pekka Enberg Cc: Russell King Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/bug.h | 1 + arch/arm/kernel/process.c | 11 +++++++++++ arch/arm/kernel/traps.c | 1 + 3 files changed, 13 insertions(+) diff --git a/arch/arm/include/asm/bug.h b/arch/arm/include/asm/bug.h index 673c7dd75ab9..ba8d9d7d242b 100644 --- a/arch/arm/include/asm/bug.h +++ b/arch/arm/include/asm/bug.h @@ -88,5 +88,6 @@ extern asmlinkage void c_backtrace(unsigned long fp, int pmode, struct mm_struct; void show_pte(const char *lvl, struct mm_struct *mm, unsigned long addr); extern void __show_regs(struct pt_regs *); +extern void __show_regs_alloc_free(struct pt_regs *regs); #endif diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 5199a2bb4111..6324f4db9b02 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -92,6 +92,17 @@ void arch_cpu_idle_exit(void) ledtrig_cpu(CPU_LED_IDLE_END); } +void __show_regs_alloc_free(struct pt_regs *regs) +{ + int i; + + /* check for r0 - r12 only */ + for (i = 0; i < 13; i++) { + pr_alert("Register r%d information:", i); + mem_dump_obj((void *)regs->uregs[i]); + } +} + void __show_regs(struct pt_regs *regs) { unsigned long flags; diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 17d5a785df28..64308e3a5d0c 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -287,6 +287,7 @@ static int __die(const char *str, int err, struct pt_regs *regs) print_modules(); __show_regs(regs); + __show_regs_alloc_free(regs); pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n", TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), end_of_stack(tsk)); From 702850a45a7798031aa06baa46f9fc2cdd1e747e Mon Sep 17 00:00:00 2001 From: Drew Fustini Date: Thu, 6 May 2021 18:06:12 -0700 Subject: [PATCH 1341/1379] scripts/spelling.txt: add "overlfow" Add typo "overlfow" for "overflow". This typo was found and fixed in net/sctp/tsnmap.c. Link: https://lore.kernel.org/netdev/20210304055548.56829-1-drew@beagleboard.org/ Link: https://lkml.kernel.org/r/20210304072657.64577-1-drew@beagleboard.org Signed-off-by: Drew Fustini Suggested-by: Kees Cook Reviewed-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 7beb4262f719..8fe0283aae4d 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -1027,6 +1027,7 @@ oustanding||outstanding overaall||overall overhread||overhead overlaping||overlapping +overlfow||overflow overide||override overrided||overridden overriden||overridden From a4799be53775bf2fdc810b897fb89dd0c81e6913 Mon Sep 17 00:00:00 2001 From: zuoqilin Date: Thu, 6 May 2021 18:06:15 -0700 Subject: [PATCH 1342/1379] scripts/spelling.txt: Add "diabled" typo Increase "diabled" spelling error check. Link: https://lkml.kernel.org/r/20210304070106.2313-1-zuoqilin1@163.com Signed-off-by: zuoqilin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 8fe0283aae4d..2e6a23088e8e 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -480,6 +480,7 @@ devided||divided deviece||device devision||division diable||disable +diabled||disabled dicline||decline dictionnary||dictionary didnt||didn't From d4e3e52b4dd57b1cfd4b43a20976385463e16126 Mon Sep 17 00:00:00 2001 From: Drew Fustini Date: Thu, 6 May 2021 18:06:18 -0700 Subject: [PATCH 1343/1379] scripts/spelling.txt: add "overflw" Add typo "overflw" for "overflow". This typo was found and fixed in drivers/clocksource/timer-pistachio.c. Link: https://lore.kernel.org/lkml/20210305090315.384547-1-drew@beagleboard.org/ Link: https://lkml.kernel.org/r/20210305095151.388182-1-drew@beagleboard.org Signed-off-by: Drew Fustini Suggested-by: Gustavo A. R. Silva Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 2e6a23088e8e..7b6a01291598 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -1028,6 +1028,7 @@ oustanding||outstanding overaall||overall overhread||overhead overlaping||overlapping +overflw||overflow overlfow||overflow overide||override overrided||overridden From 80d015587a62f7de0495f2e84c9a584322453ac6 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 6 May 2021 18:06:21 -0700 Subject: [PATCH 1344/1379] mm/slab.c: fix spelling mistake "disired" -> "desired" There is a spelling mistake in a comment. Fix it. Link: https://lkml.kernel.org/r/20210317094158.5762-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slab.c b/mm/slab.c index df45c437b394..d56607a80fa6 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2284,7 +2284,7 @@ void __kmem_cache_release(struct kmem_cache *cachep) * Because if it is the case, that means we defer the creation of * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point. * And we eventually call down to __kmem_cache_create(), which - * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one. + * in turn looks up in the kmalloc_{dma,}_caches for the desired-size one. * This is a "chicken-and-egg" problem. * * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches, From 2eb70aab25dd9b0013a0035b416dbe0e81e6ad48 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 6 May 2021 18:06:24 -0700 Subject: [PATCH 1345/1379] include/linux/pgtable.h: few spelling fixes Few spelling fixes throughout the file. Link: https://lkml.kernel.org/r/20210318201404.6380-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pgtable.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2194a9cd885c..46b13780c2c8 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -426,7 +426,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres /* * On some architectures hardware does not set page access bit when accessing - * memory page, it is responsibilty of software setting this bit. It brings + * memory page, it is responsibility of software setting this bit. It brings * out extra page fault penalty to track page access bit. For optimization page * access bit can be set during all page fault flow on these arches. * To be differentiate with macro pte_mkyoung, this macro is used on platforms @@ -519,7 +519,7 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); /* * This is an implementation of pmdp_establish() that is only suitable for an * architecture that doesn't have hardware dirty/accessed bits. In this case we - * can't race with CPU which sets these bits and non-atomic aproach is fine. + * can't race with CPU which sets these bits and non-atomic approach is fine. */ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) @@ -852,7 +852,7 @@ static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, * updates, but to prevent any updates it may make from being lost. * * This does not protect against other software modifications of the - * pte; the appropriate pte lock must be held over the transation. + * pte; the appropriate pte lock must be held over the transaction. * * Note that this interface is intended to be batchable, meaning that * ptep_modify_prot_commit may not actually update the pte, but merely @@ -1281,13 +1281,13 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) * * The complete check uses is_pmd_migration_entry() in linux/swapops.h * But using that requires moving current function and pmd_trans_unstable() - * to linux/swapops.h to resovle dependency, which is too much code move. + * to linux/swapops.h to resolve dependency, which is too much code move. * * !pmd_present() is equivalent to is_pmd_migration_entry() currently, * because !pmd_present() pages can only be under migration not swapped * out. * - * pmd_none() is preseved for future condition checks on pmd migration + * pmd_none() is preserved for future condition checks on pmd migration * entries and not confusing with this function name, although it is * redundant with !pmd_present(). */ From 48207f7d41c8bdae94d2aae11620ed76fee95d45 Mon Sep 17 00:00:00 2001 From: zhouchuangao Date: Thu, 6 May 2021 18:06:27 -0700 Subject: [PATCH 1346/1379] kernel/umh.c: fix some spelling mistakes Fix some spelling mistakes, and modify the order of the parameter comments to be consistent with the order of the parameters passed to the function. Link: https://lkml.kernel.org/r/1615636139-4076-1-git-send-email-zhouchuangao@vivo.com Signed-off-by: zhouchuangao Acked-by: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/umh.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/umh.c b/kernel/umh.c index 61f6b82c354b..36c123360ab8 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -338,8 +338,8 @@ static void helper_unlock(void) * @argv: arg vector for process * @envp: environment for process * @gfp_mask: gfp mask for memory allocation - * @cleanup: a cleanup function * @init: an init function + * @cleanup: a cleanup function * @data: arbitrary context sensitive data * * Returns either %NULL on allocation failure, or a subprocess_info @@ -350,7 +350,7 @@ static void helper_unlock(void) * exec. A non-zero return code causes the process to error out, exit, * and return the failure to the calling process * - * The cleanup function is just before ethe subprocess_info is about to + * The cleanup function is just before the subprocess_info is about to * be freed. This can be used for freeing the argv and envp. The * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. @@ -386,7 +386,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); /** * call_usermodehelper_exec - start a usermode application - * @sub_info: information about the subprocessa + * @sub_info: information about the subprocess * @wait: wait for the application to finish and return status. * when UMH_NO_WAIT don't wait at all, but you get no useful error back * when the program couldn't be exec'ed. This makes it safe to call From a12f4f85bc5a70ff5b74a274d3074f12e1122913 Mon Sep 17 00:00:00 2001 From: Xiaofeng Cao Date: Thu, 6 May 2021 18:06:30 -0700 Subject: [PATCH 1347/1379] kernel/user_namespace.c: fix typos change 'verifing' to 'verifying' change 'certaint' to 'certain' change 'approprpiate' to 'appropriate' Link: https://lkml.kernel.org/r/20210317100129.12440-1-caoxiaofeng@yulong.com Signed-off-by: Xiaofeng Cao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/user_namespace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9a4b980d695b..8d62863721b0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -85,7 +85,7 @@ int create_user_ns(struct cred *new) /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, - * by verifing that the root directory is at the root of the + * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; @@ -1014,7 +1014,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, goto out; ret = -EINVAL; } - /* Be very certaint the new map actually exists */ + /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; @@ -1169,7 +1169,7 @@ static bool new_idmap_permitted(const struct file *file, /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. - * And the opener of the id file also had the approprpiate capability. + * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) From f0fffaff0b8960c9a110211510269744af1f1d1e Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 6 May 2021 18:06:33 -0700 Subject: [PATCH 1348/1379] kernel/up.c: fix typo s/condtions/conditions/ Link: https://lkml.kernel.org/r/20210317032732.3260835-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/up.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/up.c b/kernel/up.c index bf20b4a9af60..df50828cc2f0 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -38,7 +38,7 @@ EXPORT_SYMBOL(smp_call_function_single_async); /* * Preemption is disabled here to make sure the cond_func is called under the - * same condtions in UP and SMP. + * same conditions in UP and SMP. */ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask) From 5afe69c2ccd069112fd299b573d30d6b14528b6c Mon Sep 17 00:00:00 2001 From: Xiaofeng Cao Date: Thu, 6 May 2021 18:06:36 -0700 Subject: [PATCH 1349/1379] kernel/sys.c: fix typo change 'infite' to 'infinite' change 'concurent' to 'concurrent' change 'memvers' to 'members' change 'decendants' to 'descendants' change 'argumets' to 'arguments' Link: https://lkml.kernel.org/r/20210316112904.10661-1-cxfcosmos@gmail.com Signed-off-by: Xiaofeng Cao Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 3d62c9599dc0..3a583a29815f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1590,7 +1590,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, /* * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not - * infite. In case of RLIM_INFINITY the posix CPU timer code + * infinite. In case of RLIM_INFINITY the posix CPU timer code * ignores the rlimit. */ if (!retval && new_rlim && resource == RLIMIT_CPU && @@ -2029,7 +2029,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data } /* - * arg_lock protects concurent updates but we still need mmap_lock for + * arg_lock protects concurrent updates but we still need mmap_lock for * read to exclude races with sys_brk. */ mmap_read_lock(mm); @@ -2041,7 +2041,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * output in procfs mostly, except * * - @start_brk/@brk which are used in do_brk_flags but kernel lookups - * for VMAs when updating these memvers so anything wrong written + * for VMAs when updating these members so anything wrong written * here cause kernel to swear at userspace program but won't lead * to any problem in kernel itself */ @@ -2143,7 +2143,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EINVAL; /* - * arg_lock protects concurent updates of arg boundaries, we need + * arg_lock protects concurrent updates of arg boundaries, we need * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr * validation. */ @@ -2210,7 +2210,7 @@ static int prctl_set_mm(int opt, unsigned long addr, * If command line arguments and environment * are placed somewhere else on stack, we can * set them up here, ARG_START/END to setup - * command line argumets and ENV_START/END + * command line arguments and ENV_START/END * for environment. */ case PR_SET_MM_START_STACK: @@ -2258,8 +2258,8 @@ static int prctl_get_tid_address(struct task_struct *me, int __user * __user *ti static int propagate_has_child_subreaper(struct task_struct *p, void *data) { /* - * If task has has_child_subreaper - all its decendants - * already have these flag too and new decendants will + * If task has has_child_subreaper - all its descendants + * already have these flag too and new descendants will * inherit it on fork, skip them. * * If we've found child_reaper - skip descendants in From a109ae2a0252308aa46ce77067e751295b9beb87 Mon Sep 17 00:00:00 2001 From: dingsenjie Date: Thu, 6 May 2021 18:06:39 -0700 Subject: [PATCH 1350/1379] fs: fat: fix spelling typo of values vaules -> values Link: https://lkml.kernel.org/r/20210302034817.30384-1-dingsenjie@163.com Signed-off-by: dingsenjie Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/fatent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index f7e3304b7802..860e884e56e8 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -771,7 +771,7 @@ int fat_trim_fs(struct inode *inode, struct fstrim_range *range) /* * FAT data is organized as clusters, trim at the granulary of cluster. * - * fstrim_range is in byte, convert vaules to cluster index. + * fstrim_range is in byte, convert values to cluster index. * Treat sectors before data region as all used, not to trim them. */ ent_start = max_t(u64, range->start>>sbi->cluster_bits, FAT_START_ENT); From 7497835f7e8dae01c4850ce7204f6a8a7f58f2e5 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Thu, 6 May 2021 18:06:41 -0700 Subject: [PATCH 1351/1379] ipc/sem.c: spelling fix s/purpuse/purpose/ Link: https://lkml.kernel.org/r/20210319221432.26631-1-unixbhaskar@gmail.com Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/sem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ipc/sem.c b/ipc/sem.c index 0897dac27f43..e0ec239680cb 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -786,7 +786,7 @@ static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error, { get_task_struct(q->sleeper); - /* see SEM_BARRIER_2 for purpuse/pairing */ + /* see SEM_BARRIER_2 for purpose/pairing */ smp_store_release(&q->status, error); wake_q_add_safe(wake_q, q->sleeper); From fa60ce2cb4506701c43bd4cf3ca23d970daf1b9c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 6 May 2021 18:06:44 -0700 Subject: [PATCH 1352/1379] treewide: remove editor modelines and cruft The section "19) Editor modelines and other cruft" in Documentation/process/coding-style.rst clearly says, "Do not include any of these in source files." I recently receive a patch to explicitly add a new one. Let's do treewide cleanups, otherwise some people follow the existing code and attempt to upstream their favoriate editor setups. It is even nicer if scripts/checkpatch.pl can check it. If we like to impose coding style in an editor-independent manner, I think editorconfig (patch [1]) is a saner solution. [1] https://lore.kernel.org/lkml/20200703073143.423557-1-danny@kdrag0n.dev/ Link: https://lkml.kernel.org/r/20210324054457.1477489-1-masahiroy@kernel.org Signed-off-by: Masahiro Yamada Acked-by: Geert Uytterhoeven Reviewed-by: Miguel Ojeda [auxdisplay] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m68k/atari/time.c | 7 ------- arch/parisc/include/asm/pdc_chassis.h | 1 - arch/um/drivers/cow.h | 7 ------- drivers/auxdisplay/panel.c | 7 ------- drivers/gpu/drm/qxl/qxl_drv.c | 1 - drivers/media/usb/pwc/pwc-uncompress.c | 3 --- drivers/net/ethernet/adaptec/starfire.c | 8 -------- drivers/net/ethernet/amd/atarilance.c | 8 -------- drivers/net/ethernet/amd/pcnet32.c | 7 ------- .../intersil/orinoco/orinoco_nortel.c | 8 -------- .../wireless/intersil/orinoco/orinoco_pci.c | 8 -------- .../wireless/intersil/orinoco/orinoco_plx.c | 8 -------- .../wireless/intersil/orinoco/orinoco_tmd.c | 8 -------- drivers/parport/parport_ip32.c | 12 ----------- drivers/platform/x86/dell/dell_rbu.c | 3 --- drivers/scsi/53c700.c | 1 - drivers/scsi/53c700.h | 1 - drivers/scsi/ch.c | 6 ------ drivers/scsi/ips.c | 20 ------------------- drivers/scsi/ips.h | 20 ------------------- drivers/scsi/lasi700.c | 1 - drivers/scsi/megaraid/mbox_defs.h | 2 -- drivers/scsi/megaraid/mega_common.h | 2 -- drivers/scsi/megaraid/megaraid_mbox.c | 2 -- drivers/scsi/megaraid/megaraid_mbox.h | 2 -- drivers/scsi/qla1280.c | 12 ----------- drivers/scsi/sni_53c710.c | 1 - drivers/video/fbdev/matrox/matroxfb_base.c | 9 --------- drivers/video/fbdev/vga16fb.c | 10 ---------- fs/configfs/configfs_internal.h | 4 +--- fs/configfs/dir.c | 4 +--- fs/configfs/file.c | 4 +--- fs/configfs/inode.c | 4 +--- fs/configfs/item.c | 4 +--- fs/configfs/mount.c | 4 +--- fs/configfs/symlink.c | 4 +--- fs/nfs/dir.c | 7 ------- fs/nfs/nfs4proc.c | 6 ------ fs/nfs/nfs4renewd.c | 6 ------ fs/nfs/nfs4state.c | 6 ------ fs/nfs/nfs4xdr.c | 6 ------ fs/nfsd/nfs4proc.c | 6 ------ fs/nfsd/nfs4xdr.c | 6 ------ fs/nfsd/xdr4.h | 6 ------ fs/ocfs2/acl.c | 4 +--- fs/ocfs2/acl.h | 4 +--- fs/ocfs2/alloc.c | 4 +--- fs/ocfs2/alloc.h | 4 +--- fs/ocfs2/aops.c | 4 +--- fs/ocfs2/aops.h | 4 +--- fs/ocfs2/blockcheck.c | 4 +--- fs/ocfs2/blockcheck.h | 4 +--- fs/ocfs2/buffer_head_io.c | 4 +--- fs/ocfs2/buffer_head_io.h | 4 +--- fs/ocfs2/cluster/heartbeat.c | 4 +--- fs/ocfs2/cluster/heartbeat.h | 4 +--- fs/ocfs2/cluster/masklog.c | 4 +--- fs/ocfs2/cluster/masklog.h | 4 +--- fs/ocfs2/cluster/netdebug.c | 4 +--- fs/ocfs2/cluster/nodemanager.c | 4 +--- fs/ocfs2/cluster/nodemanager.h | 4 +--- fs/ocfs2/cluster/ocfs2_heartbeat.h | 4 +--- fs/ocfs2/cluster/ocfs2_nodemanager.h | 4 +--- fs/ocfs2/cluster/quorum.c | 4 +--- fs/ocfs2/cluster/quorum.h | 4 +--- fs/ocfs2/cluster/sys.c | 4 +--- fs/ocfs2/cluster/sys.h | 4 +--- fs/ocfs2/cluster/tcp.c | 4 +--- fs/ocfs2/cluster/tcp.h | 4 +--- fs/ocfs2/cluster/tcp_internal.h | 4 +--- fs/ocfs2/dcache.c | 4 +--- fs/ocfs2/dcache.h | 4 +--- fs/ocfs2/dir.c | 4 +--- fs/ocfs2/dir.h | 4 +--- fs/ocfs2/dlm/dlmapi.h | 4 +--- fs/ocfs2/dlm/dlmast.c | 4 +--- fs/ocfs2/dlm/dlmcommon.h | 4 +--- fs/ocfs2/dlm/dlmconvert.c | 4 +--- fs/ocfs2/dlm/dlmconvert.h | 4 +--- fs/ocfs2/dlm/dlmdebug.c | 4 +--- fs/ocfs2/dlm/dlmdebug.h | 4 +--- fs/ocfs2/dlm/dlmdomain.c | 4 +--- fs/ocfs2/dlm/dlmdomain.h | 4 +--- fs/ocfs2/dlm/dlmlock.c | 4 +--- fs/ocfs2/dlm/dlmmaster.c | 4 +--- fs/ocfs2/dlm/dlmrecovery.c | 4 +--- fs/ocfs2/dlm/dlmthread.c | 4 +--- fs/ocfs2/dlm/dlmunlock.c | 4 +--- fs/ocfs2/dlmfs/dlmfs.c | 4 +--- fs/ocfs2/dlmfs/userdlm.c | 4 +--- fs/ocfs2/dlmfs/userdlm.h | 4 +--- fs/ocfs2/dlmglue.c | 4 +--- fs/ocfs2/dlmglue.h | 4 +--- fs/ocfs2/export.c | 4 +--- fs/ocfs2/export.h | 4 +--- fs/ocfs2/extent_map.c | 4 +--- fs/ocfs2/extent_map.h | 4 +--- fs/ocfs2/file.c | 4 +--- fs/ocfs2/file.h | 4 +--- fs/ocfs2/filecheck.c | 4 +--- fs/ocfs2/filecheck.h | 4 +--- fs/ocfs2/heartbeat.c | 4 +--- fs/ocfs2/heartbeat.h | 4 +--- fs/ocfs2/inode.c | 4 +--- fs/ocfs2/inode.h | 4 +--- fs/ocfs2/journal.c | 4 +--- fs/ocfs2/journal.h | 4 +--- fs/ocfs2/localalloc.c | 4 +--- fs/ocfs2/localalloc.h | 4 +--- fs/ocfs2/locks.c | 4 +--- fs/ocfs2/locks.h | 4 +--- fs/ocfs2/mmap.c | 4 +--- fs/ocfs2/move_extents.c | 4 +--- fs/ocfs2/move_extents.h | 4 +--- fs/ocfs2/namei.c | 4 +--- fs/ocfs2/namei.h | 4 +--- fs/ocfs2/ocfs1_fs_compat.h | 4 +--- fs/ocfs2/ocfs2.h | 4 +--- fs/ocfs2/ocfs2_fs.h | 4 +--- fs/ocfs2/ocfs2_ioctl.h | 4 +--- fs/ocfs2/ocfs2_lockid.h | 4 +--- fs/ocfs2/ocfs2_lockingver.h | 4 +--- fs/ocfs2/refcounttree.c | 4 +--- fs/ocfs2/refcounttree.h | 4 +--- fs/ocfs2/reservations.c | 4 +--- fs/ocfs2/reservations.h | 4 +--- fs/ocfs2/resize.c | 4 +--- fs/ocfs2/resize.h | 4 +--- fs/ocfs2/slot_map.c | 4 +--- fs/ocfs2/slot_map.h | 4 +--- fs/ocfs2/stack_o2cb.c | 4 +--- fs/ocfs2/stack_user.c | 4 +--- fs/ocfs2/stackglue.c | 4 +--- fs/ocfs2/stackglue.h | 4 +--- fs/ocfs2/suballoc.c | 4 +--- fs/ocfs2/suballoc.h | 4 +--- fs/ocfs2/super.c | 4 +--- fs/ocfs2/super.h | 4 +--- fs/ocfs2/symlink.c | 4 +--- fs/ocfs2/symlink.h | 4 +--- fs/ocfs2/sysfile.c | 4 +--- fs/ocfs2/sysfile.h | 4 +--- fs/ocfs2/uptodate.c | 4 +--- fs/ocfs2/uptodate.h | 4 +--- fs/ocfs2/xattr.c | 4 +--- fs/ocfs2/xattr.h | 4 +--- fs/reiserfs/procfs.c | 10 ---------- include/linux/configfs.h | 4 +--- include/linux/genl_magic_func.h | 1 - include/linux/genl_magic_struct.h | 1 - include/uapi/linux/if_bonding.h | 11 ---------- include/uapi/linux/nfs4.h | 6 ------ include/xen/interface/elfnote.h | 10 ---------- include/xen/interface/hvm/hvm_vcpu.h | 10 ---------- include/xen/interface/io/xenbus.h | 10 ---------- samples/configfs/configfs_sample.c | 2 -- tools/usb/hcd-tests.sh | 2 -- 157 files changed, 110 insertions(+), 627 deletions(-) diff --git a/arch/m68k/atari/time.c b/arch/m68k/atari/time.c index 1068670cb741..7e44d0e9d0f8 100644 --- a/arch/m68k/atari/time.c +++ b/arch/m68k/atari/time.c @@ -317,10 +317,3 @@ int atari_tt_hwclk( int op, struct rtc_time *t ) return( 0 ); } - -/* - * Local variables: - * c-indent-level: 4 - * tab-width: 8 - * End: - */ diff --git a/arch/parisc/include/asm/pdc_chassis.h b/arch/parisc/include/asm/pdc_chassis.h index ae3e108d22ad..d6d82f53d3d0 100644 --- a/arch/parisc/include/asm/pdc_chassis.h +++ b/arch/parisc/include/asm/pdc_chassis.h @@ -365,4 +365,3 @@ void parisc_pdc_chassis_init(void); PDC_CHASSIS_EOM_SET ) #endif /* _PARISC_PDC_CHASSIS_H */ -/* vim: set ts=8 */ diff --git a/arch/um/drivers/cow.h b/arch/um/drivers/cow.h index 103adac691ed..9a67c017000f 100644 --- a/arch/um/drivers/cow.h +++ b/arch/um/drivers/cow.h @@ -24,10 +24,3 @@ extern void cow_sizes(int version, __u64 size, int sectorsize, int align, int *data_offset_out); #endif - -/* - * --------------------------------------------------------------------------- - * Local variables: - * c-file-style: "linux" - * End: - */ diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c index ff5755ee5694..eba04c0de7eb 100644 --- a/drivers/auxdisplay/panel.c +++ b/drivers/auxdisplay/panel.c @@ -1737,10 +1737,3 @@ module_init(panel_init_module); module_exit(panel_cleanup_module); MODULE_AUTHOR("Willy Tarreau"); MODULE_LICENSE("GPL"); - -/* - * Local variables: - * c-indent-level: 4 - * tab-width: 8 - * End: - */ diff --git a/drivers/gpu/drm/qxl/qxl_drv.c b/drivers/gpu/drm/qxl/qxl_drv.c index 1864467f1063..6754f578fed2 100644 --- a/drivers/gpu/drm/qxl/qxl_drv.c +++ b/drivers/gpu/drm/qxl/qxl_drv.c @@ -1,4 +1,3 @@ -/* vim: set ts=8 sw=8 tw=78 ai noexpandtab */ /* qxl_drv.c -- QXL driver -*- linux-c -*- * * Copyright 2011 Red Hat, Inc. diff --git a/drivers/media/usb/pwc/pwc-uncompress.c b/drivers/media/usb/pwc/pwc-uncompress.c index abfc88391036..68bc3829c6b3 100644 --- a/drivers/media/usb/pwc/pwc-uncompress.c +++ b/drivers/media/usb/pwc/pwc-uncompress.c @@ -9,9 +9,6 @@ Please send bug reports and support requests to . The decompression routines have been implemented by reverse-engineering the Nemosoft binary pwcx module. Caveat emptor. - - - vim: set ts=8: */ #include diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c index 555299737b51..7965e5e3c985 100644 --- a/drivers/net/ethernet/adaptec/starfire.c +++ b/drivers/net/ethernet/adaptec/starfire.c @@ -2070,11 +2070,3 @@ static void __exit starfire_cleanup (void) module_init(starfire_init); module_exit(starfire_cleanup); - - -/* - * Local variables: - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/drivers/net/ethernet/amd/atarilance.c b/drivers/net/ethernet/amd/atarilance.c index 961796abab35..c1eab916438f 100644 --- a/drivers/net/ethernet/amd/atarilance.c +++ b/drivers/net/ethernet/amd/atarilance.c @@ -1156,11 +1156,3 @@ static void __exit atarilance_module_exit(void) module_init(atarilance_module_init); module_exit(atarilance_module_exit); #endif /* MODULE */ - - -/* - * Local variables: - * c-indent-level: 4 - * tab-width: 4 - * End: - */ diff --git a/drivers/net/ethernet/amd/pcnet32.c b/drivers/net/ethernet/amd/pcnet32.c index aa412506832d..4100ab07e6b7 100644 --- a/drivers/net/ethernet/amd/pcnet32.c +++ b/drivers/net/ethernet/amd/pcnet32.c @@ -3029,10 +3029,3 @@ static void __exit pcnet32_cleanup_module(void) module_init(pcnet32_init_module); module_exit(pcnet32_cleanup_module); - -/* - * Local variables: - * c-indent-level: 4 - * tab-width: 8 - * End: - */ diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c b/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c index 96a03d10a080..18bd0d9876c2 100644 --- a/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c +++ b/drivers/net/wireless/intersil/orinoco/orinoco_nortel.c @@ -312,11 +312,3 @@ static void __exit orinoco_nortel_exit(void) module_init(orinoco_nortel_init); module_exit(orinoco_nortel_exit); - -/* - * Local variables: - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_pci.c b/drivers/net/wireless/intersil/orinoco/orinoco_pci.c index f3c86b07b1b9..7e3a6dd60c15 100644 --- a/drivers/net/wireless/intersil/orinoco/orinoco_pci.c +++ b/drivers/net/wireless/intersil/orinoco/orinoco_pci.c @@ -255,11 +255,3 @@ static void __exit orinoco_pci_exit(void) module_init(orinoco_pci_init); module_exit(orinoco_pci_exit); - -/* - * Local variables: - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_plx.c b/drivers/net/wireless/intersil/orinoco/orinoco_plx.c index 16dada94c774..73e6ae124013 100644 --- a/drivers/net/wireless/intersil/orinoco/orinoco_plx.c +++ b/drivers/net/wireless/intersil/orinoco/orinoco_plx.c @@ -360,11 +360,3 @@ static void __exit orinoco_plx_exit(void) module_init(orinoco_plx_init); module_exit(orinoco_plx_exit); - -/* - * Local variables: - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c b/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c index 9a9d335611ac..939d5a1dce97 100644 --- a/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c +++ b/drivers/net/wireless/intersil/orinoco/orinoco_tmd.c @@ -235,11 +235,3 @@ static void __exit orinoco_tmd_exit(void) module_init(orinoco_tmd_init); module_exit(orinoco_tmd_exit); - -/* - * Local variables: - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/drivers/parport/parport_ip32.c b/drivers/parport/parport_ip32.c index 48b084e86dc6..0919ed99ba94 100644 --- a/drivers/parport/parport_ip32.c +++ b/drivers/parport/parport_ip32.c @@ -2224,15 +2224,3 @@ MODULE_PARM_DESC(features, ", bit 2: hardware SPP mode" ", bit 3: hardware EPP mode" ", bit 4: hardware ECP mode"); - -/*--- Inform (X)Emacs about preferred coding style ---------------------*/ -/* - * Local Variables: - * mode: c - * c-file-style: "linux" - * indent-tabs-mode: t - * tab-width: 8 - * fill-column: 78 - * ispell-local-dictionary: "american" - * End: - */ diff --git a/drivers/platform/x86/dell/dell_rbu.c b/drivers/platform/x86/dell/dell_rbu.c index 03c3ff34bcf5..085ad0a0d22e 100644 --- a/drivers/platform/x86/dell/dell_rbu.c +++ b/drivers/platform/x86/dell/dell_rbu.c @@ -675,6 +675,3 @@ static __exit void dcdrbu_exit(void) module_exit(dcdrbu_exit); module_init(dcdrbu_init); - -/* vim:noet:ts=8:sw=8 -*/ diff --git a/drivers/scsi/53c700.c b/drivers/scsi/53c700.c index ab42feab233f..77ccb96e5ed4 100644 --- a/drivers/scsi/53c700.c +++ b/drivers/scsi/53c700.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8 -*- */ /* NCR (or Symbios) 53c700 and 53c700-66 Driver * diff --git a/drivers/scsi/53c700.h b/drivers/scsi/53c700.h index c9f8c497babb..2df347ca91af 100644 --- a/drivers/scsi/53c700.h +++ b/drivers/scsi/53c700.h @@ -1,5 +1,4 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* -*- mode: c; c-basic-offset: 8 -*- */ /* Driver for 53c700 and 53c700-66 chips from NCR and Symbios * diff --git a/drivers/scsi/ch.c b/drivers/scsi/ch.c index cb74ab1ae5a4..9b89c26ccfdb 100644 --- a/drivers/scsi/ch.c +++ b/drivers/scsi/ch.c @@ -1058,9 +1058,3 @@ static void __exit exit_ch_module(void) module_init(init_ch_module); module_exit(exit_ch_module); - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/drivers/scsi/ips.c b/drivers/scsi/ips.c index 1a3c534826ba..bc33d54a4011 100644 --- a/drivers/scsi/ips.c +++ b/drivers/scsi/ips.c @@ -7099,23 +7099,3 @@ ips_init_phase2(int index) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IBM ServeRAID Adapter Driver " IPS_VER_STRING); MODULE_VERSION(IPS_VER_STRING); - - -/* - * Overrides for Emacs so that we almost follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-indent-level: 2 - * c-brace-imaginary-offset: 0 - * c-brace-offset: -2 - * c-argdecl-indent: 2 - * c-label-offset: -2 - * c-continued-statement-offset: 2 - * c-continued-brace-offset: 0 - * indent-tabs-mode: nil - * tab-width: 8 - * End: - */ diff --git a/drivers/scsi/ips.h b/drivers/scsi/ips.h index 6c0678fb9a67..65edf000e447 100644 --- a/drivers/scsi/ips.h +++ b/drivers/scsi/ips.h @@ -1211,23 +1211,3 @@ typedef struct { IPS_COMPAT_TAMPA, \ IPS_COMPAT_KEYWEST \ } - - -/* - * Overrides for Emacs so that we almost follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-indent-level: 2 - * c-brace-imaginary-offset: 0 - * c-brace-offset: -2 - * c-argdecl-indent: 2 - * c-label-offset: -2 - * c-continued-statement-offset: 2 - * c-continued-brace-offset: 0 - * indent-tabs-mode: nil - * tab-width: 8 - * End: - */ diff --git a/drivers/scsi/lasi700.c b/drivers/scsi/lasi700.c index de71d240a56f..6d14a7a94d0b 100644 --- a/drivers/scsi/lasi700.c +++ b/drivers/scsi/lasi700.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8 -*- */ /* PARISC LASI driver for the 53c700 chip * diff --git a/drivers/scsi/megaraid/mbox_defs.h b/drivers/scsi/megaraid/mbox_defs.h index 01a1bfb8ea2a..f0ef8f7f82c1 100644 --- a/drivers/scsi/megaraid/mbox_defs.h +++ b/drivers/scsi/megaraid/mbox_defs.h @@ -781,5 +781,3 @@ typedef struct { } __attribute__ ((packed)) mbox_sgl32; #endif // _MRAID_MBOX_DEFS_H_ - -/* vim: set ts=8 sw=8 tw=78: */ diff --git a/drivers/scsi/megaraid/mega_common.h b/drivers/scsi/megaraid/mega_common.h index 3a7596e47a88..2ad0aa2f837d 100644 --- a/drivers/scsi/megaraid/mega_common.h +++ b/drivers/scsi/megaraid/mega_common.h @@ -282,5 +282,3 @@ struct mraid_pci_blk { }; #endif // _MEGA_COMMON_H_ - -// vim: set ts=8 sw=8 tw=78: diff --git a/drivers/scsi/megaraid/megaraid_mbox.c b/drivers/scsi/megaraid/megaraid_mbox.c index b1a2d3536add..145fde302d7d 100644 --- a/drivers/scsi/megaraid/megaraid_mbox.c +++ b/drivers/scsi/megaraid/megaraid_mbox.c @@ -4068,5 +4068,3 @@ megaraid_sysfs_show_ldnum(struct device *dev, struct device_attribute *attr, cha */ module_init(megaraid_init); module_exit(megaraid_exit); - -/* vim: set ts=8 sw=8 tw=78 ai si: */ diff --git a/drivers/scsi/megaraid/megaraid_mbox.h b/drivers/scsi/megaraid/megaraid_mbox.h index 3e4347c6dab1..d2fe7f69cd5d 100644 --- a/drivers/scsi/megaraid/megaraid_mbox.h +++ b/drivers/scsi/megaraid/megaraid_mbox.h @@ -230,5 +230,3 @@ typedef struct { #define WROUTDOOR(rdev, value) writel(value, (rdev)->baseaddr + 0x2C) #endif // _MEGARAID_H_ - -// vim: set ts=8 sw=8 tw=78: diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index 8f35174a1f9a..928da90b79be 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -4403,15 +4403,3 @@ MODULE_FIRMWARE("qlogic/1040.bin"); MODULE_FIRMWARE("qlogic/1280.bin"); MODULE_FIRMWARE("qlogic/12160.bin"); MODULE_VERSION(QLA1280_VERSION); - -/* - * Overrides for Emacs so that we almost follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/drivers/scsi/sni_53c710.c b/drivers/scsi/sni_53c710.c index 97c6f81b1d2a..678651b9b4dd 100644 --- a/drivers/scsi/sni_53c710.c +++ b/drivers/scsi/sni_53c710.c @@ -1,5 +1,4 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8 -*- */ /* SNI RM driver * diff --git a/drivers/video/fbdev/matrox/matroxfb_base.c b/drivers/video/fbdev/matrox/matroxfb_base.c index a3853421b263..4325bf7f388c 100644 --- a/drivers/video/fbdev/matrox/matroxfb_base.c +++ b/drivers/video/fbdev/matrox/matroxfb_base.c @@ -2608,12 +2608,3 @@ EXPORT_SYMBOL(matroxfb_register_driver); EXPORT_SYMBOL(matroxfb_unregister_driver); EXPORT_SYMBOL(matroxfb_wait_for_sync); EXPORT_SYMBOL(matroxfb_enable_irq); - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * --------------------------------------------------------------------------- - * Local variables: - * c-basic-offset: 8 - * End: - */ - diff --git a/drivers/video/fbdev/vga16fb.c b/drivers/video/fbdev/vga16fb.c index 1e8a38a7967d..e2757ff1c23d 100644 --- a/drivers/video/fbdev/vga16fb.c +++ b/drivers/video/fbdev/vga16fb.c @@ -1451,13 +1451,3 @@ MODULE_DESCRIPTION("Legacy VGA framebuffer device driver"); MODULE_LICENSE("GPL"); module_init(vga16fb_init); module_exit(vga16fb_exit); - - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * --------------------------------------------------------------------------- - * Local variables: - * c-basic-offset: 8 - * End: - */ - diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 9a3aed249692..c0395363eab9 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset:8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * configfs_internal.h - Internal stuff for configfs * * Based on sysfs: diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index b6098e02e20b..ac5e0c0e9181 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dir.c - Operations for configfs directories. * * Based on sysfs: diff --git a/fs/configfs/file.c b/fs/configfs/file.c index da8351d1e455..e26060dae70a 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * file.c - operations for regular (text) files. * * Based on sysfs: diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 42c348bb2903..eb5ec3e46283 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * inode.c - basic inode and dentry operations. * * Based on sysfs: diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 704a4356f137..254170a82aa3 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * item.c - library routines for handling generic config items * * Based on kobject: diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 0c6e8cf61953..c2d820063ec4 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * mount.c - operations for initializing and mounting configfs. * * Based on sysfs: diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 77c854364e60..0623c3edcfb9 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * symlink.c - operations for configfs symlinks. * * Based on sysfs: diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index fc4f490f2d78..3d8e3698d3df 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -3004,10 +3004,3 @@ out_notsup: goto out; } EXPORT_SYMBOL_GPL(nfs_permission); - -/* - * Local variables: - * version-control: t - * kept-new-versions: 5 - * End: - */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c65c4b41e2c1..545010d6cbf3 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -10427,9 +10427,3 @@ const struct xattr_handler *nfs4_xattr_handlers[] = { #endif NULL }; - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index ff876dda7f06..db3811af0796 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -149,9 +149,3 @@ void nfs4_set_lease_period(struct nfs_client *clp, /* Cap maximum reconnect timeout at 1/2 lease period */ rpc_set_connect_timeout(clp->cl_rpcclient, lease, lease >> 1); } - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 3a51351bdc6a..2eec5bbb55c8 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2695,9 +2695,3 @@ static int nfs4_run_state_manager(void *ptr) module_put_and_exit(0); return 0; } - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index ac6b79ee9355..d4fd3be0e8ca 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -7629,9 +7629,3 @@ const struct rpc_version nfs_version4 = { .procs = nfs4_procedures, .counts = nfs_version4_counts, }; - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index daf43b980d4b..f4ce93d7f26e 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -3317,9 +3317,3 @@ const struct svc_version nfsd_version4 = { .vs_rpcb_optnl = true, .vs_need_cong_ctrl = true, }; - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index e0f06d3cbd44..7abeccb975b2 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -5448,9 +5448,3 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p) nfsd4_sequence_done(resp); return 1; } - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index fe540a3415c6..a7c425254fee 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -866,9 +866,3 @@ struct nfsd4_operation { #endif - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 5259badabb56..5c72a7e6d6c5 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * acl.c * * Copyright (C) 2004, 2008 Oracle. All rights reserved. diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 4e86450917b2..f59d8d0a61fa 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * acl.h * * Copyright (C) 2004, 2008 Oracle. All rights reserved. diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 78710788c237..e032f2e2c2c5 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * alloc.c * * Extent allocs and frees diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 7f973dd76dbc..4af7abaa6e40 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * alloc.h * * Function prototypes diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ad20403b383f..1294925ac94a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2002, 2004 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 70ed4382750d..3a520117fa59 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index dabfef9c2bc0..863a5316030b 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * blockcheck.c * * Checksum and ECC codes for the OCFS2 userspace library. diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h index 8f17d2c85f40..d0578e98ee8d 100644 --- a/fs/ocfs2/blockcheck.h +++ b/fs/ocfs2/blockcheck.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * blockcheck.h * * Checksum and ECC codes for the OCFS2 userspace library. diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index f0b104e483d8..e7758778abef 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * io.c * * Buffer cache handling diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index 1c5e533fba04..2d51649fc090 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_buffer_head.h * * Buffer cache handling functions defined diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 12a7590601dd..e829c2595543 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2004, 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index beed31ea86cf..1d4100abf6f8 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * heartbeat.h * * Function prototypes diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 1d696c96b8b2..810d32815593 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2004, 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 446e452ac7a6..b73fc42e46ff 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 667a5c5e1f66..7524994e3199 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * netdebug.c * * debug functionality for o2net diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 7a7640c59f3c..bb82e6b1ff4e 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2004, 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index 3e0006631cc4..3490e77a952d 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * nodemanager.h * * Function prototypes diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h index 760d850be11e..6088c9f974dd 100644 --- a/fs/ocfs2/cluster/ocfs2_heartbeat.h +++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_heartbeat.h * * On-disk structures for ocfs2_heartbeat diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h index 21ad307419a8..c9a0b77443e7 100644 --- a/fs/ocfs2/cluster/ocfs2_nodemanager.h +++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_nodemanager.h * * Header describing the interface between userspace and the kernel diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index cea739be77c4..189c111bc371 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * - * vim: noexpandtab sw=8 ts=8 sts=0: +/* * * Copyright (C) 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h index 6d45ce8b18a1..d64bf4482a4a 100644 --- a/fs/ocfs2/cluster/quorum.h +++ b/fs/ocfs2/cluster/quorum.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index d6067c3d84c1..022f716c74ff 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * sys.c * * OCFS2 cluster sysfs interface diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h index ce380517cf17..70aaba65317e 100644 --- a/fs/ocfs2/cluster/sys.h +++ b/fs/ocfs2/cluster/sys.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * sys.h * * Function prototypes for o2cb sysfs interface diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 3bd8119bed5e..f660c0dbdb63 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * - * vim: noexpandtab sw=8 ts=8 sts=0: +/* * * Copyright (C) 2004 Oracle. All rights reserved. * diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index 736338f45c59..a75b551d31c7 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * tcp.h * * Function prototypes diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index e6a2b9dfcd16..601c99bd2611 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * Copyright (C) 2005 Oracle. All rights reserved. */ diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 42a61eecdacd..04fc8344063a 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dcache.c * * dentry cache handling code diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index 3686a52ba143..7f246c5692d8 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dcache.h * * Function prototypes diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index bdfba9db558a..bd8d534f11cb 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dir.c * * Creates, reads, walks and deletes directory-nodes diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h index e3e7d5dd29e8..4b9f5a12c7d2 100644 --- a/fs/ocfs2/dir.h +++ b/fs/ocfs2/dir.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dir.h * * Function prototypes diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h index 6456c0fbcbb2..bae60ca2672a 100644 --- a/fs/ocfs2/dlm/dlmapi.h +++ b/fs/ocfs2/dlm/dlmapi.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmapi.h * * externally exported dlm interfaces diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 70a10764f249..c681ba957932 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmast.c * * AST and BAST functionality for local and remote nodes diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 58d57e25d384..fd2022712167 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmcommon.h * * Copyright (C) 2004 Oracle. All rights reserved. diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 6051edc33aef..450d46eefab3 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmconvert.c * * underlying calls for lock conversion diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h index 12d9c28bc52f..1f371716513b 100644 --- a/fs/ocfs2/dlm/dlmconvert.h +++ b/fs/ocfs2/dlm/dlmconvert.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmconvert.h * * Copyright (C) 2004 Oracle. All rights reserved. diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 4b8b41d23e91..d442cf5dda8a 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmdebug.c * * debug functionality for the dlm diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index f8fd8680a4b6..e08f7357e7ec 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmdebug.h * * Copyright (C) 2008 Oracle. All rights reserved. diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 357cfc702ce3..9f90fc9551e1 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmdomain.c * * defines domain join / leave apis diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h index 7c21664d23d0..815abe30ad09 100644 --- a/fs/ocfs2/dlm/dlmdomain.h +++ b/fs/ocfs2/dlm/dlmdomain.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmdomain.h * * Copyright (C) 2004 Oracle. All rights reserved. diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 83f0760e4fba..041fd1791ae7 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmlock.c * * underlying calls for lock creation diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index f105746063ed..4960a6de768d 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmmod.c * * standalone DLM module diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index afc51736686c..0e7aad1b11cc 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmrecovery.c * * recovery stuff diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 5ccc4ff0b82a..c350bd4df770 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmthread.c * * standalone DLM module diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index dcb17ca8ae74..61103b2d69fb 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmunlock.c * * underlying calls for unlocking locks diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b2870f1a31df..fa0a14f199eb 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmfs.c * * Code which implements the kernel side of a minimal userspace diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c index 339f098d9592..29f183a15798 100644 --- a/fs/ocfs2/dlmfs/userdlm.c +++ b/fs/ocfs2/dlmfs/userdlm.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * userdlm.c * * Code which implements the kernel side of a minimal userspace diff --git a/fs/ocfs2/dlmfs/userdlm.h b/fs/ocfs2/dlmfs/userdlm.h index 0558ae768200..47ba18eac423 100644 --- a/fs/ocfs2/dlmfs/userdlm.h +++ b/fs/ocfs2/dlmfs/userdlm.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * userdlm.h * * Userspace dlm defines diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 0fbe8bf7190f..48fd369c29a4 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmglue.c * * Code which implements an OCFS2 specific interface to our DLM. diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index b8fbed25df89..e5da5809ed95 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dlmglue.h * * description here diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 69ed278dd84d..eaa8c80ace3c 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * export.c * * Functions to facilitate NFS exporting diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h index d485da0c3439..636357400505 100644 --- a/fs/ocfs2/export.h +++ b/fs/ocfs2/export.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * export.h * * Function prototypes diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 7b93e9c766f6..70a768b623cf 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * extent_map.c * * Block/Cluster mapping functions diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index e5464f6cee8a..bc4ed59fb925 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * extent_map.h * * In-memory file extent mappings for OCFS2. diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index db8a6265b749..f17c3d33fb18 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * file.c * * File open, close, extend, truncate diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 8536cec5f122..71db8f3aa027 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * file.h * * Function prototypes diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 50f11bfdc8c2..90b8d300c1ee 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * filecheck.c * * Code which implements online file check. diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h index 4d006777ac54..d3bcb8bcfeb0 100644 --- a/fs/ocfs2/filecheck.h +++ b/fs/ocfs2/filecheck.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * filecheck.h * * Online file check. diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 60c5f995d30c..9099d8fc7599 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * heartbeat.c * * Register ourselves with the heartbaet service, keep our node maps diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h index 5fedb2d35dc0..f1f8b1802fe4 100644 --- a/fs/ocfs2/heartbeat.h +++ b/fs/ocfs2/heartbeat.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * heartbeat.h * * Function prototypes diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7c9dfd50c1c1..bc8f32fab964 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * inode.c * * vfs' aops, fops, dops and iops diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 51a4f7197987..82b28fdacc7e 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * inode.h * * Function prototypes diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index db52e843002a..4e589ce2fce6 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * journal.c * * Defines functions of journalling api diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index bfe611ed1b1d..d158acb8b38a 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * journal.h * * Defines journalling api and structures. diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index fc8252a28cb1..5f6bacbeef6b 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * localalloc.c * * Node local data allocation diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index e8a5cea48639..08f925b7ec6d 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * localalloc.h * * Function prototypes diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 7edc4e5c7c2c..fab7c6a4a7d0 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * locks.c * * Userspace file locking support diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h index 389fe1fce3a5..b52de3947d5f 100644 --- a/fs/ocfs2/locks.h +++ b/fs/ocfs2/locks.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * locks.h * * Function prototypes for Userspace file locking support diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 25cabbfe87fc..1834f26522ed 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * mmap.c * * Code to deal with the mess that is clustered mmap. diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 758d9661ef1e..192cad0662d8 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * move_extents.c * * Copyright (C) 2011 Oracle. All rights reserved. diff --git a/fs/ocfs2/move_extents.h b/fs/ocfs2/move_extents.h index 28cac43892c5..987f9e559f30 100644 --- a/fs/ocfs2/move_extents.h +++ b/fs/ocfs2/move_extents.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * move_extents.h * * Copyright (C) 2011 Oracle. All rights reserved. diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 05ced86580d1..2c46ff6ba4ea 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * namei.c * * Create and rename file, directory, symlinks diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index cc091ed02b4a..9cc891eb874e 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * namei.h * * Function prototypes diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h index 01ae48c4834d..6dbcf3d467fb 100644 --- a/fs/ocfs2/ocfs1_fs_compat.h +++ b/fs/ocfs2/ocfs1_fs_compat.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs1_fs_compat.h * * OCFS1 volume header definitions. OCFS2 creates valid but unmountable diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7993d527edae..bb62cc2e0211 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2.h * * Defines macros and structures used in OCFS2 diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 19137c6d087b..638d875eccc7 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_fs.h * * On-disk structures for OCFS2. diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h index 273616bd4f19..9680797bc531 100644 --- a/fs/ocfs2/ocfs2_ioctl.h +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_ioctl.h * * Defines OCFS2 ioctls. diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index b4be84956bc1..8ac357ce6a30 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_lockid.h * * Defines OCFS2 lockid bits. diff --git a/fs/ocfs2/ocfs2_lockingver.h b/fs/ocfs2/ocfs2_lockingver.h index 5c9c105b33ee..31a5e1619e7f 100644 --- a/fs/ocfs2/ocfs2_lockingver.h +++ b/fs/ocfs2/ocfs2_lockingver.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * ocfs2_lockingver.h * * Defines OCFS2 Locking version values. diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index c19a463fac55..7f6355cbb587 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * refcounttree.c * * Copyright (C) 2009 Oracle. All rights reserved. diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 0b9014495726..8197a94feec0 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * refcounttree.h * * Copyright (C) 2009 Oracle. All rights reserved. diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index bf3842e34fb9..769e466887b0 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * reservations.c * * Allocation reservations implementation diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h index 6ac88122896d..677c50663595 100644 --- a/fs/ocfs2/reservations.h +++ b/fs/ocfs2/reservations.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * reservations.h * * Allocation reservations function prototypes and structures. diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 24eb52f9059c..d65d43c61857 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * resize.c * * volume resize. diff --git a/fs/ocfs2/resize.h b/fs/ocfs2/resize.h index 0af0c023042c..4990637219ef 100644 --- a/fs/ocfs2/resize.h +++ b/fs/ocfs2/resize.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * resize.h * * Function prototypes diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 4da0e4b1e79b..0b0ae3ebb0cf 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * slot_map.c * * Copyright (C) 2002, 2004 Oracle. All rights reserved. diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h index 93b53e73f0f7..a43644570b53 100644 --- a/fs/ocfs2/slot_map.h +++ b/fs/ocfs2/slot_map.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * slotmap.h * * description here diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index f70012038383..88f75f7f02d7 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * stack_o2cb.c * * Code which interfaces ocfs2 with the o2cb stack. diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 7397064c3f35..85a47621e0c0 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * stack_user.c * * Code which interfaces ocfs2 with fs/dlm and a userspace stack. diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 8d33ebc6b6fc..d50e8b8dfea4 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * stackglue.c * * Code which implements an OCFS2 specific interface to underlying diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index e9d26cbeb3b8..3636847fae19 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * stackglue.h * * Glue to the underlying cluster stack. diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8c8cf7f4eb34..8521942f5af2 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * suballoc.c * * metadata alloc and free diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 50b36250beb6..5805a03d100b 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * suballoc.h * * Defines sub allocator api diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 079f8826993e..c86bd4e60e20 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * super.c * * load/unload driver, mount/dismount volumes diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index 76facaf63336..8312651135b9 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * super.h * * Function prototypes diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 94cfacc9bad7..f755a4985821 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * linux/cluster/ssi/cfs/symlink.c * * This program is free software; you can redistribute it and/or diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h index 167094d1e5aa..ffcf0210545c 100644 --- a/fs/ocfs2/symlink.h +++ b/fs/ocfs2/symlink.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * symlink.h * * Function prototypes diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index bb701c4e449f..53a945da873b 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * sysfile.c * * Initialize, read, write, etc. system files. diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h index a83dd962fccb..2b38c75990fd 100644 --- a/fs/ocfs2/sysfile.h +++ b/fs/ocfs2/sysfile.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * sysfile.h * * Function prototypes diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 580852ba05c4..09854925fa5c 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * uptodate.c * * Tracking the up-to-date-ness of a local buffer_head with respect to diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h index 77a30cae4879..85d94134001b 100644 --- a/fs/ocfs2/uptodate.h +++ b/fs/ocfs2/uptodate.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * uptodate.h * * Cluster uptodate tracking diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 36ae47a4aef6..dd784eb0cd7c 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * xattr.c * * Copyright (C) 2004, 2008 Oracle. All rights reserved. diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 9c80382da1f5..00308b57f64f 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * xattr.h * * Copyright (C) 2004, 2008 Oracle. All rights reserved. diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 155b82870333..4a7cb16e9345 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -488,13 +488,3 @@ int reiserfs_proc_info_global_done(void) * (available at http://www.namesys.com/legalese.html) * */ - -/* - * Make Linus happy. - * Local variables: - * c-indentation-style: "K&R" - * mode-name: "LC" - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 2e8c69b43c64..97cfd13bae51 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * configfs.h - definitions for the device driver filesystem * * Based on sysfs: diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 6cb82301d8e9..939b1a8f571b 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -404,4 +404,3 @@ s_fields \ /* }}}1 */ #endif /* GENL_MAGIC_FUNC_H */ -/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */ diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h index 35d21fddaf2d..f81d48987528 100644 --- a/include/linux/genl_magic_struct.h +++ b/include/linux/genl_magic_struct.h @@ -283,4 +283,3 @@ enum { \ /* }}}1 */ #endif /* GENL_MAGIC_STRUCT_H */ -/* vim: set foldmethod=marker nofoldenable : */ diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index e8eb4ad03cf1..d174914a837d 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -153,14 +153,3 @@ enum { #define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1) #endif /* _LINUX_IF_BONDING_H */ - -/* - * Local variables: - * version-control: t - * kept-new-versions: 5 - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ - diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index ed5415e0f1c1..800bb0ffa6e6 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -178,9 +178,3 @@ #define NFS4_MAX_BACK_CHANNEL_OPS 2 #endif /* _UAPI_LINUX_NFS4_H */ - -/* - * Local variables: - * c-basic-offset: 8 - * End: - */ diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h index 9e9f9bf7c66d..449bd383cb76 100644 --- a/include/xen/interface/elfnote.h +++ b/include/xen/interface/elfnote.h @@ -208,13 +208,3 @@ #define XEN_ELFNOTE_MAX XEN_ELFNOTE_PHYS32_ENTRY #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/include/xen/interface/hvm/hvm_vcpu.h b/include/xen/interface/hvm/hvm_vcpu.h index 32ca83edd44d..bfc2138e0bf5 100644 --- a/include/xen/interface/hvm/hvm_vcpu.h +++ b/include/xen/interface/hvm/hvm_vcpu.h @@ -131,13 +131,3 @@ struct vcpu_hvm_context { typedef struct vcpu_hvm_context vcpu_hvm_context_t; #endif /* __XEN_PUBLIC_HVM_HVM_VCPU_H__ */ - -/* - * Local variables: - * mode: C - * c-file-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff --git a/include/xen/interface/io/xenbus.h b/include/xen/interface/io/xenbus.h index aaf2951b1cce..fb8716112251 100644 --- a/include/xen/interface/io/xenbus.h +++ b/include/xen/interface/io/xenbus.h @@ -39,13 +39,3 @@ enum xenbus_state }; #endif /* _XEN_PUBLIC_IO_XENBUS_H */ - -/* - * Local variables: - * c-file-style: "linux" - * indent-tabs-mode: t - * c-indent-level: 8 - * c-basic-offset: 8 - * tab-width: 8 - * End: - */ diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c index f9008be7a8a1..37a657b25d58 100644 --- a/samples/configfs/configfs_sample.c +++ b/samples/configfs/configfs_sample.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * vim: noexpandtab ts=8 sts=0 sw=8: - * * configfs_example_macros.c - This file is a demonstration module * containing a number of configfs subsystems. It uses the helper * macros defined by configfs.h diff --git a/tools/usb/hcd-tests.sh b/tools/usb/hcd-tests.sh index e8cad6a4f9c9..73f914d13f5c 100644 --- a/tools/usb/hcd-tests.sh +++ b/tools/usb/hcd-tests.sh @@ -272,5 +272,3 @@ do echo '' done done - -# vim: sw=4 From f0953a1bbaca71e1ebbcb9864eb1b273156157ed Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 6 May 2021 18:06:47 -0700 Subject: [PATCH 1353/1379] mm: fix typos in comments Fix ~94 single-word typos in locking code comments, plus a few very obvious grammar mistakes. Link: https://lkml.kernel.org/r/20210322212624.GA1963421@gmail.com Link: https://lore.kernel.org/r/20210322205203.GB1959563@gmail.com Signed-off-by: Ingo Molnar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Randy Dunlap Cc: Bhaskar Chowdhury Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- include/linux/vmalloc.h | 4 ++-- mm/balloon_compaction.c | 4 ++-- mm/compaction.c | 4 ++-- mm/filemap.c | 2 +- mm/gup.c | 2 +- mm/highmem.c | 2 +- mm/huge_memory.c | 6 +++--- mm/hugetlb.c | 6 +++--- mm/internal.h | 2 +- mm/kasan/kasan.h | 8 ++++---- mm/kasan/quarantine.c | 4 ++-- mm/kasan/shadow.c | 4 ++-- mm/kfence/report.c | 2 +- mm/khugepaged.c | 2 +- mm/ksm.c | 4 ++-- mm/madvise.c | 4 ++-- mm/memcontrol.c | 18 +++++++++--------- mm/memory-failure.c | 2 +- mm/memory.c | 10 +++++----- mm/mempolicy.c | 4 ++-- mm/migrate.c | 8 ++++---- mm/mmap.c | 4 ++-- mm/mprotect.c | 2 +- mm/mremap.c | 2 +- mm/oom_kill.c | 2 +- mm/page-writeback.c | 4 ++-- mm/page_alloc.c | 14 +++++++------- mm/page_owner.c | 2 +- mm/percpu-internal.h | 2 +- mm/percpu.c | 2 +- mm/pgalloc-track.h | 6 +++--- mm/slab.c | 6 +++--- mm/slub.c | 2 +- mm/swap_slots.c | 2 +- mm/vmalloc.c | 6 +++--- mm/vmstat.c | 2 +- mm/zpool.c | 2 +- mm/zsmalloc.c | 2 +- 39 files changed, 83 insertions(+), 83 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 76e27ebb28a3..322ec61d0da7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -106,7 +106,7 @@ extern int mmap_rnd_compat_bits __read_mostly; * embedding these tags into addresses that point to these memory regions, and * checking that the memory and the pointer tags match on memory accesses) * redefine this macro to strip tags from pointers. - * It's defined as noop for arcitectures that don't support memory tagging. + * It's defined as noop for architectures that don't support memory tagging. */ #ifndef untagged_addr #define untagged_addr(addr) (addr) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index b6ff16393bf6..4d668abb6391 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -33,7 +33,7 @@ struct notifier_block; /* in notifier.h */ * * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after * shadow memory has been mapped. It's used to handle allocation errors so that - * we don't try to poision shadow on free if it was never allocated. + * we don't try to poison shadow on free if it was never allocated. * * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to * determine which allocations need the module shadow freed. @@ -43,7 +43,7 @@ struct notifier_block; /* in notifier.h */ /* * Maximum alignment for ioremap() regions. - * Can be overriden by arch-specific value. + * Can be overridden by arch-specific value. */ #ifndef IOREMAP_MAX_ORDER #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 26de020aae7b..907fefde2572 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -58,7 +58,7 @@ EXPORT_SYMBOL_GPL(balloon_page_list_enqueue); /** * balloon_page_list_dequeue() - removes pages from balloon's page list and * returns a list of the pages. - * @b_dev_info: balloon device decriptor where we will grab a page from. + * @b_dev_info: balloon device descriptor where we will grab a page from. * @pages: pointer to the list of pages that would be returned to the caller. * @n_req_pages: number of requested pages. * @@ -157,7 +157,7 @@ EXPORT_SYMBOL_GPL(balloon_page_enqueue); /* * balloon_page_dequeue - removes a page from balloon's page list and returns * its address to allow the driver to release the page. - * @b_dev_info: balloon device decriptor where we will grab a page from. + * @b_dev_info: balloon device descriptor where we will grab a page from. * * Driver must call this function to properly dequeue a previously enqueued page * before definitively releasing it back to the guest system. diff --git a/mm/compaction.c b/mm/compaction.c index 3a6c6b821f80..84fde270ae74 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2012,8 +2012,8 @@ static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low) unsigned int wmark_low; /* - * Cap the low watermak to avoid excessive compaction - * activity in case a user sets the proactivess tunable + * Cap the low watermark to avoid excessive compaction + * activity in case a user sets the proactiveness tunable * close to 100 (maximum). */ wmark_low = max(100U - sysctl_compaction_proactiveness, 5U); diff --git a/mm/filemap.c b/mm/filemap.c index 7fadf211643c..66f7e9fdfbc4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2755,7 +2755,7 @@ unsigned int seek_page_size(struct xa_state *xas, struct page *page) * entirely memory-based such as tmpfs, and filesystems which support * unwritten extents. * - * Return: The requested offset on successs, or -ENXIO if @whence specifies + * Return: The requested offset on success, or -ENXIO if @whence specifies * SEEK_DATA and there is no data after @start. There is an implicit hole * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start * and @end contain data. diff --git a/mm/gup.c b/mm/gup.c index aa09535cf4d4..0697134b6a12 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1575,7 +1575,7 @@ finish_or_fault: * Returns NULL on any kind of failure - a hole must then be inserted into * the corefile, to preserve alignment with its headers; and also returns * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - - * allowing a hole to be left in the corefile to save diskspace. + * allowing a hole to be left in the corefile to save disk space. * * Called without mmap_lock (takes and releases the mmap_lock by itself). */ diff --git a/mm/highmem.c b/mm/highmem.c index e389337e00b4..4fb51d735aa6 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -519,7 +519,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) /* * Disable migration so resulting virtual address is stable - * accross preemption. + * across preemption. */ migrate_disable(); preempt_disable(); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 98456017744d..63ed6b25deaa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1792,8 +1792,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, /* * Returns * - 0 if PMD could not be locked - * - 1 if PMD was locked but protections unchange and TLB flush unnecessary - * - HPAGE_PMD_NR is protections changed and TLB flush necessary + * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary + * - HPAGE_PMD_NR if protections changed and TLB flush necessary */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags) @@ -2469,7 +2469,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, xa_lock(&swap_cache->i_pages); } - /* lock lru list/PageCompound, ref freezed by page_ref_freeze */ + /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ lruvec = lock_page_lruvec(head); for (i = nr - 1; i >= 1; i--) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 629aa4c2259c..3db405dea3dc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -466,7 +466,7 @@ static int allocate_file_region_entries(struct resv_map *resv, resv->region_cache_count; /* At this point, we should have enough entries in the cache - * for all the existings adds_in_progress. We should only be + * for all the existing adds_in_progress. We should only be * needing to allocate for regions_needed. */ VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); @@ -5536,8 +5536,8 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); /* - * vma need span at least one aligned PUD size and the start,end range - * must at least partialy within it. + * vma needs to span at least one aligned PUD size, and the range + * must be at least partially within in. */ if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || (*end <= v_start) || (*start >= v_end)) diff --git a/mm/internal.h b/mm/internal.h index feeaaf06705d..54bd0dc2c23c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -334,7 +334,7 @@ static inline bool is_exec_mapping(vm_flags_t flags) } /* - * Stack area - atomatically grows in one direction + * Stack area - automatically grows in one direction * * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: * do_mmap() forbids all other combinations. diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 3820ca54743b..8f450bc28045 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -55,9 +55,9 @@ extern bool kasan_flag_async __ro_after_init; #define KASAN_TAG_MAX 0xFD /* maximum value for random tags */ #ifdef CONFIG_KASAN_HW_TAGS -#define KASAN_TAG_MIN 0xF0 /* mimimum value for random tags */ +#define KASAN_TAG_MIN 0xF0 /* minimum value for random tags */ #else -#define KASAN_TAG_MIN 0x00 /* mimimum value for random tags */ +#define KASAN_TAG_MIN 0x00 /* minimum value for random tags */ #endif #ifdef CONFIG_KASAN_GENERIC @@ -403,7 +403,7 @@ static inline bool kasan_byte_accessible(const void *addr) #else /* CONFIG_KASAN_HW_TAGS */ /** - * kasan_poison - mark the memory range as unaccessible + * kasan_poison - mark the memory range as inaccessible * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE * @size - range size, must be aligned to KASAN_GRANULE_SIZE * @value - value that's written to metadata for the range @@ -434,7 +434,7 @@ bool kasan_byte_accessible(const void *addr); /** * kasan_poison_last_granule - mark the last granule of the memory range as - * unaccessible + * inaccessible * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE * @size - range size * diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 728fb24c5683..d8ccff4c1275 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -27,7 +27,7 @@ /* Data structure and operations for quarantine queues. */ /* - * Each queue is a signle-linked list, which also stores the total size of + * Each queue is a single-linked list, which also stores the total size of * objects inside of it. */ struct qlist_head { @@ -138,7 +138,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) local_irq_save(flags); /* - * As the object now gets freed from the quaratine, assume that its + * As the object now gets freed from the quarantine, assume that its * free track is no longer valid. */ *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE; diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 727ad4629173..082ee5b6d9a1 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -316,7 +316,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) * // rest of vmalloc process * STORE p, a LOAD shadow(x+99) * - * If there is no barrier between the end of unpoisioning the shadow + * If there is no barrier between the end of unpoisoning the shadow * and the store of the result to p, the stores could be committed * in a different order by CPU#0, and CPU#1 could erroneously observe * poison in the shadow. @@ -384,7 +384,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, * How does this work? * ------------------- * - * We have a region that is page aligned, labelled as A. + * We have a region that is page aligned, labeled as A. * That might not map onto the shadow in a way that is page-aligned: * * start end diff --git a/mm/kfence/report.c b/mm/kfence/report.c index e3f71451ad9e..2a319c21c939 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -263,6 +263,6 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r if (panic_on_warn) panic("panic_on_warn set ...\n"); - /* We encountered a memory unsafety error, taint the kernel! */ + /* We encountered a memory safety error, taint the kernel! */ add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ea74da3232ab..6c0185fdd815 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -667,7 +667,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * * The page table that maps the page has been already unlinked * from the page table tree and this process cannot get - * an additinal pin on the page. + * an additional pin on the page. * * New pins can come later if the page is shared across fork, * but not from this process. The other process cannot write to diff --git a/mm/ksm.c b/mm/ksm.c index b7cbcc7d4977..6bbe314c5260 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1065,7 +1065,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, /* * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make - * with the pagecount against the mapcount is racey and + * with the pagecount against the mapcount is racy and * O_DIRECT can happen right after the check. * So we clear the pte and flush the tlb before the check * this assure us that no O_DIRECT can happen after the check @@ -1435,7 +1435,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup, */ *_stable_node = found; /* - * Just for robustneess as stable_node is + * Just for robustness, as stable_node is * otherwise left as a stable pointer, the * compiler shall optimize it away at build * time. diff --git a/mm/madvise.c b/mm/madvise.c index 01fef79ac761..63e489e5bfdb 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -799,7 +799,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (end > vma->vm_end) { /* * Don't fail if end > vma->vm_end. If the old - * vma was splitted while the mmap_lock was + * vma was split while the mmap_lock was * released the effect of the concurrent * operation may not cause madvise() to * have an undefined result. There may be an @@ -1039,7 +1039,7 @@ process_madvise_behavior_valid(int behavior) * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * MADV_COLD - the application is not expected to use this memory soon, * deactivate pages in this range so that they can be reclaimed - * easily if memory pressure hanppens. + * easily if memory pressure happens. * MADV_PAGEOUT - the application is not expected to use this memory soon, * page out the pages in this range immediately. * diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3004afb6d090..64ada9e650a5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -215,7 +215,7 @@ enum res_type { #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) -/* Used for OOM nofiier */ +/* Used for OOM notifier */ #define OOM_CONTROL (0) /* @@ -786,7 +786,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup * @idx: the event item - * @count: the number of events that occured + * @count: the number of events that occurred */ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) @@ -904,7 +904,7 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) rcu_read_lock(); do { /* - * Page cache insertions can happen withou an + * Page cache insertions can happen without an * actual mm context, e.g. during disk probing * on boot, loopback IO, acct() writes etc. */ @@ -1712,7 +1712,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) struct mem_cgroup *iter; /* - * Be careful about under_oom underflows becase a child memcg + * Be careful about under_oom underflows because a child memcg * could have been added after mem_cgroup_mark_under_oom. */ spin_lock(&memcg_oom_lock); @@ -1884,7 +1884,7 @@ bool mem_cgroup_oom_synchronize(bool handle) /* * There is no guarantee that an OOM-lock contender * sees the wakeups triggered by the OOM kill - * uncharges. Wake any sleepers explicitely. + * uncharges. Wake any sleepers explicitly. */ memcg_oom_recover(memcg); } @@ -4364,7 +4364,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, * Foreign dirty flushing * * There's an inherent mismatch between memcg and writeback. The former - * trackes ownership per-page while the latter per-inode. This was a + * tracks ownership per-page while the latter per-inode. This was a * deliberate design decision because honoring per-page ownership in the * writeback path is complicated, may lead to higher CPU and IO overheads * and deemed unnecessary given that write-sharing an inode across @@ -4379,9 +4379,9 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, * triggering background writeback. A will be slowed down without a way to * make writeback of the dirty pages happen. * - * Conditions like the above can lead to a cgroup getting repatedly and + * Conditions like the above can lead to a cgroup getting repeatedly and * severely throttled after making some progress after each - * dirty_expire_interval while the underyling IO device is almost + * dirty_expire_interval while the underlying IO device is almost * completely idle. * * Solving this problem completely requires matching the ownership tracking @@ -5774,7 +5774,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) return 0; /* - * We are now commited to this value whatever it is. Changes in this + * We are now committed to this value whatever it is. Changes in this * tunable will only affect upcoming migrations, not the current one. * So we need to save it, and keep it going. */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index bd3945446d47..85ad98c00fd9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -75,7 +75,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo if (dissolve_free_huge_page(page) || !take_page_off_buddy(page)) /* * We could fail to take off the target page from buddy - * for example due to racy page allocaiton, but that's + * for example due to racy page allocation, but that's * acceptable because soft-offlined page is not broken * and if someone really want to use it, they should * take it. diff --git a/mm/memory.c b/mm/memory.c index 8c491f813687..730daa00952b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3727,7 +3727,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) return ret; /* - * Archs like ppc64 need additonal space to store information + * Archs like ppc64 need additional space to store information * related to pte entry. Use the preallocated table for that. */ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { @@ -4503,7 +4503,7 @@ retry_pud: } /** - * mm_account_fault - Do page fault accountings + * mm_account_fault - Do page fault accounting * * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting * of perf event counters, but we'll still do the per-task accounting to @@ -4512,9 +4512,9 @@ retry_pud: * @flags: the fault flags. * @ret: the fault retcode. * - * This will take care of most of the page fault accountings. Meanwhile, it + * This will take care of most of the page fault accounting. Meanwhile, it * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter - * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should + * updates. However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should * still be in per-arch page fault handlers at the entry of page fault. */ static inline void mm_account_fault(struct pt_regs *regs, @@ -4848,7 +4848,7 @@ out: /** * generic_access_phys - generic implementation for iomem mmap access * @vma: the vma to access - * @addr: userspace addres, not relative offset within @vma + * @addr: userspace address, not relative offset within @vma * @buf: buffer to read/write * @len: length of transfer * @write: set to FOLL_WRITE when writing, otherwise reading diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3ebe2cfc64af..5690513c5668 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1867,7 +1867,7 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * * policy->v.nodes is intersect with node_states[N_MEMORY]. - * so if the following test faile, it implies + * so if the following test fails, it implies * policy->v.nodes has movable memory only. */ if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) @@ -2098,7 +2098,7 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) * * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default * policy. Otherwise, check for intersection between mask and the policy - * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' + * nodemask for 'bind' or 'interleave' policy. For 'preferred' or 'local' * policy, always return true since it may allocate elsewhere on fallback. * * Takes task_lock(tsk) to prevent freeing of its mempolicy. diff --git a/mm/migrate.c b/mm/migrate.c index 6b37d00890ca..b234c3f3acb7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2779,11 +2779,11 @@ restore: * * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus - * allowing the caller to allocate device memory for those unback virtual - * address. For this the caller simply has to allocate device memory and + * allowing the caller to allocate device memory for those unbacked virtual + * addresses. For this the caller simply has to allocate device memory and * properly set the destination entry like for regular migration. Note that - * this can still fails and thus inside the device driver must check if the - * migration was successful for those entries after calling migrate_vma_pages() + * this can still fail, and thus inside the device driver you must check if the + * migration was successful for those entries after calling migrate_vma_pages(), * just like for regular migration. * * After that, the callers must call migrate_vma_pages() to go over each entry diff --git a/mm/mmap.c b/mm/mmap.c index c1b848fa7da6..0584e540246e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -612,7 +612,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long nr_pages = 0; struct vm_area_struct *vma; - /* Find first overlaping mapping */ + /* Find first overlapping mapping */ vma = find_vma_intersection(mm, addr, end); if (!vma) return 0; @@ -2875,7 +2875,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas - * will remain splitted, but userland will get a + * will remain split, but userland will get a * highly unexpected error anyway. This is no * different than the case where the first of the two * __split_vma fails, but we don't undo the first diff --git a/mm/mprotect.c b/mm/mprotect.c index 94188df1ee55..e7a443157988 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -699,7 +699,7 @@ SYSCALL_DEFINE1(pkey_free, int, pkey) mmap_write_unlock(current->mm); /* - * We could provie warnings or errors if any VMA still + * We could provide warnings or errors if any VMA still * has the pkey set here. */ return ret; diff --git a/mm/mremap.c b/mm/mremap.c index d22629ff8f3c..47c255b60150 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -730,7 +730,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, * So, to avoid such scenario we can pre-compute if the whole * operation has high chances to success map-wise. * Worst-scenario case is when both vma's (new_addr and old_addr) get - * split in 3 before unmaping it. + * split in 3 before unmapping it. * That means 2 more maps (1 for each) to the ones we already hold. * Check whether current map count plus 2 still leads us to 4 maps below * the threshold, otherwise return -ENOMEM here to be more safe. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3df2ac6b8686..eefd3f5fde46 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -74,7 +74,7 @@ static inline bool is_memcg_oom(struct oom_control *oc) #ifdef CONFIG_NUMA /** - * oom_cpuset_eligible() - check task eligiblity for kill + * oom_cpuset_eligible() - check task eligibility for kill * @start: task struct of which task to consider * @oc: pointer to struct oom_control * diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5e761fb62800..0062d5c57d41 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1806,7 +1806,7 @@ pause: break; /* - * In the case of an unresponding NFS server and the NFS dirty + * In the case of an unresponsive NFS server and the NFS dirty * pages exceeds dirty_thresh, give the other good wb's a pipe * to go through, so that tasks on them still remain responsive. * @@ -2216,7 +2216,7 @@ int write_cache_pages(struct address_space *mapping, * Page truncated or invalidated. We can freely skip it * then, even for data integrity operations: the page * has disappeared concurrently, so there could be no - * real expectation of this data interity operation + * real expectation of this data integrity operation * even if there is now a new, dirty page at the same * pagecache address. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bcdc0c6f21f1..0582c85da08c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -893,7 +893,7 @@ compaction_capture(struct capture_control *capc, struct page *page, return false; /* - * Do not let lower order allocations polluate a movable pageblock. + * Do not let lower order allocations pollute a movable pageblock. * This might let an unmovable request use a reclaimable pageblock * and vice-versa but no more than normal fallback logic which can * have trouble finding a high-order free page. @@ -2776,7 +2776,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, /* * In page freeing path, migratetype change is racy so * we can counter several free pages in a pageblock - * in this loop althoug we changed the pageblock type + * in this loop although we changed the pageblock type * from highatomic to ac->migratetype. So we should * adjust the count once. */ @@ -3080,7 +3080,7 @@ static void drain_local_pages_wq(struct work_struct *work) * drain_all_pages doesn't use proper cpu hotplug protection so * we can race with cpu offline when the WQ can move this from * a cpu pinned worker to an unbound one. We can operate on a different - * cpu which is allright but we also have to make sure to not move to + * cpu which is alright but we also have to make sure to not move to * a different one. */ preempt_disable(); @@ -5929,7 +5929,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) static int __parse_numa_zonelist_order(char *s) { /* - * We used to support different zonlists modes but they turned + * We used to support different zonelists modes but they turned * out to be just not useful. Let's keep the warning in place * if somebody still use the cmd line parameter so that we do * not fail it silently @@ -7670,7 +7670,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) } /* - * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For + * Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For * such cases we allow max_zone_pfn sorted in the descending order */ bool __weak arch_has_descending_max_zone_pfns(void) @@ -8728,7 +8728,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate * @end: one-past-the-last PFN to allocate - * @migratetype: migratetype of the underlaying pageblocks (either + * @migratetype: migratetype of the underlying pageblocks (either * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. @@ -8988,7 +8988,7 @@ EXPORT_SYMBOL(free_contig_range); /* * The zone indicated has a new number of managed_pages; batch sizes and percpu - * page high values need to be recalulated. + * page high values need to be recalculated. */ void __meminit zone_pcp_update(struct zone *zone) { diff --git a/mm/page_owner.c b/mm/page_owner.c index 9661d5320a07..adfabb560eb9 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -233,7 +233,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) /* * We don't clear the bit on the oldpage as it's going to be freed * after migration. Until then, the info can be useful in case of - * a bug, and the overal stats will be off a bit only temporarily. + * a bug, and the overall stats will be off a bit only temporarily. * Also, migrate_misplaced_transhuge_page() can still fail the * migration and then we want the oldpage to retain the info. But * in that case we also don't need to explicitly clear the info from diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 095d7eaa0db4..ae26b118e246 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -170,7 +170,7 @@ struct percpu_stats { u64 nr_max_alloc; /* max # of live allocations */ u32 nr_chunks; /* current # of live chunks */ u32 nr_max_chunks; /* max # of live chunks */ - size_t min_alloc_size; /* min allocaiton size */ + size_t min_alloc_size; /* min allocation size */ size_t max_alloc_size; /* max allocation size */ }; diff --git a/mm/percpu.c b/mm/percpu.c index 23308113a5ff..f99e9306b939 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1862,7 +1862,7 @@ fail: pr_info("limit reached, disable warning\n"); } if (is_atomic) { - /* see the flag handling in pcpu_blance_workfn() */ + /* see the flag handling in pcpu_balance_workfn() */ pcpu_atomic_alloc_failed = true; pcpu_schedule_balance_work(); } else { diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h index 1dcc865029a2..e9e879de8649 100644 --- a/mm/pgalloc-track.h +++ b/mm/pgalloc-track.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_PGALLLC_TRACK_H -#define _LINUX_PGALLLC_TRACK_H +#ifndef _LINUX_PGALLOC_TRACK_H +#define _LINUX_PGALLOC_TRACK_H #if defined(CONFIG_MMU) static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, @@ -48,4 +48,4 @@ static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud, (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\ NULL: pte_offset_kernel(pmd, address)) -#endif /* _LINUX_PGALLLC_TRACK_H */ +#endif /* _LINUX_PGALLOC_TRACK_H */ diff --git a/mm/slab.c b/mm/slab.c index d56607a80fa6..d0f725637663 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -259,7 +259,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define BATCHREFILL_LIMIT 16 /* - * Optimization question: fewer reaps means less probability for unnessary + * Optimization question: fewer reaps means less probability for unnecessary * cpucache drain/refill cycles. * * OTOH the cpuarrays can contain lots of objects, @@ -2381,8 +2381,8 @@ union freelist_init_state { }; /* - * Initialize the state based on the randomization methode available. - * return true if the pre-computed list is available, false otherwize. + * Initialize the state based on the randomization method available. + * return true if the pre-computed list is available, false otherwise. */ static bool freelist_state_initialize(union freelist_init_state *state, struct kmem_cache *cachep, diff --git a/mm/slub.c b/mm/slub.c index 68123b21e65f..feda53ae62ba 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3391,7 +3391,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); */ /* - * Mininum / Maximum order of slab pages. This influences locking overhead + * Minimum / Maximum order of slab pages. This influences locking overhead * and slab fragmentation. A higher order reduces the number of partial slabs * and increases the number of allocations possible without having to * take the list_lock. diff --git a/mm/swap_slots.c b/mm/swap_slots.c index be9de6d5b516..6248d1030a9b 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -16,7 +16,7 @@ * to local caches without needing to acquire swap_info * lock. We do not reuse the returned slots directly but * move them back to the global pool in a batch. This - * allows the slots to coaellesce and reduce fragmentation. + * allows the slots to coalesce and reduce fragmentation. * * The swap entry allocated is marked with SWAP_HAS_CACHE * flag in map_count that prevents it from being allocated diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a7f318c9e426..a13ac524f6ff 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1583,7 +1583,7 @@ static unsigned long lazy_max_pages(void) static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); /* - * Serialize vmap purging. There is no actual criticial section protected + * Serialize vmap purging. There is no actual critical section protected * by this look, but we want to avoid concurrent calls for performance * reasons and to make the pcpu_get_vm_areas more deterministic. */ @@ -2628,7 +2628,7 @@ static void __vfree(const void *addr) * May sleep if called *not* from interrupt context. * Must not be called in NMI context (strictly speaking, it could be * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling - * conventions for vfree() arch-depenedent would be a really bad idea). + * conventions for vfree() arch-dependent would be a really bad idea). */ void vfree(const void *addr) { @@ -3141,7 +3141,7 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) /* * To do safe access to this _mapped_ area, we need * lock. But adding lock here means that we need to add - * overhead of vmalloc()/vfree() calles for this _debug_ + * overhead of vmalloc()/vfree() calls for this _debug_ * interface, rarely used. Instead of that, we'll use * kmap() and get small overhead in this access function. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 5ba118521ded..cccee36b289c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -934,7 +934,7 @@ void cpu_vm_stats_fold(int cpu) /* * this is only called if !populated_zone(zone), which implies no other users of - * pset->vm_stat_diff[] exsist. + * pset->vm_stat_diff[] exist. */ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { diff --git a/mm/zpool.c b/mm/zpool.c index 5ed71207ced7..6d9ed48141e5 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -336,7 +336,7 @@ int zpool_shrink(struct zpool *zpool, unsigned int pages, * This may hold locks, disable interrupts, and/or preemption, * and the zpool_unmap_handle() must be called to undo those * actions. The code that uses the mapped handle should complete - * its operatons on the mapped handle memory quickly and unmap + * its operations on the mapped handle memory quickly and unmap * as soon as possible. As the implementation may use per-cpu * data, multiple handles should not be mapped concurrently on * any cpu. diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5004c176b045..19b563bc6c48 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1227,7 +1227,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages); * zs_map_object - get address of allocated object from handle. * @pool: pool from which the object was allocated * @handle: handle returned from zs_malloc - * @mm: maping mode to use + * @mm: mapping mode to use * * Before using an object allocated from zs_malloc, it must be mapped using * this function. When done with the object, it must be unmapped using From baf2f90ba416cd887d7f54cc877d8764f6775de2 Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Thu, 6 May 2021 18:06:50 -0700 Subject: [PATCH 1354/1379] mm: fix typos in comments succed -> succeed in mm/hugetlb.c wil -> will in mm/mempolicy.c wit -> with in mm/page_alloc.c Retruns -> Returns in mm/page_vma_mapped.c confict -> conflict in mm/secretmem.c No functionality changed. Link: https://lkml.kernel.org/r/20210408140027.60623-1-lujialin4@huawei.com Signed-off-by: Lu Jialin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- mm/page_alloc.c | 2 +- mm/page_vma_mapped.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5690513c5668..d79fa299b70c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -994,7 +994,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_ADDR) { /* * Take a refcount on the mpol, lookup_node() - * wil drop the mmap_lock, so after calling + * will drop the mmap_lock, so after calling * lookup_node() only "pol" remains valid, "vma" * is stale. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0582c85da08c..aaa1655cf682 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4173,7 +4173,7 @@ out: } /* - * Maximum number of compaction retries wit a progress before OOM + * Maximum number of compaction retries with a progress before OOM * killer is consider as the only way to move forward. */ #define MAX_COMPACT_RETRIES 16 diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 86e3a3688d59..2cf01d933f13 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -134,7 +134,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) * regardless of which page table level the page is mapped at. @pvmw->pmd is * NULL. * - * Retruns false if there are no more page table entries for the page in + * Returns false if there are no more page table entries for the page in * the vma. @pvmw->ptl is unlocked and @pvmw->pte is unmapped. * * If you need to stop the walk before page_vma_mapped_walk() returned false, From a54754ec9891830ba548e2010c889e3c8146e449 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 May 2021 05:53:23 -0700 Subject: [PATCH 1355/1379] netfilter: nftables: avoid overflows in nft_hash_buckets() Number of buckets being stored in 32bit variables, we have to ensure that no overflows occur in nft_hash_buckets() syzbot injected a size == 0x40000000 and reported: UBSAN: shift-out-of-bounds in ./include/linux/log2.h:57:13 shift exponent 64 is too large for 64-bit type 'long unsigned int' CPU: 1 PID: 29539 Comm: syz-executor.4 Not tainted 5.12.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x141/0x1d7 lib/dump_stack.c:120 ubsan_epilogue+0xb/0x5a lib/ubsan.c:148 __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:327 __roundup_pow_of_two include/linux/log2.h:57 [inline] nft_hash_buckets net/netfilter/nft_set_hash.c:411 [inline] nft_hash_estimate.cold+0x19/0x1e net/netfilter/nft_set_hash.c:652 nft_select_set_ops net/netfilter/nf_tables_api.c:3586 [inline] nf_tables_newset+0xe62/0x3110 net/netfilter/nf_tables_api.c:4322 nfnetlink_rcv_batch+0xa09/0x24b0 net/netfilter/nfnetlink.c:488 nfnetlink_rcv_skb_batch net/netfilter/nfnetlink.c:612 [inline] nfnetlink_rcv+0x3af/0x420 net/netfilter/nfnetlink.c:630 netlink_unicast_kernel net/netlink/af_netlink.c:1312 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1338 netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1927 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:674 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2350 ___sys_sendmsg+0xf3/0x170 net/socket.c:2404 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2433 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 Fixes: 0ed6389c483d ("netfilter: nf_tables: rename set implementations") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_hash.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 58f576abcd4a..328f2ce32e4c 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -412,9 +412,17 @@ static void nft_rhash_destroy(const struct nft_set *set) (void *)set); } +/* Number of buckets is stored in u32, so cap our result to 1U<<31 */ +#define NFT_MAX_BUCKETS (1U << 31) + static u32 nft_hash_buckets(u32 size) { - return roundup_pow_of_two(size * 4 / 3); + u64 val = div_u64((u64)size * 4, 3); + + if (val >= NFT_MAX_BUCKETS) + return NFT_MAX_BUCKETS; + + return roundup_pow_of_two(val); } static bool nft_rhash_estimate(const struct nft_set_desc *desc, u32 features, From 6c8774a94e6ad26f29ef103c8671f55c255c6201 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 May 2021 05:53:50 -0700 Subject: [PATCH 1356/1379] netfilter: nftables: avoid potential overflows on 32bit arches User space could ask for very large hash tables, we need to make sure our size computations wont overflow. nf_tables_newset() needs to double check the u64 size will fit into size_t field. Fixes: 0ed6389c483d ("netfilter: nf_tables: rename set implementations") Signed-off-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 7 +++++-- net/netfilter/nft_set_hash.c | 10 +++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 926da6ed8d51..d63d2d8f769c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4184,6 +4184,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, unsigned char *udata; struct nft_set *set; struct nft_ctx ctx; + size_t alloc_size; u64 timeout; char *name; int err, i; @@ -4329,8 +4330,10 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, size = 0; if (ops->privsize != NULL) size = ops->privsize(nla, &desc); - - set = kvzalloc(sizeof(*set) + size + udlen, GFP_KERNEL); + alloc_size = sizeof(*set) + size + udlen; + if (alloc_size < size) + return -ENOMEM; + set = kvzalloc(alloc_size, GFP_KERNEL); if (!set) return -ENOMEM; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 328f2ce32e4c..7b3d0a78c569 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -623,7 +623,7 @@ static u64 nft_hash_privsize(const struct nlattr * const nla[], const struct nft_set_desc *desc) { return sizeof(struct nft_hash) + - nft_hash_buckets(desc->size) * sizeof(struct hlist_head); + (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head); } static int nft_hash_init(const struct nft_set *set, @@ -663,8 +663,8 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, return false; est->size = sizeof(struct nft_hash) + - nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + - desc->size * sizeof(struct nft_hash_elem); + (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + + (u64)desc->size * sizeof(struct nft_hash_elem); est->lookup = NFT_SET_CLASS_O_1; est->space = NFT_SET_CLASS_O_N; @@ -681,8 +681,8 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features return false; est->size = sizeof(struct nft_hash) + - nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + - desc->size * sizeof(struct nft_hash_elem); + (u64)nft_hash_buckets(desc->size) * sizeof(struct hlist_head) + + (u64)desc->size * sizeof(struct nft_hash_elem); est->lookup = NFT_SET_CLASS_O_1; est->space = NFT_SET_CLASS_O_N; From ae4393dfd472b194c90d75d2123105fb5ed59b04 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 26 Apr 2021 13:14:01 +0200 Subject: [PATCH 1357/1379] i40e: fix broken XDP support Commit 12738ac4754e ("i40e: Fix sparse errors in i40e_txrx.c") broke XDP support in the i40e driver. That commit was fixing a sparse error in the code by introducing a new variable xdp_res instead of overloading this into the skb pointer. The problem is that the code later uses the skb pointer in if statements and these where not extended to also test for the new xdp_res variable. Fix this by adding the correct tests for xdp_res in these places. The skb pointer was used to store the result of the XDP program by overloading the results in the error pointer ERR_PTR(-result). Therefore, the allocation failure test that used to only test for !skb now need to be extended to also consider !xdp_res. i40e_cleanup_headers() had a check that based on the skb value being an error pointer, i.e. a result from the XDP program != XDP_PASS, and if so start to process a new packet immediately, instead of populating skb fields and sending the skb to the stack. This check is not needed anymore, since we have added an explicit test for xdp_res being set and if so just do continue to pick the next packet from the NIC. Fixes: 12738ac4754e ("i40e: Fix sparse errors in i40e_txrx.c") Acked-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Reported-by: Jesper Dangaard Brouer Reviewed-by: Maciej Fijalkowski Signed-off-by: Magnus Karlsson Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 121cd99fdeff..de70c16ef619 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1961,10 +1961,6 @@ static bool i40e_cleanup_headers(struct i40e_ring *rx_ring, struct sk_buff *skb, union i40e_rx_desc *rx_desc) { - /* XDP packets use error pointer so abort at this point */ - if (IS_ERR(skb)) - return true; - /* ERR_MASK will only have valid bits if EOP set, and * what we are doing here is actually checking * I40E_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in @@ -2534,7 +2530,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) } /* exit if we failed to retrieve a buffer */ - if (!skb) { + if (!xdp_res && !skb) { rx_ring->rx_stats.alloc_buff_failed++; rx_buffer->pagecnt_bias++; break; @@ -2547,7 +2543,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) if (i40e_is_non_eop(rx_ring, rx_desc)) continue; - if (i40e_cleanup_headers(rx_ring, skb, rx_desc)) { + if (xdp_res || i40e_cleanup_headers(rx_ring, skb, rx_desc)) { skb = NULL; continue; } From 38318f23a7ef86a8b1862e5e8078c4de121960c3 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Mon, 12 Apr 2021 22:41:18 +0800 Subject: [PATCH 1358/1379] i40e: Fix use-after-free in i40e_client_subtask() Currently the call to i40e_client_del_instance frees the object pf->cinst, however pf->cinst->lan_info is being accessed after the free. Fix this by adding the missing return. Addresses-Coverity: ("Read from pointer after free") Fixes: 7b0b1a6d0ac9 ("i40e: Disable iWARP VSI PETCP_ENA flag on netdev down events") Signed-off-by: Yunjian Wang Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_client.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c b/drivers/net/ethernet/intel/i40e/i40e_client.c index a2dba32383f6..32f3facbed1a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_client.c +++ b/drivers/net/ethernet/intel/i40e/i40e_client.c @@ -375,6 +375,7 @@ void i40e_client_subtask(struct i40e_pf *pf) clear_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state); i40e_client_del_instance(pf); + return; } } } From 61343e6da7810de81d6b826698946ae4f9070819 Mon Sep 17 00:00:00 2001 From: Jaroslaw Gawin Date: Tue, 13 Apr 2021 14:19:40 +0000 Subject: [PATCH 1359/1379] i40e: fix the restart auto-negotiation after FEC modified When FEC mode was changed the link didn't know it because the link was not reset and new parameters were not negotiated. Set a flag 'I40E_AQ_PHY_ENABLE_ATOMIC_LINK' in 'abilities' to restart the link and make it run with the new settings. Fixes: 1d96340196f1 ("i40e: Add support FEC configuration for Fortville 25G") Signed-off-by: Jaroslaw Gawin Signed-off-by: Mateusz Palczewski Tested-by: Dave Switzer Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 040a01400b85..53416787fb7b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -1409,7 +1409,8 @@ static int i40e_set_fec_cfg(struct net_device *netdev, u8 fec_cfg) memset(&config, 0, sizeof(config)); config.phy_type = abilities.phy_type; - config.abilities = abilities.abilities; + config.abilities = abilities.abilities | + I40E_AQ_PHY_ENABLE_ATOMIC_LINK; config.phy_type_ext = abilities.phy_type_ext; config.link_speed = abilities.link_speed; config.eee_capability = abilities.eee_capability; From 15395ec4685bd45a43d1b54b8fd9846b87e2c621 Mon Sep 17 00:00:00 2001 From: Mateusz Palczewski Date: Tue, 13 Apr 2021 14:43:07 +0000 Subject: [PATCH 1360/1379] i40e: Fix PHY type identifiers for 2.5G and 5G adapters Unlike other supported adapters, 2.5G and 5G use different PHY type identifiers for reading/writing PHY settings and for reading link status. This commit introduces separate PHY identifiers for these two operation types. Fixes: 2e45d3f4677a ("i40e: Add support for X710 B/P & SFP+ cards") Signed-off-by: Dawid Lukwinski Signed-off-by: Mateusz Palczewski Reviewed-by: Aleksandr Loktionov Tested-by: Dave Switzer Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h | 6 ++++-- drivers/net/ethernet/intel/i40e/i40e_common.c | 4 ++-- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 4 ++-- drivers/net/ethernet/intel/i40e/i40e_type.h | 7 ++----- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index ce626eace692..140b677f114d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -1566,8 +1566,10 @@ enum i40e_aq_phy_type { I40E_PHY_TYPE_25GBASE_LR = 0x22, I40E_PHY_TYPE_25GBASE_AOC = 0x23, I40E_PHY_TYPE_25GBASE_ACC = 0x24, - I40E_PHY_TYPE_2_5GBASE_T = 0x30, - I40E_PHY_TYPE_5GBASE_T = 0x31, + I40E_PHY_TYPE_2_5GBASE_T = 0x26, + I40E_PHY_TYPE_5GBASE_T = 0x27, + I40E_PHY_TYPE_2_5GBASE_T_LINK_STATUS = 0x30, + I40E_PHY_TYPE_5GBASE_T_LINK_STATUS = 0x31, I40E_PHY_TYPE_MAX, I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP = 0xFD, I40E_PHY_TYPE_EMPTY = 0xFE, diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index 41b813fe07a5..67cb0b47416a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -1154,8 +1154,8 @@ static enum i40e_media_type i40e_get_media_type(struct i40e_hw *hw) break; case I40E_PHY_TYPE_100BASE_TX: case I40E_PHY_TYPE_1000BASE_T: - case I40E_PHY_TYPE_2_5GBASE_T: - case I40E_PHY_TYPE_5GBASE_T: + case I40E_PHY_TYPE_2_5GBASE_T_LINK_STATUS: + case I40E_PHY_TYPE_5GBASE_T_LINK_STATUS: case I40E_PHY_TYPE_10GBASE_T: media = I40E_MEDIA_TYPE_BASET; break; diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 53416787fb7b..bd527eab002b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -841,8 +841,8 @@ static void i40e_get_settings_link_up(struct i40e_hw *hw, 10000baseT_Full); break; case I40E_PHY_TYPE_10GBASE_T: - case I40E_PHY_TYPE_5GBASE_T: - case I40E_PHY_TYPE_2_5GBASE_T: + case I40E_PHY_TYPE_5GBASE_T_LINK_STATUS: + case I40E_PHY_TYPE_2_5GBASE_T_LINK_STATUS: case I40E_PHY_TYPE_1000BASE_T: case I40E_PHY_TYPE_100BASE_TX: ethtool_link_ksettings_add_link_mode(ks, supported, Autoneg); diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h index 5c10faaca790..c81109a63e90 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_type.h +++ b/drivers/net/ethernet/intel/i40e/i40e_type.h @@ -239,11 +239,8 @@ struct i40e_phy_info { #define I40E_CAP_PHY_TYPE_25GBASE_ACC BIT_ULL(I40E_PHY_TYPE_25GBASE_ACC + \ I40E_PHY_TYPE_OFFSET) /* Offset for 2.5G/5G PHY Types value to bit number conversion */ -#define I40E_PHY_TYPE_OFFSET2 (-10) -#define I40E_CAP_PHY_TYPE_2_5GBASE_T BIT_ULL(I40E_PHY_TYPE_2_5GBASE_T + \ - I40E_PHY_TYPE_OFFSET2) -#define I40E_CAP_PHY_TYPE_5GBASE_T BIT_ULL(I40E_PHY_TYPE_5GBASE_T + \ - I40E_PHY_TYPE_OFFSET2) +#define I40E_CAP_PHY_TYPE_2_5GBASE_T BIT_ULL(I40E_PHY_TYPE_2_5GBASE_T) +#define I40E_CAP_PHY_TYPE_5GBASE_T BIT_ULL(I40E_PHY_TYPE_5GBASE_T) #define I40E_HW_CAP_MAX_GPIO 30 /* Capabilities of a PF or a VF or the whole device */ struct i40e_hw_capabilities { From 8085a36db71f54d2592426eb76bdf71b82479140 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Fri, 16 Apr 2021 23:43:57 +0200 Subject: [PATCH 1361/1379] i40e: Remove LLDP frame filters Remove filters from being setup in case of software DCB and allow the LLDP frames to be properly transmitted to the wire. It is not possible to transmit the LLDP frame out of the port, if they are filtered by control VSI. This prohibits software LLDP agent properly communicate its DCB capabilities to the neighbors. Fixes: 4b208eaa8078 ("i40e: Add init and default config of software based DCB") Signed-off-by: Arkadiusz Kubalewski Tested-by: Imam Hassan Reza Biswas Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e.h | 1 - .../net/ethernet/intel/i40e/i40e_ethtool.c | 1 - drivers/net/ethernet/intel/i40e/i40e_main.c | 42 ------------------- 3 files changed, 44 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 9067cd3ce243..85d3dd3a3339 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -1144,7 +1144,6 @@ static inline bool i40e_is_sw_dcb(struct i40e_pf *pf) return !!(pf->flags & I40E_FLAG_DISABLE_FW_LLDP); } -void i40e_set_lldp_forwarding(struct i40e_pf *pf, bool enable); #ifdef CONFIG_I40E_DCB void i40e_dcbnl_flush_apps(struct i40e_pf *pf, struct i40e_dcbx_config *old_cfg, diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index bd527eab002b..ccd5b9486ea9 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -5282,7 +5282,6 @@ flags_complete: i40e_aq_cfg_lldp_mib_change_event(&pf->hw, false, NULL); i40e_aq_stop_lldp(&pf->hw, true, false, NULL); } else { - i40e_set_lldp_forwarding(pf, false); status = i40e_aq_start_lldp(&pf->hw, false, NULL); if (status) { adq_err = pf->hw.aq.asq_last_status; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index c2d145a56b5e..704e474879c5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -6879,40 +6879,6 @@ out: } #endif /* CONFIG_I40E_DCB */ -/** - * i40e_set_lldp_forwarding - set forwarding of lldp frames - * @pf: PF being configured - * @enable: if forwarding to OS shall be enabled - * - * Toggle forwarding of lldp frames behavior, - * When passing DCB control from firmware to software - * lldp frames must be forwarded to the software based - * lldp agent. - */ -void i40e_set_lldp_forwarding(struct i40e_pf *pf, bool enable) -{ - if (pf->lan_vsi == I40E_NO_VSI) - return; - - if (!pf->vsi[pf->lan_vsi]) - return; - - /* No need to check the outcome, commands may fail - * if desired value is already set - */ - i40e_aq_add_rem_control_packet_filter(&pf->hw, NULL, ETH_P_LLDP, - I40E_AQC_ADD_CONTROL_PACKET_FLAGS_TX | - I40E_AQC_ADD_CONTROL_PACKET_FLAGS_IGNORE_MAC, - pf->vsi[pf->lan_vsi]->seid, 0, - enable, NULL, NULL); - - i40e_aq_add_rem_control_packet_filter(&pf->hw, NULL, ETH_P_LLDP, - I40E_AQC_ADD_CONTROL_PACKET_FLAGS_RX | - I40E_AQC_ADD_CONTROL_PACKET_FLAGS_IGNORE_MAC, - pf->vsi[pf->lan_vsi]->seid, 0, - enable, NULL, NULL); -} - /** * i40e_print_link_message - print link up or down * @vsi: the VSI for which link needs a message @@ -10736,10 +10702,6 @@ static void i40e_rebuild(struct i40e_pf *pf, bool reinit, bool lock_acquired) */ i40e_add_filter_to_drop_tx_flow_control_frames(&pf->hw, pf->main_vsi_seid); -#ifdef CONFIG_I40E_DCB - if (pf->flags & I40E_FLAG_DISABLE_FW_LLDP) - i40e_set_lldp_forwarding(pf, true); -#endif /* CONFIG_I40E_DCB */ /* restart the VSIs that were rebuilt and running before the reset */ i40e_pf_unquiesce_all_vsi(pf); @@ -15772,10 +15734,6 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) */ i40e_add_filter_to_drop_tx_flow_control_frames(&pf->hw, pf->main_vsi_seid); -#ifdef CONFIG_I40E_DCB - if (pf->flags & I40E_FLAG_DISABLE_FW_LLDP) - i40e_set_lldp_forwarding(pf, true); -#endif /* CONFIG_I40E_DCB */ if ((pf->hw.device_id == I40E_DEV_ID_10G_BASE_T) || (pf->hw.device_id == I40E_DEV_ID_10G_BASE_T4)) From 578c18eff1627d6a911f08f4cf351eca41fdcc7d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 6 May 2021 17:16:38 -0700 Subject: [PATCH 1362/1379] mptcp: fix splat when closing unaccepted socket If userspace exits before calling accept() on a listener that had at least one new connection ready, we get: Attempt to release TCP socket in state 8 This happens because the mptcp socket gets cloned when the TCP connection is ready, but the socket is never exposed to userspace. The client additionally sends a DATA_FIN, which brings connection into CLOSE_WAIT state. This in turn prevents the orphan+state reset fixup in mptcp_sock_destruct() from doing its job. Fixes: 3721b9b64676b ("mptcp: Track received DATA_FIN sequence number and add related helpers") Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/185 Tested-by: Florian Westphal Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20210507001638.225468-1-mathew.j.martineau@linux.intel.com Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 82e91b00ad39..a5ede357cfbc 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -546,8 +546,7 @@ static void mptcp_sock_destruct(struct sock *sk) * ESTABLISHED state and will not have the SOCK_DEAD flag. * Both result in warnings from inet_sock_destruct. */ - - if (sk->sk_state == TCP_ESTABLISHED) { + if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { sk->sk_state = TCP_CLOSE; WARN_ON_ONCE(sk->sk_socket); sock_orphan(sk); From 8a7cb245cf28cb3e541e0d6c8624b95d079e155b Mon Sep 17 00:00:00 2001 From: Yannick Vignon Date: Thu, 6 May 2021 16:33:12 +0200 Subject: [PATCH 1363/1379] net: stmmac: Do not enable RX FIFO overflow interrupts The RX FIFO overflows when the system is not able to process all received packets and they start accumulating (first in the DMA queue in memory, then in the FIFO). An interrupt is then raised for each overflowing packet and handled in stmmac_interrupt(). This is counter-productive, since it brings the system (or more likely, one CPU core) to its knees to process the FIFO overflow interrupts. stmmac_interrupt() handles overflow interrupts by writing the rx tail ptr into the corresponding hardware register (according to the MAC spec, this has the effect of restarting the MAC DMA). However, without freeing any rx descriptors, the DMA stops right away, and another overflow interrupt is raised as the FIFO overflows again. Since the DMA is already restarted at the end of stmmac_rx_refill() after freeing descriptors, disabling FIFO overflow interrupts and the corresponding handling code has no side effect, and eliminates the interrupt storm when the RX FIFO overflows. Signed-off-by: Yannick Vignon Link: https://lore.kernel.org/r/20210506143312.20784-1-yannick.vignon@oss.nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c | 7 +------ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 14 ++------------ 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c index a602d16b9e53..5be8e6a631d9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c @@ -232,7 +232,7 @@ static void dwmac4_dma_rx_chan_op_mode(void __iomem *ioaddr, int mode, u32 channel, int fifosz, u8 qmode) { unsigned int rqs = fifosz / 256 - 1; - u32 mtl_rx_op, mtl_rx_int; + u32 mtl_rx_op; mtl_rx_op = readl(ioaddr + MTL_CHAN_RX_OP_MODE(channel)); @@ -293,11 +293,6 @@ static void dwmac4_dma_rx_chan_op_mode(void __iomem *ioaddr, int mode, } writel(mtl_rx_op, ioaddr + MTL_CHAN_RX_OP_MODE(channel)); - - /* Enable MTL RX overflow */ - mtl_rx_int = readl(ioaddr + MTL_CHAN_INT_CTRL(channel)); - writel(mtl_rx_int | MTL_RX_OVERFLOW_INT_EN, - ioaddr + MTL_CHAN_INT_CTRL(channel)); } static void dwmac4_dma_tx_chan_op_mode(void __iomem *ioaddr, int mode, diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index e0b7eebcb512..345b4c6d1fd4 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5587,7 +5587,6 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv) /* To handle GMAC own interrupts */ if ((priv->plat->has_gmac) || xmac) { int status = stmmac_host_irq_status(priv, priv->hw, &priv->xstats); - int mtl_status; if (unlikely(status)) { /* For LPI we need to save the tx status */ @@ -5598,17 +5597,8 @@ static void stmmac_common_interrupt(struct stmmac_priv *priv) } for (queue = 0; queue < queues_count; queue++) { - struct stmmac_rx_queue *rx_q = &priv->rx_queue[queue]; - - mtl_status = stmmac_host_mtl_irq_status(priv, priv->hw, - queue); - if (mtl_status != -EINVAL) - status |= mtl_status; - - if (status & CORE_IRQ_MTL_RX_OVERFLOW) - stmmac_set_rx_tail_ptr(priv, priv->ioaddr, - rx_q->rx_tail_addr, - queue); + status = stmmac_host_mtl_irq_status(priv, priv->hw, + queue); } /* PCS link status */ From 7d18dbddb727f8268140ab76d3954b974a21657c Mon Sep 17 00:00:00 2001 From: Wei Ming Chen Date: Fri, 7 May 2021 20:38:43 +0800 Subject: [PATCH 1364/1379] atm: firestream: Use fallthrough pseudo-keyword Add pseudo-keyword macro fallthrough[1] [1] https://www.kernel.org/doc/html/latest/process/deprecated.html?highlight=fallthrough#implicit-switch-case-fall-through Signed-off-by: Wei Ming Chen Link: https://lore.kernel.org/r/20210507123843.10602-1-jj251510319013@gmail.com Signed-off-by: Jakub Kicinski --- drivers/atm/firestream.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index 0ddd611b4277..3bc3c314a467 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -795,6 +795,7 @@ static void process_incoming (struct fs_dev *dev, struct queue *q) switch (STATUS_CODE (qe)) { case 0x1: /* Fall through for streaming mode */ + fallthrough; case 0x2:/* Packet received OK.... */ if (atm_vcc) { skb = pe->skb; From 0ab1438bad43d95877f848b7df551bd431680270 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 6 May 2021 02:45:15 +0900 Subject: [PATCH 1365/1379] linux/kconfig.h: replace IF_ENABLED() with PTR_IF() in is included from all the kernel-space source files, including C, assembly, linker scripts. It is intended to contain a minimal set of macros to evaluate CONFIG options. IF_ENABLED() is an intruder here because (x ? y : z) is C code, which should not be included from assembly files or linker scripts. Also, is no longer self-contained because NULL is defined in . Move IF_ENABLED() out to as PTR_IF(). PTF_IF() takes the general boolean expression instead of a CONFIG option so that it fits better in . Signed-off-by: Masahiro Yamada Reviewed-by: Kees Cook --- drivers/pinctrl/pinctrl-ingenic.c | 3 +++ include/linux/kconfig.h | 6 ------ include/linux/kernel.h | 2 ++ 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/pinctrl/pinctrl-ingenic.c b/drivers/pinctrl/pinctrl-ingenic.c index 651a36b9dcc0..983ba9865f77 100644 --- a/drivers/pinctrl/pinctrl-ingenic.c +++ b/drivers/pinctrl/pinctrl-ingenic.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -3854,6 +3855,8 @@ static int __init ingenic_pinctrl_probe(struct platform_device *pdev) return 0; } +#define IF_ENABLED(cfg, ptr) PTR_IF(IS_ENABLED(cfg), (ptr)) + static const struct of_device_id ingenic_pinctrl_of_match[] = { { .compatible = "ingenic,jz4730-pinctrl", diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h index 24a59cb06963..cc8fa109cfa3 100644 --- a/include/linux/kconfig.h +++ b/include/linux/kconfig.h @@ -70,10 +70,4 @@ */ #define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option)) -/* - * IF_ENABLED(CONFIG_FOO, ptr) evaluates to (ptr) if CONFIG_FOO is set to 'y' - * or 'm', NULL otherwise. - */ -#define IF_ENABLED(option, ptr) (IS_ENABLED(option) ? (ptr) : NULL) - #endif /* __LINUX_KCONFIG_H */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5b7ed6dc99ac..2f9d15410c93 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -48,6 +48,8 @@ */ #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) +#define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) + #define u64_to_user_ptr(x) ( \ { \ typecheck(u64, (x)); \ From 679971e7213174efb56abc8fab1299d0a88db0e8 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 7 May 2021 18:24:11 -0500 Subject: [PATCH 1366/1379] smb3: when mounting with multichannel include it in requested capabilities In the SMB3/SMB3.1.1 negotiate protocol request, we are supposed to advertise CAP_MULTICHANNEL capability when establishing multiple channels has been requested by the user doing the mount. See MS-SMB2 sections 2.2.3 and 3.2.5.2 Without setting it there is some risk that multichannel could fail if the server interpreted the field strictly. Reviewed-By: Tom Talpey Reviewed-by: Shyam Prasad N Cc: # v5.8+ Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index e36c2a867783..a8bf43184773 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -841,6 +841,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) req->SecurityMode = 0; req->Capabilities = cpu_to_le32(server->vals->req_capabilities); + if (ses->chan_max > 1) + req->Capabilities |= cpu_to_le32(SMB2_GLOBAL_CAP_MULTI_CHANNEL); /* ClientGUID must be zero for SMB2.02 dialect */ if (server->vals->protocol_id == SMB20_PROT_ID) @@ -1032,6 +1034,9 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) pneg_inbuf->Capabilities = cpu_to_le32(server->vals->req_capabilities); + if (tcon->ses->chan_max > 1) + pneg_inbuf->Capabilities |= cpu_to_le32(SMB2_GLOBAL_CAP_MULTI_CHANNEL); + memcpy(pneg_inbuf->Guid, server->client_guid, SMB2_CLIENT_GUID_SIZE); From 9c2dc11df50d1c8537075ff6b98472198e24438e Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 7 May 2021 20:00:41 -0500 Subject: [PATCH 1367/1379] smb3: do not attempt multichannel to server which does not support it We were ignoring CAP_MULTI_CHANNEL in the server response - if the server doesn't support multichannel we should not be attempting it. See MS-SMB2 section 3.2.5.2 Reviewed-by: Shyam Prasad N Reviewed-By: Tom Talpey Cc: # v5.8+ Signed-off-by: Steve French --- fs/cifs/sess.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 63d517b9f2ff..a92a1fb7cb52 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -97,6 +97,12 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) return 0; } + if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { + cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname); + ses->chan_max = 1; + return 0; + } + /* * Make a copy of the iface list at the time and use that * instead so as to not hold the iface spinlock for opening From c1f8a398b6d661b594556a91224b096d92293061 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 7 May 2021 19:33:51 -0500 Subject: [PATCH 1368/1379] smb3: if max_channels set to more than one channel request multichannel Mounting with "multichannel" is obviously implied if user requested more than one channel on mount (ie mount parm max_channels>1). Currently both have to be specified. Fix that so that if max_channels is greater than 1 on mount, enable multichannel rather than silently falling back to non-multichannel. Signed-off-by: Steve French Reviewed-By: Tom Talpey Cc: # v5.11+ Reviewed-by: Shyam Prasad N --- fs/cifs/fs_context.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 3bcf881c3ae9..5d21cd905315 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -1021,6 +1021,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; } ctx->max_channels = result.uint_32; + /* If more than one channel requested ... they want multichan */ + if (result.uint_32 > 1) + ctx->multichannel = true; break; case Opt_handletimeout: ctx->handle_timeout = result.uint_32; From fec4d42724a1bf3dcba52307e55375fdb967b852 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 8 May 2021 11:30:22 -0700 Subject: [PATCH 1369/1379] drm/i915/display: fix compiler warning about array overrun MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit intel_dp_check_mst_status() uses a 14-byte array to read the DPRX Event Status Indicator data, but then passes that buffer at offset 10 off as an argument to drm_dp_channel_eq_ok(). End result: there are only 4 bytes remaining of the buffer, yet drm_dp_channel_eq_ok() wants a 6-byte buffer. gcc-11 correctly warns about this case: drivers/gpu/drm/i915/display/intel_dp.c: In function ‘intel_dp_check_mst_status’: drivers/gpu/drm/i915/display/intel_dp.c:3491:22: warning: ‘drm_dp_channel_eq_ok’ reading 6 bytes from a region of size 4 [-Wstringop-overread] 3491 | !drm_dp_channel_eq_ok(&esi[10], intel_dp->lane_count)) { | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/gpu/drm/i915/display/intel_dp.c:3491:22: note: referencing argument 1 of type ‘const u8 *’ {aka ‘const unsigned char *’} In file included from drivers/gpu/drm/i915/display/intel_dp.c:38: include/drm/drm_dp_helper.h:1466:6: note: in a call to function ‘drm_dp_channel_eq_ok’ 1466 | bool drm_dp_channel_eq_ok(const u8 link_status[DP_LINK_STATUS_SIZE], | ^~~~~~~~~~~~~~~~~~~~ 6:14 elapsed This commit just extends the original array by 2 zero-initialized bytes, avoiding the warning. There may be some underlying bug in here that caused this confusion, but this is at least no worse than the existing situation that could use random data off the stack. Cc: Jani Nikula Cc: Ville Syrjälä Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: Daniel Vetter Cc: Dave Airlie Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/display/intel_dp.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index a560468765c0..6a2dee8cef1f 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -3474,7 +3474,18 @@ intel_dp_check_mst_status(struct intel_dp *intel_dp) drm_WARN_ON_ONCE(&i915->drm, intel_dp->active_mst_links < 0); for (;;) { - u8 esi[DP_DPRX_ESI_LEN] = {}; + /* + * The +2 is because DP_DPRX_ESI_LEN is 14, but we then + * pass in "esi+10" to drm_dp_channel_eq_ok(), which + * takes a 6-byte array. So we actually need 16 bytes + * here. + * + * Somebody who knows what the limits actually are + * should check this, but for now this is at least + * harmless and avoids a valid compiler warning about + * using more of the array than we have allocated. + */ + u8 esi[DP_DPRX_ESI_LEN+2] = {}; bool handled; int retry; From 35c820e71565d1fa835b82499359218b219828ac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 8 May 2021 21:49:48 -0600 Subject: [PATCH 1370/1379] Revert "bio: limit bio max size" This reverts commit cd2c7545ae1beac3b6aae033c7f31193b3255946. Alex reports that the commit causes corruption with LUKS on ext4. Revert it for now so that this can be investigated properly. Link: https://lore.kernel.org/linux-block/1620493841.bxdq8r5haw.none@localhost/ Reported-by: Alex Xu (Hello71) Signed-off-by: Jens Axboe --- block/bio.c | 13 ++----------- block/blk-settings.c | 5 ----- include/linux/bio.h | 4 +--- include/linux/blkdev.h | 2 -- 4 files changed, 3 insertions(+), 21 deletions(-) diff --git a/block/bio.c b/block/bio.c index 221dc56ba22f..44205dfb6b60 100644 --- a/block/bio.c +++ b/block/bio.c @@ -255,13 +255,6 @@ void bio_init(struct bio *bio, struct bio_vec *table, } EXPORT_SYMBOL(bio_init); -unsigned int bio_max_size(struct bio *bio) -{ - struct block_device *bdev = bio->bi_bdev; - - return bdev ? bdev->bd_disk->queue->limits.bio_max_bytes : UINT_MAX; -} - /** * bio_reset - reinitialize a bio * @bio: bio to reset @@ -873,7 +866,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page, struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (page_is_mergeable(bv, page, len, off, same_page)) { - if (bio->bi_iter.bi_size > bio_max_size(bio) - len) { + if (bio->bi_iter.bi_size > UINT_MAX - len) { *same_page = false; return false; } @@ -1002,7 +995,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - unsigned int bytes_left = bio_max_size(bio) - bio->bi_iter.bi_size; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; bool same_page = false; @@ -1018,8 +1010,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); - size = iov_iter_get_pages(iter, pages, bytes_left, nr_pages, - &offset); + size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; diff --git a/block/blk-settings.c b/block/blk-settings.c index c646503e55d2..9c009090c4b5 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -31,7 +31,6 @@ EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); */ void blk_set_default_limits(struct queue_limits *lim) { - lim->bio_max_bytes = UINT_MAX; lim->max_segments = BLK_MAX_SEGMENTS; lim->max_discard_segments = 1; lim->max_integrity_segments = 0; @@ -140,10 +139,6 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; - if (check_shl_overflow(max_sectors, SECTOR_SHIFT, - &limits->bio_max_bytes)) - limits->bio_max_bytes = UINT_MAX; - q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); diff --git a/include/linux/bio.h b/include/linux/bio.h index f1a99f0a240c..a0b4cfdf62a4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -106,8 +106,6 @@ static inline void *bio_data(struct bio *bio) return NULL; } -extern unsigned int bio_max_size(struct bio *bio); - /** * bio_full - check if the bio is full * @bio: bio to check @@ -121,7 +119,7 @@ static inline bool bio_full(struct bio *bio, unsigned len) if (bio->bi_vcnt >= bio->bi_max_vecs) return true; - if (bio->bi_iter.bi_size > bio_max_size(bio) - len) + if (bio->bi_iter.bi_size > UINT_MAX - len) return true; return false; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 40c7c4d87aa1..b91ba6207365 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -327,8 +327,6 @@ enum blk_bounce { }; struct queue_limits { - unsigned int bio_max_bytes; - enum blk_bounce bounce; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; From 6dae40aed484ef2f1a3934dcdcd17b7055173e56 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 9 May 2021 14:03:33 -0700 Subject: [PATCH 1371/1379] fbmem: fix horribly incorrect placement of __maybe_unused Commit b9d79e4ca4ff ("fbmem: Mark proc_fb_seq_ops as __maybe_unused") places the '__maybe_unused' in an entirely incorrect location between the "struct" keyword and the structure name. It's a wonder that gcc accepts that silently, but clang quite reasonably warns about it: drivers/video/fbdev/core/fbmem.c:736:21: warning: attribute declaration must precede definition [-Wignored-attributes] static const struct __maybe_unused seq_operations proc_fb_seq_ops = { ^ Fix it. Cc: Guenter Roeck Cc: Daniel Vetter Signed-off-by: Linus Torvalds --- drivers/video/fbdev/core/fbmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 52c606c0f8a2..072780b0e570 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -733,7 +733,7 @@ static int fb_seq_show(struct seq_file *m, void *v) return 0; } -static const struct __maybe_unused seq_operations proc_fb_seq_ops = { +static const struct seq_operations __maybe_unused proc_fb_seq_ops = { .start = fb_seq_start, .next = fb_seq_next, .stop = fb_seq_stop, From 6efb943b8616ec53a5e444193dccf1af9ad627b5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 9 May 2021 14:17:44 -0700 Subject: [PATCH 1372/1379] Linux 5.13-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 15b6476d0f89..53d09c414635 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 -PATCHLEVEL = 12 +PATCHLEVEL = 13 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Frozen Wasteland # *DOCUMENTATION* From e44b49f623c77bee7451f1a82ccfb969c1028ae2 Mon Sep 17 00:00:00 2001 From: Zhu Lingshan Date: Sat, 8 May 2021 15:11:52 +0800 Subject: [PATCH 1373/1379] Revert "irqbypass: do not start cons/prod when failed connect" This reverts commit a979a6aa009f3c99689432e0cdb5402a4463fb88. The reverted commit may cause VM freeze on arm64 with GICv4, where stopping a consumer is implemented by suspending the VM. Should the connect fail, the VM will not be resumed, which is a bit of a problem. It also erroneously calls the producer destructor unconditionally, which is unexpected. Reported-by: Shaokun Zhang Suggested-by: Marc Zyngier Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Reviewed-by: Eric Auger Tested-by: Shaokun Zhang Signed-off-by: Zhu Lingshan [maz: tags and cc-stable, commit message update] Signed-off-by: Marc Zyngier Fixes: a979a6aa009f ("irqbypass: do not start cons/prod when failed connect") Link: https://lore.kernel.org/r/3a2c66d6-6ca0-8478-d24b-61e8e3241b20@hisilicon.com Link: https://lore.kernel.org/r/20210508071152.722425-1-lingshan.zhu@intel.com Cc: stable@vger.kernel.org --- virt/lib/irqbypass.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/virt/lib/irqbypass.c b/virt/lib/irqbypass.c index c9bb3957f58a..28fda42e471b 100644 --- a/virt/lib/irqbypass.c +++ b/virt/lib/irqbypass.c @@ -40,21 +40,17 @@ static int __connect(struct irq_bypass_producer *prod, if (prod->add_consumer) ret = prod->add_consumer(prod, cons); - if (ret) - goto err_add_consumer; - - ret = cons->add_producer(cons, prod); - if (ret) - goto err_add_producer; + if (!ret) { + ret = cons->add_producer(cons, prod); + if (ret && prod->del_consumer) + prod->del_consumer(prod, cons); + } if (cons->start) cons->start(cons); if (prod->start) prod->start(prod); -err_add_producer: - if (prod->del_consumer) - prod->del_consumer(prod, cons); -err_add_consumer: + return ret; } From fcb8283920b135bca2916133e2383a501ad57eaa Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Tue, 27 Apr 2021 06:33:57 +0800 Subject: [PATCH 1374/1379] KVM: arm64: Fix boolreturn.cocci warnings arch/arm64/kvm/mmu.c:1114:9-10: WARNING: return of 0/1 in function 'kvm_age_gfn' with return type bool arch/arm64/kvm/mmu.c:1084:9-10: WARNING: return of 0/1 in function 'kvm_set_spte_gfn' with return type bool arch/arm64/kvm/mmu.c:1127:9-10: WARNING: return of 0/1 in function 'kvm_test_age_gfn' with return type bool arch/arm64/kvm/mmu.c:1070:9-10: WARNING: return of 0/1 in function 'kvm_unmap_gfn_range' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Generated by: scripts/coccinelle/misc/boolreturn.cocci Fixes: cd4c71835228 ("KVM: arm64: Convert to the gfn-based MMU notifier callbacks") Reported-by: kernel test robot Signed-off-by: kernel test robot Reviewed-by: Sean Christopherson Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210426223357.GA45871@cd4295a34ed8 --- arch/arm64/kvm/mmu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c5d1f3c87dbd..c10207fed2f3 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1156,13 +1156,13 @@ out_unlock: bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.mmu.pgt) - return 0; + return false; __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT, (range->end - range->start) << PAGE_SHIFT, range->may_block); - return 0; + return false; } bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1170,7 +1170,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) kvm_pfn_t pfn = pte_pfn(range->pte); if (!kvm->arch.mmu.pgt) - return 0; + return false; WARN_ON(range->end - range->start != 1); @@ -1190,7 +1190,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) PAGE_SIZE, __pfn_to_phys(pfn), KVM_PGTABLE_PROT_R, NULL); - return 0; + return false; } bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -1200,7 +1200,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) pte_t pte; if (!kvm->arch.mmu.pgt) - return 0; + return false; WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); @@ -1213,7 +1213,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { if (!kvm->arch.mmu.pgt) - return 0; + return false; return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT); From eaa9b88dae64254a87d3d83b77afa71ee992f502 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 14 May 2021 08:56:39 +0000 Subject: [PATCH 1375/1379] KVM: arm64: Mark pkvm_pgtable_mm_ops static It is not used outside of setup.c, mark it static. Fixes:f320bc742bc2 ("KVM: arm64: Prepare the creation of s1 mappings at EL2") Reported-by: kernel test robot Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210514085640.3917886-2-qperret@google.com --- arch/arm64/kvm/hyp/nvhe/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 7488f53b0aa2..a3d3a275344e 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -17,7 +17,6 @@ #include struct hyp_pool hpool; -struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; unsigned long hyp_nr_cpus; #define hyp_percpu_size ((unsigned long)__per_cpu_end - \ @@ -27,6 +26,7 @@ static void *vmemmap_base; static void *hyp_pgt_base; static void *host_s2_mem_pgt_base; static void *host_s2_dev_pgt_base; +static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; static int divide_memory_pool(void *virt, unsigned long size) { From 3fdc15fe8c6445175d61f0fac111d2ee9354e385 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 14 May 2021 08:56:40 +0000 Subject: [PATCH 1376/1379] KVM: arm64: Mark the host stage-2 memory pools static The host stage-2 memory pools are not used outside of mem_protect.c, mark them static. Fixes: 1025c8c0c6ac ("KVM: arm64: Wrap the host with a stage 2") Reported-by: kernel test robot Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210514085640.3917886-3-qperret@google.com --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index e342f7f4f4fb..4b60c0056c04 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -23,8 +23,8 @@ extern unsigned long hyp_nr_cpus; struct host_kvm host_kvm; -struct hyp_pool host_s2_mem; -struct hyp_pool host_s2_dev; +static struct hyp_pool host_s2_mem; +static struct hyp_pool host_s2_dev; /* * Copies of the host's CPU features registers holding sanitized values. From f5e30680616ab09e690b153b7a68ff7dd13e6579 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 6 May 2021 14:31:42 +0100 Subject: [PATCH 1377/1379] KVM: arm64: Move __adjust_pc out of line In order to make it easy to call __adjust_pc() from the EL1 code (in the case of nVHE), rename it to __kvm_adjust_pc() and move it out of line. No expected functional change. Reviewed-by: Alexandru Elisei Reviewed-by: Zenghui Yu Tested-by: Zenghui Yu Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org # 5.11 --- arch/arm64/include/asm/kvm_asm.h | 2 ++ arch/arm64/kvm/hyp/exception.c | 18 +++++++++++++++++- arch/arm64/kvm/hyp/include/hyp/adjust_pc.h | 18 ------------------ arch/arm64/kvm/hyp/nvhe/switch.c | 3 +-- arch/arm64/kvm/hyp/vhe/switch.c | 3 +-- 5 files changed, 21 insertions(+), 23 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index cf8df032b9c3..d5b11037401d 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -201,6 +201,8 @@ extern void __kvm_timer_set_cntvoff(u64 cntvoff); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); +extern void __kvm_adjust_pc(struct kvm_vcpu *vcpu); + extern u64 __vgic_v3_get_gic_config(void); extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 73629094f903..0812a496725f 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -296,7 +296,7 @@ static void enter_exception32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) *vcpu_pc(vcpu) = vect_offset; } -void kvm_inject_exception(struct kvm_vcpu *vcpu) +static void kvm_inject_exception(struct kvm_vcpu *vcpu) { if (vcpu_el1_is_32bit(vcpu)) { switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { @@ -329,3 +329,19 @@ void kvm_inject_exception(struct kvm_vcpu *vcpu) } } } + +/* + * Adjust the guest PC on entry, depending on flags provided by EL1 + * for the purpose of emulation (MMIO, sysreg) or exception injection. + */ +void __kvm_adjust_pc(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) { + kvm_inject_exception(vcpu); + vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION | + KVM_ARM64_EXCEPT_MASK); + } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { + kvm_skip_instr(vcpu); + vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC; + } +} diff --git a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h index 61716359035d..4fdfeabefeb4 100644 --- a/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h +++ b/arch/arm64/kvm/hyp/include/hyp/adjust_pc.h @@ -13,8 +13,6 @@ #include #include -void kvm_inject_exception(struct kvm_vcpu *vcpu); - static inline void kvm_skip_instr(struct kvm_vcpu *vcpu) { if (vcpu_mode_is_32bit(vcpu)) { @@ -43,22 +41,6 @@ static inline void __kvm_skip_instr(struct kvm_vcpu *vcpu) write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR); } -/* - * Adjust the guest PC on entry, depending on flags provided by EL1 - * for the purpose of emulation (MMIO, sysreg) or exception injection. - */ -static inline void __adjust_pc(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) { - kvm_inject_exception(vcpu); - vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION | - KVM_ARM64_EXCEPT_MASK); - } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { - kvm_skip_instr(vcpu); - vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC; - } -} - /* * Skip an instruction while host sysregs are live. * Assumes host is always 64-bit. diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index e9f6ea704d07..f7af9688c1f7 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -4,7 +4,6 @@ * Author: Marc Zyngier */ -#include #include #include @@ -201,7 +200,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) */ __debug_save_host_buffers_nvhe(vcpu); - __adjust_pc(vcpu); + __kvm_adjust_pc(vcpu); /* * We must restore the 32-bit state before the sysregs, thanks diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 7b8f7db5c1ed..b3229924d243 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -4,7 +4,6 @@ * Author: Marc Zyngier */ -#include #include #include @@ -132,7 +131,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __load_guest_stage2(vcpu->arch.hw_mmu); __activate_traps(vcpu); - __adjust_pc(vcpu); + __kvm_adjust_pc(vcpu); sysreg_restore_guest_state_vhe(guest_ctxt); __debug_switch_to_guest(vcpu); From 26778aaa134a9aefdf5dbaad904054d7be9d656d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 6 May 2021 15:20:12 +0100 Subject: [PATCH 1378/1379] KVM: arm64: Commit pending PC adjustemnts before returning to userspace KVM currently updates PC (and the corresponding exception state) using a two phase approach: first by setting a set of flags, then by converting these flags into a state update when the vcpu is about to enter the guest. However, this creates a disconnect with userspace if the vcpu thread returns there with any exception/PC flag set. In this case, the exposed context is wrong, as userspace doesn't have access to these flags (they aren't architectural). It also means that these flags are preserved across a reset, which isn't expected. To solve this problem, force an explicit synchronisation of the exception state on vcpu exit to userspace. As an optimisation for nVHE systems, only perform this when there is something pending. Reported-by: Zenghui Yu Reviewed-by: Alexandru Elisei Reviewed-by: Zenghui Yu Tested-by: Zenghui Yu Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org # 5.11 --- arch/arm64/include/asm/kvm_asm.h | 1 + arch/arm64/kvm/arm.c | 11 +++++++++++ arch/arm64/kvm/hyp/exception.c | 4 ++-- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 8 ++++++++ 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index d5b11037401d..5e9b33cbac51 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -63,6 +63,7 @@ #define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 #define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 #define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20 +#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 21 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 1cb39c0803a4..1126eae27400 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -897,6 +897,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_sigset_deactivate(vcpu); + /* + * In the unlikely event that we are returning to userspace + * with pending exceptions or PC adjustment, commit these + * adjustments in order to give userspace a consistent view of + * the vcpu state. Note that this relies on __kvm_adjust_pc() + * being preempt-safe on VHE. + */ + if (unlikely(vcpu->arch.flags & (KVM_ARM64_PENDING_EXCEPTION | + KVM_ARM64_INCREMENT_PC))) + kvm_call_hyp(__kvm_adjust_pc, vcpu); + vcpu_put(vcpu); return ret; } diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 0812a496725f..11541b94b328 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -331,8 +331,8 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) } /* - * Adjust the guest PC on entry, depending on flags provided by EL1 - * for the purpose of emulation (MMIO, sysreg) or exception injection. + * Adjust the guest PC (and potentially exception state) depending on + * flags provided by the emulation code. */ void __kvm_adjust_pc(struct kvm_vcpu *vcpu) { diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index f36420a80474..1632f001f4ed 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -28,6 +28,13 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __kvm_vcpu_run(kern_hyp_va(vcpu)); } +static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); + + __kvm_adjust_pc(kern_hyp_va(vcpu)); +} + static void handle___kvm_flush_vm_context(struct kvm_cpu_context *host_ctxt) { __kvm_flush_vm_context(); @@ -170,6 +177,7 @@ typedef void (*hcall_t)(struct kvm_cpu_context *); static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_vcpu_run), + HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), HANDLE_FUNC(__kvm_tlb_flush_vmid), From cb853ded1d25e5b026ce115dbcde69e3d7e2e831 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 14 May 2021 09:05:41 +0100 Subject: [PATCH 1379/1379] KVM: arm64: Fix debug register indexing Commit 03fdfb2690099 ("KVM: arm64: Don't write junk to sysregs on reset") flipped the register number to 0 for all the debug registers in the sysreg table, hereby indicating that these registers live in a separate shadow structure. However, the author of this patch failed to realise that all the accessors are using that particular index instead of the register encoding, resulting in all the registers hitting index 0. Not quite a valid implementation of the architecture... Address the issue by fixing all the accessors to use the CRm field of the encoding, which contains the debug register index. Fixes: 03fdfb2690099 ("KVM: arm64: Don't write junk to sysregs on reset") Reported-by: Ricardo Koller Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org --- arch/arm64/kvm/sys_regs.c | 42 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 76ea2800c33e..1a7968ad078c 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -399,14 +399,14 @@ static bool trap_bvr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; if (p->is_write) reg_to_dbg(vcpu, p, rd, dbg_reg); else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); + trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); return true; } @@ -414,7 +414,7 @@ static bool trap_bvr(struct kvm_vcpu *vcpu, static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -424,7 +424,7 @@ static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -434,21 +434,21 @@ static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static void reset_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg] = rd->val; + vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val; } static bool trap_bcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; if (p->is_write) reg_to_dbg(vcpu, p, rd, dbg_reg); else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); + trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); return true; } @@ -456,7 +456,7 @@ static bool trap_bcr(struct kvm_vcpu *vcpu, static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -467,7 +467,7 @@ static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -477,22 +477,22 @@ static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static void reset_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg] = rd->val; + vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val; } static bool trap_wvr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; if (p->is_write) reg_to_dbg(vcpu, p, rd, dbg_reg); else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->reg, p->is_write, - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]); + trace_trap_reg(__func__, rd->CRm, p->is_write, + vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]); return true; } @@ -500,7 +500,7 @@ static bool trap_wvr(struct kvm_vcpu *vcpu, static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -510,7 +510,7 @@ static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -520,21 +520,21 @@ static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static void reset_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg] = rd->val; + vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val; } static bool trap_wcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; + u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; if (p->is_write) reg_to_dbg(vcpu, p, rd, dbg_reg); else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg); + trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); return true; } @@ -542,7 +542,7 @@ static bool trap_wcr(struct kvm_vcpu *vcpu, static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -552,7 +552,7 @@ static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, const struct kvm_one_reg *reg, void __user *uaddr) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg]; + __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) return -EFAULT; @@ -562,7 +562,7 @@ static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static void reset_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg] = rd->val; + vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val; } static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)