From 77965c98cffe41994dce3389c4aae80e2072f098 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Thu, 1 Jul 2021 09:29:25 +0200 Subject: pwm: Move legacy driver handling into a dedicated function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no change in behaviour, only some code is moved from pwm_apply_state to a separate function. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 130 ++++++++++++++++++++++++++++------------------------- 1 file changed, 70 insertions(+), 60 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index fb04a439462c..c4bbe12cd850 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -522,6 +522,64 @@ static void pwm_apply_state_debug(struct pwm_device *pwm, } } +static int pwm_apply_legacy(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + /* + * FIXME: restore the initial state in case of error. + */ + if (state->polarity != pwm->state.polarity) { + if (!chip->ops->set_polarity) + return -EINVAL; + + /* + * Changing the polarity of a running PWM is only allowed when + * the PWM driver implements ->apply(). + */ + if (pwm->state.enabled) { + chip->ops->disable(chip, pwm); + + /* + * Update pwm->state already here in case + * .set_polarity() or another callback depend on that. + */ + pwm->state.enabled = false; + } + + err = chip->ops->set_polarity(chip, pwm, state->polarity); + if (err) + return err; + + pwm->state.polarity = state->polarity; + } + + if (state->period != pwm->state.period || + state->duty_cycle != pwm->state.duty_cycle) { + err = chip->ops->config(pwm->chip, pwm, + state->duty_cycle, + state->period); + if (err) + return err; + + pwm->state.period = state->period; + pwm->state.duty_cycle = state->duty_cycle; + } + + if (state->enabled != pwm->state.enabled) { + if (!pwm->state.enabled) { + err = chip->ops->enable(chip, pwm); + if (err) + return err; + } else { + chip->ops->disable(chip, pwm); + } + } + + return 0; +} + /** * pwm_apply_state() - atomically apply a new state to a PWM device * @pwm: PWM device @@ -554,70 +612,22 @@ int pwm_apply_state(struct pwm_device *pwm, const struct pwm_state *state) state->usage_power == pwm->state.usage_power) return 0; - if (chip->ops->apply) { + if (chip->ops->apply) err = chip->ops->apply(chip, pwm, state); - if (err) - return err; - - trace_pwm_apply(pwm, state); - - pwm->state = *state; - - /* - * only do this after pwm->state was applied as some - * implementations of .get_state depend on this - */ - pwm_apply_state_debug(pwm, state); - } else { - /* - * FIXME: restore the initial state in case of error. - */ - if (state->polarity != pwm->state.polarity) { - if (!chip->ops->set_polarity) - return -EINVAL; - - /* - * Changing the polarity of a running PWM is - * only allowed when the PWM driver implements - * ->apply(). - */ - if (pwm->state.enabled) { - chip->ops->disable(chip, pwm); - pwm->state.enabled = false; - } - - err = chip->ops->set_polarity(chip, pwm, - state->polarity); - if (err) - return err; - - pwm->state.polarity = state->polarity; - } - - if (state->period != pwm->state.period || - state->duty_cycle != pwm->state.duty_cycle) { - err = chip->ops->config(pwm->chip, pwm, - state->duty_cycle, - state->period); - if (err) - return err; + else + err = pwm_apply_legacy(chip, pwm, state); + if (err) + return err; - pwm->state.duty_cycle = state->duty_cycle; - pwm->state.period = state->period; - } + trace_pwm_apply(pwm, state); - if (state->enabled != pwm->state.enabled) { - if (state->enabled) { - err = chip->ops->enable(chip, pwm); - if (err) - return err; - } else { - chip->ops->disable(chip, pwm); - } + pwm->state = *state; - pwm->state.enabled = state->enabled; - } - } + /* + * only do this after pwm->state was applied as some + * implementations of .get_state depend on this + */ + pwm_apply_state_debug(pwm, state); return 0; } -- cgit v1.2.3-58-ga151 From 92f69e582e15bf281ff1ab3ccc7abdd8392550a3 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Thu, 1 Jul 2021 09:29:26 +0200 Subject: pwm: Prevent a glitch for legacy drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a running PWM is reconfigured to disabled calling the ->config() callback before disabling the hardware might result in a glitch where the (maybe) new period and duty_cycle are visible on the output before disabling the hardware. So handle disabling before calling ->config(). Also exit early in this case which is possible because period and duty_cycle don't matter for disabled PWMs. In return however ->config has to be called even if state->period == pwm->state.period && state->duty_cycle != pwm->state.duty_cycle because setting these might have been skipped in the previous call. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index c4bbe12cd850..dedf38a81bf9 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -555,26 +555,33 @@ static int pwm_apply_legacy(struct pwm_chip *chip, struct pwm_device *pwm, pwm->state.polarity = state->polarity; } - if (state->period != pwm->state.period || - state->duty_cycle != pwm->state.duty_cycle) { - err = chip->ops->config(pwm->chip, pwm, - state->duty_cycle, - state->period); - if (err) - return err; + if (!state->enabled) { + if (pwm->state.enabled) + chip->ops->disable(chip, pwm); - pwm->state.period = state->period; - pwm->state.duty_cycle = state->duty_cycle; + return 0; } - if (state->enabled != pwm->state.enabled) { - if (!pwm->state.enabled) { - err = chip->ops->enable(chip, pwm); - if (err) - return err; - } else { - chip->ops->disable(chip, pwm); - } + /* + * We cannot skip calling ->config even if state->period == + * pwm->state.period && state->duty_cycle == pwm->state.duty_cycle + * because we might have exited early in the last call to + * pwm_apply_state because of !state->enabled and so the two values in + * pwm->state might not be configured in hardware. + */ + err = chip->ops->config(pwm->chip, pwm, + state->duty_cycle, + state->period); + if (err) + return err; + + pwm->state.period = state->period; + pwm->state.duty_cycle = state->duty_cycle; + + if (!pwm->state.enabled) { + err = chip->ops->enable(chip, pwm); + if (err) + return err; } return 0; -- cgit v1.2.3-58-ga151 From e45a178e9e285c86265611a56705a1e6444037e3 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Thu, 1 Jul 2021 09:29:27 +0200 Subject: pwm: Restore initial state if a legacy callback fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is not entirely accurate to go back to the initial state after e.g. .enable() failed, as .config() still modified the hardware, but this same inconsistency exists for drivers that implement .apply(). Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index dedf38a81bf9..57b8cedfec3f 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -526,10 +526,8 @@ static int pwm_apply_legacy(struct pwm_chip *chip, struct pwm_device *pwm, const struct pwm_state *state) { int err; + struct pwm_state initial_state = pwm->state; - /* - * FIXME: restore the initial state in case of error. - */ if (state->polarity != pwm->state.polarity) { if (!chip->ops->set_polarity) return -EINVAL; @@ -550,7 +548,7 @@ static int pwm_apply_legacy(struct pwm_chip *chip, struct pwm_device *pwm, err = chip->ops->set_polarity(chip, pwm, state->polarity); if (err) - return err; + goto rollback; pwm->state.polarity = state->polarity; } @@ -573,7 +571,7 @@ static int pwm_apply_legacy(struct pwm_chip *chip, struct pwm_device *pwm, state->duty_cycle, state->period); if (err) - return err; + goto rollback; pwm->state.period = state->period; pwm->state.duty_cycle = state->duty_cycle; @@ -581,10 +579,14 @@ static int pwm_apply_legacy(struct pwm_chip *chip, struct pwm_device *pwm, if (!pwm->state.enabled) { err = chip->ops->enable(chip, pwm); if (err) - return err; + goto rollback; } return 0; + +rollback: + pwm->state = initial_state; + return err; } /** -- cgit v1.2.3-58-ga151 From 5e93d7782f7fda242e0e696da918f720660854bf Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Thu, 28 Oct 2021 12:09:42 +0200 Subject: pwm: twl: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This just pushes down a slightly optimized variant of how legacy drivers are handled in the core. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-twl.c | 62 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 8 deletions(-) diff --git a/drivers/pwm/pwm-twl.c b/drivers/pwm/pwm-twl.c index 203194f2c92e..86567add79db 100644 --- a/drivers/pwm/pwm-twl.c +++ b/drivers/pwm/pwm-twl.c @@ -58,9 +58,9 @@ static inline struct twl_pwm_chip *to_twl(struct pwm_chip *chip) } static int twl_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) + u64 duty_ns, u64 period_ns) { - int duty_cycle = DIV_ROUND_UP(duty_ns * TWL_PWM_MAX, period_ns) + 1; + int duty_cycle = DIV64_U64_ROUND_UP(duty_ns * TWL_PWM_MAX, period_ns) + 1; u8 pwm_config[2] = { 1, 0 }; int base, ret; @@ -279,19 +279,65 @@ out: mutex_unlock(&twl->mutex); } +static int twl4030_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + twl4030_pwm_disable(chip, pwm); + + return 0; + } + + err = twl_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = twl4030_pwm_enable(chip, pwm); + + return err; +} + +static int twl6030_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + twl6030_pwm_disable(chip, pwm); + + return 0; + } + + err = twl_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = twl6030_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops twl4030_pwm_ops = { - .config = twl_pwm_config, - .enable = twl4030_pwm_enable, - .disable = twl4030_pwm_disable, + .apply = twl4030_pwm_apply, .request = twl4030_pwm_request, .free = twl4030_pwm_free, .owner = THIS_MODULE, }; static const struct pwm_ops twl6030_pwm_ops = { - .config = twl_pwm_config, - .enable = twl6030_pwm_enable, - .disable = twl6030_pwm_disable, + .apply = twl6030_pwm_apply, .owner = THIS_MODULE, }; -- cgit v1.2.3-58-ga151 From 0ee11b87c38b64c693881a53969056e16a03fcd1 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 29 Oct 2021 12:56:17 +0200 Subject: pwm: img: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This just pushes down a slightly optimized variant of how legacy drivers are handled in the core. Signed-off-by: Uwe Kleine-König Tested-by: Hauke Mehrtens Signed-off-by: Thierry Reding --- drivers/pwm/pwm-img.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/pwm/pwm-img.c b/drivers/pwm/pwm-img.c index f97f82548293..1f3d6346ab86 100644 --- a/drivers/pwm/pwm-img.c +++ b/drivers/pwm/pwm-img.c @@ -184,10 +184,33 @@ static void img_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) pm_runtime_put_autosuspend(chip->dev); } +static int img_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + + if (state->polarity != PWM_POLARITY_NORMAL) + return -EINVAL; + + if (!state->enabled) { + if (pwm->state.enabled) + img_pwm_disable(chip, pwm); + + return 0; + } + + err = img_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!pwm->state.enabled) + err = img_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops img_pwm_ops = { - .config = img_pwm_config, - .enable = img_pwm_enable, - .disable = img_pwm_disable, + .apply = img_pwm_apply, .owner = THIS_MODULE, }; -- cgit v1.2.3-58-ga151 From 14d8956548ad48574138d8fd377d434083c3c3cd Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Tue, 2 Nov 2021 10:28:03 +0100 Subject: pwm: vt8500: Implement .apply() callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To eventually get rid of all legacy drivers convert this driver to the modern world implementing .apply(). This just pushes down a slightly optimized variant of how legacy drivers are handled in the core. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-vt8500.c | 57 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/drivers/pwm/pwm-vt8500.c b/drivers/pwm/pwm-vt8500.c index 480bfc29782f..7170a315535b 100644 --- a/drivers/pwm/pwm-vt8500.c +++ b/drivers/pwm/pwm-vt8500.c @@ -70,7 +70,7 @@ static inline void vt8500_pwm_busy_wait(struct vt8500_chip *vt8500, int nr, u8 b } static int vt8500_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) + u64 duty_ns, u64 period_ns) { struct vt8500_chip *vt8500 = to_vt8500_chip(chip); unsigned long long c; @@ -102,8 +102,8 @@ static int vt8500_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, } c = (unsigned long long)pv * duty_ns; - do_div(c, period_ns); - dc = c; + + dc = div64_u64(c, period_ns); writel(prescale, vt8500->base + REG_SCALAR(pwm->hwpwm)); vt8500_pwm_busy_wait(vt8500, pwm->hwpwm, STATUS_SCALAR_UPDATE); @@ -176,11 +176,54 @@ static int vt8500_pwm_set_polarity(struct pwm_chip *chip, return 0; } +static int vt8500_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + const struct pwm_state *state) +{ + int err; + bool enabled = pwm->state.enabled; + + if (state->polarity != pwm->state.polarity) { + /* + * Changing the polarity of a running PWM is only allowed when + * the PWM driver implements ->apply(). + */ + if (enabled) { + vt8500_pwm_disable(chip, pwm); + + enabled = false; + } + + err = vt8500_pwm_set_polarity(chip, pwm, state->polarity); + if (err) + return err; + } + + if (!state->enabled) { + if (enabled) + vt8500_pwm_disable(chip, pwm); + + return 0; + } + + /* + * We cannot skip calling ->config even if state->period == + * pwm->state.period && state->duty_cycle == pwm->state.duty_cycle + * because we might have exited early in the last call to + * pwm_apply_state because of !state->enabled and so the two values in + * pwm->state might not be configured in hardware. + */ + err = vt8500_pwm_config(pwm->chip, pwm, state->duty_cycle, state->period); + if (err) + return err; + + if (!enabled) + err = vt8500_pwm_enable(chip, pwm); + + return err; +} + static const struct pwm_ops vt8500_pwm_ops = { - .enable = vt8500_pwm_enable, - .disable = vt8500_pwm_disable, - .config = vt8500_pwm_config, - .set_polarity = vt8500_pwm_set_polarity, + .apply = vt8500_pwm_apply, .owner = THIS_MODULE, }; -- cgit v1.2.3-58-ga151 From b6ce2af8766c39a5b09afa466ed4d0ef2d8b5a65 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 12 Nov 2021 20:00:15 +0100 Subject: pwm: img: Use only a single idiom to get a runtime PM reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently there are two very similar approaches in use by this driver: img_pwm_config() uses pm_runtime_get_sync() and calls pm_runtime_put_autosuspend() in the error path; img_pwm_enable() calls pm_runtime_resume_and_get() which already puts the reference in its own error path. Align pm_runtime usage and use the same idiom in both locations. Signed-off-by: Uwe Kleine-König Signed-off-by: Thierry Reding --- drivers/pwm/pwm-img.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/pwm/pwm-img.c b/drivers/pwm/pwm-img.c index 1f3d6346ab86..5996049f66ec 100644 --- a/drivers/pwm/pwm-img.c +++ b/drivers/pwm/pwm-img.c @@ -128,11 +128,9 @@ static int img_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, duty = DIV_ROUND_UP(timebase * duty_ns, period_ns); - ret = pm_runtime_get_sync(chip->dev); - if (ret < 0) { - pm_runtime_put_autosuspend(chip->dev); + ret = pm_runtime_resume_and_get(chip->dev); + if (ret < 0) return ret; - } val = img_pwm_readl(pwm_chip, PWM_CTRL_CFG); val &= ~(PWM_CTRL_CFG_DIV_MASK << PWM_CTRL_CFG_DIV_SHIFT(pwm->hwpwm)); -- cgit v1.2.3-58-ga151 From f601aa7930669439623dd266fc9e90b0218b42c1 Mon Sep 17 00:00:00 2001 From: Camel Guo Date: Thu, 11 Nov 2021 09:36:25 +0100 Subject: rtc: rs5c372: Add RTC_VL_READ, RTC_VL_CLR ioctls In order to make it possible to get battery voltage status, this commit adds RTC_VL_READ, RTC_VL_CLR ioctl commands to rtc-rs5c372. Signed-off-by: Camel Guo Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211111083625.10216-1-camel.guo@axis.com --- drivers/rtc/rtc-rs5c372.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c index 80980414890c..955513514179 100644 --- a/drivers/rtc/rtc-rs5c372.c +++ b/drivers/rtc/rtc-rs5c372.c @@ -485,6 +485,60 @@ static int rs5c372_rtc_proc(struct device *dev, struct seq_file *seq) #define rs5c372_rtc_proc NULL #endif +#ifdef CONFIG_RTC_INTF_DEV +static int rs5c372_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) +{ + struct rs5c372 *rs5c = i2c_get_clientdata(to_i2c_client(dev)); + unsigned char ctrl2; + int addr; + unsigned int flags; + + dev_dbg(dev, "%s: cmd=%x\n", __func__, cmd); + + addr = RS5C_ADDR(RS5C_REG_CTRL2); + ctrl2 = i2c_smbus_read_byte_data(rs5c->client, addr); + + switch (cmd) { + case RTC_VL_READ: + flags = 0; + + switch (rs5c->type) { + case rtc_r2025sd: + case rtc_r2221tl: + if ((rs5c->type == rtc_r2025sd && !(ctrl2 & R2x2x_CTRL2_XSTP)) || + (rs5c->type == rtc_r2221tl && (ctrl2 & R2x2x_CTRL2_XSTP))) { + flags |= RTC_VL_DATA_INVALID; + } + if (ctrl2 & R2x2x_CTRL2_VDET) + flags |= RTC_VL_BACKUP_LOW; + break; + default: + if (ctrl2 & RS5C_CTRL2_XSTP) + flags |= RTC_VL_DATA_INVALID; + break; + } + + return put_user(flags, (unsigned int __user *)arg); + case RTC_VL_CLR: + /* clear VDET bit */ + if (rs5c->type == rtc_r2025sd || rs5c->type == rtc_r2221tl) { + ctrl2 &= ~R2x2x_CTRL2_VDET; + if (i2c_smbus_write_byte_data(rs5c->client, addr, ctrl2) < 0) { + dev_dbg(&rs5c->client->dev, "%s: write error in line %i\n", + __func__, __LINE__); + return -EIO; + } + } + return 0; + default: + return -ENOIOCTLCMD; + } + return 0; +} +#else +#define rs5c372_ioctl NULL +#endif + static const struct rtc_class_ops rs5c372_rtc_ops = { .proc = rs5c372_rtc_proc, .read_time = rs5c372_rtc_read_time, @@ -492,6 +546,7 @@ static const struct rtc_class_ops rs5c372_rtc_ops = { .read_alarm = rs5c_read_alarm, .set_alarm = rs5c_set_alarm, .alarm_irq_enable = rs5c_rtc_alarm_irq_enable, + .ioctl = rs5c372_ioctl, }; #if IS_ENABLED(CONFIG_RTC_INTF_SYSFS) -- cgit v1.2.3-58-ga151 From 1c1b3098ae1e0d9725d0d4d49986e0edebba443a Mon Sep 17 00:00:00 2001 From: Marc Ferland Date: Tue, 16 Nov 2021 11:47:33 -0500 Subject: rtc: pcf85063: add i2c_device_id name matching support The pcf85063 driver regsitration currently supports the "compatible" property type of matching (for DT). This patch adds "matching by name" support to the driver by defining an i2c_device_id table and setting the id_table parameter in the i2c_driver struct. This will, for example, make the driver easier to instantiate on systems where CONFIG_OF is not enabled (x86 in my case). Signed-off-by: Marc Ferland Reported-by: kernel test robot Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211116164733.17149-1-ferlandm@amotus.ca --- drivers/rtc/rtc-pcf85063.c | 97 +++++++++++++++++++++++++++++++--------------- 1 file changed, 66 insertions(+), 31 deletions(-) diff --git a/drivers/rtc/rtc-pcf85063.c b/drivers/rtc/rtc-pcf85063.c index 15e50bb10cf0..df2b072c394d 100644 --- a/drivers/rtc/rtc-pcf85063.c +++ b/drivers/rtc/rtc-pcf85063.c @@ -514,21 +514,56 @@ static struct clk *pcf85063_clkout_register_clk(struct pcf85063 *pcf85063) } #endif -static const struct pcf85063_config pcf85063tp_config = { - .regmap = { - .reg_bits = 8, - .val_bits = 8, - .max_register = 0x0a, +enum pcf85063_type { + PCF85063, + PCF85063TP, + PCF85063A, + RV8263, + PCF85063_LAST_ID +}; + +static struct pcf85063_config pcf85063_cfg[] = { + [PCF85063] = { + .regmap = { + .reg_bits = 8, + .val_bits = 8, + .max_register = 0x0a, + }, + }, + [PCF85063TP] = { + .regmap = { + .reg_bits = 8, + .val_bits = 8, + .max_register = 0x0a, + }, + }, + [PCF85063A] = { + .regmap = { + .reg_bits = 8, + .val_bits = 8, + .max_register = 0x11, + }, + .has_alarms = 1, + }, + [RV8263] = { + .regmap = { + .reg_bits = 8, + .val_bits = 8, + .max_register = 0x11, + }, + .has_alarms = 1, + .force_cap_7000 = 1, }, }; +static const struct i2c_device_id pcf85063_ids[]; + static int pcf85063_probe(struct i2c_client *client) { struct pcf85063 *pcf85063; unsigned int tmp; int err; - const struct pcf85063_config *config = &pcf85063tp_config; - const void *data = of_device_get_match_data(&client->dev); + const struct pcf85063_config *config; struct nvmem_config nvmem_cfg = { .name = "pcf85063_nvram", .reg_read = pcf85063_nvmem_read, @@ -544,8 +579,17 @@ static int pcf85063_probe(struct i2c_client *client) if (!pcf85063) return -ENOMEM; - if (data) - config = data; + if (client->dev.of_node) { + config = of_device_get_match_data(&client->dev); + if (!config) + return -ENODEV; + } else { + enum pcf85063_type type = + i2c_match_id(pcf85063_ids, client)->driver_data; + if (type >= PCF85063_LAST_ID) + return -ENODEV; + config = &pcf85063_cfg[type]; + } pcf85063->regmap = devm_regmap_init_i2c(client, &config->regmap); if (IS_ERR(pcf85063->regmap)) @@ -604,31 +648,21 @@ static int pcf85063_probe(struct i2c_client *client) return devm_rtc_register_device(pcf85063->rtc); } -#ifdef CONFIG_OF -static const struct pcf85063_config pcf85063a_config = { - .regmap = { - .reg_bits = 8, - .val_bits = 8, - .max_register = 0x11, - }, - .has_alarms = 1, -}; - -static const struct pcf85063_config rv8263_config = { - .regmap = { - .reg_bits = 8, - .val_bits = 8, - .max_register = 0x11, - }, - .has_alarms = 1, - .force_cap_7000 = 1, +static const struct i2c_device_id pcf85063_ids[] = { + { "pcf85063", PCF85063 }, + { "pcf85063tp", PCF85063TP }, + { "pcf85063a", PCF85063A }, + { "rv8263", RV8263 }, + {} }; +MODULE_DEVICE_TABLE(i2c, pcf85063_ids); +#ifdef CONFIG_OF static const struct of_device_id pcf85063_of_match[] = { - { .compatible = "nxp,pcf85063", .data = &pcf85063tp_config }, - { .compatible = "nxp,pcf85063tp", .data = &pcf85063tp_config }, - { .compatible = "nxp,pcf85063a", .data = &pcf85063a_config }, - { .compatible = "microcrystal,rv8263", .data = &rv8263_config }, + { .compatible = "nxp,pcf85063", .data = &pcf85063_cfg[PCF85063] }, + { .compatible = "nxp,pcf85063tp", .data = &pcf85063_cfg[PCF85063TP] }, + { .compatible = "nxp,pcf85063a", .data = &pcf85063_cfg[PCF85063A] }, + { .compatible = "microcrystal,rv8263", .data = &pcf85063_cfg[RV8263] }, {} }; MODULE_DEVICE_TABLE(of, pcf85063_of_match); @@ -640,6 +674,7 @@ static struct i2c_driver pcf85063_driver = { .of_match_table = of_match_ptr(pcf85063_of_match), }, .probe_new = pcf85063_probe, + .id_table = pcf85063_ids, }; module_i2c_driver(pcf85063_driver); -- cgit v1.2.3-58-ga151 From a478c433d72bf006f36bef68c239c8a68b062e5b Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 10 Nov 2021 00:47:50 +0100 Subject: rtc: da9063: switch to RTC_FEATURE_UPDATE_INTERRUPT Stop using uie_unsupported and clear RTC_FEATURE_UPDATE_INTERRUPT instead. Also, let the core know that the alarm will truncate seconds as it only has a minute resolution. Signed-off-by: Alexandre Belloni Reviewed-by: Adam Thomson Link: https://lore.kernel.org/r/20211109234750.107115-1-alexandre.belloni@bootlin.com --- drivers/rtc/rtc-da9063.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/rtc/rtc-da9063.c b/drivers/rtc/rtc-da9063.c index d4b72a9fa2ba..54a5e244946b 100644 --- a/drivers/rtc/rtc-da9063.c +++ b/drivers/rtc/rtc-da9063.c @@ -475,12 +475,14 @@ static int da9063_rtc_probe(struct platform_device *pdev) da9063_data_to_tm(data, &rtc->alarm_time, rtc); rtc->rtc_sync = false; - /* - * TODO: some models have alarms on a minute boundary but still support - * real hardware interrupts. Add this once the core supports it. - */ - if (config->rtc_data_start != RTC_SEC) - rtc->rtc_dev->uie_unsupported = 1; + if (config->rtc_data_start != RTC_SEC) { + set_bit(RTC_FEATURE_ALARM_RES_MINUTE, rtc->rtc_dev->features); + /* + * TODO: some models have alarms on a minute boundary but still + * support real hardware interrupts. + */ + clear_bit(RTC_FEATURE_UPDATE_INTERRUPT, rtc->rtc_dev->features); + } irq_alarm = platform_get_irq_byname(pdev, "ALARM"); if (irq_alarm < 0) -- cgit v1.2.3-58-ga151 From 029d3a6f2f3c73ac29a7460d8007798e940488fd Mon Sep 17 00:00:00 2001 From: Nikita Shubin Date: Mon, 29 Nov 2021 10:26:49 +0300 Subject: rtc: da9063: add as wakeup source As da9063 RTC is not a real I2C client, but relies on da9063 MFD driver, we need to explicitly mark da9063 RTC as a wakeup source to be able to access class/rtc/rtcN/wakealarm sysfs entry to set alarms, so we can wakeup from SHUTDOWN/RTC/DELIVERY mode. As da9063 driver refuses to load without irq, we simply add it as a wakeup source before registering rtc device. Signed-off-by: Nikita Shubin Reviewed-by: Adam Thomson Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211129072650.22686-1-nikita.shubin@maquefel.me --- drivers/rtc/rtc-da9063.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/rtc/rtc-da9063.c b/drivers/rtc/rtc-da9063.c index 54a5e244946b..ee2efb496174 100644 --- a/drivers/rtc/rtc-da9063.c +++ b/drivers/rtc/rtc-da9063.c @@ -496,6 +496,8 @@ static int da9063_rtc_probe(struct platform_device *pdev) dev_err(&pdev->dev, "Failed to request ALARM IRQ %d: %d\n", irq_alarm, ret); + device_init_wakeup(&pdev->dev, true); + return devm_rtc_register_device(rtc->rtc_dev); } -- cgit v1.2.3-58-ga151 From 10d96b44a94e5cfd23739d2dcb950a7bdc109736 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Tue, 30 Nov 2021 09:58:29 -0300 Subject: dt/bindings: rtc: rx8900: Add an entry for RX8804 The Epson RX8804 RTC has the same programming model as RV8803 and RX8900. Add an entry for it in the binding document. Signed-off-by: Fabio Estevam Reviewed-by: Otavio Salvador Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211130125830.1166194-1-festevam@gmail.com --- Documentation/devicetree/bindings/rtc/epson,rx8900.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/rtc/epson,rx8900.yaml b/Documentation/devicetree/bindings/rtc/epson,rx8900.yaml index 29fe39bb08ad..d12855e7ffd7 100644 --- a/Documentation/devicetree/bindings/rtc/epson,rx8900.yaml +++ b/Documentation/devicetree/bindings/rtc/epson,rx8900.yaml @@ -15,6 +15,7 @@ allOf: properties: compatible: enum: + - epson,rx8804 - epson,rx8900 - microcrystal,rv8803 -- cgit v1.2.3-58-ga151 From 5c0189a8b52f76d8a061d2ec80adb11559742d78 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Tue, 30 Nov 2021 09:58:30 -0300 Subject: rtc: rv8803: Add support for the Epson RX8804 RTC The Epson RX8804 RTC has the same programming model as RV8803. Add support for it in the driver. Signed-off-by: Fabio Estevam Reviewed-by: Otavio Salvador Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211130125830.1166194-2-festevam@gmail.com --- drivers/rtc/rtc-rv8803.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/rtc/rtc-rv8803.c b/drivers/rtc/rtc-rv8803.c index 0d5ed38bf60c..f69e0b1137cd 100644 --- a/drivers/rtc/rtc-rv8803.c +++ b/drivers/rtc/rtc-rv8803.c @@ -55,6 +55,7 @@ enum rv8803_type { rv_8803, + rx_8804, rx_8900 }; @@ -601,6 +602,7 @@ static int rv8803_probe(struct i2c_client *client, static const struct i2c_device_id rv8803_id[] = { { "rv8803", rv_8803 }, + { "rv8804", rx_8804 }, { "rx8803", rv_8803 }, { "rx8900", rx_8900 }, { } @@ -616,6 +618,10 @@ static const __maybe_unused struct of_device_id rv8803_of_match[] = { .compatible = "epson,rx8803", .data = (void *)rv_8803 }, + { + .compatible = "epson,rx8804", + .data = (void *)rx_8804 + }, { .compatible = "epson,rx8900", .data = (void *)rx_8900 -- cgit v1.2.3-58-ga151 From 3f0565451cc0c5158513af0bc4e91aa8fb0b5e75 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Tue, 7 Dec 2021 13:48:55 +0100 Subject: dt-bindings: pwm: Avoid selecting schema on node name match Currently any node whose name starts with the "pwm-" prefix will match this schema and in turn required the "#pwm-cells" property. Avoid this by marking the schema with select: false, therefore only activating the schema when directly included from a PWM controller schema file. Signed-off-by: Thierry Reding Acked-by: Rob Herring Signed-off-by: Thierry Reding --- Documentation/devicetree/bindings/pwm/pwm.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/pwm/pwm.yaml b/Documentation/devicetree/bindings/pwm/pwm.yaml index 2effe6c0de6b..3c01f85029e5 100644 --- a/Documentation/devicetree/bindings/pwm/pwm.yaml +++ b/Documentation/devicetree/bindings/pwm/pwm.yaml @@ -9,6 +9,8 @@ title: PWM controllers (providers) maintainers: - Thierry Reding +select: false + properties: $nodename: pattern: "^pwm(@.*|-[0-9a-f])*$" -- cgit v1.2.3-58-ga151 From 86559400b3ef9de93ba50523cffe767c35cd531a Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Wed, 15 Dec 2021 18:54:57 +0100 Subject: rtc: gamecube: Add a RTC driver for the GameCube, Wii and Wii U MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These three consoles share a device, the MX23L4005, which contains a clock and 64 bytes of SRAM storage, and is exposed on the EXI bus (similar to SPI) on channel 0, device 1. This driver allows it to be used as a Linux RTC device, where time can be read and set. The hardware also exposes two timers, one which shuts down the console and one which powers it on, but these aren’t supported currently. On the Wii U, the counter bias is stored in a XML file, /config/rtc.xml, encrypted in the SLC (eMMC storage), using a proprietary filesystem. In order to avoid having to implement all that, this driver assumes a bootloader will parse this XML file and write the bias into the SRAM, at the same location the other two consoles have it. Signed-off-by: Emmanuel Gil Peyrot Acked-by: Michael Ellerman (powerpc) Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211215175501.6761-2-linkmauve@linkmauve.fr --- drivers/rtc/Kconfig | 11 ++ drivers/rtc/Makefile | 1 + drivers/rtc/rtc-gamecube.c | 347 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 359 insertions(+) create mode 100644 drivers/rtc/rtc-gamecube.c diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 058e56a10ab8..6d019c5ed374 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -1216,6 +1216,17 @@ config RTC_DRV_V3020 This driver can also be built as a module. If so, the module will be called rtc-v3020. +config RTC_DRV_GAMECUBE + tristate "Nintendo GameCube, Wii and Wii U RTC" + depends on GAMECUBE || WII || COMPILE_TEST + select REGMAP + help + If you say yes here you will get support for the RTC subsystem + of the Nintendo GameCube, Wii and Wii U. + + This driver can also be built as a module. If so, the module + will be called "rtc-gamecube". + config RTC_DRV_WM831X tristate "Wolfson Microelectronics WM831x RTC" depends on MFD_WM831X diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 678a8ef4abae..80b8f8f4b635 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -111,6 +111,7 @@ obj-$(CONFIG_RTC_DRV_MT7622) += rtc-mt7622.o obj-$(CONFIG_RTC_DRV_MV) += rtc-mv.o obj-$(CONFIG_RTC_DRV_MXC) += rtc-mxc.o obj-$(CONFIG_RTC_DRV_MXC_V2) += rtc-mxc_v2.o +obj-$(CONFIG_RTC_DRV_GAMECUBE) += rtc-gamecube.o obj-$(CONFIG_RTC_DRV_NTXEC) += rtc-ntxec.o obj-$(CONFIG_RTC_DRV_OMAP) += rtc-omap.o obj-$(CONFIG_RTC_DRV_OPAL) += rtc-opal.o diff --git a/drivers/rtc/rtc-gamecube.c b/drivers/rtc/rtc-gamecube.c new file mode 100644 index 000000000000..e8260c82c07d --- /dev/null +++ b/drivers/rtc/rtc-gamecube.c @@ -0,0 +1,347 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Nintendo GameCube, Wii and Wii U RTC driver + * + * This driver is for the MX23L4005, more specifically its real-time clock and + * SRAM storage. The value returned by the RTC counter must be added with the + * offset stored in a bias register in SRAM (on the GameCube and Wii) or in + * /config/rtc.xml (on the Wii U). The latter being very impractical to access + * from Linux, this driver assumes the bootloader has read it and stored it in + * SRAM like for the other two consoles. + * + * This device sits on a bus named EXI (which is similar to SPI), channel 0, + * device 1. This driver assumes no other user of the EXI bus, which is + * currently the case but would have to be reworked to add support for other + * GameCube hardware exposed on this bus. + * + * References: + * - https://wiiubrew.org/wiki/Hardware/RTC + * - https://wiibrew.org/wiki/MX23L4005 + * + * Copyright (C) 2018 rw-r-r-0644 + * Copyright (C) 2021 Emmanuel Gil Peyrot + * + * Based on rtc-gcn.c + * Copyright (C) 2004-2009 The GameCube Linux Team + * Copyright (C) 2005,2008,2009 Albert Herranz + * Based on gamecube_time.c from Torben Nielsen. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* EXI registers */ +#define EXICSR 0 +#define EXICR 12 +#define EXIDATA 16 + +/* EXI register values */ +#define EXICSR_DEV 0x380 + #define EXICSR_DEV1 0x100 +#define EXICSR_CLK 0x070 + #define EXICSR_CLK_1MHZ 0x000 + #define EXICSR_CLK_2MHZ 0x010 + #define EXICSR_CLK_4MHZ 0x020 + #define EXICSR_CLK_8MHZ 0x030 + #define EXICSR_CLK_16MHZ 0x040 + #define EXICSR_CLK_32MHZ 0x050 +#define EXICSR_INT 0x008 + #define EXICSR_INTSET 0x008 + +#define EXICR_TSTART 0x001 +#define EXICR_TRSMODE 0x002 + #define EXICR_TRSMODE_IMM 0x000 +#define EXICR_TRSTYPE 0x00C + #define EXICR_TRSTYPE_R 0x000 + #define EXICR_TRSTYPE_W 0x004 +#define EXICR_TLEN 0x030 + #define EXICR_TLEN32 0x030 + +/* EXI registers values to access the RTC */ +#define RTC_EXICSR (EXICSR_DEV1 | EXICSR_CLK_8MHZ | EXICSR_INTSET) +#define RTC_EXICR_W (EXICR_TSTART | EXICR_TRSMODE_IMM | EXICR_TRSTYPE_W | EXICR_TLEN32) +#define RTC_EXICR_R (EXICR_TSTART | EXICR_TRSMODE_IMM | EXICR_TRSTYPE_R | EXICR_TLEN32) +#define RTC_EXIDATA_W 0x80000000 + +/* RTC registers */ +#define RTC_COUNTER 0x200000 +#define RTC_SRAM 0x200001 +#define RTC_SRAM_BIAS 0x200004 +#define RTC_SNAPSHOT 0x204000 +#define RTC_ONTMR 0x210000 +#define RTC_OFFTMR 0x210001 +#define RTC_TEST0 0x210004 +#define RTC_TEST1 0x210005 +#define RTC_TEST2 0x210006 +#define RTC_TEST3 0x210007 +#define RTC_CONTROL0 0x21000c +#define RTC_CONTROL1 0x21000d + +struct priv { + struct regmap *regmap; + void __iomem *iob; + u32 rtc_bias; +}; + +static int exi_read(void *context, u32 reg, u32 *data) +{ + struct priv *d = (struct priv *)context; + void __iomem *iob = d->iob; + + /* The spin loops here loop about 15~16 times each, so there is no need + * to use a more expensive sleep method. + */ + + /* Write register offset */ + iowrite32be(RTC_EXICSR, iob + EXICSR); + iowrite32be(reg << 8, iob + EXIDATA); + iowrite32be(RTC_EXICR_W, iob + EXICR); + while (!(ioread32be(iob + EXICSR) & EXICSR_INTSET)) + cpu_relax(); + + /* Read data */ + iowrite32be(RTC_EXICSR, iob + EXICSR); + iowrite32be(RTC_EXICR_R, iob + EXICR); + while (!(ioread32be(iob + EXICSR) & EXICSR_INTSET)) + cpu_relax(); + *data = ioread32be(iob + EXIDATA); + + /* Clear channel parameters */ + iowrite32be(0, iob + EXICSR); + + return 0; +} + +static int exi_write(void *context, u32 reg, u32 data) +{ + struct priv *d = (struct priv *)context; + void __iomem *iob = d->iob; + + /* The spin loops here loop about 15~16 times each, so there is no need + * to use a more expensive sleep method. + */ + + /* Write register offset */ + iowrite32be(RTC_EXICSR, iob + EXICSR); + iowrite32be(RTC_EXIDATA_W | (reg << 8), iob + EXIDATA); + iowrite32be(RTC_EXICR_W, iob + EXICR); + while (!(ioread32be(iob + EXICSR) & EXICSR_INTSET)) + cpu_relax(); + + /* Write data */ + iowrite32be(RTC_EXICSR, iob + EXICSR); + iowrite32be(data, iob + EXIDATA); + iowrite32be(RTC_EXICR_W, iob + EXICR); + while (!(ioread32be(iob + EXICSR) & EXICSR_INTSET)) + cpu_relax(); + + /* Clear channel parameters */ + iowrite32be(0, iob + EXICSR); + + return 0; +} + +static const struct regmap_bus exi_bus = { + /* TODO: is that true? Not that it matters here, but still. */ + .fast_io = true, + .reg_read = exi_read, + .reg_write = exi_write, +}; + +static int gamecube_rtc_read_time(struct device *dev, struct rtc_time *t) +{ + struct priv *d = dev_get_drvdata(dev); + int ret; + u32 counter; + time64_t timestamp; + + ret = regmap_read(d->regmap, RTC_COUNTER, &counter); + if (ret) + return ret; + + /* Add the counter and the bias to obtain the timestamp */ + timestamp = (time64_t)d->rtc_bias + counter; + rtc_time64_to_tm(timestamp, t); + + return 0; +} + +static int gamecube_rtc_set_time(struct device *dev, struct rtc_time *t) +{ + struct priv *d = dev_get_drvdata(dev); + time64_t timestamp; + + /* Subtract the timestamp and the bias to obtain the counter value */ + timestamp = rtc_tm_to_time64(t); + return regmap_write(d->regmap, RTC_COUNTER, timestamp - d->rtc_bias); +} + +static const struct rtc_class_ops gamecube_rtc_ops = { + .read_time = gamecube_rtc_read_time, + .set_time = gamecube_rtc_set_time, +}; + +static int gamecube_rtc_read_offset_from_sram(struct priv *d) +{ + struct device_node *np; + int ret; + struct resource res; + void __iomem *hw_srnprot; + u32 old; + + np = of_find_compatible_node(NULL, NULL, "nintendo,latte-srnprot"); + if (!np) + np = of_find_compatible_node(NULL, NULL, + "nintendo,hollywood-srnprot"); + if (!np) { + pr_info("HW_SRNPROT not found, assuming a GameCube\n"); + return regmap_read(d->regmap, RTC_SRAM_BIAS, &d->rtc_bias); + } + + ret = of_address_to_resource(np, 0, &res); + if (ret) { + pr_err("no io memory range found\n"); + return -1; + } + + hw_srnprot = ioremap(res.start, resource_size(&res)); + old = ioread32be(hw_srnprot); + + /* TODO: figure out why we use this magic constant. I obtained it by + * reading the leftover value after boot, after IOSU already ran. + * + * On my Wii U, setting this register to 1 prevents the console from + * rebooting properly, so wiiubrew.org must be missing something. + * + * See https://wiiubrew.org/wiki/Hardware/Latte_registers + */ + if (old != 0x7bf) + iowrite32be(0x7bf, hw_srnprot); + + /* Get the offset from RTC SRAM. + * + * Its default location on the GameCube and on the Wii is in the SRAM, + * while on the Wii U the bootloader needs to fill it with the contents + * of /config/rtc.xml on the SLC (the eMMC). We don’t do that from + * Linux since it requires implementing a proprietary filesystem and do + * file decryption, instead we require the bootloader to fill the same + * SRAM address as on previous consoles. + */ + ret = regmap_read(d->regmap, RTC_SRAM_BIAS, &d->rtc_bias); + if (ret) { + pr_err("failed to get the RTC bias\n"); + return -1; + } + + /* Reset SRAM access to how it was before, our job here is done. */ + if (old != 0x7bf) + iowrite32be(old, hw_srnprot); + iounmap(hw_srnprot); + + return 0; +} + +static const struct regmap_range rtc_rd_ranges[] = { + regmap_reg_range(0x200000, 0x200010), + regmap_reg_range(0x204000, 0x204000), + regmap_reg_range(0x210000, 0x210001), + regmap_reg_range(0x210004, 0x210007), + regmap_reg_range(0x21000c, 0x21000d), +}; + +static const struct regmap_access_table rtc_rd_regs = { + .yes_ranges = rtc_rd_ranges, + .n_yes_ranges = ARRAY_SIZE(rtc_rd_ranges), +}; + +static const struct regmap_range rtc_wr_ranges[] = { + regmap_reg_range(0x200000, 0x200010), + regmap_reg_range(0x204000, 0x204000), + regmap_reg_range(0x210000, 0x210001), + regmap_reg_range(0x21000d, 0x21000d), +}; + +static const struct regmap_access_table rtc_wr_regs = { + .yes_ranges = rtc_wr_ranges, + .n_yes_ranges = ARRAY_SIZE(rtc_wr_ranges), +}; + +static const struct regmap_config gamecube_rtc_regmap_config = { + .reg_bits = 24, + .val_bits = 32, + .rd_table = &rtc_rd_regs, + .wr_table = &rtc_wr_regs, + .max_register = 0x21000d, + .name = "gamecube-rtc", +}; + +static int gamecube_rtc_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct rtc_device *rtc; + struct priv *d; + int ret; + + d = devm_kzalloc(dev, sizeof(struct priv), GFP_KERNEL); + if (IS_ERR(d)) + return PTR_ERR(d); + + d->iob = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(d->iob)) + return PTR_ERR(d->iob); + + d->regmap = devm_regmap_init(dev, &exi_bus, d, + &gamecube_rtc_regmap_config); + if (IS_ERR(d->regmap)) + return PTR_ERR(d->regmap); + + ret = gamecube_rtc_read_offset_from_sram(d); + if (ret) + return ret; + dev_dbg(dev, "SRAM bias: 0x%x", d->rtc_bias); + + dev_set_drvdata(dev, d); + + rtc = devm_rtc_allocate_device(dev); + if (IS_ERR(rtc)) + return PTR_ERR(rtc); + + /* We can represent further than that, but it depends on the stored + * bias and we can’t modify it persistently on all supported consoles, + * so here we pretend to be limited to 2106. + */ + rtc->range_min = 0; + rtc->range_max = U32_MAX; + rtc->ops = &gamecube_rtc_ops; + + devm_rtc_register_device(rtc); + + return 0; +} + +static const struct of_device_id gamecube_rtc_of_match[] = { + {.compatible = "nintendo,latte-exi" }, + {.compatible = "nintendo,hollywood-exi" }, + {.compatible = "nintendo,flipper-exi" }, + { } +}; +MODULE_DEVICE_TABLE(of, gamecube_rtc_of_match); + +static struct platform_driver gamecube_rtc_driver = { + .probe = gamecube_rtc_probe, + .driver = { + .name = "rtc-gamecube", + .of_match_table = gamecube_rtc_of_match, + }, +}; +module_platform_driver(gamecube_rtc_driver); + +MODULE_AUTHOR("Emmanuel Gil Peyrot "); +MODULE_DESCRIPTION("Nintendo GameCube, Wii and Wii U RTC driver"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3-58-ga151 From 322539a014bcd24cbb9281832c09b24e07912237 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Wed, 15 Dec 2021 18:54:58 +0100 Subject: rtc: gamecube: Report low battery as invalid data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I haven’t been able to test this patch as all of my consoles have a working RTC battery, but according to the documentation it should work like that. Signed-off-by: Emmanuel Gil Peyrot Acked-by: Michael Ellerman (powerpc) Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211215175501.6761-3-linkmauve@linkmauve.fr --- drivers/rtc/rtc-gamecube.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/rtc/rtc-gamecube.c b/drivers/rtc/rtc-gamecube.c index e8260c82c07d..98128746171e 100644 --- a/drivers/rtc/rtc-gamecube.c +++ b/drivers/rtc/rtc-gamecube.c @@ -83,6 +83,10 @@ #define RTC_CONTROL0 0x21000c #define RTC_CONTROL1 0x21000d +/* RTC flags */ +#define RTC_CONTROL0_UNSTABLE_POWER 0x00000800 +#define RTC_CONTROL0_LOW_BATTERY 0x00000200 + struct priv { struct regmap *regmap; void __iomem *iob; @@ -182,9 +186,35 @@ static int gamecube_rtc_set_time(struct device *dev, struct rtc_time *t) return regmap_write(d->regmap, RTC_COUNTER, timestamp - d->rtc_bias); } +static int gamecube_rtc_ioctl(struct device *dev, unsigned int cmd, unsigned long arg) +{ + struct priv *d = dev_get_drvdata(dev); + int value; + int control0; + int ret; + + switch (cmd) { + case RTC_VL_READ: + ret = regmap_read(d->regmap, RTC_CONTROL0, &control0); + if (ret) + return ret; + + value = 0; + if (control0 & RTC_CONTROL0_UNSTABLE_POWER) + value |= RTC_VL_DATA_INVALID; + if (control0 & RTC_CONTROL0_LOW_BATTERY) + value |= RTC_VL_BACKUP_LOW; + return put_user(value, (unsigned int __user *)arg); + + default: + return -ENOIOCTLCMD; + } +} + static const struct rtc_class_ops gamecube_rtc_ops = { .read_time = gamecube_rtc_read_time, .set_time = gamecube_rtc_set_time, + .ioctl = gamecube_rtc_ioctl, }; static int gamecube_rtc_read_offset_from_sram(struct priv *d) -- cgit v1.2.3-58-ga151 From 5479618e1e2641dd57352a73b7b7b2f6908fbeee Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Wed, 15 Dec 2021 18:54:59 +0100 Subject: powerpc: wii.dts: Expose HW_SRNPROT on this platform MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This Hollywood register isn’t properly understood, but can allow or reject access to the SRAM, which we need to set for RTC usage if it isn’t previously set correctly beforehand. See https://wiibrew.org/wiki/Hardware/Hollywood_Registers#HW_SRNPROT Signed-off-by: Emmanuel Gil Peyrot Acked-by: Michael Ellerman (powerpc) Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211215175501.6761-4-linkmauve@linkmauve.fr --- arch/powerpc/boot/dts/wii.dts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts index e9c945b123c6..e46143c32308 100644 --- a/arch/powerpc/boot/dts/wii.dts +++ b/arch/powerpc/boot/dts/wii.dts @@ -168,6 +168,11 @@ interrupts = <14>; }; + srnprot@d800060 { + compatible = "nintendo,hollywood-srnprot"; + reg = <0x0d800060 0x4>; + }; + GPIO: gpio@d8000c0 { #gpio-cells = <2>; compatible = "nintendo,hollywood-gpio"; -- cgit v1.2.3-58-ga151 From 57bd7d356506b713d0df8d8e42da7810a18864df Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Wed, 15 Dec 2021 18:55:00 +0100 Subject: powerpc: gamecube_defconfig: Enable the RTC driver This selects the rtc-gamecube driver, which provides a real-time clock on this platform. Signed-off-by: Emmanuel Gil Peyrot Acked-by: Michael Ellerman (powerpc) Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211215175501.6761-5-linkmauve@linkmauve.fr --- arch/powerpc/configs/gamecube_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/configs/gamecube_defconfig b/arch/powerpc/configs/gamecube_defconfig index 24c0e0ea5aeb..91a1b99f4e8f 100644 --- a/arch/powerpc/configs/gamecube_defconfig +++ b/arch/powerpc/configs/gamecube_defconfig @@ -68,7 +68,7 @@ CONFIG_SND_SEQUENCER=y CONFIG_SND_SEQUENCER_OSS=y # CONFIG_USB_SUPPORT is not set CONFIG_RTC_CLASS=y -CONFIG_RTC_DRV_GENERIC=y +CONFIG_RTC_DRV_GAMECUBE=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_ISO9660_FS=y -- cgit v1.2.3-58-ga151 From c636783d594f6cfc95db51c796761719317ce5eb Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Wed, 15 Dec 2021 18:55:01 +0100 Subject: powerpc: wii_defconfig: Enable the RTC driver This selects the rtc-gamecube driver, which provides a real-time clock on this platform. Signed-off-by: Emmanuel Gil Peyrot Acked-by: Michael Ellerman (powerpc) Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211215175501.6761-6-linkmauve@linkmauve.fr --- arch/powerpc/configs/wii_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig index a0c45bf2bfb1..0ab78c51455d 100644 --- a/arch/powerpc/configs/wii_defconfig +++ b/arch/powerpc/configs/wii_defconfig @@ -98,7 +98,7 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_HEARTBEAT=y CONFIG_LEDS_TRIGGER_PANIC=y CONFIG_RTC_CLASS=y -CONFIG_RTC_DRV_GENERIC=y +CONFIG_RTC_DRV_GAMECUBE=y CONFIG_NVMEM_NINTENDO_OTP=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y -- cgit v1.2.3-58-ga151 From 454f47ff464325223129b9b5b8d0b61946ec704d Mon Sep 17 00:00:00 2001 From: Mateusz Jończyk Date: Fri, 10 Dec 2021 21:01:23 +0100 Subject: rtc: cmos: take rtc_lock while reading from CMOS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reading from the CMOS involves writing to the index register and then reading from the data register. Therefore access to the CMOS has to be serialized with rtc_lock. This invocation of CMOS_READ was not serialized, which could cause trouble when other code is accessing CMOS at the same time. Use spin_lock_irq() like the rest of the function. Nothing in kernel modifies the RTC_DM_BINARY bit, so there could be a separate pair of spin_lock_irq() / spin_unlock_irq() before doing the math. Signed-off-by: Mateusz Jończyk Reviewed-by: Nobuhiro Iwamatsu Cc: Alessandro Zummo Cc: Alexandre Belloni Cc: stable@vger.kernel.org Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211210200131.153887-2-mat.jonczyk@o2.pl --- drivers/rtc/rtc-cmos.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 4eb53412b808..dc3f8b0dde98 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -457,7 +457,10 @@ static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t) min = t->time.tm_min; sec = t->time.tm_sec; + spin_lock_irq(&rtc_lock); rtc_control = CMOS_READ(RTC_CONTROL); + spin_unlock_irq(&rtc_lock); + if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { /* Writing 0xff means "don't care" or "match all". */ mon = (mon <= 12) ? bin2bcd(mon) : 0xff; -- cgit v1.2.3-58-ga151 From d35786b3a28dee20b12962ae2dd365892a99ed1a Mon Sep 17 00:00:00 2001 From: Mateusz Jończyk Date: Fri, 10 Dec 2021 21:01:24 +0100 Subject: rtc: mc146818-lib: change return values of mc146818_get_time() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No function is checking mc146818_get_time() return values yet, so correct them to make them more customary. Signed-off-by: Mateusz Jończyk Cc: Alessandro Zummo Cc: Alexandre Belloni Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211210200131.153887-3-mat.jonczyk@o2.pl --- drivers/rtc/rtc-mc146818-lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index dcfaf09946ee..c186c8c4982b 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -25,7 +25,7 @@ again: if (WARN_ON_ONCE((CMOS_READ(RTC_VALID) & 0x40) != 0)) { spin_unlock_irqrestore(&rtc_lock, flags); memset(time, 0xff, sizeof(*time)); - return 0; + return -EIO; } /* @@ -116,7 +116,7 @@ again: time->tm_mon--; - return RTC_24H; + return 0; } EXPORT_SYMBOL_GPL(mc146818_get_time); -- cgit v1.2.3-58-ga151 From 0dd8d6cb9eddfe637bcd821bbfd40ebd5a0737b9 Mon Sep 17 00:00:00 2001 From: Mateusz Jończyk Date: Fri, 10 Dec 2021 21:01:25 +0100 Subject: rtc: Check return value from mc146818_get_time() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are 4 users of mc146818_get_time() and none of them was checking the return value from this function. Change this. Print the appropriate warnings in callers of mc146818_get_time() instead of in the function mc146818_get_time() itself, in order not to add strings to rtc-mc146818-lib.c, which is kind of a library. The callers of alpha_rtc_read_time() and cmos_read_time() may use the contents of (struct rtc_time *) even when the functions return a failure code. Therefore, set the contents of (struct rtc_time *) to 0x00, which looks more sensible then 0xff and aligns with the (possibly stale?) comment in cmos_read_time: /* * If pm_trace abused the RTC for storage, set the timespec to 0, * which tells the caller that this RTC value is unusable. */ For consistency, do this in mc146818_get_time(). Note: hpet_rtc_interrupt() may call mc146818_get_time() many times a second. It is very unlikely, though, that the RTC suddenly stops working and mc146818_get_time() would consistently fail. Only compile-tested on alpha. Signed-off-by: Mateusz Jończyk Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Alessandro Zummo Cc: Alexandre Belloni Cc: linux-alpha@vger.kernel.org Cc: x86@kernel.org Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211210200131.153887-4-mat.jonczyk@o2.pl --- arch/alpha/kernel/rtc.c | 7 ++++++- arch/x86/kernel/hpet.c | 8 ++++++-- drivers/base/power/trace.c | 6 +++++- drivers/rtc/rtc-cmos.c | 9 ++++++++- drivers/rtc/rtc-mc146818-lib.c | 2 +- 5 files changed, 26 insertions(+), 6 deletions(-) diff --git a/arch/alpha/kernel/rtc.c b/arch/alpha/kernel/rtc.c index ce3077946e1d..fb3025396ac9 100644 --- a/arch/alpha/kernel/rtc.c +++ b/arch/alpha/kernel/rtc.c @@ -80,7 +80,12 @@ init_rtc_epoch(void) static int alpha_rtc_read_time(struct device *dev, struct rtc_time *tm) { - mc146818_get_time(tm); + int ret = mc146818_get_time(tm); + + if (ret < 0) { + dev_err_ratelimited(dev, "unable to read current time\n"); + return ret; + } /* Adjust for non-default epochs. It's easier to depend on the generic __get_rtc_time and adjust the epoch here than create diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 882213df3713..71f336425e58 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1435,8 +1435,12 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) hpet_rtc_timer_reinit(); memset(&curr_time, 0, sizeof(struct rtc_time)); - if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - mc146818_get_time(&curr_time); + if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) { + if (unlikely(mc146818_get_time(&curr_time) < 0)) { + pr_err_ratelimited("unable to read current time from RTC\n"); + return IRQ_HANDLED; + } + } if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index 94665037f4a3..72b7a92337b1 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -120,7 +120,11 @@ static unsigned int read_magic_time(void) struct rtc_time time; unsigned int val; - mc146818_get_time(&time); + if (mc146818_get_time(&time) < 0) { + pr_err("Unable to read current time from RTC\n"); + return 0; + } + pr_info("RTC time: %ptRt, date: %ptRd\n", &time, &time); val = time.tm_year; /* 100 years */ if (val > 100) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index dc3f8b0dde98..d0f58cca5c20 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -222,6 +222,8 @@ static inline void cmos_write_bank2(unsigned char val, unsigned char addr) static int cmos_read_time(struct device *dev, struct rtc_time *t) { + int ret; + /* * If pm_trace abused the RTC for storage, set the timespec to 0, * which tells the caller that this RTC value is unusable. @@ -229,7 +231,12 @@ static int cmos_read_time(struct device *dev, struct rtc_time *t) if (!pm_trace_rtc_valid()) return -EIO; - mc146818_get_time(t); + ret = mc146818_get_time(t); + if (ret < 0) { + dev_err_ratelimited(dev, "unable to read current time\n"); + return ret; + } + return 0; } diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index c186c8c4982b..ccd974b8a75a 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -24,7 +24,7 @@ again: /* Ensure that the RTC is accessible. Bit 6 must be 0! */ if (WARN_ON_ONCE((CMOS_READ(RTC_VALID) & 0x40) != 0)) { spin_unlock_irqrestore(&rtc_lock, flags); - memset(time, 0xff, sizeof(*time)); + memset(time, 0, sizeof(*time)); return -EIO; } -- cgit v1.2.3-58-ga151 From ea6fa4961aab8f90a8aa03575a98b4bda368d4b6 Mon Sep 17 00:00:00 2001 From: Mateusz Jończyk Date: Fri, 10 Dec 2021 21:01:26 +0100 Subject: rtc: mc146818-lib: fix RTC presence check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prevent an infinite loop in mc146818_get_time(), commit 211e5db19d15 ("rtc: mc146818: Detect and handle broken RTCs") added a check for RTC availability. Together with a later fix, it checked if bit 6 in register 0x0d is cleared. This, however, caused a false negative on a motherboard with an AMD SB710 southbridge; according to the specification [1], bit 6 of register 0x0d of this chipset is a scratchbit. This caused a regression in Linux 5.11 - the RTC was determined broken by the kernel and not used by rtc-cmos.c [3]. This problem was also reported in Fedora [4]. As a better alternative, check whether the UIP ("Update-in-progress") bit is set for longer then 10ms. If that is the case, then apparently the RTC is either absent (and all register reads return 0xff) or broken. Also limit the number of loop iterations in mc146818_get_time() to 10 to prevent an infinite loop there. The functions mc146818_get_time() and mc146818_does_rtc_work() will be refactored later in this patch series, in order to fix a separate problem with reading / setting the RTC alarm time. This is done so to avoid a confusion about what is being fixed when. In a previous approach to this problem, I implemented a check whether the RTC_HOURS register contains a value <= 24. This, however, sometimes did not work correctly on my Intel Kaby Lake laptop. According to Intel's documentation [2], "the time and date RAM locations (0-9) are disconnected from the external bus" during the update cycle so reading this register without checking the UIP bit is incorrect. [1] AMD SB700/710/750 Register Reference Guide, page 308, https://developer.amd.com/wordpress/media/2012/10/43009_sb7xx_rrg_pub_1.00.pdf [2] 7th Generation Intel ® Processor Family I/O for U/Y Platforms [...] Datasheet Volume 1 of 2, page 209 Intel's Document Number: 334658-006, https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/7th-and-8th-gen-core-family-mobile-u-y-processor-lines-i-o-datasheet-vol-1.pdf [3] Functions in arch/x86/kernel/rtc.c apparently were using it. [4] https://bugzilla.redhat.com/show_bug.cgi?id=1936688 Fixes: 211e5db19d15 ("rtc: mc146818: Detect and handle broken RTCs") Fixes: ebb22a059436 ("rtc: mc146818: Dont test for bit 0-5 in Register D") Signed-off-by: Mateusz Jończyk Cc: Thomas Gleixner Cc: Alessandro Zummo Cc: Alexandre Belloni Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211210200131.153887-5-mat.jonczyk@o2.pl --- drivers/rtc/rtc-cmos.c | 10 ++++------ drivers/rtc/rtc-mc146818-lib.c | 34 ++++++++++++++++++++++++++++++---- include/linux/mc146818rtc.h | 1 + 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index d0f58cca5c20..b90a603d6b12 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -800,16 +800,14 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) rename_region(ports, dev_name(&cmos_rtc.rtc->dev)); - spin_lock_irq(&rtc_lock); - - /* Ensure that the RTC is accessible. Bit 6 must be 0! */ - if ((CMOS_READ(RTC_VALID) & 0x40) != 0) { - spin_unlock_irq(&rtc_lock); - dev_warn(dev, "not accessible\n"); + if (!mc146818_does_rtc_work()) { + dev_warn(dev, "broken or not accessible\n"); retval = -ENXIO; goto cleanup1; } + spin_lock_irq(&rtc_lock); + if (!(flags & CMOS_RTC_FLAGS_NOFREQ)) { /* force periodic irq to CMOS reset default of 1024Hz; * diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index ccd974b8a75a..d8e67a01220e 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -8,10 +8,36 @@ #include #endif +/* + * If the UIP (Update-in-progress) bit of the RTC is set for more then + * 10ms, the RTC is apparently broken or not present. + */ +bool mc146818_does_rtc_work(void) +{ + int i; + unsigned char val; + unsigned long flags; + + for (i = 0; i < 10; i++) { + spin_lock_irqsave(&rtc_lock, flags); + val = CMOS_READ(RTC_FREQ_SELECT); + spin_unlock_irqrestore(&rtc_lock, flags); + + if ((val & RTC_UIP) == 0) + return true; + + mdelay(1); + } + + return false; +} +EXPORT_SYMBOL_GPL(mc146818_does_rtc_work); + unsigned int mc146818_get_time(struct rtc_time *time) { unsigned char ctrl; unsigned long flags; + unsigned int iter_count = 0; unsigned char century = 0; bool retry; @@ -20,13 +46,13 @@ unsigned int mc146818_get_time(struct rtc_time *time) #endif again: - spin_lock_irqsave(&rtc_lock, flags); - /* Ensure that the RTC is accessible. Bit 6 must be 0! */ - if (WARN_ON_ONCE((CMOS_READ(RTC_VALID) & 0x40) != 0)) { - spin_unlock_irqrestore(&rtc_lock, flags); + if (iter_count > 10) { memset(time, 0, sizeof(*time)); return -EIO; } + iter_count++; + + spin_lock_irqsave(&rtc_lock, flags); /* * Check whether there is an update in progress during which the diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h index 0661af17a758..69c80c4325bf 100644 --- a/include/linux/mc146818rtc.h +++ b/include/linux/mc146818rtc.h @@ -123,6 +123,7 @@ struct cmos_rtc_board_info { #define RTC_IO_EXTENT_USED RTC_IO_EXTENT #endif /* ARCH_RTC_LOCATION */ +bool mc146818_does_rtc_work(void); unsigned int mc146818_get_time(struct rtc_time *time); int mc146818_set_time(struct rtc_time *time); -- cgit v1.2.3-58-ga151 From ec5895c0f2d87b9bf4185db1915e40fa6fcfc0ac Mon Sep 17 00:00:00 2001 From: Mateusz Jończyk Date: Fri, 10 Dec 2021 21:01:27 +0100 Subject: rtc: mc146818-lib: extract mc146818_avoid_UIP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function mc146818_get_time() contains an elaborate mechanism of reading the RTC time while no RTC update is in progress. It turns out that reading the RTC alarm clock also requires avoiding the RTC update. Therefore, the mechanism in mc146818_get_time() should be reused - so extract it into a separate function. The logic in mc146818_avoid_UIP() is same as in mc146818_get_time() except that after every if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) { there is now "mdelay(1)". To avoid producing a very unreadable patch, mc146818_get_time() will be refactored to use mc146818_avoid_UIP() in the next patch. Signed-off-by: Mateusz Jończyk Cc: Alessandro Zummo Cc: Alexandre Belloni Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211210200131.153887-6-mat.jonczyk@o2.pl --- drivers/rtc/rtc-mc146818-lib.c | 70 ++++++++++++++++++++++++++++++++++++++++++ include/linux/mc146818rtc.h | 3 ++ 2 files changed, 73 insertions(+) diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index d8e67a01220e..b20f4ebb2f3a 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -8,6 +8,76 @@ #include #endif +/* + * Execute a function while the UIP (Update-in-progress) bit of the RTC is + * unset. + * + * Warning: callback may be executed more then once. + */ +bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param), + void *param) +{ + int i; + unsigned long flags; + unsigned char seconds; + + for (i = 0; i < 10; i++) { + spin_lock_irqsave(&rtc_lock, flags); + + /* + * Check whether there is an update in progress during which the + * readout is unspecified. The maximum update time is ~2ms. Poll + * every msec for completion. + * + * Store the second value before checking UIP so a long lasting + * NMI which happens to hit after the UIP check cannot make + * an update cycle invisible. + */ + seconds = CMOS_READ(RTC_SECONDS); + + if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) { + spin_unlock_irqrestore(&rtc_lock, flags); + mdelay(1); + continue; + } + + /* Revalidate the above readout */ + if (seconds != CMOS_READ(RTC_SECONDS)) { + spin_unlock_irqrestore(&rtc_lock, flags); + continue; + } + + if (callback) + callback(seconds, param); + + /* + * Check for the UIP bit again. If it is set now then + * the above values may contain garbage. + */ + if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) { + spin_unlock_irqrestore(&rtc_lock, flags); + mdelay(1); + continue; + } + + /* + * A NMI might have interrupted the above sequence so check + * whether the seconds value has changed which indicates that + * the NMI took longer than the UIP bit was set. Unlikely, but + * possible and there is also virt... + */ + if (seconds != CMOS_READ(RTC_SECONDS)) { + spin_unlock_irqrestore(&rtc_lock, flags); + continue; + } + spin_unlock_irqrestore(&rtc_lock, flags); + + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(mc146818_avoid_UIP); + /* * If the UIP (Update-in-progress) bit of the RTC is set for more then * 10ms, the RTC is apparently broken or not present. diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h index 69c80c4325bf..67fb0a12becc 100644 --- a/include/linux/mc146818rtc.h +++ b/include/linux/mc146818rtc.h @@ -127,4 +127,7 @@ bool mc146818_does_rtc_work(void); unsigned int mc146818_get_time(struct rtc_time *time); int mc146818_set_time(struct rtc_time *time); +bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param), + void *param); + #endif /* _MC146818RTC_H */ -- cgit v1.2.3-58-ga151 From 2a61b0ac5493363149f68a2fb80287f314626987 Mon Sep 17 00:00:00 2001 From: Mateusz Jończyk Date: Fri, 10 Dec 2021 21:01:28 +0100 Subject: rtc: mc146818-lib: refactor mc146818_get_time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor mc146818_get_time() so that it uses mc146818_avoid_UIP(). Signed-off-by: Mateusz Jończyk Cc: Alessandro Zummo Cc: Alexandre Belloni Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211210200131.153887-7-mat.jonczyk@o2.pl --- drivers/rtc/rtc-mc146818-lib.c | 109 ++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 67 deletions(-) diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index b20f4ebb2f3a..15604b7f164d 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -103,49 +103,20 @@ bool mc146818_does_rtc_work(void) } EXPORT_SYMBOL_GPL(mc146818_does_rtc_work); -unsigned int mc146818_get_time(struct rtc_time *time) -{ +struct mc146818_get_time_callback_param { + struct rtc_time *time; unsigned char ctrl; - unsigned long flags; - unsigned int iter_count = 0; - unsigned char century = 0; - bool retry; - +#ifdef CONFIG_ACPI + unsigned char century; +#endif #ifdef CONFIG_MACH_DECSTATION unsigned int real_year; #endif +}; -again: - if (iter_count > 10) { - memset(time, 0, sizeof(*time)); - return -EIO; - } - iter_count++; - - spin_lock_irqsave(&rtc_lock, flags); - - /* - * Check whether there is an update in progress during which the - * readout is unspecified. The maximum update time is ~2ms. Poll - * every msec for completion. - * - * Store the second value before checking UIP so a long lasting NMI - * which happens to hit after the UIP check cannot make an update - * cycle invisible. - */ - time->tm_sec = CMOS_READ(RTC_SECONDS); - - if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) { - spin_unlock_irqrestore(&rtc_lock, flags); - mdelay(1); - goto again; - } - - /* Revalidate the above readout */ - if (time->tm_sec != CMOS_READ(RTC_SECONDS)) { - spin_unlock_irqrestore(&rtc_lock, flags); - goto again; - } +static void mc146818_get_time_callback(unsigned char seconds, void *param_in) +{ + struct mc146818_get_time_callback_param *p = param_in; /* * Only the values that we read from the RTC are set. We leave @@ -153,39 +124,39 @@ again: * RTC has RTC_DAY_OF_WEEK, we ignore it, as it is only updated * by the RTC when initially set to a non-zero value. */ - time->tm_min = CMOS_READ(RTC_MINUTES); - time->tm_hour = CMOS_READ(RTC_HOURS); - time->tm_mday = CMOS_READ(RTC_DAY_OF_MONTH); - time->tm_mon = CMOS_READ(RTC_MONTH); - time->tm_year = CMOS_READ(RTC_YEAR); + p->time->tm_sec = seconds; + p->time->tm_min = CMOS_READ(RTC_MINUTES); + p->time->tm_hour = CMOS_READ(RTC_HOURS); + p->time->tm_mday = CMOS_READ(RTC_DAY_OF_MONTH); + p->time->tm_mon = CMOS_READ(RTC_MONTH); + p->time->tm_year = CMOS_READ(RTC_YEAR); #ifdef CONFIG_MACH_DECSTATION - real_year = CMOS_READ(RTC_DEC_YEAR); + p->real_year = CMOS_READ(RTC_DEC_YEAR); #endif #ifdef CONFIG_ACPI if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && - acpi_gbl_FADT.century) - century = CMOS_READ(acpi_gbl_FADT.century); + acpi_gbl_FADT.century) { + p->century = CMOS_READ(acpi_gbl_FADT.century); + } else { + p->century = 0; + } #endif - ctrl = CMOS_READ(RTC_CONTROL); - /* - * Check for the UIP bit again. If it is set now then - * the above values may contain garbage. - */ - retry = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP; - /* - * A NMI might have interrupted the above sequence so check whether - * the seconds value has changed which indicates that the NMI took - * longer than the UIP bit was set. Unlikely, but possible and - * there is also virt... - */ - retry |= time->tm_sec != CMOS_READ(RTC_SECONDS); - spin_unlock_irqrestore(&rtc_lock, flags); + p->ctrl = CMOS_READ(RTC_CONTROL); +} - if (retry) - goto again; +unsigned int mc146818_get_time(struct rtc_time *time) +{ + struct mc146818_get_time_callback_param p = { + .time = time + }; + + if (!mc146818_avoid_UIP(mc146818_get_time_callback, &p)) { + memset(time, 0, sizeof(*time)); + return -EIO; + } - if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) + if (!(p.ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { time->tm_sec = bcd2bin(time->tm_sec); time->tm_min = bcd2bin(time->tm_min); @@ -193,15 +164,19 @@ again: time->tm_mday = bcd2bin(time->tm_mday); time->tm_mon = bcd2bin(time->tm_mon); time->tm_year = bcd2bin(time->tm_year); - century = bcd2bin(century); +#ifdef CONFIG_ACPI + p.century = bcd2bin(p.century); +#endif } #ifdef CONFIG_MACH_DECSTATION - time->tm_year += real_year - 72; + time->tm_year += p.real_year - 72; #endif - if (century > 20) - time->tm_year += (century - 19) * 100; +#ifdef CONFIG_ACPI + if (p.century > 20) + time->tm_year += (p.century - 19) * 100; +#endif /* * Account for differences between how the RTC uses the values -- cgit v1.2.3-58-ga151 From 2c7d47a45b06be3d0a7aa21c3dea2215685a928f Mon Sep 17 00:00:00 2001 From: Mateusz Jończyk Date: Fri, 10 Dec 2021 21:01:29 +0100 Subject: rtc: mc146818-lib: refactor mc146818_does_rtc_work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor mc146818_does_rtc_work() so that it uses mc146818_avoid_UIP(). It is enough to call mc146818_avoid_UIP() with no callback. Signed-off-by: Mateusz Jończyk Cc: Alessandro Zummo Cc: Alexandre Belloni Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211210200131.153887-8-mat.jonczyk@o2.pl --- drivers/rtc/rtc-mc146818-lib.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index 15604b7f164d..f62e658cbe23 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -84,22 +84,7 @@ EXPORT_SYMBOL_GPL(mc146818_avoid_UIP); */ bool mc146818_does_rtc_work(void) { - int i; - unsigned char val; - unsigned long flags; - - for (i = 0; i < 10; i++) { - spin_lock_irqsave(&rtc_lock, flags); - val = CMOS_READ(RTC_FREQ_SELECT); - spin_unlock_irqrestore(&rtc_lock, flags); - - if ((val & RTC_UIP) == 0) - return true; - - mdelay(1); - } - - return false; + return mc146818_avoid_UIP(NULL, NULL); } EXPORT_SYMBOL_GPL(mc146818_does_rtc_work); -- cgit v1.2.3-58-ga151 From cdedc45c579faf8cc6608d3ef81576ee0d512aa4 Mon Sep 17 00:00:00 2001 From: Mateusz Jończyk Date: Fri, 10 Dec 2021 21:01:30 +0100 Subject: rtc: cmos: avoid UIP when reading alarm time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some Intel chipsets disconnect the time and date RTC registers when the clock update is in progress: during this time reads may return bogus values and writes fail silently. This includes the RTC alarm registers. [1] cmos_read_alarm() did not take account for that, which caused alarm time reads to sometimes return bogus values. This can be shown with a test patch that I am attaching to this patch series. Fix this, by using mc146818_avoid_UIP(). [1] 7th Generation Intel ® Processor Family I/O for U/Y Platforms [...] Datasheet, Volume 1 of 2 (Intel's Document Number: 334658-006) Page 208 https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/7th-and-8th-gen-core-family-mobile-u-y-processor-lines-i-o-datasheet-vol-1.pdf "If a RAM read from the ten time and date bytes is attempted during an update cycle, the value read do not necessarily represent the true contents of those locations. Any RAM writes under the same conditions are ignored." Signed-off-by: Mateusz Jończyk Cc: Alessandro Zummo Cc: Alexandre Belloni Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211210200131.153887-9-mat.jonczyk@o2.pl --- drivers/rtc/rtc-cmos.c | 72 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 23 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index b90a603d6b12..6f47d68d2c86 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -249,10 +249,46 @@ static int cmos_set_time(struct device *dev, struct rtc_time *t) return mc146818_set_time(t); } +struct cmos_read_alarm_callback_param { + struct cmos_rtc *cmos; + struct rtc_time *time; + unsigned char rtc_control; +}; + +static void cmos_read_alarm_callback(unsigned char __always_unused seconds, + void *param_in) +{ + struct cmos_read_alarm_callback_param *p = + (struct cmos_read_alarm_callback_param *)param_in; + struct rtc_time *time = p->time; + + time->tm_sec = CMOS_READ(RTC_SECONDS_ALARM); + time->tm_min = CMOS_READ(RTC_MINUTES_ALARM); + time->tm_hour = CMOS_READ(RTC_HOURS_ALARM); + + if (p->cmos->day_alrm) { + /* ignore upper bits on readback per ACPI spec */ + time->tm_mday = CMOS_READ(p->cmos->day_alrm) & 0x3f; + if (!time->tm_mday) + time->tm_mday = -1; + + if (p->cmos->mon_alrm) { + time->tm_mon = CMOS_READ(p->cmos->mon_alrm); + if (!time->tm_mon) + time->tm_mon = -1; + } + } + + p->rtc_control = CMOS_READ(RTC_CONTROL); +} + static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t) { struct cmos_rtc *cmos = dev_get_drvdata(dev); - unsigned char rtc_control; + struct cmos_read_alarm_callback_param p = { + .cmos = cmos, + .time = &t->time, + }; /* This not only a rtc_op, but also called directly */ if (!is_valid_irq(cmos->irq)) @@ -263,28 +299,18 @@ static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t) * the future. */ - spin_lock_irq(&rtc_lock); - t->time.tm_sec = CMOS_READ(RTC_SECONDS_ALARM); - t->time.tm_min = CMOS_READ(RTC_MINUTES_ALARM); - t->time.tm_hour = CMOS_READ(RTC_HOURS_ALARM); - - if (cmos->day_alrm) { - /* ignore upper bits on readback per ACPI spec */ - t->time.tm_mday = CMOS_READ(cmos->day_alrm) & 0x3f; - if (!t->time.tm_mday) - t->time.tm_mday = -1; - - if (cmos->mon_alrm) { - t->time.tm_mon = CMOS_READ(cmos->mon_alrm); - if (!t->time.tm_mon) - t->time.tm_mon = -1; - } - } - - rtc_control = CMOS_READ(RTC_CONTROL); - spin_unlock_irq(&rtc_lock); + /* Some Intel chipsets disconnect the alarm registers when the clock + * update is in progress - during this time reads return bogus values + * and writes may fail silently. See for example "7th Generation Intel® + * Processor Family I/O for U/Y Platforms [...] Datasheet", section + * 27.7.1 + * + * Use the mc146818_avoid_UIP() function to avoid this. + */ + if (!mc146818_avoid_UIP(cmos_read_alarm_callback, &p)) + return -EIO; - if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { + if (!(p.rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { if (((unsigned)t->time.tm_sec) < 0x60) t->time.tm_sec = bcd2bin(t->time.tm_sec); else @@ -313,7 +339,7 @@ static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t) } } - t->enabled = !!(rtc_control & RTC_AIE); + t->enabled = !!(p.rtc_control & RTC_AIE); t->pending = 0; return 0; -- cgit v1.2.3-58-ga151 From cd17420ebea580c22dd3a93f7237de3d2cfafc37 Mon Sep 17 00:00:00 2001 From: Mateusz Jończyk Date: Fri, 10 Dec 2021 21:01:31 +0100 Subject: rtc: cmos: avoid UIP when writing alarm time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some Intel chipsets disconnect the time and date RTC registers when the clock update is in progress: during this time reads may return bogus values and writes fail silently. This includes the RTC alarm registers. [1] cmos_set_alarm() did not take account for that, fix it. [1] 7th Generation Intel ® Processor Family I/O for U/Y Platforms [...] Datasheet, Volume 1 of 2 (Intel's Document Number: 334658-006) Page 208 https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/7th-and-8th-gen-core-family-mobile-u-y-processor-lines-i-o-datasheet-vol-1.pdf "If a RAM read from the ten time and date bytes is attempted during an update cycle, the value read do not necessarily represent the true contents of those locations. Any RAM writes under the same conditions are ignored." Signed-off-by: Mateusz Jończyk Cc: Alessandro Zummo Cc: Alexandre Belloni Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211210200131.153887-10-mat.jonczyk@o2.pl --- drivers/rtc/rtc-cmos.c | 107 ++++++++++++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 41 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 6f47d68d2c86..7c006c2b125f 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -470,10 +470,57 @@ static int cmos_validate_alarm(struct device *dev, struct rtc_wkalrm *t) return 0; } +struct cmos_set_alarm_callback_param { + struct cmos_rtc *cmos; + unsigned char mon, mday, hrs, min, sec; + struct rtc_wkalrm *t; +}; + +/* Note: this function may be executed by mc146818_avoid_UIP() more then + * once + */ +static void cmos_set_alarm_callback(unsigned char __always_unused seconds, + void *param_in) +{ + struct cmos_set_alarm_callback_param *p = + (struct cmos_set_alarm_callback_param *)param_in; + + /* next rtc irq must not be from previous alarm setting */ + cmos_irq_disable(p->cmos, RTC_AIE); + + /* update alarm */ + CMOS_WRITE(p->hrs, RTC_HOURS_ALARM); + CMOS_WRITE(p->min, RTC_MINUTES_ALARM); + CMOS_WRITE(p->sec, RTC_SECONDS_ALARM); + + /* the system may support an "enhanced" alarm */ + if (p->cmos->day_alrm) { + CMOS_WRITE(p->mday, p->cmos->day_alrm); + if (p->cmos->mon_alrm) + CMOS_WRITE(p->mon, p->cmos->mon_alrm); + } + + if (use_hpet_alarm()) { + /* + * FIXME the HPET alarm glue currently ignores day_alrm + * and mon_alrm ... + */ + hpet_set_alarm_time(p->t->time.tm_hour, p->t->time.tm_min, + p->t->time.tm_sec); + } + + if (p->t->enabled) + cmos_irq_enable(p->cmos, RTC_AIE); +} + static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t) { struct cmos_rtc *cmos = dev_get_drvdata(dev); - unsigned char mon, mday, hrs, min, sec, rtc_control; + struct cmos_set_alarm_callback_param p = { + .cmos = cmos, + .t = t + }; + unsigned char rtc_control; int ret; /* This not only a rtc_op, but also called directly */ @@ -484,11 +531,11 @@ static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t) if (ret < 0) return ret; - mon = t->time.tm_mon + 1; - mday = t->time.tm_mday; - hrs = t->time.tm_hour; - min = t->time.tm_min; - sec = t->time.tm_sec; + p.mon = t->time.tm_mon + 1; + p.mday = t->time.tm_mday; + p.hrs = t->time.tm_hour; + p.min = t->time.tm_min; + p.sec = t->time.tm_sec; spin_lock_irq(&rtc_lock); rtc_control = CMOS_READ(RTC_CONTROL); @@ -496,43 +543,21 @@ static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t) if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { /* Writing 0xff means "don't care" or "match all". */ - mon = (mon <= 12) ? bin2bcd(mon) : 0xff; - mday = (mday >= 1 && mday <= 31) ? bin2bcd(mday) : 0xff; - hrs = (hrs < 24) ? bin2bcd(hrs) : 0xff; - min = (min < 60) ? bin2bcd(min) : 0xff; - sec = (sec < 60) ? bin2bcd(sec) : 0xff; - } - - spin_lock_irq(&rtc_lock); - - /* next rtc irq must not be from previous alarm setting */ - cmos_irq_disable(cmos, RTC_AIE); - - /* update alarm */ - CMOS_WRITE(hrs, RTC_HOURS_ALARM); - CMOS_WRITE(min, RTC_MINUTES_ALARM); - CMOS_WRITE(sec, RTC_SECONDS_ALARM); - - /* the system may support an "enhanced" alarm */ - if (cmos->day_alrm) { - CMOS_WRITE(mday, cmos->day_alrm); - if (cmos->mon_alrm) - CMOS_WRITE(mon, cmos->mon_alrm); - } - - if (use_hpet_alarm()) { - /* - * FIXME the HPET alarm glue currently ignores day_alrm - * and mon_alrm ... - */ - hpet_set_alarm_time(t->time.tm_hour, t->time.tm_min, - t->time.tm_sec); + p.mon = (p.mon <= 12) ? bin2bcd(p.mon) : 0xff; + p.mday = (p.mday >= 1 && p.mday <= 31) ? bin2bcd(p.mday) : 0xff; + p.hrs = (p.hrs < 24) ? bin2bcd(p.hrs) : 0xff; + p.min = (p.min < 60) ? bin2bcd(p.min) : 0xff; + p.sec = (p.sec < 60) ? bin2bcd(p.sec) : 0xff; } - if (t->enabled) - cmos_irq_enable(cmos, RTC_AIE); - - spin_unlock_irq(&rtc_lock); + /* + * Some Intel chipsets disconnect the alarm registers when the clock + * update is in progress - during this time writes fail silently. + * + * Use mc146818_avoid_UIP() to avoid this. + */ + if (!mc146818_avoid_UIP(cmos_set_alarm_callback, &p)) + return -EIO; cmos->alarm_expires = rtc_tm_to_time64(&t->time); -- cgit v1.2.3-58-ga151 From 21ab799585762e097387da82a4e0fd6c2ffb4000 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 1 Dec 2021 16:45:41 -0700 Subject: vfio/pci: Resolve sparse endian warnings in IGD support Sparse warns: sparse warnings: (new ones prefixed by >>) >> drivers/vfio/pci/vfio_pci_igd.c:146:21: sparse: sparse: incorrect type in assignment (different base types) @@ expected unsigned short [addressable] [usertype] val @@ got restricted __le16 [usertype] @@ drivers/vfio/pci/vfio_pci_igd.c:146:21: sparse: expected unsigned short [addressable] [usertype] val drivers/vfio/pci/vfio_pci_igd.c:146:21: sparse: got restricted __le16 [usertype] >> drivers/vfio/pci/vfio_pci_igd.c:161:21: sparse: sparse: incorrect type in assignment (different base types) @@ expected unsigned int [addressable] [usertype] val @@ got restricted __le32 [usertype] @@ drivers/vfio/pci/vfio_pci_igd.c:161:21: sparse: expected unsigned int [addressable] [usertype] val drivers/vfio/pci/vfio_pci_igd.c:161:21: sparse: got restricted __le32 [usertype] drivers/vfio/pci/vfio_pci_igd.c:176:21: sparse: sparse: incorrect type in assignment (different base types) @@ expected unsigned short [addressable] [usertype] val @@ got restricted __le16 [usertype] @@ drivers/vfio/pci/vfio_pci_igd.c:176:21: sparse: expected unsigned short [addressable] [usertype] val drivers/vfio/pci/vfio_pci_igd.c:176:21: sparse: got restricted __le16 [usertype] These are due to trying to use an unsigned to store the result of a cpu_to_leXX() conversion. These are small variables, so pointer tricks are wasteful and casting just generates different sparse warnings. Store to and copy results from a separate little endian variable. Reported-by: kernel test robot Link: https://lore.kernel.org/r/202111290026.O3vehj03-lkp@intel.com/ Link: https://lore.kernel.org/r/163840226123.138003.7668320168896210328.stgit@omen Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_igd.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c index 362f91ec8845..352c725ccf18 100644 --- a/drivers/vfio/pci/vfio_pci_igd.c +++ b/drivers/vfio/pci/vfio_pci_igd.c @@ -309,13 +309,14 @@ static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_core_device *vdev, if ((pos & 3) && size > 2) { u16 val; + __le16 lval; ret = pci_user_read_config_word(pdev, pos, &val); if (ret) return ret; - val = cpu_to_le16(val); - if (copy_to_user(buf + count - size, &val, 2)) + lval = cpu_to_le16(val); + if (copy_to_user(buf + count - size, &lval, 2)) return -EFAULT; pos += 2; @@ -324,13 +325,14 @@ static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_core_device *vdev, while (size > 3) { u32 val; + __le32 lval; ret = pci_user_read_config_dword(pdev, pos, &val); if (ret) return ret; - val = cpu_to_le32(val); - if (copy_to_user(buf + count - size, &val, 4)) + lval = cpu_to_le32(val); + if (copy_to_user(buf + count - size, &lval, 4)) return -EFAULT; pos += 4; @@ -339,13 +341,14 @@ static ssize_t vfio_pci_igd_cfg_rw(struct vfio_pci_core_device *vdev, while (size >= 2) { u16 val; + __le16 lval; ret = pci_user_read_config_word(pdev, pos, &val); if (ret) return ret; - val = cpu_to_le16(val); - if (copy_to_user(buf + count - size, &val, 2)) + lval = cpu_to_le16(val); + if (copy_to_user(buf + count - size, &lval, 2)) return -EFAULT; pos += 2; -- cgit v1.2.3-58-ga151 From 2bed2ced40c97b8540ff38df0149e8ecb2bf4c65 Mon Sep 17 00:00:00 2001 From: Jiacheng Shi Date: Sun, 12 Dec 2021 01:16:00 -0800 Subject: vfio/iommu_type1: replace kfree with kvfree Variables allocated by kvzalloc should not be freed by kfree. Because they may be allocated by vmalloc. So we replace kfree with kvfree here. Fixes: d6a4c185660c ("vfio iommu: Implementation of ioctl for dirty pages tracking") Signed-off-by: Jiacheng Shi Link: https://lore.kernel.org/r/20211212091600.2560-1-billsjc@sjtu.edu.cn Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index f17490ab238f..9394aa9444c1 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -256,7 +256,7 @@ static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize) static void vfio_dma_bitmap_free(struct vfio_dma *dma) { - kfree(dma->bitmap); + kvfree(dma->bitmap); dma->bitmap = NULL; } -- cgit v1.2.3-58-ga151 From 294277410cf3b46bee2b8282ab754e52975c0a70 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Tue, 23 Nov 2021 11:10:13 +0900 Subject: ksmbd: use oid registry functions to decode OIDs Use look_up_OID to decode OIDs rather than implementing functions. Acked-by: Namjae Jeon Signed-off-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/asn1.c | 142 ++++++++------------------------------------------------ 1 file changed, 19 insertions(+), 123 deletions(-) diff --git a/fs/ksmbd/asn1.c b/fs/ksmbd/asn1.c index b014f4638610..c03eba090368 100644 --- a/fs/ksmbd/asn1.c +++ b/fs/ksmbd/asn1.c @@ -21,101 +21,11 @@ #include "ksmbd_spnego_negtokeninit.asn1.h" #include "ksmbd_spnego_negtokentarg.asn1.h" -#define SPNEGO_OID_LEN 7 #define NTLMSSP_OID_LEN 10 -#define KRB5_OID_LEN 7 -#define KRB5U2U_OID_LEN 8 -#define MSKRB5_OID_LEN 7 -static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 }; -static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 }; -static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 }; -static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 }; -static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 }; static char NTLMSSP_OID_STR[NTLMSSP_OID_LEN] = { 0x2b, 0x06, 0x01, 0x04, 0x01, 0x82, 0x37, 0x02, 0x02, 0x0a }; -static bool -asn1_subid_decode(const unsigned char **begin, const unsigned char *end, - unsigned long *subid) -{ - const unsigned char *ptr = *begin; - unsigned char ch; - - *subid = 0; - - do { - if (ptr >= end) - return false; - - ch = *ptr++; - *subid <<= 7; - *subid |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - - *begin = ptr; - return true; -} - -static bool asn1_oid_decode(const unsigned char *value, size_t vlen, - unsigned long **oid, size_t *oidlen) -{ - const unsigned char *iptr = value, *end = value + vlen; - unsigned long *optr; - unsigned long subid; - - vlen += 1; - if (vlen < 2 || vlen > UINT_MAX / sizeof(unsigned long)) - goto fail_nullify; - - *oid = kmalloc(vlen * sizeof(unsigned long), GFP_KERNEL); - if (!*oid) - return false; - - optr = *oid; - - if (!asn1_subid_decode(&iptr, end, &subid)) - goto fail; - - if (subid < 40) { - optr[0] = 0; - optr[1] = subid; - } else if (subid < 80) { - optr[0] = 1; - optr[1] = subid - 40; - } else { - optr[0] = 2; - optr[1] = subid - 80; - } - - *oidlen = 2; - optr += 2; - - while (iptr < end) { - if (++(*oidlen) > vlen) - goto fail; - - if (!asn1_subid_decode(&iptr, end, optr++)) - goto fail; - } - return true; - -fail: - kfree(*oid); -fail_nullify: - *oid = NULL; - return false; -} - -static bool oid_eq(unsigned long *oid1, unsigned int oid1len, - unsigned long *oid2, unsigned int oid2len) -{ - if (oid1len != oid2len) - return false; - - return memcmp(oid1, oid2, oid1len) == 0; -} - int ksmbd_decode_negTokenInit(unsigned char *security_blob, int length, struct ksmbd_conn *conn) @@ -252,26 +162,18 @@ int build_spnego_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, int ksmbd_gssapi_this_mech(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { - unsigned long *oid; - size_t oidlen; - int err = 0; - - if (!asn1_oid_decode(value, vlen, &oid, &oidlen)) { - err = -EBADMSG; - goto out; - } + enum OID oid; - if (!oid_eq(oid, oidlen, SPNEGO_OID, SPNEGO_OID_LEN)) - err = -EBADMSG; - kfree(oid); -out: - if (err) { + oid = look_up_OID(value, vlen); + if (oid != OID_spnego) { char buf[50]; sprint_oid(value, vlen, buf, sizeof(buf)); ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); + return -EBADMSG; } - return err; + + return 0; } int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen, @@ -279,37 +181,31 @@ int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen, size_t vlen) { struct ksmbd_conn *conn = context; - unsigned long *oid; - size_t oidlen; + enum OID oid; int mech_type; - char buf[50]; - if (!asn1_oid_decode(value, vlen, &oid, &oidlen)) - goto fail; - - if (oid_eq(oid, oidlen, NTLMSSP_OID, NTLMSSP_OID_LEN)) + oid = look_up_OID(value, vlen); + if (oid == OID_ntlmssp) { mech_type = KSMBD_AUTH_NTLMSSP; - else if (oid_eq(oid, oidlen, MSKRB5_OID, MSKRB5_OID_LEN)) + } else if (oid == OID_mskrb5) { mech_type = KSMBD_AUTH_MSKRB5; - else if (oid_eq(oid, oidlen, KRB5_OID, KRB5_OID_LEN)) + } else if (oid == OID_krb5) { mech_type = KSMBD_AUTH_KRB5; - else if (oid_eq(oid, oidlen, KRB5U2U_OID, KRB5U2U_OID_LEN)) + } else if (oid == OID_krb5u2u) { mech_type = KSMBD_AUTH_KRB5U2U; - else - goto fail; + } else { + char buf[50]; + + sprint_oid(value, vlen, buf, sizeof(buf)); + ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); + return -EBADMSG; + } conn->auth_mechs |= mech_type; if (conn->preferred_auth_mech == 0) conn->preferred_auth_mech = mech_type; - kfree(oid); return 0; - -fail: - kfree(oid); - sprint_oid(value, vlen, buf, sizeof(buf)); - ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); - return -EBADMSG; } int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen, -- cgit v1.2.3-58-ga151 From 80917f17e3f99027661a45262c310139e53a9faa Mon Sep 17 00:00:00 2001 From: Marios Makassikis Date: Wed, 1 Dec 2021 21:41:19 +0100 Subject: ksmbd: Remove unused parameter from smb2_get_name() The 'share' parameter is no longer used by smb2_get_name() since commit 265fd1991c1d ("ksmbd: use LOOKUP_BENEATH to prevent the out of share access"). Acked-by: Namjae Jeon Signed-off-by: Marios Makassikis Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index b8b3a4c28b74..225f2bcb46b6 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -610,7 +610,6 @@ static void destroy_previous_session(struct ksmbd_user *user, u64 id) /** * smb2_get_name() - get filename string from on the wire smb format - * @share: ksmbd_share_config pointer * @src: source buffer * @maxlen: maxlen of source string * @nls_table: nls_table pointer @@ -618,8 +617,7 @@ static void destroy_previous_session(struct ksmbd_user *user, u64 id) * Return: matching converted filename on success, otherwise error ptr */ static char * -smb2_get_name(struct ksmbd_share_config *share, const char *src, - const int maxlen, struct nls_table *local_nls) +smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls) { char *name; @@ -2530,8 +2528,7 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } - name = smb2_get_name(share, - req->Buffer, + name = smb2_get_name(req->Buffer, le16_to_cpu(req->NameLength), work->conn->local_nls); if (IS_ERR(name)) { @@ -5398,8 +5395,7 @@ static int smb2_rename(struct ksmbd_work *work, goto out; } - new_name = smb2_get_name(share, - file_info->FileName, + new_name = smb2_get_name(file_info->FileName, le32_to_cpu(file_info->FileNameLength), local_nls); if (IS_ERR(new_name)) { @@ -5510,8 +5506,7 @@ static int smb2_create_link(struct ksmbd_work *work, if (!pathname) return -ENOMEM; - link_name = smb2_get_name(share, - file_info->FileName, + link_name = smb2_get_name(file_info->FileName, le32_to_cpu(file_info->FileNameLength), local_nls); if (IS_ERR(link_name) || S_ISDIR(file_inode(filp)->i_mode)) { -- cgit v1.2.3-58-ga151 From 305f8bda15ebbe4004681286a5c67d0dc296c771 Mon Sep 17 00:00:00 2001 From: Marios Makassikis Date: Wed, 1 Dec 2021 21:40:50 +0100 Subject: ksmbd: Remove unused fields from ksmbd_file struct definition These fields are remnants of the not upstreamed SMB1 code. Acked-by: Namjae Jeon Signed-off-by: Marios Makassikis Signed-off-by: Steve French --- fs/ksmbd/vfs_cache.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h index 448576fbe4b7..36239ce31afd 100644 --- a/fs/ksmbd/vfs_cache.h +++ b/fs/ksmbd/vfs_cache.h @@ -96,16 +96,6 @@ struct ksmbd_file { int durable_timeout; - /* for SMB1 */ - int pid; - - /* conflict lock fail count for SMB1 */ - unsigned int cflock_cnt; - /* last lock failure start offset for SMB1 */ - unsigned long long llock_fstart; - - int dirent_offset; - /* if ls is happening on directory, below is valid*/ struct ksmbd_readdir_data readdir_data; int dot_dotdot[2]; -- cgit v1.2.3-58-ga151 From a58b45a4dbfd0bf2ebb157789da4d8e6368afb1b Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 16 Dec 2021 10:26:43 +0900 Subject: ksmbd: set RSS capable in FSCTL_QUERY_NETWORK_INTERFACE_INFO Set RSS capable in FSCTL_QUERY_NETWORK_INTERFACE_INFO if netdev has multi tx queues. And add ksmbd_compare_user() to avoid racy condition issue in ksmbd_free_user(). because windows client is simultaneously used to send session setup requests for multichannel connection. Tested-by: Ziwei Xie Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/mgmt/user_config.c | 10 ++++++++++ fs/ksmbd/mgmt/user_config.h | 1 + fs/ksmbd/smb2pdu.c | 15 ++++++++++----- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/ksmbd/mgmt/user_config.c index 1019d3677d55..279d00feff21 100644 --- a/fs/ksmbd/mgmt/user_config.c +++ b/fs/ksmbd/mgmt/user_config.c @@ -67,3 +67,13 @@ int ksmbd_anonymous_user(struct ksmbd_user *user) return 1; return 0; } + +bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2) +{ + if (strcmp(u1->name, u2->name)) + return false; + if (memcmp(u1->passkey, u2->passkey, u1->passkey_sz)) + return false; + + return true; +} diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/ksmbd/mgmt/user_config.h index aff80b029579..6a44109617f1 100644 --- a/fs/ksmbd/mgmt/user_config.h +++ b/fs/ksmbd/mgmt/user_config.h @@ -64,4 +64,5 @@ struct ksmbd_user *ksmbd_login_user(const char *account); struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp); void ksmbd_free_user(struct ksmbd_user *user); int ksmbd_anonymous_user(struct ksmbd_user *user); +bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2); #endif /* __USER_CONFIG_MANAGEMENT_H__ */ diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 225f2bcb46b6..1ef8ef2c8e85 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -1448,10 +1448,16 @@ static int ntlm_authenticate(struct ksmbd_work *work) ksmbd_free_user(user); return 0; } - ksmbd_free_user(sess->user); + + if (!ksmbd_compare_user(sess->user, user)) { + ksmbd_free_user(user); + return -EPERM; + } + ksmbd_free_user(user); + } else { + sess->user = user; } - sess->user = user; if (user_guest(sess->user)) { if (conn->sign) { ksmbd_debug(SMB, "Guest login not allowed when signing enabled\n"); @@ -2055,9 +2061,6 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_debug(SMB, "request\n"); - /* Got a valid session, set connection state */ - WARN_ON(sess->conn != conn); - /* setting CifsExiting here may race with start_tcp_sess */ ksmbd_conn_set_need_reconnect(work); ksmbd_close_session_fds(work); @@ -7260,6 +7263,8 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, nii_rsp->IfIndex = cpu_to_le32(netdev->ifindex); nii_rsp->Capability = 0; + if (netdev->real_num_tx_queues > 1) + nii_rsp->Capability |= cpu_to_le32(RSS_CAPABLE); if (ksmbd_rdma_capable_netdev(netdev)) nii_rsp->Capability |= cpu_to_le32(RDMA_CAPABLE); -- cgit v1.2.3-58-ga151 From 71cd9cb680cb5d536c0dcbddb1c1d0010d79b214 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 16 Dec 2021 10:31:44 +0900 Subject: ksmbd: set both ipv4 and ipv6 in FSCTL_QUERY_NETWORK_INTERFACE_INFO Set ipv4 and ipv6 address in FSCTL_QUERY_NETWORK_INTERFACE_INFO. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 1ef8ef2c8e85..fcc3a9d0ab50 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -7241,15 +7241,10 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, struct sockaddr_storage_rsp *sockaddr_storage; unsigned int flags; unsigned long long speed; - struct sockaddr_in6 *csin6 = (struct sockaddr_in6 *)&conn->peer_addr; rtnl_lock(); for_each_netdev(&init_net, netdev) { - if (out_buf_len < - nbytes + sizeof(struct network_interface_info_ioctl_rsp)) { - rtnl_unlock(); - return -ENOSPC; - } + bool ipv4_set = false; if (netdev->type == ARPHRD_LOOPBACK) continue; @@ -7257,6 +7252,12 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, flags = dev_get_flags(netdev); if (!(flags & IFF_RUNNING)) continue; +ipv6_retry: + if (out_buf_len < + nbytes + sizeof(struct network_interface_info_ioctl_rsp)) { + rtnl_unlock(); + return -ENOSPC; + } nii_rsp = (struct network_interface_info_ioctl_rsp *) &rsp->Buffer[nbytes]; @@ -7289,8 +7290,7 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, nii_rsp->SockAddr_Storage; memset(sockaddr_storage, 0, 128); - if (conn->peer_addr.ss_family == PF_INET || - ipv6_addr_v4mapped(&csin6->sin6_addr)) { + if (!ipv4_set) { struct in_device *idev; sockaddr_storage->Family = cpu_to_le16(INTERNETWORK); @@ -7301,6 +7301,9 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, continue; sockaddr_storage->addr4.IPv4address = idev_ipv4_address(idev); + nbytes += sizeof(struct network_interface_info_ioctl_rsp); + ipv4_set = true; + goto ipv6_retry; } else { struct inet6_dev *idev6; struct inet6_ifaddr *ifa; @@ -7322,9 +7325,8 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, break; } sockaddr_storage->addr6.ScopeId = 0; + nbytes += sizeof(struct network_interface_info_ioctl_rsp); } - - nbytes += sizeof(struct network_interface_info_ioctl_rsp); } rtnl_unlock(); -- cgit v1.2.3-58-ga151 From ce53d365378cde71bb6596d79c257e600d951d29 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 15 Dec 2021 14:57:27 +0900 Subject: ksmbd: fix multi session connection failure When RSS mode is enable, windows client do simultaneously send several session requests to server. There is racy issue using sess->ntlmssp.cryptkey on N connection : 1 session. So authetication failed using wrong cryptkey on some session. This patch move cryptkey to ksmbd_conn structure to use each cryptkey on connection. Tested-by: Ziwei Xie Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/auth.c | 27 ++++++++++++++------------- fs/ksmbd/auth.h | 10 +++++----- fs/ksmbd/connection.h | 7 +------ fs/ksmbd/mgmt/user_session.h | 1 - fs/ksmbd/smb2pdu.c | 8 ++++---- 5 files changed, 24 insertions(+), 29 deletions(-) diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index 3503b1c48cb4..dc3d061edda9 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -215,7 +215,7 @@ out: * Return: 0 on success, error number on error */ int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, - int blen, char *domain_name) + int blen, char *domain_name, char *cryptkey) { char ntlmv2_hash[CIFS_ENCPWD_SIZE]; char ntlmv2_rsp[CIFS_HMAC_MD5_HASH_SIZE]; @@ -256,7 +256,7 @@ int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, goto out; } - memcpy(construct, sess->ntlmssp.cryptkey, CIFS_CRYPTO_KEY_SIZE); + memcpy(construct, cryptkey, CIFS_CRYPTO_KEY_SIZE); memcpy(construct + CIFS_CRYPTO_KEY_SIZE, &ntlmv2->blob_signature, blen); rc = crypto_shash_update(CRYPTO_HMACMD5(ctx), construct, len); @@ -295,7 +295,8 @@ out: * Return: 0 on success, error number on error */ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, - int blob_len, struct ksmbd_session *sess) + int blob_len, struct ksmbd_conn *conn, + struct ksmbd_session *sess) { char *domain_name; unsigned int nt_off, dn_off; @@ -324,7 +325,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, /* TODO : use domain name that imported from configuration file */ domain_name = smb_strndup_from_utf16((const char *)authblob + dn_off, - dn_len, true, sess->conn->local_nls); + dn_len, true, conn->local_nls); if (IS_ERR(domain_name)) return PTR_ERR(domain_name); @@ -333,7 +334,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, domain_name); ret = ksmbd_auth_ntlmv2(sess, (struct ntlmv2_resp *)((char *)authblob + nt_off), nt_len - CIFS_ENCPWD_SIZE, - domain_name); + domain_name, conn->ntlmssp.cryptkey); kfree(domain_name); return ret; } @@ -347,7 +348,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, * */ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, - int blob_len, struct ksmbd_session *sess) + int blob_len, struct ksmbd_conn *conn) { if (blob_len < sizeof(struct negotiate_message)) { ksmbd_debug(AUTH, "negotiate blob len %d too small\n", @@ -361,7 +362,7 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, return -EINVAL; } - sess->ntlmssp.client_flags = le32_to_cpu(negblob->NegotiateFlags); + conn->ntlmssp.client_flags = le32_to_cpu(negblob->NegotiateFlags); return 0; } @@ -375,14 +376,14 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, */ unsigned int ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, - struct ksmbd_session *sess) + struct ksmbd_conn *conn) { struct target_info *tinfo; wchar_t *name; __u8 *target_name; unsigned int flags, blob_off, blob_len, type, target_info_len = 0; int len, uni_len, conv_len; - int cflags = sess->ntlmssp.client_flags; + int cflags = conn->ntlmssp.client_flags; memcpy(chgblob->Signature, NTLMSSP_SIGNATURE, 8); chgblob->MessageType = NtLmChallenge; @@ -403,7 +404,7 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, if (cflags & NTLMSSP_REQUEST_TARGET) flags |= NTLMSSP_REQUEST_TARGET; - if (sess->conn->use_spnego && + if (conn->use_spnego && (cflags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) flags |= NTLMSSP_NEGOTIATE_EXTENDED_SEC; @@ -414,7 +415,7 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, return -ENOMEM; conv_len = smb_strtoUTF16((__le16 *)name, ksmbd_netbios_name(), len, - sess->conn->local_nls); + conn->local_nls); if (conv_len < 0 || conv_len > len) { kfree(name); return -EINVAL; @@ -430,8 +431,8 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, chgblob->TargetName.BufferOffset = cpu_to_le32(blob_off); /* Initialize random conn challenge */ - get_random_bytes(sess->ntlmssp.cryptkey, sizeof(__u64)); - memcpy(chgblob->Challenge, sess->ntlmssp.cryptkey, + get_random_bytes(conn->ntlmssp.cryptkey, sizeof(__u64)); + memcpy(chgblob->Challenge, conn->ntlmssp.cryptkey, CIFS_CRYPTO_KEY_SIZE); /* Add Target Information to security buffer */ diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h index 9c2d4badd05d..95629651cf26 100644 --- a/fs/ksmbd/auth.h +++ b/fs/ksmbd/auth.h @@ -38,16 +38,16 @@ struct kvec; int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, unsigned int nvec, int enc); void ksmbd_copy_gss_neg_header(void *buf); -int ksmbd_auth_ntlm(struct ksmbd_session *sess, char *pw_buf); int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, - int blen, char *domain_name); + int blen, char *domain_name, char *cryptkey); int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, - int blob_len, struct ksmbd_session *sess); + int blob_len, struct ksmbd_conn *conn, + struct ksmbd_session *sess); int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, - int blob_len, struct ksmbd_session *sess); + int blob_len, struct ksmbd_conn *conn); unsigned int ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, - struct ksmbd_session *sess); + struct ksmbd_conn *conn); int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, int in_len, char *out_blob, int *out_len); int ksmbd_sign_smb2_pdu(struct ksmbd_conn *conn, char *key, struct kvec *iov, diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index e5403c587a58..72dfd155b5bf 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -72,12 +72,7 @@ struct ksmbd_conn { int connection_type; struct ksmbd_stats stats; char ClientGUID[SMB2_CLIENT_GUID_SIZE]; - union { - /* pending trans request table */ - struct trans_state *recent_trans; - /* Used by ntlmssp */ - char *ntlmssp_cryptkey; - }; + struct ntlmssp_auth ntlmssp; spinlock_t llist_lock; struct list_head lock_list; diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h index 82289c3cbd2b..e241f16a3851 100644 --- a/fs/ksmbd/mgmt/user_session.h +++ b/fs/ksmbd/mgmt/user_session.h @@ -45,7 +45,6 @@ struct ksmbd_session { int state; __u8 *Preauth_HashValue; - struct ntlmssp_auth ntlmssp; char sess_key[CIFS_KEY_SIZE]; struct hlist_node hlist; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index fcc3a9d0ab50..036a0dc86859 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -1301,7 +1301,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, int sz, rc; ksmbd_debug(SMB, "negotiate phase\n"); - rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->sess); + rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->conn); if (rc) return rc; @@ -1311,7 +1311,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, memset(chgblob, 0, sizeof(struct challenge_message)); if (!work->conn->use_spnego) { - sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess); + sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->conn); if (sz < 0) return -ENOMEM; @@ -1327,7 +1327,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, return -ENOMEM; chgblob = (struct challenge_message *)neg_blob; - sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess); + sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->conn); if (sz < 0) { rc = -ENOMEM; goto out; @@ -1470,7 +1470,7 @@ static int ntlm_authenticate(struct ksmbd_work *work) authblob = user_authblob(conn, req); sz = le16_to_cpu(req->SecurityBufferLength); - rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, sess); + rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, conn, sess); if (rc) { set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD); ksmbd_debug(SMB, "authentication failed\n"); -- cgit v1.2.3-58-ga151 From e230d013378489bcd4b5589ca1d2a5b91ff8d098 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 21 Dec 2021 17:07:11 +0800 Subject: ksmbd: Fix buffer_check_err() kernel-doc comment Add the description of @rsp_org in buffer_check_err() kernel-doc comment to remove a warning found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/ksmbd/smb2pdu.c:4028: warning: Function parameter or member 'rsp_org' not described in 'buffer_check_err' Reported-by: Abaci Robot Fixes: cb4517201b8a ("ksmbd: remove smb2_buf_length in smb2_hdr") Acked-by: Namjae Jeon Signed-off-by: Yang Li Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 036a0dc86859..08ff4684dee6 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -4018,6 +4018,7 @@ err_out2: * buffer_check_err() - helper function to check buffer errors * @reqOutputBufferLength: max buffer length expected in command response * @rsp: query info response buffer contains output buffer length + * @rsp_org: base response buffer pointer in case of chained response * @infoclass_size: query info class response buffer size * * Return: 0 on success, otherwise error -- cgit v1.2.3-58-ga151 From 4bfd9eed15e163969156e976c62db5ef423e5b0f Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 21 Dec 2021 17:07:12 +0800 Subject: ksmbd: Fix smb2_set_info_file() kernel-doc comment Fix argument list that the kdoc format and script verified in smb2_set_info_file(). The warnings were found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/ksmbd/smb2pdu.c:5862: warning: Function parameter or member 'req' not described in 'smb2_set_info_file' fs/ksmbd/smb2pdu.c:5862: warning: Excess function parameter 'info_class' description in 'smb2_set_info_file' Reported-by: Abaci Robot Fixes: 9496e268e3af ("ksmbd: add request buffer validation in smb2_set_info") Acked-by: Namjae Jeon Signed-off-by: Yang Li Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 08ff4684dee6..722f14a1b77a 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -5848,7 +5848,7 @@ static int set_file_mode_info(struct ksmbd_file *fp, * smb2_set_info_file() - handler for smb2 set info command * @work: smb work containing set info command buffer * @fp: ksmbd_file pointer - * @info_class: smb2 set info class + * @req: request buffer pointer * @share: ksmbd_share_config pointer * * Return: 0 on success, otherwise error -- cgit v1.2.3-58-ga151 From f5c381392948dcae19f854b9586b806654f08a11 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 21 Dec 2021 17:07:13 +0800 Subject: ksmbd: Delete an invalid argument description in smb2_populate_readdir_entry() A warning is reported because an invalid argument description, it is found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/ksmbd/smb2pdu.c:3406: warning: Excess function parameter 'user_ns' description in 'smb2_populate_readdir_entry' Reported-by: Abaci Robot Fixes: 475d6f98804c ("ksmbd: fix translation in smb2_populate_readdir_entry()") Acked-by: Namjae Jeon Signed-off-by: Yang Li Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 722f14a1b77a..43882f5baa3b 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -3392,7 +3392,6 @@ static int dentry_name(struct ksmbd_dir_info *d_info, int info_level) * @conn: connection instance * @info_level: smb information level * @d_info: structure included variables for query dir - * @user_ns: user namespace * @ksmbd_kstat: ksmbd wrapper of dirent stat information * * if directory has many entries, find first can't read it fully. -- cgit v1.2.3-58-ga151 From d4eeb82674acadf789277b577986e8e7d3faf695 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 21 Dec 2021 20:48:57 +0900 Subject: ksmbd: Fix smb2_get_name() kernel-doc comment Remove some warnings found by running scripts/kernel-doc, which is caused by using 'make W=1'. fs/ksmbd/smb2pdu.c:623: warning: Function parameter or member 'local_nls' not described in 'smb2_get_name' fs/ksmbd/smb2pdu.c:623: warning: Excess function parameter 'nls_table' description in 'smb2_get_name' Reported-by: Abaci Robot Acked-by: Namjae Jeon Signed-off-by: Yang Li Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 43882f5baa3b..beae94f6033a 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -612,7 +612,7 @@ static void destroy_previous_session(struct ksmbd_user *user, u64 id) * smb2_get_name() - get filename string from on the wire smb format * @src: source buffer * @maxlen: maxlen of source string - * @nls_table: nls_table pointer + * @local_nls: nls_table pointer * * Return: matching converted filename on success, otherwise error ptr */ -- cgit v1.2.3-58-ga151 From dd93849d47ce1517c1383ef30bd7497a001d213f Mon Sep 17 00:00:00 2001 From: Camel Guo Date: Thu, 2 Dec 2021 16:22:52 +0100 Subject: rtc: rs5c372: add offset correction support In order for linux userspace application to be able to adjust offset to keep rtc precision as high as possible, this commit adds support of offset correction by adjusting the time trimming register on rs5c372[a|b] and oscilluation adjustment register on r2025x, r222[1|3]x, rv5c38[6|7]a. Signed-off-by: Camel Guo Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211202152252.31264-1-camel.guo@axis.com --- drivers/rtc/rtc-rs5c372.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c index 955513514179..b3155fa914f1 100644 --- a/drivers/rtc/rtc-rs5c372.c +++ b/drivers/rtc/rtc-rs5c372.c @@ -30,6 +30,8 @@ #define RS5C372_REG_TRIM 7 # define RS5C372_TRIM_XSL 0x80 # define RS5C372_TRIM_MASK 0x7F +# define R2221TL_TRIM_DEV (1 << 7) /* only if R2221TL */ +# define RS5C372_TRIM_DECR (1 << 6) #define RS5C_REG_ALARM_A_MIN 8 /* or ALARM_W */ #define RS5C_REG_ALARM_A_HOURS 9 @@ -539,6 +541,122 @@ static int rs5c372_ioctl(struct device *dev, unsigned int cmd, unsigned long arg #define rs5c372_ioctl NULL #endif +static int rs5c372_read_offset(struct device *dev, long *offset) +{ + struct rs5c372 *rs5c = i2c_get_clientdata(to_i2c_client(dev)); + u8 val = rs5c->regs[RS5C372_REG_TRIM]; + long ppb_per_step = 0; + bool decr = val & RS5C372_TRIM_DECR; + + switch (rs5c->type) { + case rtc_r2221tl: + ppb_per_step = val & R2221TL_TRIM_DEV ? 1017 : 3051; + break; + case rtc_rs5c372a: + case rtc_rs5c372b: + ppb_per_step = val & RS5C372_TRIM_XSL ? 3125 : 3051; + break; + default: + ppb_per_step = 3051; + break; + } + + /* Only bits[0:5] repsents the time counts */ + val &= 0x3F; + + /* If bits[1:5] are all 0, it means no increment or decrement */ + if (!(val & 0x3E)) { + *offset = 0; + } else { + if (decr) + *offset = -(((~val) & 0x3F) + 1) * ppb_per_step; + else + *offset = (val - 1) * ppb_per_step; + } + + return 0; +} + +static int rs5c372_set_offset(struct device *dev, long offset) +{ + struct rs5c372 *rs5c = i2c_get_clientdata(to_i2c_client(dev)); + int addr = RS5C_ADDR(RS5C372_REG_TRIM); + u8 val = 0; + u8 tmp = 0; + long ppb_per_step = 3051; + long steps = LONG_MIN; + + switch (rs5c->type) { + case rtc_rs5c372a: + case rtc_rs5c372b: + tmp = rs5c->regs[RS5C372_REG_TRIM]; + if (tmp & RS5C372_TRIM_XSL) { + ppb_per_step = 3125; + val |= RS5C372_TRIM_XSL; + } + break; + case rtc_r2221tl: + /* + * Check if it is possible to use high resolution mode (DEV=1). + * In this mode, the minimum resolution is 2 / (32768 * 20 * 3), + * which is about 1017 ppb. + */ + steps = DIV_ROUND_CLOSEST(offset, 1017); + if (steps >= -0x3E && steps <= 0x3E) { + ppb_per_step = 1017; + val |= R2221TL_TRIM_DEV; + } else { + /* + * offset is out of the range of high resolution mode. + * Try to use low resolution mode (DEV=0). In this mode, + * the minimum resolution is 2 / (32768 * 20), which is + * about 3051 ppb. + */ + steps = LONG_MIN; + } + break; + default: + break; + } + + if (steps == LONG_MIN) { + steps = DIV_ROUND_CLOSEST(offset, ppb_per_step); + if (steps > 0x3E || steps < -0x3E) + return -ERANGE; + } + + if (steps > 0) { + val |= steps + 1; + } else { + val |= RS5C372_TRIM_DECR; + val |= (~(-steps - 1)) & 0x3F; + } + + if (!steps || !(val & 0x3E)) { + /* + * if offset is too small, set oscillation adjustment register + * or time trimming register with its default value whic means + * no increment or decrement. But for rs5c372[a|b], the XSL bit + * should be kept unchanged. + */ + if (rs5c->type == rtc_rs5c372a || rs5c->type == rtc_rs5c372b) + val &= RS5C372_TRIM_XSL; + else + val = 0; + } + + dev_dbg(&rs5c->client->dev, "write 0x%x for offset %ld\n", val, offset); + + if (i2c_smbus_write_byte_data(rs5c->client, addr, val) < 0) { + dev_err(&rs5c->client->dev, "failed to write 0x%x to reg %d\n", val, addr); + return -EIO; + } + + rs5c->regs[RS5C372_REG_TRIM] = val; + + return 0; +} + static const struct rtc_class_ops rs5c372_rtc_ops = { .proc = rs5c372_rtc_proc, .read_time = rs5c372_rtc_read_time, @@ -547,6 +665,8 @@ static const struct rtc_class_ops rs5c372_rtc_ops = { .set_alarm = rs5c_set_alarm, .alarm_irq_enable = rs5c_rtc_alarm_irq_enable, .ioctl = rs5c372_ioctl, + .read_offset = rs5c372_read_offset, + .set_offset = rs5c372_set_offset, }; #if IS_ENABLED(CONFIG_RTC_INTF_SYSFS) -- cgit v1.2.3-58-ga151 From ed06106614341301b3c4b84b6c0b497a72caec7d Mon Sep 17 00:00:00 2001 From: Camel Guo Date: Mon, 6 Dec 2021 13:58:31 +0100 Subject: rtc: rs5c372: fix incorrect oscillation value on r2221tl The XSL bit only exists in RS5C372A/B. On other Ricoh RTC chips supported in rs5c372, this bit has different meaning. For example, on R2221x and R2223x, this bit of oscillation adjustment register determines the operation frequency of oscillation adjustment circuit and the oscillation is always 32768HZ. But rs5c372_get_trim gives 32000HZ to osc when DEV is 1. Signed-off-by: Camel Guo Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211206125832.6461-1-camel.guo@axis.com --- drivers/rtc/rtc-rs5c372.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c index b3155fa914f1..cb15983383f5 100644 --- a/drivers/rtc/rtc-rs5c372.c +++ b/drivers/rtc/rtc-rs5c372.c @@ -28,7 +28,7 @@ #define RS5C372_REG_MONTH 5 #define RS5C372_REG_YEAR 6 #define RS5C372_REG_TRIM 7 -# define RS5C372_TRIM_XSL 0x80 +# define RS5C372_TRIM_XSL 0x80 /* only if RS5C372[a|b] */ # define RS5C372_TRIM_MASK 0x7F # define R2221TL_TRIM_DEV (1 << 7) /* only if R2221TL */ # define RS5C372_TRIM_DECR (1 << 6) @@ -326,8 +326,12 @@ static int rs5c372_get_trim(struct i2c_client *client, int *osc, int *trim) struct rs5c372 *rs5c372 = i2c_get_clientdata(client); u8 tmp = rs5c372->regs[RS5C372_REG_TRIM]; - if (osc) - *osc = (tmp & RS5C372_TRIM_XSL) ? 32000 : 32768; + if (osc) { + if (rs5c372->type == rtc_rs5c372a || rs5c372->type == rtc_rs5c372b) + *osc = (tmp & RS5C372_TRIM_XSL) ? 32000 : 32768; + else + *osc = 32768; + } if (trim) { dev_dbg(&client->dev, "%s: raw trim=%x\n", __func__, tmp); -- cgit v1.2.3-58-ga151 From fad6cbe9b2b4137f5af5355d4ee7e4eb38221e7e Mon Sep 17 00:00:00 2001 From: Vincent Shih Date: Fri, 3 Dec 2021 15:46:18 +0800 Subject: rtc: Add driver for RTC in Sunplus SP7021 Add driver for RTC in Sunplus SP7021 Signed-off-by: Vincent Shih Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/1638517579-10316-2-git-send-email-vincent.sunplus@gamil.com --- MAINTAINERS | 6 + drivers/rtc/Kconfig | 13 ++ drivers/rtc/Makefile | 1 + drivers/rtc/rtc-sunplus.c | 362 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 382 insertions(+) create mode 100644 drivers/rtc/rtc-sunplus.c diff --git a/MAINTAINERS b/MAINTAINERS index 7a2345ce8521..fbafc10ee9f9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18216,6 +18216,12 @@ L: netdev@vger.kernel.org S: Maintained F: drivers/net/ethernet/dlink/sundance.c +SUNPLUS RTC DRIVER +M: Vincent Shih +L: linux-rtc@vger.kernel.org +S: Maintained +F: drivers/rtc/rtc-sunplus.c + SUPERH M: Yoshinori Sato M: Rich Felker diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 6d019c5ed374..d85a3c31347c 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -1455,6 +1455,19 @@ config RTC_DRV_SH To compile this driver as a module, choose M here: the module will be called rtc-sh. +config RTC_DRV_SUNPLUS + tristate "Sunplus SP7021 RTC" + depends on SOC_SP7021 + help + Say 'yes' to get support for the real-time clock present in + Sunplus SP7021 - a SoC for industrial applications. It provides + RTC status check, timer/alarm functionalities, user data + reservation with the battery over 2.5V, RTC power status check + and battery charge. + + This driver can also be built as a module. If so, the module + will be called rtc-sunplus. + config RTC_DRV_VR41XX tristate "NEC VR41XX" depends on CPU_VR41XX || COMPILE_TEST diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 80b8f8f4b635..e92f3e943245 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -166,6 +166,7 @@ obj-$(CONFIG_RTC_DRV_STM32) += rtc-stm32.o obj-$(CONFIG_RTC_DRV_STMP) += rtc-stmp3xxx.o obj-$(CONFIG_RTC_DRV_SUN4V) += rtc-sun4v.o obj-$(CONFIG_RTC_DRV_SUN6I) += rtc-sun6i.o +obj-$(CONFIG_RTC_DRV_SUNPLUS) += rtc-sunplus.o obj-$(CONFIG_RTC_DRV_SUNXI) += rtc-sunxi.o obj-$(CONFIG_RTC_DRV_TEGRA) += rtc-tegra.o obj-$(CONFIG_RTC_DRV_TEST) += rtc-test.o diff --git a/drivers/rtc/rtc-sunplus.c b/drivers/rtc/rtc-sunplus.c new file mode 100644 index 000000000000..0b3873204f5c --- /dev/null +++ b/drivers/rtc/rtc-sunplus.c @@ -0,0 +1,362 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * The RTC driver for Sunplus SP7021 + * + * Copyright (C) 2019 Sunplus Technology Inc., All rights reseerved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RTC_REG_NAME "rtc" + +#define RTC_CTRL 0x40 +#define TIMER_FREEZE_MASK_BIT BIT(5 + 16) +#define TIMER_FREEZE BIT(5) +#define DIS_SYS_RST_RTC_MASK_BIT BIT(4 + 16) +#define DIS_SYS_RST_RTC BIT(4) +#define RTC32K_MODE_RESET_MASK_BIT BIT(3 + 16) +#define RTC32K_MODE_RESET BIT(3) +#define ALARM_EN_OVERDUE_MASK_BIT BIT(2 + 16) +#define ALARM_EN_OVERDUE BIT(2) +#define ALARM_EN_PMC_MASK_BIT BIT(1 + 16) +#define ALARM_EN_PMC BIT(1) +#define ALARM_EN_MASK_BIT BIT(0 + 16) +#define ALARM_EN BIT(0) +#define RTC_TIMER_OUT 0x44 +#define RTC_DIVIDER 0x48 +#define RTC_TIMER_SET 0x4c +#define RTC_ALARM_SET 0x50 +#define RTC_USER_DATA 0x54 +#define RTC_RESET_RECORD 0x58 +#define RTC_BATT_CHARGE_CTRL 0x5c +#define BAT_CHARGE_RSEL_MASK_BIT GENMASK(3 + 16, 2 + 16) +#define BAT_CHARGE_RSEL_MASK GENMASK(3, 2) +#define BAT_CHARGE_RSEL_2K_OHM FIELD_PREP(BAT_CHARGE_RSEL_MASK, 0) +#define BAT_CHARGE_RSEL_250_OHM FIELD_PREP(BAT_CHARGE_RSEL_MASK, 1) +#define BAT_CHARGE_RSEL_50_OHM FIELD_PREP(BAT_CHARGE_RSEL_MASK, 2) +#define BAT_CHARGE_RSEL_0_OHM FIELD_PREP(BAT_CHARGE_RSEL_MASK, 3) +#define BAT_CHARGE_DSEL_MASK_BIT BIT(1 + 16) +#define BAT_CHARGE_DSEL_MASK GENMASK(1, 1) +#define BAT_CHARGE_DSEL_ON FIELD_PREP(BAT_CHARGE_DSEL_MASK, 0) +#define BAT_CHARGE_DSEL_OFF FIELD_PREP(BAT_CHARGE_DSEL_MASK, 1) +#define BAT_CHARGE_EN_MASK_BIT BIT(0 + 16) +#define BAT_CHARGE_EN BIT(0) +#define RTC_TRIM_CTRL 0x60 + +struct sunplus_rtc { + struct rtc_device *rtc; + struct resource *res; + struct clk *rtcclk; + struct reset_control *rstc; + void __iomem *reg_base; + int irq; +}; + +static void sp_get_seconds(struct device *dev, unsigned long *secs) +{ + struct sunplus_rtc *sp_rtc = dev_get_drvdata(dev); + + *secs = (unsigned long)readl(sp_rtc->reg_base + RTC_TIMER_OUT); +} + +static void sp_set_seconds(struct device *dev, unsigned long secs) +{ + struct sunplus_rtc *sp_rtc = dev_get_drvdata(dev); + + writel((u32)secs, sp_rtc->reg_base + RTC_TIMER_SET); +} + +static int sp_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + unsigned long secs; + + sp_get_seconds(dev, &secs); + rtc_time64_to_tm(secs, tm); + + return 0; +} + +static int sp_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + unsigned long secs; + + secs = rtc_tm_to_time64(tm); + dev_dbg(dev, "%s, secs = %lu\n", __func__, secs); + sp_set_seconds(dev, secs); + + return 0; +} + +static int sp_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct sunplus_rtc *sp_rtc = dev_get_drvdata(dev); + unsigned long alarm_time; + + alarm_time = rtc_tm_to_time64(&alrm->time); + dev_dbg(dev, "%s, alarm_time: %u\n", __func__, (u32)(alarm_time)); + writel((u32)alarm_time, sp_rtc->reg_base + RTC_ALARM_SET); + + return 0; +} + +static int sp_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) +{ + struct sunplus_rtc *sp_rtc = dev_get_drvdata(dev); + unsigned int alarm_time; + + alarm_time = readl(sp_rtc->reg_base + RTC_ALARM_SET); + dev_dbg(dev, "%s, alarm_time: %u\n", __func__, alarm_time); + + if (alarm_time == 0) + alrm->enabled = 0; + else + alrm->enabled = 1; + + rtc_time64_to_tm((unsigned long)(alarm_time), &alrm->time); + + return 0; +} + +static int sp_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) +{ + struct sunplus_rtc *sp_rtc = dev_get_drvdata(dev); + + if (enabled) + writel((TIMER_FREEZE_MASK_BIT | DIS_SYS_RST_RTC_MASK_BIT | + RTC32K_MODE_RESET_MASK_BIT | ALARM_EN_OVERDUE_MASK_BIT | + ALARM_EN_PMC_MASK_BIT | ALARM_EN_MASK_BIT) | + (DIS_SYS_RST_RTC | ALARM_EN_OVERDUE | ALARM_EN_PMC | ALARM_EN), + sp_rtc->reg_base + RTC_CTRL); + else + writel((ALARM_EN_OVERDUE_MASK_BIT | ALARM_EN_PMC_MASK_BIT | ALARM_EN_MASK_BIT) | + 0x0, sp_rtc->reg_base + RTC_CTRL); + + return 0; +} + +static const struct rtc_class_ops sp_rtc_ops = { + .read_time = sp_rtc_read_time, + .set_time = sp_rtc_set_time, + .set_alarm = sp_rtc_set_alarm, + .read_alarm = sp_rtc_read_alarm, + .alarm_irq_enable = sp_rtc_alarm_irq_enable, +}; + +static irqreturn_t sp_rtc_irq_handler(int irq, void *dev_id) +{ + struct platform_device *plat_dev = dev_id; + struct sunplus_rtc *sp_rtc = dev_get_drvdata(&plat_dev->dev); + + rtc_update_irq(sp_rtc->rtc, 1, RTC_IRQF | RTC_AF); + dev_dbg(&plat_dev->dev, "[RTC] ALARM INT\n"); + + return IRQ_HANDLED; +} + +/* + * ------------------------------------------------------------------------------------- + * bat_charge_rsel bat_charge_dsel bat_charge_en Remarks + * x x 0 Disable + * 0 0 1 0.86mA (2K Ohm with diode) + * 1 0 1 1.81mA (250 Ohm with diode) + * 2 0 1 2.07mA (50 Ohm with diode) + * 3 0 1 16.0mA (0 Ohm with diode) + * 0 1 1 1.36mA (2K Ohm without diode) + * 1 1 1 3.99mA (250 Ohm without diode) + * 2 1 1 4.41mA (50 Ohm without diode) + * 3 1 1 16.0mA (0 Ohm without diode) + * ------------------------------------------------------------------------------------- + */ +static void sp_rtc_set_trickle_charger(struct device dev) +{ + struct sunplus_rtc *sp_rtc = dev_get_drvdata(&dev); + u32 ohms, rsel; + u32 chargeable; + + if (of_property_read_u32(dev.of_node, "trickle-resistor-ohms", &ohms) || + of_property_read_u32(dev.of_node, "aux-voltage-chargeable", &chargeable)) { + dev_warn(&dev, "battery charger disabled\n"); + return; + } + + switch (ohms) { + case 2000: + rsel = BAT_CHARGE_RSEL_2K_OHM; + break; + case 250: + rsel = BAT_CHARGE_RSEL_250_OHM; + break; + case 50: + rsel = BAT_CHARGE_RSEL_50_OHM; + break; + case 0: + rsel = BAT_CHARGE_RSEL_0_OHM; + break; + default: + dev_err(&dev, "invalid charger resistor value (%d)\n", ohms); + return; + } + + writel(BAT_CHARGE_RSEL_MASK_BIT | rsel, sp_rtc->reg_base + RTC_BATT_CHARGE_CTRL); + + switch (chargeable) { + case 0: + writel(BAT_CHARGE_DSEL_MASK_BIT | BAT_CHARGE_DSEL_OFF, + sp_rtc->reg_base + RTC_BATT_CHARGE_CTRL); + break; + case 1: + writel(BAT_CHARGE_DSEL_MASK_BIT | BAT_CHARGE_DSEL_ON, + sp_rtc->reg_base + RTC_BATT_CHARGE_CTRL); + break; + default: + dev_err(&dev, "invalid aux-voltage-chargeable value (%d)\n", chargeable); + return; + } + + writel(BAT_CHARGE_EN_MASK_BIT | BAT_CHARGE_EN, sp_rtc->reg_base + RTC_BATT_CHARGE_CTRL); +} + +static int sp_rtc_probe(struct platform_device *plat_dev) +{ + struct sunplus_rtc *sp_rtc; + int ret; + + sp_rtc = devm_kzalloc(&plat_dev->dev, sizeof(*sp_rtc), GFP_KERNEL); + if (!sp_rtc) + return -ENOMEM; + + sp_rtc->res = platform_get_resource_byname(plat_dev, IORESOURCE_MEM, RTC_REG_NAME); + sp_rtc->reg_base = devm_ioremap_resource(&plat_dev->dev, sp_rtc->res); + if (IS_ERR(sp_rtc->reg_base)) + return dev_err_probe(&plat_dev->dev, PTR_ERR(sp_rtc->res), + "%s devm_ioremap_resource fail\n", RTC_REG_NAME); + dev_dbg(&plat_dev->dev, "res = 0x%x, reg_base = 0x%lx\n", + sp_rtc->res->start, (unsigned long)sp_rtc->reg_base); + + sp_rtc->irq = platform_get_irq(plat_dev, 0); + if (sp_rtc->irq < 0) + return dev_err_probe(&plat_dev->dev, sp_rtc->irq, "platform_get_irq failed\n"); + + ret = devm_request_irq(&plat_dev->dev, sp_rtc->irq, sp_rtc_irq_handler, + IRQF_TRIGGER_RISING, "rtc irq", plat_dev); + if (ret) + return dev_err_probe(&plat_dev->dev, ret, "devm_request_irq failed:\n"); + + sp_rtc->rtcclk = devm_clk_get(&plat_dev->dev, NULL); + if (IS_ERR(sp_rtc->rtcclk)) + return dev_err_probe(&plat_dev->dev, PTR_ERR(sp_rtc->rtcclk), + "devm_clk_get fail\n"); + + sp_rtc->rstc = devm_reset_control_get_exclusive(&plat_dev->dev, NULL); + if (IS_ERR(sp_rtc->rstc)) + return dev_err_probe(&plat_dev->dev, PTR_ERR(sp_rtc->rstc), + "failed to retrieve reset controller\n"); + + ret = clk_prepare_enable(sp_rtc->rtcclk); + if (ret) + goto free_clk; + + ret = reset_control_deassert(sp_rtc->rstc); + if (ret) + goto free_reset_assert; + + device_init_wakeup(&plat_dev->dev, 1); + dev_set_drvdata(&plat_dev->dev, sp_rtc); + + sp_rtc->rtc = devm_rtc_allocate_device(&plat_dev->dev); + if (IS_ERR(sp_rtc->rtc)) { + ret = PTR_ERR(sp_rtc->rtc); + goto free_reset_assert; + } + + sp_rtc->rtc->range_max = U32_MAX; + sp_rtc->rtc->range_min = 0; + sp_rtc->rtc->ops = &sp_rtc_ops; + + ret = devm_rtc_register_device(sp_rtc->rtc); + if (ret) + goto free_reset_assert; + + /* Setup trickle charger */ + if (plat_dev->dev.of_node) + sp_rtc_set_trickle_charger(plat_dev->dev); + + /* Keep RTC from system reset */ + writel(DIS_SYS_RST_RTC_MASK_BIT | DIS_SYS_RST_RTC, sp_rtc->reg_base + RTC_CTRL); + + return 0; + +free_reset_assert: + reset_control_assert(sp_rtc->rstc); +free_clk: + clk_disable_unprepare(sp_rtc->rtcclk); + + return ret; +} + +static int sp_rtc_remove(struct platform_device *plat_dev) +{ + struct sunplus_rtc *sp_rtc = dev_get_drvdata(&plat_dev->dev); + + device_init_wakeup(&plat_dev->dev, 0); + reset_control_assert(sp_rtc->rstc); + clk_disable_unprepare(sp_rtc->rtcclk); + + return 0; +} + +#ifdef CONFIG_PM_SLEEP +static int sp_rtc_suspend(struct device *dev) +{ + struct sunplus_rtc *sp_rtc = dev_get_drvdata(dev); + + if (device_may_wakeup(dev)) + enable_irq_wake(sp_rtc->irq); + + return 0; +} + +static int sp_rtc_resume(struct device *dev) +{ + struct sunplus_rtc *sp_rtc = dev_get_drvdata(dev); + + if (device_may_wakeup(dev)) + disable_irq_wake(sp_rtc->irq); + + return 0; +} +#endif + +static const struct of_device_id sp_rtc_of_match[] = { + { .compatible = "sunplus,sp7021-rtc" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, sp_rtc_of_match); + +static SIMPLE_DEV_PM_OPS(sp_rtc_pm_ops, sp_rtc_suspend, sp_rtc_resume); + +static struct platform_driver sp_rtc_driver = { + .probe = sp_rtc_probe, + .remove = sp_rtc_remove, + .driver = { + .name = "sp7021-rtc", + .of_match_table = sp_rtc_of_match, + .pm = &sp_rtc_pm_ops, + }, +}; +module_platform_driver(sp_rtc_driver); + +MODULE_AUTHOR("Vincent Shih "); +MODULE_DESCRIPTION("Sunplus RTC driver"); +MODULE_LICENSE("GPL v2"); + -- cgit v1.2.3-58-ga151 From 8462904204abd8cc7f75947d7005c71e8a77da7b Mon Sep 17 00:00:00 2001 From: Vincent Shih Date: Fri, 3 Dec 2021 15:46:19 +0800 Subject: dt-bindings: rtc: Add Sunplus RTC json-schema Add Sunplus RTC json-schema Signed-off-by: Vincent Shih Reviewed-by: Rob Herring Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/1638517579-10316-3-git-send-email-vincent.sunplus@gamil.com --- .../bindings/rtc/sunplus,sp7021-rtc.yaml | 56 ++++++++++++++++++++++ MAINTAINERS | 1 + 2 files changed, 57 insertions(+) create mode 100644 Documentation/devicetree/bindings/rtc/sunplus,sp7021-rtc.yaml diff --git a/Documentation/devicetree/bindings/rtc/sunplus,sp7021-rtc.yaml b/Documentation/devicetree/bindings/rtc/sunplus,sp7021-rtc.yaml new file mode 100644 index 000000000000..fd1b3e71ff2c --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/sunplus,sp7021-rtc.yaml @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) Sunplus Co., Ltd. 2021 +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/rtc/sunplus,sp7021-rtc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Sunplus SP7021 Real Time Clock controller + +maintainers: + - Vincent Shih + +properties: + compatible: + const: sunplus,sp7021-rtc + + reg: + maxItems: 1 + + reg-names: + items: + - const: rtc + + clocks: + maxItems: 1 + + resets: + maxItems: 1 + + interrupts: + maxItems: 1 + +required: + - compatible + - reg + - reg-names + - clocks + - resets + - interrupts + +additionalProperties: false + +examples: + - | + #include + + rtc: serial@9c003a00 { + compatible = "sunplus,sp7021-rtc"; + reg = <0x9c003a00 0x80>; + reg-names = "rtc"; + clocks = <&clkc 0x12>; + resets = <&rstc 0x02>; + interrupt-parent = <&intc>; + interrupts = <163 IRQ_TYPE_EDGE_RISING>; + }; +... diff --git a/MAINTAINERS b/MAINTAINERS index fbafc10ee9f9..b85f0a1f52f5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18220,6 +18220,7 @@ SUNPLUS RTC DRIVER M: Vincent Shih L: linux-rtc@vger.kernel.org S: Maintained +F: Documentation/devicetree/bindings/rtc/sunplus,sp7021-rtc.yaml F: drivers/rtc/rtc-sunplus.c SUPERH -- cgit v1.2.3-58-ga151 From 7b69b54aaa48979f5e3cebb7225e11cbbdc9f5fb Mon Sep 17 00:00:00 2001 From: Hugo Villeneuve Date: Tue, 7 Dec 2021 16:56:25 -0500 Subject: rtc: pcf2127: Fix typo in comment Replace TFS2 with TSF2. Signed-off-by: Hugo Villeneuve Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211207215626.2619819-1-hugo@hugovil.com --- drivers/rtc/rtc-pcf2127.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-pcf2127.c b/drivers/rtc/rtc-pcf2127.c index 56c58b055dff..81a5b1f2e68c 100644 --- a/drivers/rtc/rtc-pcf2127.c +++ b/drivers/rtc/rtc-pcf2127.c @@ -748,7 +748,7 @@ static int pcf2127_probe(struct device *dev, struct regmap *regmap, /* * Enable timestamp function and store timestamp of first trigger - * event until TSF1 and TFS2 interrupt flags are cleared. + * event until TSF1 and TSF2 interrupt flags are cleared. */ ret = regmap_update_bits(pcf2127->regmap, PCF2127_REG_TS_CTRL, PCF2127_BIT_TS_CTRL_TSOFF | -- cgit v1.2.3-58-ga151 From ba52eac083e1598e748811ff58d259f77e4c5c4d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 8 Dec 2021 20:39:15 -0800 Subject: rtc: Move variable into switch case statement When building with automatic stack variable initialization, GCC 12 complains about variables defined outside of switch case statements. Move the variable into the case that uses it, which silences the warning: drivers/rtc/dev.c: In function 'rtc_dev_ioctl': drivers/rtc/dev.c:394:30: warning: statement will never be executed [-Wswitch-unreachable] 394 | long offset; | ^~~~~~ Fixes: 6a8af1b6568a ("rtc: add parameter ioctl") Signed-off-by: Kees Cook Reviewed-by: Gustavo A. R. Silva Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211209043915.1378393-1-keescook@chromium.org --- drivers/rtc/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/dev.c b/drivers/rtc/dev.c index e104972a28fd..69325aeede1a 100644 --- a/drivers/rtc/dev.c +++ b/drivers/rtc/dev.c @@ -391,14 +391,14 @@ static long rtc_dev_ioctl(struct file *file, } switch(param.param) { - long offset; case RTC_PARAM_FEATURES: if (param.index != 0) err = -EINVAL; param.uvalue = rtc->features[0]; break; - case RTC_PARAM_CORRECTION: + case RTC_PARAM_CORRECTION: { + long offset; mutex_unlock(&rtc->ops_lock); if (param.index != 0) return -EINVAL; @@ -407,7 +407,7 @@ static long rtc_dev_ioctl(struct file *file, if (err == 0) param.svalue = offset; break; - + } default: if (rtc->ops->param_get) err = rtc->ops->param_get(rtc->dev.parent, ¶m); -- cgit v1.2.3-58-ga151 From 05020a733b02cf7a474305e620fb306cd3abfe84 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Mon, 20 Dec 2021 01:15:24 +0000 Subject: rtc: ftrtc010: Use platform_get_irq() to get the interrupt platform_get_resource(pdev, IORESOURCE_IRQ, ..) relies on static allocation of IRQ resources in DT core code, this causes an issue when using hierarchical interrupt domains using "interrupts" property in the node as this bypasses the hierarchical setup and messes up the irq chaining. In preparation for removal of static setup of IRQ resource from DT core code use platform_get_irq(). Signed-off-by: Lad Prabhakar Reviewed-by: Linus Walleij Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211220011524.17206-1-prabhakar.mahadev-lad.rj@bp.renesas.com --- drivers/rtc/rtc-ftrtc010.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/rtc/rtc-ftrtc010.c b/drivers/rtc/rtc-ftrtc010.c index ad3add5db4c8..53bb08fe1cd4 100644 --- a/drivers/rtc/rtc-ftrtc010.c +++ b/drivers/rtc/rtc-ftrtc010.c @@ -141,11 +141,9 @@ static int ftrtc010_rtc_probe(struct platform_device *pdev) } } - res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (!res) - return -ENODEV; - - rtc->rtc_irq = res->start; + rtc->rtc_irq = platform_get_irq(pdev, 0); + if (rtc->rtc_irq < 0) + return rtc->rtc_irq; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (!res) -- cgit v1.2.3-58-ga151 From 34127b3632b21e5c391756e724b1198eb9917981 Mon Sep 17 00:00:00 2001 From: Laurence de Bruxelles Date: Sat, 1 Jan 2022 15:41:49 +0000 Subject: rtc: pxa: fix null pointer dereference With the latest stable kernel versions the rtc on the PXA based Zaurus does not work, when booting I see the following kernel messages: pxa-rtc pxa-rtc: failed to find rtc clock source pxa-rtc pxa-rtc: Unable to init SA1100 RTC sub-device pxa-rtc: probe of pxa-rtc failed with error -2 hctosys: unable to open rtc device (rtc0) I think this is because commit f2997775b111 ("rtc: sa1100: fix possible race condition") moved the allocation of the rtc_device struct out of sa1100_rtc_init and into sa1100_rtc_probe. This means that pxa_rtc_probe also needs to do allocation for the rtc_device struct, otherwise sa1100_rtc_init will try to dereference a null pointer. This patch adds that allocation by copying how sa1100_rtc_probe in drivers/rtc/rtc-sa1100.c does it; after the IRQs are set up a managed rtc_device is allocated. I've tested this patch with `qemu-system-arm -machine akita` and with a real Zaurus SL-C1000 applied to 4.19, 5.4, and 5.10. Signed-off-by: Laurence de Bruxelles Fixes: f2997775b111 ("rtc: sa1100: fix possible race condition") Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220101154149.12026-1-lfdebrux@gmail.com --- drivers/rtc/rtc-pxa.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/rtc/rtc-pxa.c b/drivers/rtc/rtc-pxa.c index d2f1d8f754bf..cf8119b6d320 100644 --- a/drivers/rtc/rtc-pxa.c +++ b/drivers/rtc/rtc-pxa.c @@ -330,6 +330,10 @@ static int __init pxa_rtc_probe(struct platform_device *pdev) if (sa1100_rtc->irq_alarm < 0) return -ENXIO; + sa1100_rtc->rtc = devm_rtc_allocate_device(&pdev->dev); + if (IS_ERR(sa1100_rtc->rtc)) + return PTR_ERR(sa1100_rtc->rtc); + pxa_rtc->base = devm_ioremap(dev, pxa_rtc->ress->start, resource_size(pxa_rtc->ress)); if (!pxa_rtc->base) { -- cgit v1.2.3-58-ga151 From a12ac1f0ffa41b7aab3f69c4aac5bb72369bd117 Mon Sep 17 00:00:00 2001 From: David Heidelberg Date: Mon, 13 Dec 2021 20:29:45 +0100 Subject: dt-bindings: rtc: qcom-pm8xxx-rtc: update register numbers Extend registers up to 2, also document their names. Also fixes warnings generated by `make qcom/sdm845-oneplus-fajita.dtb`: arch/arm64/boot/dts/qcom/sdm845-oneplus-fajita.dt.yaml: rtc@6000: reg: [[24576], [24832]] is too long From schema: Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml arch/arm64/boot/dts/qcom/sdm845-oneplus-fajita.dt.yaml: rtc@6000: 'reg-names' does not match any of the regexes: 'pinctrl-[0-9]+' From schema: Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml Signed-off-by: David Heidelberg Reviewed-by: Rob Herring Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20211213192946.111320-1-david@ixit.cz --- Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml b/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml index 4fba6dba16f3..6fa7d9fc2dc7 100644 --- a/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml +++ b/Documentation/devicetree/bindings/rtc/qcom-pm8xxx-rtc.yaml @@ -19,7 +19,14 @@ properties: - qcom,pmk8350-rtc reg: - maxItems: 1 + minItems: 1 + maxItems: 2 + + reg-names: + minItems: 1 + items: + - const: rtc + - const: alarm interrupts: maxItems: 1 -- cgit v1.2.3-58-ga151 From aa7069d840dab6bef4887657b94e439c82ae985a Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Thu, 16 Dec 2021 18:14:49 +0800 Subject: scsi: qedf: Fix potential dereference of NULL pointer The return value of dma_alloc_coherent() needs to be checked to avoid use of NULL pointer in case of an allocation failure. Link: https://lore.kernel.org/r/20211216101449.375953-1-jiasheng@iscas.ac.cn Fixes: 61d8658b4a43 ("scsi: qedf: Add QLogic FastLinQ offload FCoE driver framework.") Acked-by: Saurav Kashyap Signed-off-by: Jiasheng Jiang Signed-off-by: Martin K. Petersen --- drivers/scsi/qedf/qedf_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index 1bf7a22d4948..cdc66e2a9488 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -1415,6 +1415,8 @@ static void qedf_upload_connection(struct qedf_ctx *qedf, */ term_params = dma_alloc_coherent(&qedf->pdev->dev, QEDF_TERM_BUFF_SIZE, &term_params_dma, GFP_KERNEL); + if (!term_params) + return; QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_CONN, "Uploading connection " "port_id=%06x.\n", fcport->rdata->ids.port_id); -- cgit v1.2.3-58-ga151 From 4d516e495235a8e95b482a1e9441c4043c7a8235 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 22 Dec 2021 22:11:19 -0800 Subject: scsi: aacraid: Fix spelling of "its" Use the possessive "its" instead of the contraction "it's" in user messages. Link: https://lore.kernel.org/r/20211223061119.18304-1-rdunlap@infradead.org Cc: Adaptec OEM Raid Solutions Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Signed-off-by: Randy Dunlap Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/aachba.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c index 59f6b7b2a70a..b04d039da276 100644 --- a/drivers/scsi/aacraid/aachba.c +++ b/drivers/scsi/aacraid/aachba.c @@ -271,7 +271,7 @@ MODULE_PARM_DESC(msi, "IRQ handling." " 0=PIC(default), 1=MSI, 2=MSI-X)"); module_param(startup_timeout, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(startup_timeout, "The duration of time in seconds to wait for" - " adapter to have it's kernel up and\n" + " adapter to have its kernel up and\n" "running. This is typically adjusted for large systems that do not" " have a BIOS."); module_param(aif_timeout, int, S_IRUGO|S_IWUSR); -- cgit v1.2.3-58-ga151 From 81d3f500ee98cf9f5388f0fb0548ba5a57598827 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 29 Sep 2021 18:17:44 +0900 Subject: scsi: core: Fix scsi_mode_select() interface The modepage argument is unused. Remove it. Link: https://lore.kernel.org/r/20210929091744.706003-3-damien.lemoal@wdc.com Reviewed-by: Himanshu Madhani Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 8 +++----- drivers/scsi/sd.c | 2 +- include/scsi/scsi_device.h | 5 ++--- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 621d841d819a..6c2a35d31ecc 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2026,7 +2026,6 @@ void scsi_exit_queue(void) * @sdev: SCSI device to be queried * @pf: Page format bit (1 == standard, 0 == vendor specific) * @sp: Save page bit (0 == don't save, 1 == save) - * @modepage: mode page being requested * @buffer: request buffer (may not be smaller than eight bytes) * @len: length of request buffer. * @timeout: command timeout @@ -2039,10 +2038,9 @@ void scsi_exit_queue(void) * status on error * */ -int -scsi_mode_select(struct scsi_device *sdev, int pf, int sp, int modepage, - unsigned char *buffer, int len, int timeout, int retries, - struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) +int scsi_mode_select(struct scsi_device *sdev, int pf, int sp, + unsigned char *buffer, int len, int timeout, int retries, + struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) { unsigned char cmd[10]; unsigned char *real_buffer; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 2a50a840a00c..2e1250b899da 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -209,7 +209,7 @@ cache_type_store(struct device *dev, struct device_attribute *attr, */ data.device_specific = 0; - if (scsi_mode_select(sdp, 1, sp, 8, buffer_data, len, SD_TIMEOUT, + if (scsi_mode_select(sdp, 1, sp, buffer_data, len, SD_TIMEOUT, sdkp->max_retries, &data, &sshdr)) { if (scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index d1c6fc83b1e3..7571e23d8d8a 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -415,9 +415,8 @@ extern int scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *); extern int scsi_mode_select(struct scsi_device *sdev, int pf, int sp, - int modepage, unsigned char *buffer, int len, - int timeout, int retries, - struct scsi_mode_data *data, + unsigned char *buffer, int len, int timeout, + int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *); extern int scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries, struct scsi_sense_hdr *sshdr); -- cgit v1.2.3-58-ga151 From 9211faa39a0350fb2239a0bce03b9459cd14fc40 Mon Sep 17 00:00:00 2001 From: Suganath Prabu S Date: Mon, 27 Dec 2021 11:00:55 +0530 Subject: scsi: mpt3sas: Update persistent trigger pages from sysfs interface Store sysfs-provided trigger values into the corresponding persistent trigger pages. Otherwise trigger entries are not persistent across system reboots. Link: https://lore.kernel.org/r/20211227053055.289537-1-suganath-prabu.subramani@broadcom.com Signed-off-by: Suganath Prabu S Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_base.h | 4 +- drivers/scsi/mpt3sas/mpt3sas_ctl.c | 87 +++++++++++++++++++++++++++++++++++-- 2 files changed, 85 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h index a0af986633d2..949e98d523e2 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.h +++ b/drivers/scsi/mpt3sas/mpt3sas_base.h @@ -77,8 +77,8 @@ #define MPT3SAS_DRIVER_NAME "mpt3sas" #define MPT3SAS_AUTHOR "Avago Technologies " #define MPT3SAS_DESCRIPTION "LSI MPT Fusion SAS 3.0 Device Driver" -#define MPT3SAS_DRIVER_VERSION "39.100.00.00" -#define MPT3SAS_MAJOR_VERSION 39 +#define MPT3SAS_DRIVER_VERSION "40.100.00.00" +#define MPT3SAS_MAJOR_VERSION 40 #define MPT3SAS_MINOR_VERSION 100 #define MPT3SAS_BUILD_VERSION 0 #define MPT3SAS_RELEASE_VERSION 00 diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c b/drivers/scsi/mpt3sas/mpt3sas_ctl.c index 05b6c6a073c3..d92ca140d298 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c +++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c @@ -3533,11 +3533,31 @@ diag_trigger_master_store(struct device *cdev, { struct Scsi_Host *shost = class_to_shost(cdev); struct MPT3SAS_ADAPTER *ioc = shost_priv(shost); + struct SL_WH_MASTER_TRIGGER_T *master_tg; unsigned long flags; ssize_t rc; + bool set = 1; - spin_lock_irqsave(&ioc->diag_trigger_lock, flags); rc = min(sizeof(struct SL_WH_MASTER_TRIGGER_T), count); + + if (ioc->supports_trigger_pages) { + master_tg = kzalloc(sizeof(struct SL_WH_MASTER_TRIGGER_T), + GFP_KERNEL); + if (!master_tg) + return -ENOMEM; + + memcpy(master_tg, buf, rc); + if (!master_tg->MasterData) + set = 0; + if (mpt3sas_config_update_driver_trigger_pg1(ioc, master_tg, + set)) { + kfree(master_tg); + return -EFAULT; + } + kfree(master_tg); + } + + spin_lock_irqsave(&ioc->diag_trigger_lock, flags); memset(&ioc->diag_trigger_master, 0, sizeof(struct SL_WH_MASTER_TRIGGER_T)); memcpy(&ioc->diag_trigger_master, buf, rc); @@ -3589,11 +3609,31 @@ diag_trigger_event_store(struct device *cdev, { struct Scsi_Host *shost = class_to_shost(cdev); struct MPT3SAS_ADAPTER *ioc = shost_priv(shost); + struct SL_WH_EVENT_TRIGGERS_T *event_tg; unsigned long flags; ssize_t sz; + bool set = 1; - spin_lock_irqsave(&ioc->diag_trigger_lock, flags); sz = min(sizeof(struct SL_WH_EVENT_TRIGGERS_T), count); + if (ioc->supports_trigger_pages) { + event_tg = kzalloc(sizeof(struct SL_WH_EVENT_TRIGGERS_T), + GFP_KERNEL); + if (!event_tg) + return -ENOMEM; + + memcpy(event_tg, buf, sz); + if (!event_tg->ValidEntries) + set = 0; + if (mpt3sas_config_update_driver_trigger_pg2(ioc, event_tg, + set)) { + kfree(event_tg); + return -EFAULT; + } + kfree(event_tg); + } + + spin_lock_irqsave(&ioc->diag_trigger_lock, flags); + memset(&ioc->diag_trigger_event, 0, sizeof(struct SL_WH_EVENT_TRIGGERS_T)); memcpy(&ioc->diag_trigger_event, buf, sz); @@ -3644,11 +3684,31 @@ diag_trigger_scsi_store(struct device *cdev, { struct Scsi_Host *shost = class_to_shost(cdev); struct MPT3SAS_ADAPTER *ioc = shost_priv(shost); + struct SL_WH_SCSI_TRIGGERS_T *scsi_tg; unsigned long flags; ssize_t sz; + bool set = 1; + + sz = min(sizeof(struct SL_WH_SCSI_TRIGGERS_T), count); + if (ioc->supports_trigger_pages) { + scsi_tg = kzalloc(sizeof(struct SL_WH_SCSI_TRIGGERS_T), + GFP_KERNEL); + if (!scsi_tg) + return -ENOMEM; + + memcpy(scsi_tg, buf, sz); + if (!scsi_tg->ValidEntries) + set = 0; + if (mpt3sas_config_update_driver_trigger_pg3(ioc, scsi_tg, + set)) { + kfree(scsi_tg); + return -EFAULT; + } + kfree(scsi_tg); + } spin_lock_irqsave(&ioc->diag_trigger_lock, flags); - sz = min(sizeof(ioc->diag_trigger_scsi), count); + memset(&ioc->diag_trigger_scsi, 0, sizeof(ioc->diag_trigger_scsi)); memcpy(&ioc->diag_trigger_scsi, buf, sz); if (ioc->diag_trigger_scsi.ValidEntries > NUM_VALID_ENTRIES) @@ -3698,11 +3758,30 @@ diag_trigger_mpi_store(struct device *cdev, { struct Scsi_Host *shost = class_to_shost(cdev); struct MPT3SAS_ADAPTER *ioc = shost_priv(shost); + struct SL_WH_MPI_TRIGGERS_T *mpi_tg; unsigned long flags; ssize_t sz; + bool set = 1; - spin_lock_irqsave(&ioc->diag_trigger_lock, flags); sz = min(sizeof(struct SL_WH_MPI_TRIGGERS_T), count); + if (ioc->supports_trigger_pages) { + mpi_tg = kzalloc(sizeof(struct SL_WH_MPI_TRIGGERS_T), + GFP_KERNEL); + if (!mpi_tg) + return -ENOMEM; + + memcpy(mpi_tg, buf, sz); + if (!mpi_tg->ValidEntries) + set = 0; + if (mpt3sas_config_update_driver_trigger_pg4(ioc, mpi_tg, + set)) { + kfree(mpi_tg); + return -EFAULT; + } + kfree(mpi_tg); + } + + spin_lock_irqsave(&ioc->diag_trigger_lock, flags); memset(&ioc->diag_trigger_mpi, 0, sizeof(ioc->diag_trigger_mpi)); memcpy(&ioc->diag_trigger_mpi, buf, sz); -- cgit v1.2.3-58-ga151 From 5867b8569e6480d3926b0508ee896532c59fde32 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 24 Dec 2021 17:52:40 +0000 Subject: scsi: mpi3mr: Fix some spelling mistakes There are some spelling mistakes in some literal strings. Fix them. Link: https://lore.kernel.org/r/20211224175240.1348942-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_fw.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index c39dd4978c9d..eb07334dd43d 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -901,7 +901,7 @@ static const struct { }, { MPI3MR_RESET_FROM_SYSFS, "sysfs invocation" }, { MPI3MR_RESET_FROM_SYSFS_TIMEOUT, "sysfs TM timeout" }, - { MPI3MR_RESET_FROM_FIRMWARE, "firmware asynchronus reset" }, + { MPI3MR_RESET_FROM_FIRMWARE, "firmware asynchronous reset" }, }; /** @@ -1242,7 +1242,7 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc) ioc_state = mpi3mr_get_iocstate(mrioc); if (ioc_state == MRIOC_STATE_READY) { ioc_info(mrioc, - "successfully transistioned to %s state\n", + "successfully transitioned to %s state\n", mpi3mr_iocstate_name(ioc_state)); return 0; } @@ -3844,7 +3844,7 @@ retry_init: if (mrioc->shost->nr_hw_queues > mrioc->num_op_reply_q) { ioc_err(mrioc, - "cannot create minimum number of operatioanl queues expected:%d created:%d\n", + "cannot create minimum number of operational queues expected:%d created:%d\n", mrioc->shost->nr_hw_queues, mrioc->num_op_reply_q); goto out_failed_noretry; } -- cgit v1.2.3-58-ga151 From 3bb3c24e268ab64305eec670be253eef2238b013 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 31 Dec 2021 16:23:50 +0800 Subject: scsi: mpi3mr: Fix formatting problems in some kernel-doc comments Remove some warnings found by running scripts/kernel-doc, which is caused by using 'make W=1'. drivers/scsi/mpi3mr/mpi3mr_fw.c:2188: warning: Function parameter or member 'reason_code' not described in 'mpi3mr_check_rh_fault_ioc' drivers/scsi/mpi3mr/mpi3mr_fw.c:3650: warning: Excess function parameter 'init_type' description in 'mpi3mr_init_ioc' drivers/scsi/mpi3mr/mpi3mr_fw.c:4177: warning: bad line Link: https://lore.kernel.org/r/20211231082350.19315-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_fw.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index eb07334dd43d..15bdc21ead66 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -2174,7 +2174,7 @@ out: * mpi3mr_check_rh_fault_ioc - check reset history and fault * controller * @mrioc: Adapter instance reference - * @reason_code, reason code for the fault. + * @reason_code: reason code for the fault. * * This routine will save snapdump and fault the controller with * the given reason code if it is not already in the fault or @@ -3633,7 +3633,6 @@ static int mpi3mr_enable_events(struct mpi3mr_ioc *mrioc) /** * mpi3mr_init_ioc - Initialize the controller * @mrioc: Adapter instance reference - * @init_type: Flag to indicate is the init_type * * This the controller initialization routine, executed either * after soft reset or from pci probe callback. @@ -4174,7 +4173,7 @@ static void mpi3mr_issue_ioc_shutdown(struct mpi3mr_ioc *mrioc) /** * mpi3mr_cleanup_ioc - Cleanup controller * @mrioc: Adapter instance reference - + * * controller cleanup handler, Message unit reset or soft reset * and shutdown notification is issued to the controller. * -- cgit v1.2.3-58-ga151 From ee05cb71f9f7eb0c257f6b80b70edb4659151b26 Mon Sep 17 00:00:00 2001 From: Ajish Koshy Date: Tue, 28 Dec 2021 16:47:53 +0530 Subject: scsi: pm80xx: Port reset timeout error handling correction Error handling steps were not in sequence as per the programmers manual. Expected sequence: - PHY_DOWN (PORT_IN_RESET) - PORT_RESET_TIMER_TMO - Host aborts pending I/Os - Host deregister the device - Host sends HW_EVENT_PHY_DOWN ACK Previously we were sending HW_EVENT_PHY_DOWN ACK first and then deregister the device. Fix this to use the expected sequence. Link: https://lore.kernel.org/r/20211228111753.10802-1-Ajish.Koshy@microchip.com Signed-off-by: Ajish Koshy Signed-off-by: Viswas G Signed-off-by: Martin K. Petersen --- drivers/scsi/pm8001/pm8001_sas.c | 7 ++++++- drivers/scsi/pm8001/pm8001_sas.h | 3 +++ drivers/scsi/pm8001/pm80xx_hwi.c | 7 +++++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c index c9a16eef38c1..160ee8b228c9 100644 --- a/drivers/scsi/pm8001/pm8001_sas.c +++ b/drivers/scsi/pm8001/pm8001_sas.c @@ -1199,7 +1199,7 @@ int pm8001_abort_task(struct sas_task *task) struct pm8001_device *pm8001_dev; struct pm8001_tmf_task tmf_task; int rc = TMF_RESP_FUNC_FAILED, ret; - u32 phy_id; + u32 phy_id, port_id; struct sas_task_slow slow_task; if (unlikely(!task || !task->lldd_task || !task->dev)) @@ -1246,6 +1246,7 @@ int pm8001_abort_task(struct sas_task *task) DECLARE_COMPLETION_ONSTACK(completion_reset); DECLARE_COMPLETION_ONSTACK(completion); struct pm8001_phy *phy = pm8001_ha->phy + phy_id; + port_id = phy->port->port_id; /* 1. Set Device state as Recovery */ pm8001_dev->setds_completion = &completion; @@ -1297,6 +1298,10 @@ int pm8001_abort_task(struct sas_task *task) PORT_RESET_TMO); if (phy->port_reset_status == PORT_RESET_TMO) { pm8001_dev_gone_notify(dev); + PM8001_CHIP_DISP->hw_event_ack_req( + pm8001_ha, 0, + 0x07, /*HW_EVENT_PHY_DOWN ack*/ + port_id, phy_id, 0, 0); goto out; } } diff --git a/drivers/scsi/pm8001/pm8001_sas.h b/drivers/scsi/pm8001/pm8001_sas.h index 83eec16d021d..a17da1cebce1 100644 --- a/drivers/scsi/pm8001/pm8001_sas.h +++ b/drivers/scsi/pm8001/pm8001_sas.h @@ -216,6 +216,9 @@ struct pm8001_dispatch { u32 state); int (*sas_re_init_req)(struct pm8001_hba_info *pm8001_ha); int (*fatal_errors)(struct pm8001_hba_info *pm8001_ha); + void (*hw_event_ack_req)(struct pm8001_hba_info *pm8001_ha, + u32 Qnum, u32 SEA, u32 port_id, u32 phyId, u32 param0, + u32 param1); }; struct pm8001_chip_info { diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c index 0849ecc913c7..97750d0ebee9 100644 --- a/drivers/scsi/pm8001/pm80xx_hwi.c +++ b/drivers/scsi/pm8001/pm80xx_hwi.c @@ -3709,8 +3709,10 @@ static int mpi_hw_event(struct pm8001_hba_info *pm8001_ha, void *piomb) break; case HW_EVENT_PORT_RESET_TIMER_TMO: pm8001_dbg(pm8001_ha, MSG, "HW_EVENT_PORT_RESET_TIMER_TMO\n"); - pm80xx_hw_event_ack_req(pm8001_ha, 0, HW_EVENT_PHY_DOWN, - port_id, phy_id, 0, 0); + if (!pm8001_ha->phy[phy_id].reset_completion) { + pm80xx_hw_event_ack_req(pm8001_ha, 0, HW_EVENT_PHY_DOWN, + port_id, phy_id, 0, 0); + } sas_phy_disconnected(sas_phy); phy->phy_attached = 0; sas_notify_port_event(sas_phy, PORTE_LINK_RESET_ERR, @@ -5051,4 +5053,5 @@ const struct pm8001_dispatch pm8001_80xx_dispatch = { .fw_flash_update_req = pm8001_chip_fw_flash_update_req, .set_dev_state_req = pm8001_chip_set_dev_state_req, .fatal_errors = pm80xx_fatal_errors, + .hw_event_ack_req = pm80xx_hw_event_ack_req, }; -- cgit v1.2.3-58-ga151 From c3b48443ba7c4467d44f7faa16fea204ea6c239c Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Tue, 4 Jan 2022 11:24:52 +0000 Subject: scsi: aic79xx: Remove redundant error variable Return the value from ahd_linux_queue_abort_cmd() directly instead of using a redundant variable. Link: https://lore.kernel.org/r/20220104112452.601899-1-chi.minghao@zte.com.cn Reported-by: Zeal Robot Signed-off-by: Minghao Chi Signed-off-by: CGEL ZTE Signed-off-by: Martin K. Petersen --- drivers/scsi/aic7xxx/aic79xx_osm.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c index 5d566d2b2997..928099163f0f 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm.c @@ -755,11 +755,7 @@ ahd_linux_biosparam(struct scsi_device *sdev, struct block_device *bdev, static int ahd_linux_abort(struct scsi_cmnd *cmd) { - int error; - - error = ahd_linux_queue_abort_cmd(cmd); - - return error; + return ahd_linux_queue_abort_cmd(cmd); } /* -- cgit v1.2.3-58-ga151 From b5e7b59c3480f355910f9d2c6ece5857922a5e54 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 28 Sep 2021 09:47:57 +1000 Subject: NFS: change nfs_access_get_cached to only report the mask Currently the nfs_access_get_cached family of functions report a 'struct nfs_access_entry' as the result, with both .mask and .cred set. However the .cred is never used. This is probably good and there is no guarantee that it won't be freed before use. Change to only report the 'mask' - as this is all that is used or needed. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 20 +++++++++----------- fs/nfs/nfs4proc.c | 18 +++++++++--------- include/linux/nfs_fs.h | 4 ++-- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 731d31015b6a..8487a6d69116 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2673,7 +2673,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co return NULL; } -static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, bool may_block) +static int nfs_access_get_cached_locked(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_access_entry *cache; @@ -2703,8 +2703,7 @@ static int nfs_access_get_cached_locked(struct inode *inode, const struct cred * spin_lock(&inode->i_lock); retry = false; } - res->cred = cache->cred; - res->mask = cache->mask; + *mask = cache->mask; list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); err = 0; out: @@ -2716,7 +2715,7 @@ out_zap: return -ENOENT; } -static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res) +static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cred, u32 *mask) { /* Only check the most recently returned cache entry, * but do it without locking. @@ -2738,22 +2737,21 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre goto out; if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) goto out; - res->cred = cache->cred; - res->mask = cache->mask; + *mask = cache->mask; err = 0; out: rcu_read_unlock(); return err; } -int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct -nfs_access_entry *res, bool may_block) +int nfs_access_get_cached(struct inode *inode, const struct cred *cred, + u32 *mask, bool may_block) { int status; - status = nfs_access_get_cached_rcu(inode, cred, res); + status = nfs_access_get_cached_rcu(inode, cred, mask); if (status != 0) - status = nfs_access_get_cached_locked(inode, cred, res, + status = nfs_access_get_cached_locked(inode, cred, mask, may_block); return status; @@ -2874,7 +2872,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) trace_nfs_access_enter(inode); - status = nfs_access_get_cached(inode, cred, &cache, may_block); + status = nfs_access_get_cached(inode, cred, &cache.mask, may_block); if (status == 0) goto out_cached; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ee3bc79f6ca3..322ff45ad15c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7611,7 +7611,7 @@ static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, const char *key, const void *buf, size_t buflen, int flags) { - struct nfs_access_entry cache; + u32 mask; int ret; if (!nfs_server_capable(inode, NFS_CAP_XATTR)) @@ -7626,8 +7626,8 @@ static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, * do a cached access check for the XA* flags to possibly avoid * doing an RPC and getting EACCES back. */ - if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) { - if (!(cache.mask & NFS_ACCESS_XAWRITE)) + if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) { + if (!(mask & NFS_ACCESS_XAWRITE)) return -EACCES; } @@ -7648,14 +7648,14 @@ static int nfs4_xattr_get_nfs4_user(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *key, void *buf, size_t buflen) { - struct nfs_access_entry cache; + u32 mask; ssize_t ret; if (!nfs_server_capable(inode, NFS_CAP_XATTR)) return -EOPNOTSUPP; - if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) { - if (!(cache.mask & NFS_ACCESS_XAREAD)) + if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) { + if (!(mask & NFS_ACCESS_XAREAD)) return -EACCES; } @@ -7680,13 +7680,13 @@ nfs4_listxattr_nfs4_user(struct inode *inode, char *list, size_t list_len) ssize_t ret, size; char *buf; size_t buflen; - struct nfs_access_entry cache; + u32 mask; if (!nfs_server_capable(inode, NFS_CAP_XATTR)) return 0; - if (!nfs_access_get_cached(inode, current_cred(), &cache, true)) { - if (!(cache.mask & NFS_ACCESS_XALIST)) + if (!nfs_access_get_cached(inode, current_cred(), &mask, true)) { + if (!(mask & NFS_ACCESS_XALIST)) return 0; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 05f249f20f55..f33559acbcc2 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -533,8 +533,8 @@ extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags); extern void nfs_access_zap_cache(struct inode *inode); -extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred, struct nfs_access_entry *res, - bool may_block); +extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred, + u32 *mask, bool may_block); /* * linux/fs/nfs/symlink.c -- cgit v1.2.3-58-ga151 From 73fbb3fa647bdb5b60469af8101c741ece03a825 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 28 Sep 2021 09:47:57 +1000 Subject: NFS: pass cred explicitly for access tests Storing the 'struct cred *' in nfs_access_entry is problematic. An active 'cred' can keep a 'struct key *' active, and a quota is imposed on the number of such keys that a user can maintain. Cached 'nfs_access_entry' structs have indefinite lifetime, and having these keep 'struct key's alive imposes on that quota. So a future patch will remove the ->cred ref from nfs_access_entry. To prepare, change various functions to not assume there is a 'cred' in the nfs_access_entry, but to pass the cred around explicitly. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 17 ++++++++++------- fs/nfs/nfs3proc.c | 5 +++-- fs/nfs/nfs4proc.c | 12 +++++++----- include/linux/nfs_fs.h | 2 +- include/linux/nfs_xdr.h | 2 +- 5 files changed, 22 insertions(+), 16 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8487a6d69116..a34351ce79a2 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2758,7 +2758,9 @@ int nfs_access_get_cached(struct inode *inode, const struct cred *cred, } EXPORT_SYMBOL_GPL(nfs_access_get_cached); -static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set) +static void nfs_access_add_rbtree(struct inode *inode, + struct nfs_access_entry *set, + const struct cred *cred) { struct nfs_inode *nfsi = NFS_I(inode); struct rb_root *root_node = &nfsi->access_cache; @@ -2771,7 +2773,7 @@ static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry * while (*p != NULL) { parent = *p; entry = rb_entry(parent, struct nfs_access_entry, rb_node); - cmp = cred_fscmp(set->cred, entry->cred); + cmp = cred_fscmp(cred, entry->cred); if (cmp < 0) p = &parent->rb_left; @@ -2793,13 +2795,14 @@ found: nfs_access_free_entry(entry); } -void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) +void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set, + const struct cred *cred) { struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL); if (cache == NULL) return; RB_CLEAR_NODE(&cache->rb_node); - cache->cred = get_cred(set->cred); + cache->cred = get_cred(cred); cache->mask = set->mask; /* The above field assignments must be visible @@ -2807,7 +2810,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) * use rcu_assign_pointer, so just force the memory barrier. */ smp_wmb(); - nfs_access_add_rbtree(inode, cache); + nfs_access_add_rbtree(inode, cache, cred); /* Update accounting */ smp_mb__before_atomic(); @@ -2893,7 +2896,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) else cache.mask |= NFS_ACCESS_EXECUTE; cache.cred = cred; - status = NFS_PROTO(inode)->access(inode, &cache); + status = NFS_PROTO(inode)->access(inode, &cache, cred); if (status != 0) { if (status == -ESTALE) { if (!S_ISDIR(inode->i_mode)) @@ -2903,7 +2906,7 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) } goto out; } - nfs_access_add_cache(inode, &cache); + nfs_access_add_cache(inode, &cache, cred); out_cached: cache_mask = nfs_access_calc_mask(cache.mask, inode->i_mode); if ((mask & ~cache_mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 7100514d306b..1597eef40d54 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -220,7 +220,8 @@ static int nfs3_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle, task_flags); } -static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) +static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry, + const struct cred *cred) { struct nfs3_accessargs arg = { .fh = NFS_FH(inode), @@ -231,7 +232,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) .rpc_proc = &nfs3_procedures[NFS3PROC_ACCESS], .rpc_argp = &arg, .rpc_resp = &res, - .rpc_cred = entry->cred, + .rpc_cred = cred, }; int status = -ENOMEM; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 322ff45ad15c..f02aa9877e6f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2655,7 +2655,7 @@ static int nfs4_opendata_access(const struct cred *cred, cache.cred = cred; nfs_access_set_mask(&cache, opendata->o_res.access_result); - nfs_access_add_cache(state->inode, &cache); + nfs_access_add_cache(state->inode, &cache, cred); flags = NFS4_ACCESS_READ | NFS4_ACCESS_EXECUTE | NFS4_ACCESS_LOOKUP; if ((mask & ~cache.mask & flags) == 0) @@ -4441,7 +4441,8 @@ static int nfs4_proc_lookupp(struct inode *inode, struct nfs_fh *fhandle, return err; } -static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) +static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry, + const struct cred *cred) { struct nfs_server *server = NFS_SERVER(inode); struct nfs4_accessargs args = { @@ -4455,7 +4456,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ACCESS], .rpc_argp = &args, .rpc_resp = &res, - .rpc_cred = entry->cred, + .rpc_cred = cred, }; int status = 0; @@ -4475,14 +4476,15 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry return status; } -static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry) +static int nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry, + const struct cred *cred) { struct nfs4_exception exception = { .interruptible = true, }; int err; do { - err = _nfs4_proc_access(inode, entry); + err = _nfs4_proc_access(inode, entry, cred); trace_nfs4_access(inode, err); err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f33559acbcc2..c33b1b792bc9 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -396,7 +396,7 @@ extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fa extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int); -extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *); +extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *, const struct cred *); extern void nfs_access_set_mask(struct nfs_access_entry *, u32); extern int nfs_permission(struct user_namespace *, struct inode *, int); extern int nfs_open(struct inode *, struct file *); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 967a0098f0a9..9102bdaa3faa 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1737,7 +1737,7 @@ struct nfs_rpc_ops { struct nfs_fh *, struct nfs_fattr *); int (*lookupp) (struct inode *, struct nfs_fh *, struct nfs_fattr *); - int (*access) (struct inode *, struct nfs_access_entry *); + int (*access) (struct inode *, struct nfs_access_entry *, const struct cred *); int (*readlink)(struct inode *, struct page *, unsigned int, unsigned int); int (*create) (struct inode *, struct dentry *, -- cgit v1.2.3-58-ga151 From 6238aec83f3fb12132f964937e5bbcf248fea8f9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 28 Sep 2021 09:47:57 +1000 Subject: NFS: don't store 'struct cred *' in struct nfs_access_entry Storing the 'struct cred *' in nfs_access_entry is problematic. An active 'cred' can keep a 'struct key *' active, and a quota is imposed on the number of such keys that a user can maintain. Cached 'nfs_access_entry' structs have indefinite lifetime, and having these keep 'struct key's alive imposes on that quota. So remove the 'struct cred *' and replace it with the fields we need: kuid_t, kgid_t, and struct group_info * This makes the 'struct nfs_access_entry' 64 bits larger. New function "access_cmp" is introduced which is identical to cred_fscmp() except that the second arg is an 'nfs_access_entry', rather than a 'cred' Fixes: b68572e07c58 ("NFS: change access cache to use 'struct cred'.") Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ fs/nfs/nfs4proc.c | 1 - include/linux/nfs_fs.h | 4 +++- 3 files changed, 47 insertions(+), 8 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a34351ce79a2..7dee3fd10382 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2528,7 +2528,7 @@ MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache lengt static void nfs_access_free_entry(struct nfs_access_entry *entry) { - put_cred(entry->cred); + put_group_info(entry->group_info); kfree_rcu(entry, rcu_head); smp_mb__before_atomic(); atomic_long_dec(&nfs_access_nr_entries); @@ -2654,6 +2654,43 @@ void nfs_access_zap_cache(struct inode *inode) } EXPORT_SYMBOL_GPL(nfs_access_zap_cache); +static int access_cmp(const struct cred *a, const struct nfs_access_entry *b) +{ + struct group_info *ga, *gb; + int g; + + if (uid_lt(a->fsuid, b->fsuid)) + return -1; + if (uid_gt(a->fsuid, b->fsuid)) + return 1; + + if (gid_lt(a->fsgid, b->fsgid)) + return -1; + if (gid_gt(a->fsgid, b->fsgid)) + return 1; + + ga = a->group_info; + gb = b->group_info; + if (ga == gb) + return 0; + if (ga == NULL) + return -1; + if (gb == NULL) + return 1; + if (ga->ngroups < gb->ngroups) + return -1; + if (ga->ngroups > gb->ngroups) + return 1; + + for (g = 0; g < ga->ngroups; g++) { + if (gid_lt(ga->gid[g], gb->gid[g])) + return -1; + if (gid_gt(ga->gid[g], gb->gid[g])) + return 1; + } + return 0; +} + static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, const struct cred *cred) { struct rb_node *n = NFS_I(inode)->access_cache.rb_node; @@ -2661,7 +2698,7 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, co while (n != NULL) { struct nfs_access_entry *entry = rb_entry(n, struct nfs_access_entry, rb_node); - int cmp = cred_fscmp(cred, entry->cred); + int cmp = access_cmp(cred, entry); if (cmp < 0) n = n->rb_left; @@ -2731,7 +2768,7 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre lh = rcu_dereference(list_tail_rcu(&nfsi->access_cache_entry_lru)); cache = list_entry(lh, struct nfs_access_entry, lru); if (lh == &nfsi->access_cache_entry_lru || - cred_fscmp(cred, cache->cred) != 0) + access_cmp(cred, cache) != 0) cache = NULL; if (cache == NULL) goto out; @@ -2773,7 +2810,7 @@ static void nfs_access_add_rbtree(struct inode *inode, while (*p != NULL) { parent = *p; entry = rb_entry(parent, struct nfs_access_entry, rb_node); - cmp = cred_fscmp(cred, entry->cred); + cmp = access_cmp(cred, entry); if (cmp < 0) p = &parent->rb_left; @@ -2802,7 +2839,9 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set, if (cache == NULL) return; RB_CLEAR_NODE(&cache->rb_node); - cache->cred = get_cred(cred); + cache->fsuid = cred->fsuid; + cache->fsgid = cred->fsgid; + cache->group_info = get_group_info(cred->group_info); cache->mask = set->mask; /* The above field assignments must be visible @@ -2895,7 +2934,6 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP; else cache.mask |= NFS_ACCESS_EXECUTE; - cache.cred = cred; status = NFS_PROTO(inode)->access(inode, &cache, cred); if (status != 0) { if (status == -ESTALE) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f02aa9877e6f..5d9ff01ddf46 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2653,7 +2653,6 @@ static int nfs4_opendata_access(const struct cred *cred, } else if ((fmode & FMODE_READ) && !opendata->file_created) mask = NFS4_ACCESS_READ; - cache.cred = cred; nfs_access_set_mask(&cache, opendata->o_res.access_result); nfs_access_add_cache(state->inode, &cache, cred); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index c33b1b792bc9..450e393d4bf2 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -61,7 +61,9 @@ struct nfs_access_entry { struct rb_node rb_node; struct list_head lru; - const struct cred * cred; + kuid_t fsuid; + kgid_t fsgid; + struct group_info *group_info; __u32 mask; struct rcu_head rcu_head; }; -- cgit v1.2.3-58-ga151 From 204975036b34f55237bc44c8a302a88468ef21b5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 15 Dec 2021 16:38:15 -0500 Subject: NFS: Ensure the server has an up to date ctime before hardlinking Creating a hard link is required by POSIX to update the file ctime, so ensure that the file data is synced to disk so that we don't clobber the updated ctime by writing back after creating the hard link. Fixes: 9f7682728728 ("NFS: Move the delegation return down into nfs4_proc_link()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7dee3fd10382..3f8e5122d004 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2379,6 +2379,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) trace_nfs_link_enter(inode, dir, dentry); d_drop(dentry); + if (S_ISREG(inode->i_mode)) + nfs_sync_inode(inode); error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); if (error == 0) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); -- cgit v1.2.3-58-ga151 From 6ff9d99bb88faebf134ca668842349d9718e5464 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 15 Dec 2021 16:38:16 -0500 Subject: NFS: Ensure the server has an up to date ctime before renaming Renaming a file is required by POSIX to update the file ctime, so ensure that the file data is synced to disk so that we don't clobber the updated ctime by writing back after creating the hard link. Fixes: f2c2c552f119 ("NFS: Move delegation recall into the NFSv4 callback for rename_setup()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 3f8e5122d004..9883f72fdb6f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2470,6 +2470,8 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, } } + if (S_ISREG(old_inode->i_mode)) + nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); if (IS_ERR(task)) { error = PTR_ERR(task); -- cgit v1.2.3-58-ga151 From 4b0c359b813bbf115f5e2219ea8c0e4fad92400b Mon Sep 17 00:00:00 2001 From: Pierguido Lambri Date: Mon, 13 Dec 2021 08:38:48 +0000 Subject: SUNRPC: Add source address/port to rpc_socket* traces The rpc_socket* traces now show also the source address and port. An example is: kworker/u17:1-951 [005] 134218.925343: rpc_socket_close: socket:[46913] srcaddr=192.168.100.187:793 dstaddr=192.168.100.129:2049 state=4 (DISCONNECTING) sk_state=7 (CLOSE) kworker/u17:0-242 [006] 134360.841370: rpc_socket_connect: error=-115 socket:[56322] srcaddr=192.168.100.187:769 dstaddr=192.168.100.129:2049 state=2 (CONNECTING) sk_state=2 (SYN_SENT) -0 [006] 134360.841859: rpc_socket_state_change: socket:[56322] srcaddr=192.168.100.187:769 dstaddr=192.168.100.129:2049 state=2 (CONNECTING) sk_state=1 (ESTABLISHED) Signed-off-by: Pierguido Lambri Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 52 ++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 3a99358c262b..a6af347c935d 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -794,6 +794,9 @@ RPC_SHOW_SOCKET RPC_SHOW_SOCK + +#include + /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. @@ -816,27 +819,32 @@ DECLARE_EVENT_CLASS(xs_socket_event, __field(unsigned int, socket_state) __field(unsigned int, sock_state) __field(unsigned long long, ino) - __string(dstaddr, - xprt->address_strings[RPC_DISPLAY_ADDR]) - __string(dstport, - xprt->address_strings[RPC_DISPLAY_PORT]) + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( struct inode *inode = SOCK_INODE(socket); + const struct sock *sk = socket->sk; + const struct inet_sock *inet = inet_sk(sk); + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + + TP_STORE_ADDR_PORTS(__entry, inet, sk); + __entry->socket_state = socket->state; __entry->sock_state = socket->sk->sk_state; __entry->ino = (unsigned long long)inode->i_ino; - __assign_str(dstaddr, - xprt->address_strings[RPC_DISPLAY_ADDR]); - __assign_str(dstport, - xprt->address_strings[RPC_DISPLAY_PORT]); + ), TP_printk( - "socket:[%llu] dstaddr=%s/%s " + "socket:[%llu] srcaddr=%pISpc dstaddr=%pISpc " "state=%u (%s) sk_state=%u (%s)", - __entry->ino, __get_str(dstaddr), __get_str(dstport), + __entry->ino, + __entry->saddr, + __entry->daddr, __entry->socket_state, rpc_show_socket_state(__entry->socket_state), __entry->sock_state, @@ -866,29 +874,33 @@ DECLARE_EVENT_CLASS(xs_socket_event_done, __field(unsigned int, socket_state) __field(unsigned int, sock_state) __field(unsigned long long, ino) - __string(dstaddr, - xprt->address_strings[RPC_DISPLAY_ADDR]) - __string(dstport, - xprt->address_strings[RPC_DISPLAY_PORT]) + __array(__u8, saddr, sizeof(struct sockaddr_in6)) + __array(__u8, daddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( struct inode *inode = SOCK_INODE(socket); + const struct sock *sk = socket->sk; + const struct inet_sock *inet = inet_sk(sk); + + memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); + memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); + + TP_STORE_ADDR_PORTS(__entry, inet, sk); + __entry->socket_state = socket->state; __entry->sock_state = socket->sk->sk_state; __entry->ino = (unsigned long long)inode->i_ino; __entry->error = error; - __assign_str(dstaddr, - xprt->address_strings[RPC_DISPLAY_ADDR]); - __assign_str(dstport, - xprt->address_strings[RPC_DISPLAY_PORT]); ), TP_printk( - "error=%d socket:[%llu] dstaddr=%s/%s " + "error=%d socket:[%llu] srcaddr=%pISpc dstaddr=%pISpc " "state=%u (%s) sk_state=%u (%s)", __entry->error, - __entry->ino, __get_str(dstaddr), __get_str(dstport), + __entry->ino, + __entry->saddr, + __entry->daddr, __entry->socket_state, rpc_show_socket_state(__entry->socket_state), __entry->sock_state, -- cgit v1.2.3-58-ga151 From c72a826829ccfb38019187a3a5ba6d3584b7b7dc Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 10 Aug 2021 18:31:01 -0500 Subject: nfs41: pnfs: filelayout: Replace one-element array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. Refactor the code a bit according to the use of a flexible-array member in struct nfs4_file_layout_dsaddr instead of a one-element array, and use the struct_size() helper. This helps with the ongoing efforts to globally enable -Warray-bounds and get us closer to being able to tighten the FORTIFY_SOURCE routines on memcpy(). This issue was found with the help of Coccinelle and audited and fixed, manually. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/79 Link: https://github.com/KSPP/linux/issues/109 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayout.h | 2 +- fs/nfs/filelayout/filelayoutdev.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h index 79323b5dab0c..aed0748fd6ec 100644 --- a/fs/nfs/filelayout/filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -51,7 +51,7 @@ struct nfs4_file_layout_dsaddr { u32 stripe_count; u8 *stripe_indices; u32 ds_num; - struct nfs4_pnfs_ds *ds_list[1]; + struct nfs4_pnfs_ds *ds_list[]; }; struct nfs4_filelayout_segment { diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 86c3f7e69ec4..acf4b88889dc 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -136,9 +136,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_free_stripe_indices; } - dsaddr = kzalloc(sizeof(*dsaddr) + - (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), - gfp_flags); + dsaddr = kzalloc(struct_size(dsaddr, ds_list, num), gfp_flags); if (!dsaddr) goto out_err_free_stripe_indices; -- cgit v1.2.3-58-ga151 From 35e0f9a9af4869a4b1a3943da08d3a29ac6c4a42 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Wed, 10 Nov 2021 09:32:17 +0000 Subject: sunrpc: Remove unneeded null check In g_verify_token_header, the null check of 'ret' is unneeded to be done twice. Signed-off-by: Xu Wang Signed-off-by: Anna Schumaker --- net/sunrpc/auth_gss/gss_generic_token.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c index fe97f3106536..4a4082bb22ad 100644 --- a/net/sunrpc/auth_gss/gss_generic_token.c +++ b/net/sunrpc/auth_gss/gss_generic_token.c @@ -222,10 +222,8 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size, if (ret) return ret; - if (!ret) { - *buf_in = buf; - *body_size = toksize; - } + *buf_in = buf; + *body_size = toksize; return ret; } -- cgit v1.2.3-58-ga151 From c4f0396688b5916b3a95bcb004d158634f2234ff Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 18 Nov 2021 17:37:41 +0800 Subject: SUNRPC: clean up some inconsistent indenting Eliminate the follow smatch warning: net/sunrpc/xprtsock.c:1912 xs_local_connect() warn: inconsistent indenting. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d8ee06a9650a..69b6ee5a5fd1 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1910,7 +1910,7 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); int ret; - if (RPC_IS_ASYNC(task)) { + if (RPC_IS_ASYNC(task)) { /* * We want the AF_LOCAL connect to be resolved in the * filesystem namespace of the process making the rpc -- cgit v1.2.3-58-ga151 From 2c52c8376db7160a1dd8a681c61c9258405ef143 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 29 Nov 2021 15:33:56 -0500 Subject: NFSv4 only print the label when its queried When the bitmask of the attributes doesn't include the security label, don't bother printing it. Since the label might not be null terminated, adjust the printing format accordingly. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4xdr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 69862bf6db00..801119b7a596 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4200,10 +4200,11 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, } else printk(KERN_WARNING "%s: label too long (%u)!\n", __func__, len); + if (label && label->label) + dprintk("%s: label=%.*s, len=%d, PI=%d, LFS=%d\n", + __func__, label->len, (char *)label->label, + label->len, label->pi, label->lfs); } - if (label && label->label) - dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, - (char *)label->label, label->len, label->pi, label->lfs); return status; } -- cgit v1.2.3-58-ga151 From fbd2057e5329d3502a27491190237b6be52a1cb6 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Fri, 17 Dec 2021 01:01:33 +0800 Subject: nfs: nfs4clinet: check the return value of kstrdup() kstrdup() returns NULL when some internal memory errors happen, it is better to check the return value of it so to catch the memory error in time. Signed-off-by: Xiaoke Wang Signed-off-by: Anna Schumaker --- fs/nfs/nfs4client.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index d8b5a250ca05..47a6cf892c95 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1343,8 +1343,11 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, } nfs_put_client(clp); - if (server->nfs_client->cl_hostname == NULL) + if (server->nfs_client->cl_hostname == NULL) { server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); + if (server->nfs_client->cl_hostname == NULL) + return -ENOMEM; + } nfs_server_insert_lists(server); return nfs_probe_server(server, NFS_FH(d_inode(server->super->s_root))); -- cgit v1.2.3-58-ga151 From b05bf5c63b326ce1da84ef42498d8e0e292e694c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 3 Jan 2022 14:50:16 -0500 Subject: NFSv4.1: Fix uninitialised variable in devicenotify When decode_devicenotify_args() exits with no entries, we need to ensure that the struct cb_devicenotifyargs is initialised to { 0, NULL } in order to avoid problems in nfs4_callback_devicenotify(). Reported-by: Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/callback.h | 2 +- fs/nfs/callback_proc.c | 2 +- fs/nfs/callback_xdr.c | 18 +++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 6a2033131c06..ccd4f245cae2 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -170,7 +170,7 @@ struct cb_devicenotifyitem { }; struct cb_devicenotifyargs { - int ndevs; + uint32_t ndevs; struct cb_devicenotifyitem *devs; }; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 09c5b1cb3e07..c343666d9a42 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -358,7 +358,7 @@ __be32 nfs4_callback_devicenotify(void *argp, void *resp, struct cb_process_state *cps) { struct cb_devicenotifyargs *args = argp; - int i; + uint32_t i; __be32 res = 0; struct nfs_client *clp = cps->clp; struct nfs_server *server = NULL; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index a67c41ec545f..f90de8043b0f 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -258,11 +258,9 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, void *argp) { struct cb_devicenotifyargs *args = argp; + uint32_t tmp, n, i; __be32 *p; __be32 status = 0; - u32 tmp; - int n, i; - args->ndevs = 0; /* Num of device notifications */ p = xdr_inline_decode(xdr, sizeof(uint32_t)); @@ -271,7 +269,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, goto out; } n = ntohl(*p++); - if (n <= 0) + if (n == 0) goto out; if (n > ULONG_MAX / sizeof(*args->devs)) { status = htonl(NFS4ERR_BADXDR); @@ -330,19 +328,21 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, dev->cbd_immediate = 0; } - args->ndevs++; - dprintk("%s: type %d layout 0x%x immediate %d\n", __func__, dev->cbd_notify_type, dev->cbd_layout_type, dev->cbd_immediate); } + args->ndevs = n; + dprintk("%s: ndevs %d\n", __func__, args->ndevs); + return 0; +err: + kfree(args->devs); out: + args->devs = NULL; + args->ndevs = 0; dprintk("%s: status %d ndevs %d\n", __func__, ntohl(status), args->ndevs); return status; -err: - kfree(args->devs); - goto out; } static __be32 decode_sessionid(struct xdr_stream *xdr, -- cgit v1.2.3-58-ga151 From 1ab5be4ac5b1c9ce39ce1037c45b68d2ce6eede0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Dec 2021 15:36:54 -0500 Subject: NFSv4: Add some support for case insensitive filesystems Add capabilities to allow the NFS client to recognise when it is dealing with case insensitive and case preserving filesystems. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 8 +++++++- fs/nfs/nfs4xdr.c | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/nfs_fs_sb.h | 2 ++ include/linux/nfs_xdr.h | 2 ++ 4 files changed, 51 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5d9ff01ddf46..a6d7472058ad 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3840,7 +3840,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f FATTR4_WORD0_FH_EXPIRE_TYPE | FATTR4_WORD0_LINK_SUPPORT | FATTR4_WORD0_SYMLINK_SUPPORT | - FATTR4_WORD0_ACLSUPPORT; + FATTR4_WORD0_ACLSUPPORT | + FATTR4_WORD0_CASE_INSENSITIVE | + FATTR4_WORD0_CASE_PRESERVING; if (minorversion) bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; @@ -3869,6 +3871,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; + if (res.case_insensitive) + server->caps |= NFS_CAP_CASE_INSENSITIVE; + if (res.case_preserving) + server->caps |= NFS_CAP_CASE_PRESERVING; #ifdef CONFIG_NFS_V4_SECURITY_LABEL if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) server->caps |= NFS_CAP_SECURITY_LABEL; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 801119b7a596..a9487c840052 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3533,6 +3533,42 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint return 0; } +static int decode_attr_case_insensitive(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_CASE_INSENSITIVE - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_CASE_INSENSITIVE)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_CASE_INSENSITIVE; + } + dprintk("%s: case_insensitive=%s\n", __func__, *res == 0 ? "false" : "true"); + return 0; +} + +static int decode_attr_case_preserving(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) +{ + __be32 *p; + + *res = 0; + if (unlikely(bitmap[0] & (FATTR4_WORD0_CASE_PRESERVING - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_CASE_PRESERVING)) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + *res = be32_to_cpup(p); + bitmap[0] &= ~FATTR4_WORD0_CASE_PRESERVING; + } + dprintk("%s: case_preserving=%s\n", __func__, *res == 0 ? "false" : "true"); + return 0; +} + static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) { __be32 *p; @@ -4413,6 +4449,10 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re goto xdr_error; if ((status = decode_attr_aclsupport(xdr, bitmap, &res->acl_bitmask)) != 0) goto xdr_error; + if ((status = decode_attr_case_insensitive(xdr, bitmap, &res->case_insensitive)) != 0) + goto xdr_error; + if ((status = decode_attr_case_preserving(xdr, bitmap, &res->case_preserving)) != 0) + goto xdr_error; if ((status = decode_attr_exclcreat_supported(xdr, bitmap, res->exclcreat_bitmask)) != 0) goto xdr_error; diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 2a9acbfe00f0..f24fc67af42d 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -271,6 +271,8 @@ struct nfs_server { #define NFS_CAP_ACLS (1U << 3) #define NFS_CAP_ATOMIC_OPEN (1U << 4) #define NFS_CAP_LGOPEN (1U << 5) +#define NFS_CAP_CASE_INSENSITIVE (1U << 6) +#define NFS_CAP_CASE_PRESERVING (1U << 7) #define NFS_CAP_POSIX_LOCK (1U << 14) #define NFS_CAP_UIDGID_NOMAP (1U << 15) #define NFS_CAP_STATEID_NFSV41 (1U << 16) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9102bdaa3faa..ddff92396a3f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1194,6 +1194,8 @@ struct nfs4_server_caps_res { u32 has_links; u32 has_symlinks; u32 fh_expire_type; + u32 case_insensitive; + u32 case_preserving; }; #define NFS4_PATHNAME_MAXCOMPONENTS 512 -- cgit v1.2.3-58-ga151 From 98ca3ee60b9e4b039cc3ef21a169e775afa9bd0c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Dec 2021 15:36:55 -0500 Subject: NFSv4: Just don't cache negative dentries on case insensitive servers If the directory contents change, we cannot rely on the negative dentry being cacheable. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 9883f72fdb6f..7ebfe4e3cf85 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1436,6 +1436,9 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, return 0; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) return 1; + /* Case insensitive server? Revalidate negative dentries */ + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + return 1; return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU); } -- cgit v1.2.3-58-ga151 From 8ce37abdeb4c73842d16620e1da151765ac86b5e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Dec 2021 15:36:56 -0500 Subject: NFS: Invalidate negative dentries on all case insensitive directory changes If we create a file, rename it, or hardlink it, then we need to assume that cached negative dentries need to be revalidated. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7ebfe4e3cf85..a691a1e364a7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1324,6 +1324,14 @@ void nfs_clear_verifier_delegated(struct inode *inode) EXPORT_SYMBOL_GPL(nfs_clear_verifier_delegated); #endif /* IS_ENABLED(CONFIG_NFS_V4) */ +static int nfs_dentry_verify_change(struct inode *dir, struct dentry *dentry) +{ + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE) && + d_really_is_negative(dentry)) + return dentry->d_time == inode_peek_iversion_raw(dir); + return nfs_verify_change_attribute(dir, dentry->d_time); +} + /* * A check for whether or not the parent directory has changed. * In the case it has, we assume that the dentries are untrustworthy @@ -1337,7 +1345,7 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, return 1; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) return 0; - if (!nfs_verify_change_attribute(dir, dentry->d_time)) + if (!nfs_dentry_verify_change(dir, dentry)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ if (nfs_mapping_need_revalidate_inode(dir)) { @@ -1346,7 +1354,7 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) return 0; } - if (!nfs_verify_change_attribute(dir, dentry->d_time)) + if (!nfs_dentry_verify_change(dir, dentry)) return 0; return 1; } @@ -1539,7 +1547,7 @@ out: * If the lookup failed despite the dentry change attribute being * a match, then we should revalidate the directory cache. */ - if (!ret && nfs_verify_change_attribute(dir, dentry->d_time)) + if (!ret && nfs_dentry_verify_change(dir, dentry)) nfs_mark_dir_for_revalidate(dir); return nfs_lookup_revalidate_done(dir, dentry, inode, ret); } @@ -1778,8 +1786,11 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in dir_verifier = nfs_save_change_attribute(dir); trace_nfs_lookup_enter(dir, dentry, flags); error = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr); - if (error == -ENOENT) + if (error == -ENOENT) { + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + dir_verifier = inode_peek_iversion_raw(dir); goto no_entry; + } if (error < 0) { res = ERR_PTR(error); goto out; -- cgit v1.2.3-58-ga151 From 00bdadc7accfce944dc30fbc205cd28a7eed657b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Dec 2021 15:36:57 -0500 Subject: NFS: Add a helper to remove case-insensitive aliases When dealing with case insensitive names, the client has no idea how the server performs the mapping, so cannot collapse the dentries into a single representative. So both rename and unlink need to deal with the fact that there could be several dentries representing the file, and have to somehow force them to be revalidated. Use d_prune_aliases() as a big hammer approach. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 12 +++++++++++- fs/nfs/internal.h | 1 + fs/nfs/nfs4proc.c | 5 ++++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a691a1e364a7..d2a58f7a73a6 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1819,6 +1819,14 @@ out: } EXPORT_SYMBOL_GPL(nfs_lookup); +void nfs_d_prune_case_insensitive_aliases(struct inode *inode) +{ + /* Case insensitive server? Revalidate dentries */ + if (inode && nfs_server_capable(inode, NFS_CAP_CASE_INSENSITIVE)) + d_prune_aliases(inode); +} +EXPORT_SYMBOL_GPL(nfs_d_prune_case_insensitive_aliases); + #if IS_ENABLED(CONFIG_NFS_V4) static int nfs4_lookup_revalidate(struct dentry *, unsigned int); @@ -2199,8 +2207,10 @@ static void nfs_dentry_remove_handle_error(struct inode *dir, switch (error) { case -ENOENT: d_delete(dentry); - fallthrough; + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + break; case 0: + nfs_d_prune_case_insensitive_aliases(d_inode(dentry)); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); } } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 12f6acb483bb..2de7c56a1fbe 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -373,6 +373,7 @@ extern unsigned long nfs_access_cache_count(struct shrinker *shrink, extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); +void nfs_d_prune_case_insensitive_aliases(struct inode *inode); int nfs_create(struct user_namespace *, struct inode *, struct dentry *, umode_t, bool); int nfs_mkdir(struct user_namespace *, struct inode *, struct dentry *, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a6d7472058ad..7df2eb74025f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4670,8 +4670,10 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, nfs_fattr_init(res->dir_attr); - if (inode) + if (inode) { nfs4_inode_return_delegation(inode); + nfs_d_prune_case_insensitive_aliases(inode); + } } static void nfs4_proc_unlink_rpc_prepare(struct rpc_task *task, struct nfs_unlinkdata *data) @@ -4737,6 +4739,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, return 0; if (task->tk_status == 0) { + nfs_d_prune_case_insensitive_aliases(d_inode(data->old_dentry)); if (new_dir != old_dir) { /* Note: If we moved a directory, nlink will change */ nfs4_update_changeattr(old_dir, &res->old_cinfo, -- cgit v1.2.3-58-ga151 From 68eaba4ca924a97a863c5c81c0b23a11dcb6db90 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Dec 2021 15:36:58 -0500 Subject: NFS: Fix the verifier for case sensitive filesystem in nfs_atomic_open() Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d2a58f7a73a6..2884138848f4 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1888,6 +1888,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct iattr attr = { .ia_valid = ATTR_OPEN }; struct inode *inode; unsigned int lookup_flags = 0; + unsigned long dir_verifier; bool switched = false; int created = 0; int err; @@ -1961,7 +1962,11 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, switch (err) { case -ENOENT: d_splice_alias(NULL, dentry); - nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); + if (nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + dir_verifier = inode_peek_iversion_raw(dir); + else + dir_verifier = nfs_save_change_attribute(dir); + nfs_set_verifier(dentry, dir_verifier); break; case -EISDIR: case -ENOTDIR: -- cgit v1.2.3-58-ga151 From 01f34245722b9dac0b2667db4dc17d2049b99c76 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 28 Dec 2021 15:41:38 +0100 Subject: NFS: use default_groups in kobj_type There are currently 2 ways to create a set of sysfs files for a kobj_type, through the default_attrs field, and the default_groups field. Move the NFS code to use default_groups field which has been the preferred way since aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") so that we can soon get rid of the obsolete default_attrs field. Cc: Trond Myklebust Cc: Anna Schumaker Cc: linux-nfs@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Anna Schumaker --- fs/nfs/sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 8cb70755e3c9..a6f740366963 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -142,10 +142,11 @@ static struct attribute *nfs_netns_client_attrs[] = { &nfs_netns_client_id.attr, NULL, }; +ATTRIBUTE_GROUPS(nfs_netns_client); static struct kobj_type nfs_netns_client_type = { .release = nfs_netns_client_release, - .default_attrs = nfs_netns_client_attrs, + .default_groups = nfs_netns_client_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = nfs_netns_client_namespace, }; -- cgit v1.2.3-58-ga151 From 86439fa2678d1ae752ac9c787aac1c145b87b4c2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 28 Dec 2021 15:48:23 +0100 Subject: SUNRPC: use default_groups in kobj_type There are currently 2 ways to create a set of sysfs files for a kobj_type, through the default_attrs field, and the default_groups field. Move the sunrpc sysfs code to use default_groups field which has been the preferred way since aa30f47cf666 ("kobject: Add support for default attribute groups to kobj_type") so that we can soon get rid of the obsolete default_attrs field. Cc: "J. Bruce Fields" Cc: Chuck Lever Cc: Trond Myklebust Cc: Anna Schumaker Cc: linux-nfs@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Anna Schumaker --- net/sunrpc/sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 2766dd21935b..b1aea3419218 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -422,6 +422,7 @@ static struct attribute *rpc_sysfs_xprt_attrs[] = { &rpc_sysfs_xprt_change_state.attr, NULL, }; +ATTRIBUTE_GROUPS(rpc_sysfs_xprt); static struct kobj_attribute rpc_sysfs_xprt_switch_info = __ATTR(xprt_switch_info, 0444, rpc_sysfs_xprt_switch_info_show, NULL); @@ -430,6 +431,7 @@ static struct attribute *rpc_sysfs_xprt_switch_attrs[] = { &rpc_sysfs_xprt_switch_info.attr, NULL, }; +ATTRIBUTE_GROUPS(rpc_sysfs_xprt_switch); static struct kobj_type rpc_sysfs_client_type = { .release = rpc_sysfs_client_release, @@ -439,14 +441,14 @@ static struct kobj_type rpc_sysfs_client_type = { static struct kobj_type rpc_sysfs_xprt_switch_type = { .release = rpc_sysfs_xprt_switch_release, - .default_attrs = rpc_sysfs_xprt_switch_attrs, + .default_groups = rpc_sysfs_xprt_switch_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_xprt_switch_namespace, }; static struct kobj_type rpc_sysfs_xprt_type = { .release = rpc_sysfs_xprt_release, - .default_attrs = rpc_sysfs_xprt_attrs, + .default_groups = rpc_sysfs_xprt_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_xprt_namespace, }; -- cgit v1.2.3-58-ga151 From 85847280b11666ae24aac519e06f0742fab72064 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 27 Dec 2021 14:40:51 -0500 Subject: NFSv4: Allow writebacks to request 'blocks used' When doing a non-pNFS write, allow the writeback code to specify that it also needs to update 'blocks used'. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4_fs.h | 2 ++ fs/nfs/nfs4proc.c | 21 +++++++-------------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index ed5eaca6801e..89076dd32334 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -315,6 +315,8 @@ extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, fmode_t fmode); +extern void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[], + struct inode *inode, unsigned long cache_validity); extern int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct inode *inode); extern int update_open_stateid(struct nfs4_state *state, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7df2eb74025f..44ab27f20f04 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -108,10 +108,6 @@ static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, const struct cred *, bool); #endif -static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], - const __u32 *src, struct inode *inode, - struct nfs_server *server, - struct nfs4_label *label); #ifdef CONFIG_NFS_V4_SECURITY_LABEL static inline struct nfs4_label * @@ -3669,7 +3665,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) if (!nfs4_have_delegation(inode, FMODE_READ)) { nfs4_bitmask_set(calldata->arg.bitmask_store, server->cache_consistency_bitmask, - inode, server, NULL); + inode, 0); calldata->arg.bitmask = calldata->arg.bitmask_store; } else calldata->arg.bitmask = NULL; @@ -5432,14 +5428,14 @@ bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr) return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; } -static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src, - struct inode *inode, struct nfs_server *server, - struct nfs4_label *label) +void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[], + struct inode *inode, unsigned long cache_validity) { - unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + struct nfs_server *server = NFS_SERVER(inode); unsigned int i; memcpy(bitmask, src, sizeof(*bitmask) * NFS4_BITMASK_SZ); + cache_validity |= READ_ONCE(NFS_I(inode)->cache_validity); if (cache_validity & NFS_INO_INVALID_CHANGE) bitmask[0] |= FATTR4_WORD0_CHANGE; @@ -5451,8 +5447,6 @@ static void nfs4_bitmask_set(__u32 bitmask[NFS4_BITMASK_SZ], const __u32 *src, bitmask[1] |= FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP; if (cache_validity & NFS_INO_INVALID_NLINK) bitmask[1] |= FATTR4_WORD1_NUMLINKS; - if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL) - bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL; if (cache_validity & NFS_INO_INVALID_CTIME) bitmask[1] |= FATTR4_WORD1_TIME_METADATA; if (cache_validity & NFS_INO_INVALID_MTIME) @@ -5479,7 +5473,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr, } else { nfs4_bitmask_set(hdr->args.bitmask_store, server->cache_consistency_bitmask, - hdr->inode, server, NULL); + hdr->inode, NFS_INO_INVALID_BLOCKS); hdr->args.bitmask = hdr->args.bitmask_store; } @@ -6517,8 +6511,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; nfs4_bitmask_set(data->args.bitmask_store, - server->cache_consistency_bitmask, inode, server, - NULL); + server->cache_consistency_bitmask, inode, 0); data->args.bitmask = data->args.bitmask_store; nfs_copy_fh(&data->fh, NFS_FH(inode)); nfs4_stateid_copy(&data->stateid, stateid); -- cgit v1.2.3-58-ga151 From 34bf20ce986c441c1088ed09a33e0bb96e52f99a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 27 Dec 2021 14:40:52 -0500 Subject: NFSv42: Fallocate and clone should also request 'blocks used' Both fallocate and clone can end up updating the blocks used attribute. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs42proc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 8b21ff1be717..32129446beca 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -46,7 +46,7 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, { struct inode *inode = file_inode(filep); struct nfs_server *server = NFS_SERVER(inode); - u32 bitmask[3]; + u32 bitmask[NFS_BITMASK_SZ]; struct nfs42_falloc_args args = { .falloc_fh = NFS_FH(inode), .falloc_offset = offset, @@ -69,9 +69,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, return status; } - memcpy(bitmask, server->cache_consistency_bitmask, sizeof(bitmask)); - if (server->attr_bitmask[1] & FATTR4_WORD1_SPACE_USED) - bitmask[1] |= FATTR4_WORD1_SPACE_USED; + nfs4_bitmask_set(bitmask, server->cache_consistency_bitmask, inode, + NFS_INO_INVALID_BLOCKS); res.falloc_fattr = nfs_alloc_fattr(); if (!res.falloc_fattr) @@ -1044,13 +1043,14 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct inode *src_inode = file_inode(src_f); struct inode *dst_inode = file_inode(dst_f); struct nfs_server *server = NFS_SERVER(dst_inode); + __u32 dst_bitmask[NFS_BITMASK_SZ]; struct nfs42_clone_args args = { .src_fh = NFS_FH(src_inode), .dst_fh = NFS_FH(dst_inode), .src_offset = src_offset, .dst_offset = dst_offset, .count = count, - .dst_bitmask = server->cache_consistency_bitmask, + .dst_bitmask = dst_bitmask, }; struct nfs42_clone_res res = { .server = server, @@ -1079,6 +1079,9 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, if (!res.dst_fattr) return -ENOMEM; + nfs4_bitmask_set(dst_bitmask, server->cache_consistency_bitmask, + dst_inode, NFS_INO_INVALID_BLOCKS); + status = nfs4_call_sync(server->client, server, msg, &args.seq_args, &res.seq_res, 0); trace_nfs4_clone(src_inode, dst_inode, &args, status); -- cgit v1.2.3-58-ga151 From 5d9224fb076e9a2023e0b06d6a164d644612c0c0 Mon Sep 17 00:00:00 2001 From: Xiang Chen Date: Tue, 4 Jan 2022 20:42:06 +0800 Subject: scsi: hisi_sas: Remove unused variable and check in hisi_sas_send_ata_reset_each_phy() In commit 29e2bac87421 ("scsi: hisi_sas: Fix some issues related to asd_sas_port->phy_list"), we use asd_sas_port->phy_mask instead of accessing asd_sas_port->phy_list, and it is enough to use asd_sas_port->phy_mask to check the state of phy, so remove the unused check and variable. Link: https://lore.kernel.org/r/1641300126-53574-1-git-send-email-chenxiang66@hisilicon.com Fixes: 29e2bac87421 ("scsi: hisi_sas: Fix some issues related to asd_sas_port->phy_list") Reported-by: Nathan Chancellor Reported-by: Colin King Acked-by: John Garry Signed-off-by: Xiang Chen Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_main.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index f46f679fe825..a05ec7aece5a 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1525,16 +1525,11 @@ static void hisi_sas_send_ata_reset_each_phy(struct hisi_hba *hisi_hba, struct device *dev = hisi_hba->dev; int s = sizeof(struct host_to_dev_fis); int rc = TMF_RESP_FUNC_FAILED; - struct asd_sas_phy *sas_phy; struct ata_link *link; u8 fis[20] = {0}; - u32 state; int i; - state = hisi_hba->hw->get_phys_state(hisi_hba); for (i = 0; i < hisi_hba->n_phy; i++) { - if (!(state & BIT(sas_phy->id))) - continue; if (!(sas_port->phy_mask & BIT(i))) continue; -- cgit v1.2.3-58-ga151 From 315d049ad1951cef02d9337a2469cac51cca6932 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 5 Jan 2022 09:36:33 -0800 Subject: scsi: megaraid: Avoid mismatched storage type sizes Remove needless use of mbox_t, replacing with just struct mbox_out. Silences compiler warnings under a -Warray-bounds build: drivers/scsi/megaraid.c: In function 'megaraid_probe_one': drivers/scsi/megaraid.c:3615:30: error: array subscript 'mbox_t[0]' is partly outside array bounds of 'unsigned char[15]' [-Werror=array-bounds] 3615 | mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/scsi/megaraid.c:3599:23: note: while referencing 'raw_mbox' 3599 | unsigned char raw_mbox[sizeof(struct mbox_out)]; | ^~~~~~~~ Link: https://lore.kernel.org/r/20220105173633.2421129-1-keescook@chromium.org Cc: Kashyap Desai Cc: Sumit Saxena Cc: Shivasharan S Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: megaraidlinux.pdl@broadcom.com Cc: linux-scsi@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid.c | 84 ++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 50 deletions(-) diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index 0d31d7a5e335..bf987f3a7f3f 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -192,23 +192,21 @@ mega_query_adapter(adapter_t *adapter) { dma_addr_t prod_info_dma_handle; mega_inquiry3 *inquiry3; - u8 raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + u8 *raw_mbox = (u8 *)&mbox; int retval; /* Initialize adapter inquiry mailbox */ - mbox = (mbox_t *)raw_mbox; - memset((void *)adapter->mega_buffer, 0, MEGA_BUFFER_SIZE); - memset(&mbox->m_out, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); /* * Try to issue Inquiry3 command * if not succeeded, then issue MEGA_MBOXCMD_ADAPTERINQ command and * update enquiry3 structure */ - mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; + mbox.xferaddr = (u32)adapter->buf_dma_handle; inquiry3 = (mega_inquiry3 *)adapter->mega_buffer; @@ -232,10 +230,10 @@ mega_query_adapter(adapter_t *adapter) inq = &ext_inq->raid_inq; - mbox->m_out.xferaddr = (u32)dma_handle; + mbox.xferaddr = (u32)dma_handle; /*issue old 0x04 command to adapter */ - mbox->m_out.cmd = MEGA_MBOXCMD_ADPEXTINQ; + mbox.cmd = MEGA_MBOXCMD_ADPEXTINQ; issue_scb_block(adapter, raw_mbox); @@ -262,7 +260,7 @@ mega_query_adapter(adapter_t *adapter) sizeof(mega_product_info), DMA_FROM_DEVICE); - mbox->m_out.xferaddr = prod_info_dma_handle; + mbox.xferaddr = prod_info_dma_handle; raw_mbox[0] = FC_NEW_CONFIG; /* i.e. mbox->cmd=0xA1 */ raw_mbox[2] = NC_SUBOP_PRODUCT_INFO; /* i.e. 0x0E */ @@ -3569,16 +3567,14 @@ mega_n_to_m(void __user *arg, megacmd_t *mc) static int mega_is_bios_enabled(adapter_t *adapter) { - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; - - mbox = (mbox_t *)raw_mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; - memset(&mbox->m_out, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); memset((void *)adapter->mega_buffer, 0, MEGA_BUFFER_SIZE); - mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; + mbox.xferaddr = (u32)adapter->buf_dma_handle; raw_mbox[0] = IS_BIOS_ENABLED; raw_mbox[2] = GET_BIOS; @@ -3600,13 +3596,11 @@ mega_is_bios_enabled(adapter_t *adapter) static void mega_enum_raid_scsi(adapter_t *adapter) { - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; int i; - mbox = (mbox_t *)raw_mbox; - - memset(&mbox->m_out, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); /* * issue command to find out what channels are raid/scsi @@ -3616,7 +3610,7 @@ mega_enum_raid_scsi(adapter_t *adapter) memset((void *)adapter->mega_buffer, 0, MEGA_BUFFER_SIZE); - mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; + mbox.xferaddr = (u32)adapter->buf_dma_handle; /* * Non-ROMB firmware fail this command, so all channels @@ -3655,23 +3649,21 @@ static void mega_get_boot_drv(adapter_t *adapter) { struct private_bios_data *prv_bios_data; - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; u16 cksum = 0; u8 *cksum_p; u8 boot_pdrv; int i; - mbox = (mbox_t *)raw_mbox; - - memset(&mbox->m_out, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); raw_mbox[0] = BIOS_PVT_DATA; raw_mbox[2] = GET_BIOS_PVT_DATA; memset((void *)adapter->mega_buffer, 0, MEGA_BUFFER_SIZE); - mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; + mbox.xferaddr = (u32)adapter->buf_dma_handle; adapter->boot_ldrv_enabled = 0; adapter->boot_ldrv = 0; @@ -3721,13 +3713,11 @@ mega_get_boot_drv(adapter_t *adapter) static int mega_support_random_del(adapter_t *adapter) { - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; int rval; - mbox = (mbox_t *)raw_mbox; - - memset(&mbox->m_out, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); /* * issue command @@ -3750,13 +3740,11 @@ mega_support_random_del(adapter_t *adapter) static int mega_support_ext_cdb(adapter_t *adapter) { - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; int rval; - mbox = (mbox_t *)raw_mbox; - - memset(&mbox->m_out, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); /* * issue command to find out if controller supports extended CDBs. */ @@ -3865,16 +3853,14 @@ mega_do_del_logdrv(adapter_t *adapter, int logdrv) static void mega_get_max_sgl(adapter_t *adapter) { - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; - mbox = (mbox_t *)raw_mbox; - - memset(mbox, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); memset((void *)adapter->mega_buffer, 0, MEGA_BUFFER_SIZE); - mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; + mbox.xferaddr = (u32)adapter->buf_dma_handle; raw_mbox[0] = MAIN_MISC_OPCODE; raw_mbox[2] = GET_MAX_SG_SUPPORT; @@ -3888,7 +3874,7 @@ mega_get_max_sgl(adapter_t *adapter) } else { adapter->sglen = *((char *)adapter->mega_buffer); - + /* * Make sure this is not more than the resources we are * planning to allocate @@ -3910,16 +3896,14 @@ mega_get_max_sgl(adapter_t *adapter) static int mega_support_cluster(adapter_t *adapter) { - unsigned char raw_mbox[sizeof(struct mbox_out)]; - mbox_t *mbox; - - mbox = (mbox_t *)raw_mbox; + struct mbox_out mbox; + unsigned char *raw_mbox = (u8 *)&mbox; - memset(mbox, 0, sizeof(raw_mbox)); + memset(&mbox, 0, sizeof(mbox)); memset((void *)adapter->mega_buffer, 0, MEGA_BUFFER_SIZE); - mbox->m_out.xferaddr = (u32)adapter->buf_dma_handle; + mbox.xferaddr = (u32)adapter->buf_dma_handle; /* * Try to get the initiator id. This command will succeed iff the -- cgit v1.2.3-58-ga151 From ac795161c93699d600db16c1a8cc23a65a1eceaf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 6 Jan 2022 18:24:02 -0500 Subject: NFSv4: Handle case where the lookup of a directory fails If the application sets the O_DIRECTORY flag, and tries to open a regular file, nfs_atomic_open() will punt to doing a regular lookup. If the server then returns a regular file, we will happily return a file descriptor with uninitialised open state. The fix is to return the expected ENOTDIR error in these cases. Reported-by: Lyu Tao Fixes: 0dd2b474d0b6 ("nfs: implement i_op->atomic_open()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2884138848f4..408c3bb549b1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1994,6 +1994,19 @@ out: no_open: res = nfs_lookup(dir, dentry, lookup_flags); + if (!res) { + inode = d_inode(dentry); + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !S_ISDIR(inode->i_mode)) + res = ERR_PTR(-ENOTDIR); + } else if (!IS_ERR(res)) { + inode = d_inode(res); + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !S_ISDIR(inode->i_mode)) { + dput(res); + res = ERR_PTR(-ENOTDIR); + } + } if (switched) { d_lookup_done(dentry); if (!res) -- cgit v1.2.3-58-ga151 From 1751fc1db36f6f411709e143d5393f92d12137a9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 6 Jan 2022 18:24:03 -0500 Subject: NFSv4: nfs_atomic_open() can race when looking up a non-regular file If the file type changes back to being a regular file on the server between the failed OPEN and our LOOKUP, then we need to re-run the OPEN. Fixes: 0dd2b474d0b6 ("nfs: implement i_op->atomic_open()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 408c3bb549b1..5df75ed09268 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1999,12 +1999,17 @@ no_open: if ((lookup_flags & LOOKUP_DIRECTORY) && inode && !S_ISDIR(inode->i_mode)) res = ERR_PTR(-ENOTDIR); + else if (inode && S_ISREG(inode->i_mode)) + res = ERR_PTR(-EOPENSTALE); } else if (!IS_ERR(res)) { inode = d_inode(res); if ((lookup_flags & LOOKUP_DIRECTORY) && inode && !S_ISDIR(inode->i_mode)) { dput(res); res = ERR_PTR(-ENOTDIR); + } else if (inode && S_ISREG(inode->i_mode)) { + dput(res); + res = ERR_PTR(-EOPENSTALE); } } if (switched) { -- cgit v1.2.3-58-ga151 From b114dda6f2f10cc8b2ddcde3285a576fe3f12c5d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 Jan 2022 22:54:05 +0100 Subject: scsi: message: fusion: Remove usage of the deprecated "pci-dma-compat.h" API In [1], Christoph Hellwig has proposed to remove the wrappers in include/linux/pci-dma-compat.h. Some reasons why this API should be removed have been given by Julia Lawall in [2]. A coccinelle script has been used to perform the needed transformation. It can be found in [3]. In this patch, all functions but pci_alloc_consistent() are handled. pci_alloc_consistent() needs more attention and explanation. [1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/ [2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/ [3]: https://lore.kernel.org/kernel-janitors/20200716192821.321233-1-christophe.jaillet@wanadoo.fr/ Link: https://lore.kernel.org/r/e38e897fbd3314718315b0e357c824e3f01775d6.1641500561.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/message/fusion/mptbase.c | 94 +++++++++++++++++++++------------------- drivers/message/fusion/mptctl.c | 51 +++++++++++++--------- drivers/message/fusion/mptlan.c | 90 ++++++++++++++++++++------------------ drivers/message/fusion/mptsas.c | 51 +++++++++++----------- 4 files changed, 154 insertions(+), 132 deletions(-) diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c index 24a4532053e4..5a3b7b56e85a 100644 --- a/drivers/message/fusion/mptbase.c +++ b/drivers/message/fusion/mptbase.c @@ -316,8 +316,8 @@ mpt_is_discovery_complete(MPT_ADAPTER *ioc) rc = 1; out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return rc; } @@ -1661,16 +1661,14 @@ mpt_mapresources(MPT_ADAPTER *ioc) const uint64_t required_mask = dma_get_required_mask (&pdev->dev); if (required_mask > DMA_BIT_MASK(32) - && !pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) - && !pci_set_consistent_dma_mask(pdev, - DMA_BIT_MASK(64))) { + && !dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)) + && !dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64))) { ioc->dma_mask = DMA_BIT_MASK(64); dinitprintk(ioc, printk(MYIOC_s_INFO_FMT ": 64 BIT PCI BUS DMA ADDRESSING SUPPORTED\n", ioc->name)); - } else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) - && !pci_set_consistent_dma_mask(pdev, - DMA_BIT_MASK(32))) { + } else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)) + && !dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32))) { ioc->dma_mask = DMA_BIT_MASK(32); dinitprintk(ioc, printk(MYIOC_s_INFO_FMT ": 32 BIT PCI BUS DMA ADDRESSING SUPPORTED\n", @@ -1681,9 +1679,8 @@ mpt_mapresources(MPT_ADAPTER *ioc) goto out_pci_release_region; } } else { - if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) - && !pci_set_consistent_dma_mask(pdev, - DMA_BIT_MASK(32))) { + if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)) + && !dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32))) { ioc->dma_mask = DMA_BIT_MASK(32); dinitprintk(ioc, printk(MYIOC_s_INFO_FMT ": 32 BIT PCI BUS DMA ADDRESSING SUPPORTED\n", @@ -2769,9 +2766,9 @@ mpt_adapter_disable(MPT_ADAPTER *ioc) if (ioc->spi_data.pIocPg4 != NULL) { sz = ioc->spi_data.IocPg4Sz; - pci_free_consistent(ioc->pcidev, sz, - ioc->spi_data.pIocPg4, - ioc->spi_data.IocPg4_dma); + dma_free_coherent(&ioc->pcidev->dev, sz, + ioc->spi_data.pIocPg4, + ioc->spi_data.IocPg4_dma); ioc->spi_data.pIocPg4 = NULL; ioc->alloc_total -= sz; } @@ -3548,7 +3545,8 @@ mpt_free_fw_memory(MPT_ADAPTER *ioc) sz = ioc->facts.FWImageSize; dinitprintk(ioc, printk(MYIOC_s_DEBUG_FMT "free_fw_memory: FW Image @ %p[%p], sz=%d[%x] bytes\n", ioc->name, ioc->cached_fw, (void *)(ulong)ioc->cached_fw_dma, sz, sz)); - pci_free_consistent(ioc->pcidev, sz, ioc->cached_fw, ioc->cached_fw_dma); + dma_free_coherent(&ioc->pcidev->dev, sz, ioc->cached_fw, + ioc->cached_fw_dma); ioc->alloc_total -= sz; ioc->cached_fw = NULL; } @@ -4447,9 +4445,8 @@ PrimeIocFifos(MPT_ADAPTER *ioc) */ if (ioc->pcidev->device == MPI_MANUFACTPAGE_DEVID_SAS1078 && ioc->dma_mask > DMA_BIT_MASK(35)) { - if (!pci_set_dma_mask(ioc->pcidev, DMA_BIT_MASK(32)) - && !pci_set_consistent_dma_mask(ioc->pcidev, - DMA_BIT_MASK(32))) { + if (!dma_set_mask(&ioc->pcidev->dev, DMA_BIT_MASK(32)) + && !dma_set_coherent_mask(&ioc->pcidev->dev, DMA_BIT_MASK(32))) { dma_mask = DMA_BIT_MASK(35); d36memprintk(ioc, printk(MYIOC_s_DEBUG_FMT "setting 35 bit addressing for " @@ -4457,10 +4454,10 @@ PrimeIocFifos(MPT_ADAPTER *ioc) ioc->name)); } else { /*Reseting DMA mask to 64 bit*/ - pci_set_dma_mask(ioc->pcidev, - DMA_BIT_MASK(64)); - pci_set_consistent_dma_mask(ioc->pcidev, - DMA_BIT_MASK(64)); + dma_set_mask(&ioc->pcidev->dev, + DMA_BIT_MASK(64)); + dma_set_coherent_mask(&ioc->pcidev->dev, + DMA_BIT_MASK(64)); printk(MYIOC_s_ERR_FMT "failed setting 35 bit addressing for " @@ -4595,8 +4592,8 @@ PrimeIocFifos(MPT_ADAPTER *ioc) alloc_dma += ioc->reply_sz; } - if (dma_mask == DMA_BIT_MASK(35) && !pci_set_dma_mask(ioc->pcidev, - ioc->dma_mask) && !pci_set_consistent_dma_mask(ioc->pcidev, + if (dma_mask == DMA_BIT_MASK(35) && !dma_set_mask(&ioc->pcidev->dev, + ioc->dma_mask) && !dma_set_coherent_mask(&ioc->pcidev->dev, ioc->dma_mask)) d36memprintk(ioc, printk(MYIOC_s_DEBUG_FMT "restoring 64 bit addressing\n", ioc->name)); @@ -4620,8 +4617,8 @@ out_fail: ioc->sense_buf_pool = NULL; } - if (dma_mask == DMA_BIT_MASK(35) && !pci_set_dma_mask(ioc->pcidev, - DMA_BIT_MASK(64)) && !pci_set_consistent_dma_mask(ioc->pcidev, + if (dma_mask == DMA_BIT_MASK(35) && !dma_set_mask(&ioc->pcidev->dev, + DMA_BIT_MASK(64)) && !dma_set_coherent_mask(&ioc->pcidev->dev, DMA_BIT_MASK(64))) d36memprintk(ioc, printk(MYIOC_s_DEBUG_FMT "restoring 64 bit addressing\n", ioc->name)); @@ -4982,7 +4979,8 @@ GetLanConfigPages(MPT_ADAPTER *ioc) } - pci_free_consistent(ioc->pcidev, data_sz, (u8 *) ppage0_alloc, page0_dma); + dma_free_coherent(&ioc->pcidev->dev, data_sz, + (u8 *)ppage0_alloc, page0_dma); /* FIXME! * Normalize endianness of structure data, @@ -5026,7 +5024,8 @@ GetLanConfigPages(MPT_ADAPTER *ioc) memcpy(&ioc->lan_cnfg_page1, ppage1_alloc, copy_sz); } - pci_free_consistent(ioc->pcidev, data_sz, (u8 *) ppage1_alloc, page1_dma); + dma_free_coherent(&ioc->pcidev->dev, data_sz, + (u8 *)ppage1_alloc, page1_dma); /* FIXME! * Normalize endianness of structure data, @@ -5325,7 +5324,8 @@ GetIoUnitPage2(MPT_ADAPTER *ioc) if ((rc = mpt_config(ioc, &cfg)) == 0) ioc->biosVersion = le32_to_cpu(ppage_alloc->BiosVersion); - pci_free_consistent(ioc->pcidev, data_sz, (u8 *) ppage_alloc, page_dma); + dma_free_coherent(&ioc->pcidev->dev, data_sz, + (u8 *)ppage_alloc, page_dma); } return rc; @@ -5456,7 +5456,9 @@ mpt_GetScsiPortSettings(MPT_ADAPTER *ioc, int portnum) } } if (pbuf) { - pci_free_consistent(ioc->pcidev, header.PageLength * 4, pbuf, buf_dma); + dma_free_coherent(&ioc->pcidev->dev, + header.PageLength * 4, pbuf, + buf_dma); } } } @@ -5543,7 +5545,9 @@ mpt_GetScsiPortSettings(MPT_ADAPTER *ioc, int portnum) } } - pci_free_consistent(ioc->pcidev, header.PageLength * 4, pbuf, buf_dma); + dma_free_coherent(&ioc->pcidev->dev, + header.PageLength * 4, pbuf, + buf_dma); } } @@ -5707,8 +5711,8 @@ mpt_inactive_raid_volumes(MPT_ADAPTER *ioc, u8 channel, u8 id) out: if (buffer) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, buffer, - dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + buffer, dma_handle); } /** @@ -5776,8 +5780,8 @@ mpt_raid_phys_disk_pg0(MPT_ADAPTER *ioc, u8 phys_disk_num, out: if (buffer) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, buffer, - dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + buffer, dma_handle); return rc; } @@ -5840,8 +5844,8 @@ mpt_raid_phys_disk_get_num_paths(MPT_ADAPTER *ioc, u8 phys_disk_num) out: if (buffer) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, buffer, - dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + buffer, dma_handle); return rc; } @@ -5929,8 +5933,8 @@ mpt_raid_phys_disk_pg1(MPT_ADAPTER *ioc, u8 phys_disk_num, out: if (buffer) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, buffer, - dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + buffer, dma_handle); return rc; } @@ -6011,7 +6015,7 @@ mpt_findImVolumes(MPT_ADAPTER *ioc) pIoc2->RaidVolume[i].VolumeID); out: - pci_free_consistent(ioc->pcidev, iocpage2sz, pIoc2, ioc2_dma); + dma_free_coherent(&ioc->pcidev->dev, iocpage2sz, pIoc2, ioc2_dma); return rc; } @@ -6070,7 +6074,7 @@ mpt_read_ioc_pg_3(MPT_ADAPTER *ioc) } } - pci_free_consistent(ioc->pcidev, iocpage3sz, pIoc3, ioc3_dma); + dma_free_coherent(&ioc->pcidev->dev, iocpage3sz, pIoc3, ioc3_dma); return 0; } @@ -6122,7 +6126,8 @@ mpt_read_ioc_pg_4(MPT_ADAPTER *ioc) ioc->spi_data.IocPg4_dma = ioc4_dma; ioc->spi_data.IocPg4Sz = iocpage4sz; } else { - pci_free_consistent(ioc->pcidev, iocpage4sz, pIoc4, ioc4_dma); + dma_free_coherent(&ioc->pcidev->dev, iocpage4sz, pIoc4, + ioc4_dma); ioc->spi_data.pIocPg4 = NULL; ioc->alloc_total -= iocpage4sz; } @@ -6210,7 +6215,7 @@ mpt_read_ioc_pg_1(MPT_ADAPTER *ioc) } } - pci_free_consistent(ioc->pcidev, iocpage1sz, pIoc1, ioc1_dma); + dma_free_coherent(&ioc->pcidev->dev, iocpage1sz, pIoc1, ioc1_dma); return; } @@ -6255,7 +6260,8 @@ mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc) out: if (pbuf) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, pbuf, buf_dma); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, pbuf, + buf_dma); } /*=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=*/ diff --git a/drivers/message/fusion/mptctl.c b/drivers/message/fusion/mptctl.c index ae433c150b37..0f447179e3f5 100644 --- a/drivers/message/fusion/mptctl.c +++ b/drivers/message/fusion/mptctl.c @@ -1046,9 +1046,9 @@ kbuf_alloc_2_sgl(int bytes, u32 sgdir, int sge_offset, int *frags, goto free_and_fail; if (sgdir & 0x04000000) - dir = PCI_DMA_TODEVICE; + dir = DMA_TO_DEVICE; else - dir = PCI_DMA_FROMDEVICE; + dir = DMA_FROM_DEVICE; /* At start: * sgl = sglbuf = point to beginning of sg buffer @@ -1080,8 +1080,9 @@ kbuf_alloc_2_sgl(int bytes, u32 sgdir, int sge_offset, int *frags, bytes_allocd += this_alloc; sgl->FlagsLength = (0x10000000|sgdir|this_alloc); - dma_addr = pci_map_single(ioc->pcidev, - buflist[buflist_ent].kptr, this_alloc, dir); + dma_addr = dma_map_single(&ioc->pcidev->dev, + buflist[buflist_ent].kptr, + this_alloc, dir); sgl->Address = dma_addr; fragcnt++; @@ -1140,9 +1141,11 @@ free_and_fail: kptr = buflist[i].kptr; len = buflist[i].len; - pci_free_consistent(ioc->pcidev, len, kptr, dma_addr); + dma_free_coherent(&ioc->pcidev->dev, len, kptr, + dma_addr); } - pci_free_consistent(ioc->pcidev, MAX_SGL_BYTES, sglbuf, *sglbuf_dma); + dma_free_coherent(&ioc->pcidev->dev, MAX_SGL_BYTES, sglbuf, + *sglbuf_dma); } kfree(buflist); return NULL; @@ -1162,9 +1165,9 @@ kfree_sgl(MptSge_t *sgl, dma_addr_t sgl_dma, struct buflist *buflist, MPT_ADAPTE int n = 0; if (sg->FlagsLength & 0x04000000) - dir = PCI_DMA_TODEVICE; + dir = DMA_TO_DEVICE; else - dir = PCI_DMA_FROMDEVICE; + dir = DMA_FROM_DEVICE; nib = (sg->FlagsLength & 0xF0000000) >> 28; while (! (nib & 0x4)) { /* eob */ @@ -1179,8 +1182,10 @@ kfree_sgl(MptSge_t *sgl, dma_addr_t sgl_dma, struct buflist *buflist, MPT_ADAPTE dma_addr = sg->Address; kptr = bl->kptr; len = bl->len; - pci_unmap_single(ioc->pcidev, dma_addr, len, dir); - pci_free_consistent(ioc->pcidev, len, kptr, dma_addr); + dma_unmap_single(&ioc->pcidev->dev, dma_addr, len, + dir); + dma_free_coherent(&ioc->pcidev->dev, len, kptr, + dma_addr); n++; } sg++; @@ -1197,12 +1202,12 @@ kfree_sgl(MptSge_t *sgl, dma_addr_t sgl_dma, struct buflist *buflist, MPT_ADAPTE dma_addr = sg->Address; kptr = bl->kptr; len = bl->len; - pci_unmap_single(ioc->pcidev, dma_addr, len, dir); - pci_free_consistent(ioc->pcidev, len, kptr, dma_addr); + dma_unmap_single(&ioc->pcidev->dev, dma_addr, len, dir); + dma_free_coherent(&ioc->pcidev->dev, len, kptr, dma_addr); n++; } - pci_free_consistent(ioc->pcidev, MAX_SGL_BYTES, sgl, sgl_dma); + dma_free_coherent(&ioc->pcidev->dev, MAX_SGL_BYTES, sgl, sgl_dma); kfree(buflist); dctlprintk(ioc, printk(MYIOC_s_DEBUG_FMT "-SG: Free'd 1 SGL buf + %d kbufs!\n", ioc->name, n)); @@ -2283,13 +2288,13 @@ done_free_mem: /* Free the allocated memory. */ if (bufOut.kptr != NULL) { - pci_free_consistent(ioc->pcidev, - bufOut.len, (void *) bufOut.kptr, dma_addr_out); + dma_free_coherent(&ioc->pcidev->dev, bufOut.len, + (void *)bufOut.kptr, dma_addr_out); } if (bufIn.kptr != NULL) { - pci_free_consistent(ioc->pcidev, - bufIn.len, (void *) bufIn.kptr, dma_addr_in); + dma_free_coherent(&ioc->pcidev->dev, bufIn.len, + (void *)bufIn.kptr, dma_addr_in); } /* mf is null if command issued successfully @@ -2405,7 +2410,9 @@ mptctl_hp_hostinfo(MPT_ADAPTER *ioc, unsigned long arg, unsigned int data_size) pdata->BoardTracerNumber, 24); } } - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, pbuf, buf_dma); + dma_free_coherent(&ioc->pcidev->dev, + hdr.PageLength * 4, pbuf, + buf_dma); pbuf = NULL; } } @@ -2519,7 +2526,7 @@ retry_wait: SET_MGMT_MSG_CONTEXT(ioc->ioctl_cmds.msg_context, 0); if (pbuf) - pci_free_consistent(ioc->pcidev, 4, pbuf, buf_dma); + dma_free_coherent(&ioc->pcidev->dev, 4, pbuf, buf_dma); /* Copy the data from kernel memory to user memory */ @@ -2623,7 +2630,8 @@ mptctl_hp_targetinfo(MPT_ADAPTER *ioc, unsigned long arg) karg.negotiated_speed = HP_DEV_SPEED_ASYNC; } - pci_free_consistent(ioc->pcidev, data_sz, (u8 *) pg0_alloc, page_dma); + dma_free_coherent(&ioc->pcidev->dev, data_sz, (u8 *)pg0_alloc, + page_dma); } /* Set defaults @@ -2658,7 +2666,8 @@ mptctl_hp_targetinfo(MPT_ADAPTER *ioc, unsigned long arg) karg.phase_errors = (u32) le16_to_cpu(pg3_alloc->PhaseErrorCount); karg.parity_errors = (u32) le16_to_cpu(pg3_alloc->ParityErrorCount); } - pci_free_consistent(ioc->pcidev, data_sz, (u8 *) pg3_alloc, page_dma); + dma_free_coherent(&ioc->pcidev->dev, data_sz, + (u8 *)pg3_alloc, page_dma); } } hd = shost_priv(ioc->sh); diff --git a/drivers/message/fusion/mptlan.c b/drivers/message/fusion/mptlan.c index 117fa4ebf6d7..142eb5d5d9df 100644 --- a/drivers/message/fusion/mptlan.c +++ b/drivers/message/fusion/mptlan.c @@ -516,9 +516,9 @@ mpt_lan_close(struct net_device *dev) if (priv->RcvCtl[i].skb != NULL) { /**/ dlprintk((KERN_INFO MYNAM "/lan_close: bucket %05x " /**/ "is still out\n", i)); - pci_unmap_single(mpt_dev->pcidev, priv->RcvCtl[i].dma, - priv->RcvCtl[i].len, - PCI_DMA_FROMDEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, + priv->RcvCtl[i].dma, + priv->RcvCtl[i].len, DMA_FROM_DEVICE); dev_kfree_skb(priv->RcvCtl[i].skb); } } @@ -528,9 +528,9 @@ mpt_lan_close(struct net_device *dev) for (i = 0; i < priv->tx_max_out; i++) { if (priv->SendCtl[i].skb != NULL) { - pci_unmap_single(mpt_dev->pcidev, priv->SendCtl[i].dma, - priv->SendCtl[i].len, - PCI_DMA_TODEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, + priv->SendCtl[i].dma, + priv->SendCtl[i].len, DMA_TO_DEVICE); dev_kfree_skb(priv->SendCtl[i].skb); } } @@ -582,8 +582,8 @@ mpt_lan_send_turbo(struct net_device *dev, u32 tmsg) __func__, sent)); priv->SendCtl[ctx].skb = NULL; - pci_unmap_single(mpt_dev->pcidev, priv->SendCtl[ctx].dma, - priv->SendCtl[ctx].len, PCI_DMA_TODEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, priv->SendCtl[ctx].dma, + priv->SendCtl[ctx].len, DMA_TO_DEVICE); dev_kfree_skb_irq(sent); spin_lock_irqsave(&priv->txfidx_lock, flags); @@ -648,8 +648,9 @@ mpt_lan_send_reply(struct net_device *dev, LANSendReply_t *pSendRep) __func__, sent)); priv->SendCtl[ctx].skb = NULL; - pci_unmap_single(mpt_dev->pcidev, priv->SendCtl[ctx].dma, - priv->SendCtl[ctx].len, PCI_DMA_TODEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, + priv->SendCtl[ctx].dma, + priv->SendCtl[ctx].len, DMA_TO_DEVICE); dev_kfree_skb_irq(sent); priv->mpt_txfidx[++priv->mpt_txfidx_tail] = ctx; @@ -720,8 +721,8 @@ mpt_lan_sdu_send (struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb); skb_pull(skb, 12); - dma = pci_map_single(mpt_dev->pcidev, skb->data, skb->len, - PCI_DMA_TODEVICE); + dma = dma_map_single(&mpt_dev->pcidev->dev, skb->data, skb->len, + DMA_TO_DEVICE); priv->SendCtl[ctx].skb = skb; priv->SendCtl[ctx].dma = dma; @@ -868,13 +869,17 @@ mpt_lan_receive_post_turbo(struct net_device *dev, u32 tmsg) return -ENOMEM; } - pci_dma_sync_single_for_cpu(mpt_dev->pcidev, priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, PCI_DMA_FROMDEVICE); + dma_sync_single_for_cpu(&mpt_dev->pcidev->dev, + priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, + DMA_FROM_DEVICE); skb_copy_from_linear_data(old_skb, skb_put(skb, len), len); - pci_dma_sync_single_for_device(mpt_dev->pcidev, priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, PCI_DMA_FROMDEVICE); + dma_sync_single_for_device(&mpt_dev->pcidev->dev, + priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, + DMA_FROM_DEVICE); goto out; } @@ -882,8 +887,8 @@ mpt_lan_receive_post_turbo(struct net_device *dev, u32 tmsg) priv->RcvCtl[ctx].skb = NULL; - pci_unmap_single(mpt_dev->pcidev, priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, PCI_DMA_FROMDEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, DMA_FROM_DEVICE); out: spin_lock_irqsave(&priv->rxfidx_lock, flags); @@ -927,8 +932,8 @@ mpt_lan_receive_post_free(struct net_device *dev, // dlprintk((KERN_INFO MYNAM "@rpr[2] TC + 3\n")); priv->RcvCtl[ctx].skb = NULL; - pci_unmap_single(mpt_dev->pcidev, priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, PCI_DMA_FROMDEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, DMA_FROM_DEVICE); dev_kfree_skb_any(skb); priv->mpt_rxfidx[++priv->mpt_rxfidx_tail] = ctx; @@ -1028,16 +1033,16 @@ mpt_lan_receive_post_reply(struct net_device *dev, // IOC_AND_NETDEV_NAMES_s_s(dev), // i, l)); - pci_dma_sync_single_for_cpu(mpt_dev->pcidev, - priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, - PCI_DMA_FROMDEVICE); + dma_sync_single_for_cpu(&mpt_dev->pcidev->dev, + priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, + DMA_FROM_DEVICE); skb_copy_from_linear_data(old_skb, skb_put(skb, l), l); - pci_dma_sync_single_for_device(mpt_dev->pcidev, - priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, - PCI_DMA_FROMDEVICE); + dma_sync_single_for_device(&mpt_dev->pcidev->dev, + priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, + DMA_FROM_DEVICE); priv->mpt_rxfidx[++priv->mpt_rxfidx_tail] = ctx; szrem -= l; @@ -1056,17 +1061,17 @@ mpt_lan_receive_post_reply(struct net_device *dev, return -ENOMEM; } - pci_dma_sync_single_for_cpu(mpt_dev->pcidev, - priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, - PCI_DMA_FROMDEVICE); + dma_sync_single_for_cpu(&mpt_dev->pcidev->dev, + priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, + DMA_FROM_DEVICE); skb_copy_from_linear_data(old_skb, skb_put(skb, len), len); - pci_dma_sync_single_for_device(mpt_dev->pcidev, - priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, - PCI_DMA_FROMDEVICE); + dma_sync_single_for_device(&mpt_dev->pcidev->dev, + priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, + DMA_FROM_DEVICE); spin_lock_irqsave(&priv->rxfidx_lock, flags); priv->mpt_rxfidx[++priv->mpt_rxfidx_tail] = ctx; @@ -1077,8 +1082,8 @@ mpt_lan_receive_post_reply(struct net_device *dev, priv->RcvCtl[ctx].skb = NULL; - pci_unmap_single(mpt_dev->pcidev, priv->RcvCtl[ctx].dma, - priv->RcvCtl[ctx].len, PCI_DMA_FROMDEVICE); + dma_unmap_single(&mpt_dev->pcidev->dev, priv->RcvCtl[ctx].dma, + priv->RcvCtl[ctx].len, DMA_FROM_DEVICE); priv->RcvCtl[ctx].dma = 0; priv->mpt_rxfidx[++priv->mpt_rxfidx_tail] = ctx; @@ -1199,10 +1204,10 @@ mpt_lan_post_receive_buckets(struct mpt_lan_priv *priv) skb = priv->RcvCtl[ctx].skb; if (skb && (priv->RcvCtl[ctx].len != len)) { - pci_unmap_single(mpt_dev->pcidev, + dma_unmap_single(&mpt_dev->pcidev->dev, priv->RcvCtl[ctx].dma, priv->RcvCtl[ctx].len, - PCI_DMA_FROMDEVICE); + DMA_FROM_DEVICE); dev_kfree_skb(priv->RcvCtl[ctx].skb); skb = priv->RcvCtl[ctx].skb = NULL; } @@ -1218,8 +1223,9 @@ mpt_lan_post_receive_buckets(struct mpt_lan_priv *priv) break; } - dma = pci_map_single(mpt_dev->pcidev, skb->data, - len, PCI_DMA_FROMDEVICE); + dma = dma_map_single(&mpt_dev->pcidev->dev, + skb->data, len, + DMA_FROM_DEVICE); priv->RcvCtl[ctx].skb = skb; priv->RcvCtl[ctx].dma = dma; diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index 091b45024d34..0363b2a2264d 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -769,8 +769,8 @@ mptsas_add_device_component_starget_ir(MPT_ADAPTER *ioc, out: if (buffer) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, buffer, - dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + buffer, dma_handle); } /** @@ -1426,8 +1426,8 @@ mptsas_sas_enclosure_pg0(MPT_ADAPTER *ioc, struct mptsas_enclosure *enclosure, enclosure->sep_channel = buffer->SEPBus; out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2081,8 +2081,8 @@ static int mptsas_get_linkerrors(struct sas_phy *phy) le32_to_cpu(buffer->PhyResetProblemCount); out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); return error; } @@ -2301,7 +2301,7 @@ static void mptsas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost, << MPI_SGE_FLAGS_SHIFT; if (!dma_map_sg(&ioc->pcidev->dev, job->request_payload.sg_list, - 1, PCI_DMA_BIDIRECTIONAL)) + 1, DMA_BIDIRECTIONAL)) goto put_mf; flagsLength |= (sg_dma_len(job->request_payload.sg_list) - 4); @@ -2318,7 +2318,7 @@ static void mptsas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost, flagsLength = flagsLength << MPI_SGE_FLAGS_SHIFT; if (!dma_map_sg(&ioc->pcidev->dev, job->reply_payload.sg_list, - 1, PCI_DMA_BIDIRECTIONAL)) + 1, DMA_BIDIRECTIONAL)) goto unmap_out; flagsLength |= sg_dma_len(job->reply_payload.sg_list) + 4; ioc->add_sge(psge, flagsLength, @@ -2356,10 +2356,10 @@ static void mptsas_smp_handler(struct bsg_job *job, struct Scsi_Host *shost, unmap_in: dma_unmap_sg(&ioc->pcidev->dev, job->reply_payload.sg_list, 1, - PCI_DMA_BIDIRECTIONAL); + DMA_BIDIRECTIONAL); unmap_out: dma_unmap_sg(&ioc->pcidev->dev, job->request_payload.sg_list, 1, - PCI_DMA_BIDIRECTIONAL); + DMA_BIDIRECTIONAL); put_mf: if (mf) mpt_free_msg_frame(ioc, mf); @@ -2452,8 +2452,8 @@ mptsas_sas_io_unit_pg0(MPT_ADAPTER *ioc, struct mptsas_portinfo *port_info) } out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2509,8 +2509,8 @@ mptsas_sas_io_unit_pg1(MPT_ADAPTER *ioc) device_missing_delay & MPI_SAS_IOUNIT1_REPORT_MISSING_TIMEOUT_MASK; out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2573,8 +2573,8 @@ mptsas_sas_phy_pg0(MPT_ADAPTER *ioc, struct mptsas_phyinfo *phy_info, phy_info->attached.handle = le16_to_cpu(buffer->AttachedDevHandle); out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2654,8 +2654,8 @@ mptsas_sas_device_pg0(MPT_ADAPTER *ioc, struct mptsas_devinfo *device_info, device_info->flags = le16_to_cpu(buffer->Flags); out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2737,8 +2737,8 @@ mptsas_sas_expander_pg0(MPT_ADAPTER *ioc, struct mptsas_portinfo *port_info, } out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2810,8 +2810,8 @@ mptsas_sas_expander_pg1(MPT_ADAPTER *ioc, struct mptsas_phyinfo *phy_info, phy_info->attached.handle = le16_to_cpu(buffer->AttachedDevHandle); out_free_consistent: - pci_free_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - buffer, dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, buffer, + dma_handle); out: return error; } @@ -2987,7 +2987,8 @@ mptsas_exp_repmanufacture_info(MPT_ADAPTER *ioc, } out_free: if (data_out_dma) - pci_free_consistent(ioc->pcidev, sz, data_out, data_out_dma); + dma_free_coherent(&ioc->pcidev->dev, sz, data_out, + data_out_dma); put_mf: if (mf) mpt_free_msg_frame(ioc, mf); @@ -4318,8 +4319,8 @@ mptsas_adding_inactive_raid_components(MPT_ADAPTER *ioc, u8 channel, u8 id) out: if (buffer) - pci_free_consistent(ioc->pcidev, hdr.PageLength * 4, buffer, - dma_handle); + dma_free_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + buffer, dma_handle); } /* * Work queue thread to handle SAS hotplug events -- cgit v1.2.3-58-ga151 From 2d50607260a6c142f49222346814fee30eeaba9e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 Jan 2022 22:54:13 +0100 Subject: scsi: message: fusion: Use dma_alloc_coherent() in mpt_alloc_fw_memory() In [1], Christoph Hellwig has proposed to remove the wrappers in include/linux/pci-dma-compat.h. Some reasons why this API should be removed have been given by Julia Lawall in [2]. mpt_alloc_fw_memory() should still use GFP_ATOMIC, because it can be called from mpt_do_upload() which might sleep. [1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/ [2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/ Link: https://lore.kernel.org/r/db3db9db219005b75659561d08117d312d0cfb13.1641500561.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/message/fusion/mptbase.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c index 5a3b7b56e85a..c4702ef87897 100644 --- a/drivers/message/fusion/mptbase.c +++ b/drivers/message/fusion/mptbase.c @@ -3512,7 +3512,8 @@ mpt_alloc_fw_memory(MPT_ADAPTER *ioc, int size) rc = 0; goto out; } - ioc->cached_fw = pci_alloc_consistent(ioc->pcidev, size, &ioc->cached_fw_dma); + ioc->cached_fw = dma_alloc_coherent(&ioc->pcidev->dev, size, + &ioc->cached_fw_dma, GFP_ATOMIC); if (!ioc->cached_fw) { printk(MYIOC_s_ERR_FMT "Unable to allocate memory for the cached firmware image!\n", ioc->name); -- cgit v1.2.3-58-ga151 From 5c5e6b6f61e0196141db38b84b30ef9b3bcca0f2 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 Jan 2022 22:54:19 +0100 Subject: scsi: message: fusion: mptbase: Use dma_alloc_coherent() In [1], Christoph Hellwig has proposed to remove the wrappers in include/linux/pci-dma-compat.h. Some reasons why this API should be removed have been given by Julia Lawall in [2]. In all these places where some memory is allocated GFP_KERNEL can be used because they already call mpt_config() which has an explicit might_sleep(). [1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/ [2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/ Link: https://lore.kernel.org/r/3bea2452deb8cc8be65982e87efa4c6861caa01c.1641500561.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/message/fusion/mptbase.c | 52 ++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c index c4702ef87897..e90adfa57950 100644 --- a/drivers/message/fusion/mptbase.c +++ b/drivers/message/fusion/mptbase.c @@ -300,8 +300,8 @@ mpt_is_discovery_complete(MPT_ADAPTER *ioc) if (!hdr.ExtPageLength) goto out; - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) goto out; @@ -4966,7 +4966,8 @@ GetLanConfigPages(MPT_ADAPTER *ioc) if (hdr.PageLength > 0) { data_sz = hdr.PageLength * 4; - ppage0_alloc = pci_alloc_consistent(ioc->pcidev, data_sz, &page0_dma); + ppage0_alloc = dma_alloc_coherent(&ioc->pcidev->dev, data_sz, + &page0_dma, GFP_KERNEL); rc = -ENOMEM; if (ppage0_alloc) { memset((u8 *)ppage0_alloc, 0, data_sz); @@ -5013,7 +5014,8 @@ GetLanConfigPages(MPT_ADAPTER *ioc) data_sz = hdr.PageLength * 4; rc = -ENOMEM; - ppage1_alloc = pci_alloc_consistent(ioc->pcidev, data_sz, &page1_dma); + ppage1_alloc = dma_alloc_coherent(&ioc->pcidev->dev, data_sz, + &page1_dma, GFP_KERNEL); if (ppage1_alloc) { memset((u8 *)ppage1_alloc, 0, data_sz); cfg.physAddr = page1_dma; @@ -5315,7 +5317,8 @@ GetIoUnitPage2(MPT_ADAPTER *ioc) /* Read the config page */ data_sz = hdr.PageLength * 4; rc = -ENOMEM; - ppage_alloc = pci_alloc_consistent(ioc->pcidev, data_sz, &page_dma); + ppage_alloc = dma_alloc_coherent(&ioc->pcidev->dev, data_sz, + &page_dma, GFP_KERNEL); if (ppage_alloc) { memset((u8 *)ppage_alloc, 0, data_sz); cfg.physAddr = page_dma; @@ -5401,7 +5404,9 @@ mpt_GetScsiPortSettings(MPT_ADAPTER *ioc, int portnum) return -EFAULT; if (header.PageLength > 0) { - pbuf = pci_alloc_consistent(ioc->pcidev, header.PageLength * 4, &buf_dma); + pbuf = dma_alloc_coherent(&ioc->pcidev->dev, + header.PageLength * 4, &buf_dma, + GFP_KERNEL); if (pbuf) { cfg.action = MPI_CONFIG_ACTION_PAGE_READ_CURRENT; cfg.physAddr = buf_dma; @@ -5481,7 +5486,9 @@ mpt_GetScsiPortSettings(MPT_ADAPTER *ioc, int portnum) if (header.PageLength > 0) { /* Allocate memory and read SCSI Port Page 2 */ - pbuf = pci_alloc_consistent(ioc->pcidev, header.PageLength * 4, &buf_dma); + pbuf = dma_alloc_coherent(&ioc->pcidev->dev, + header.PageLength * 4, &buf_dma, + GFP_KERNEL); if (pbuf) { cfg.action = MPI_CONFIG_ACTION_PAGE_READ_NVRAM; cfg.physAddr = buf_dma; @@ -5664,8 +5671,8 @@ mpt_inactive_raid_volumes(MPT_ADAPTER *ioc, u8 channel, u8 id) if (!hdr.PageLength) goto out; - buffer = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) goto out; @@ -5757,8 +5764,8 @@ mpt_raid_phys_disk_pg0(MPT_ADAPTER *ioc, u8 phys_disk_num, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { rc = -ENOMEM; @@ -5824,8 +5831,8 @@ mpt_raid_phys_disk_get_num_paths(MPT_ADAPTER *ioc, u8 phys_disk_num) goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { rc = 0; @@ -5896,8 +5903,8 @@ mpt_raid_phys_disk_pg1(MPT_ADAPTER *ioc, u8 phys_disk_num, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { rc = -ENOMEM; @@ -5991,7 +5998,8 @@ mpt_findImVolumes(MPT_ADAPTER *ioc) return -EFAULT; iocpage2sz = header.PageLength * 4; - pIoc2 = pci_alloc_consistent(ioc->pcidev, iocpage2sz, &ioc2_dma); + pIoc2 = dma_alloc_coherent(&ioc->pcidev->dev, iocpage2sz, &ioc2_dma, + GFP_KERNEL); if (!pIoc2) return -ENOMEM; @@ -6058,7 +6066,8 @@ mpt_read_ioc_pg_3(MPT_ADAPTER *ioc) /* Read Header good, alloc memory */ iocpage3sz = header.PageLength * 4; - pIoc3 = pci_alloc_consistent(ioc->pcidev, iocpage3sz, &ioc3_dma); + pIoc3 = dma_alloc_coherent(&ioc->pcidev->dev, iocpage3sz, &ioc3_dma, + GFP_KERNEL); if (!pIoc3) return 0; @@ -6109,7 +6118,8 @@ mpt_read_ioc_pg_4(MPT_ADAPTER *ioc) if ( (pIoc4 = ioc->spi_data.pIocPg4) == NULL ) { iocpage4sz = (header.PageLength + 4) * 4; /* Allow 4 additional SEP's */ - pIoc4 = pci_alloc_consistent(ioc->pcidev, iocpage4sz, &ioc4_dma); + pIoc4 = dma_alloc_coherent(&ioc->pcidev->dev, iocpage4sz, + &ioc4_dma, GFP_KERNEL); if (!pIoc4) return; ioc->alloc_total += iocpage4sz; @@ -6165,7 +6175,8 @@ mpt_read_ioc_pg_1(MPT_ADAPTER *ioc) /* Read Header good, alloc memory */ iocpage1sz = header.PageLength * 4; - pIoc1 = pci_alloc_consistent(ioc->pcidev, iocpage1sz, &ioc1_dma); + pIoc1 = dma_alloc_coherent(&ioc->pcidev->dev, iocpage1sz, &ioc1_dma, + GFP_KERNEL); if (!pIoc1) return; @@ -6245,7 +6256,8 @@ mpt_get_manufacturing_pg_0(MPT_ADAPTER *ioc) goto out; cfg.action = MPI_CONFIG_ACTION_PAGE_READ_CURRENT; - pbuf = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, &buf_dma); + pbuf = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &buf_dma, GFP_KERNEL); if (!pbuf) goto out; -- cgit v1.2.3-58-ga151 From 7a960b3a5e37f05d7d319bf257a8cf6ee9d18e7c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 Jan 2022 22:54:26 +0100 Subject: scsi: message: fusion: Use dma_alloc_coherent() in mptsas_exp_repmanufacture_info() In [1], Christoph Hellwig has proposed to remove the wrappers in include/linux/pci-dma-compat.h. Some reasons why this API should be removed have been given by Julia Lawall in [2]. The only caller of mptsas_exp_repmanufacture_info() is mptsas_probe_one_phy(). This function already calls sas_end_device_alloc() or sas_expander_alloc(). They both already use GFP_KERNEL. As no spin_lock is held at this point, it is safe to also use GFP_KERNEL here. [1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/ [2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/ Link: https://lore.kernel.org/r/d78d4a5b096897932808ed7e3a4540db1687c25d.1641500561.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/message/fusion/mptsas.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index 0363b2a2264d..9b40be04710c 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -2896,7 +2896,8 @@ mptsas_exp_repmanufacture_info(MPT_ADAPTER *ioc, sz = sizeof(struct rep_manu_request) + sizeof(struct rep_manu_reply); - data_out = pci_alloc_consistent(ioc->pcidev, sz, &data_out_dma); + data_out = dma_alloc_coherent(&ioc->pcidev->dev, sz, &data_out_dma, + GFP_KERNEL); if (!data_out) { printk(KERN_ERR "Memory allocation failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); -- cgit v1.2.3-58-ga151 From 76a334d756c596f2f283e1d0054477ba0f0eacf6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 Jan 2022 22:54:33 +0100 Subject: scsi: message: fusion: mptsas: Use dma_alloc_coherent() In [1], Christoph Hellwig has proposed to remove the wrappers in include/linux/pci-dma-compat.h. Some reasons why this API should be removed have been given by Julia Lawall in [2]. In all these places where some memory is allocated GFP_KERNEL can be used because they already call mpt_config() which has an explicit might_sleep(). [1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/ [2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/ Link: https://lore.kernel.org/r/443b81ecb08b2fe6f789bb2fdff13a53c809e401.1641500561.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/message/fusion/mptsas.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index 9b40be04710c..4acd8f9a48e1 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -702,8 +702,8 @@ mptsas_add_device_component_starget_ir(MPT_ADAPTER *ioc, if (!hdr.PageLength) goto out; - buffer = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) goto out; @@ -1399,8 +1399,8 @@ mptsas_sas_enclosure_pg0(MPT_ADAPTER *ioc, struct mptsas_enclosure *enclosure, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -2058,8 +2058,8 @@ static int mptsas_get_linkerrors(struct sas_phy *phy) if (!hdr.ExtPageLength) return -ENXIO; - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) return -ENOMEM; @@ -2412,8 +2412,8 @@ mptsas_sas_io_unit_pg0(MPT_ADAPTER *ioc, struct mptsas_portinfo *port_info) goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -2487,8 +2487,8 @@ mptsas_sas_io_unit_pg1(MPT_ADAPTER *ioc) goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -2551,8 +2551,8 @@ mptsas_sas_phy_pg0(MPT_ADAPTER *ioc, struct mptsas_phyinfo *phy_info, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -2614,8 +2614,8 @@ mptsas_sas_device_pg0(MPT_ADAPTER *ioc, struct mptsas_devinfo *device_info, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -2697,8 +2697,8 @@ mptsas_sas_expander_pg0(MPT_ADAPTER *ioc, struct mptsas_portinfo *port_info, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -2777,8 +2777,8 @@ mptsas_sas_expander_pg1(MPT_ADAPTER *ioc, struct mptsas_phyinfo *phy_info, goto out; } - buffer = pci_alloc_consistent(ioc->pcidev, hdr.ExtPageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.ExtPageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) { error = -ENOMEM; goto out; @@ -4273,8 +4273,8 @@ mptsas_adding_inactive_raid_components(MPT_ADAPTER *ioc, u8 channel, u8 id) if (!hdr.PageLength) goto out; - buffer = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, - &dma_handle); + buffer = dma_alloc_coherent(&ioc->pcidev->dev, hdr.PageLength * 4, + &dma_handle, GFP_KERNEL); if (!buffer) goto out; -- cgit v1.2.3-58-ga151 From 706dc3b91989a1286b0eb75a1cd816596c590e5c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 Jan 2022 22:54:39 +0100 Subject: scsi: message: fusion: mptctl: Use dma_alloc_coherent() In [1], Christoph Hellwig has proposed to remove the wrappers in include/linux/pci-dma-compat.h. Some reasons why this API should be removed have been given by Julia Lawall in [2]. When memory is allocated in kbuf_alloc_2_sgl() GFP_KERNEL can be used because this function already uses the GFP_USER flag for some memory allocation and not spin_lock is taken in the between. When memory is allocated in mptctl_do_mpt_command() GFP_KERNEL can be used because this function already uses copy_from_user() and this function can sleep. When memory is allocated in mptctl_hp_hostinfo() GFP_KERNEL can be used because this function already uses mpt_config() and this function has an explicit might_sleep(). When memory is allocated in mptctl_hp_targetinfo() GFP_KERNEL can be used because this function already uses mpt_config() and this function has an explicit might_sleep(). [1]: https://lore.kernel.org/kernel-janitors/20200421081257.GA131897@infradead.org/ [2]: https://lore.kernel.org/kernel-janitors/alpine.DEB.2.22.394.2007120902170.2424@hadrien/ Link: https://lore.kernel.org/r/516375d6d06114484533baf03aae351306100246.1641500561.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/message/fusion/mptctl.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/drivers/message/fusion/mptctl.c b/drivers/message/fusion/mptctl.c index 0f447179e3f5..03c8fb1795c2 100644 --- a/drivers/message/fusion/mptctl.c +++ b/drivers/message/fusion/mptctl.c @@ -1041,7 +1041,8 @@ kbuf_alloc_2_sgl(int bytes, u32 sgdir, int sge_offset, int *frags, * copying the data in this array into the correct place in the * request and chain buffers. */ - sglbuf = pci_alloc_consistent(ioc->pcidev, MAX_SGL_BYTES, sglbuf_dma); + sglbuf = dma_alloc_coherent(&ioc->pcidev->dev, MAX_SGL_BYTES, + sglbuf_dma, GFP_KERNEL); if (sglbuf == NULL) goto free_and_fail; @@ -1062,9 +1063,9 @@ kbuf_alloc_2_sgl(int bytes, u32 sgdir, int sge_offset, int *frags, while (bytes_allocd < bytes) { this_alloc = min(alloc_sz, bytes-bytes_allocd); buflist[buflist_ent].len = this_alloc; - buflist[buflist_ent].kptr = pci_alloc_consistent(ioc->pcidev, - this_alloc, - &pa); + buflist[buflist_ent].kptr = dma_alloc_coherent(&ioc->pcidev->dev, + this_alloc, + &pa, GFP_KERNEL); if (buflist[buflist_ent].kptr == NULL) { alloc_sz = alloc_sz / 2; if (alloc_sz == 0) { @@ -2105,8 +2106,9 @@ mptctl_do_mpt_command (MPT_ADAPTER *ioc, struct mpt_ioctl_command karg, void __u } flagsLength |= karg.dataOutSize; bufOut.len = karg.dataOutSize; - bufOut.kptr = pci_alloc_consistent( - ioc->pcidev, bufOut.len, &dma_addr_out); + bufOut.kptr = dma_alloc_coherent(&ioc->pcidev->dev, + bufOut.len, + &dma_addr_out, GFP_KERNEL); if (bufOut.kptr == NULL) { rc = -ENOMEM; @@ -2139,8 +2141,9 @@ mptctl_do_mpt_command (MPT_ADAPTER *ioc, struct mpt_ioctl_command karg, void __u flagsLength |= karg.dataInSize; bufIn.len = karg.dataInSize; - bufIn.kptr = pci_alloc_consistent(ioc->pcidev, - bufIn.len, &dma_addr_in); + bufIn.kptr = dma_alloc_coherent(&ioc->pcidev->dev, + bufIn.len, + &dma_addr_in, GFP_KERNEL); if (bufIn.kptr == NULL) { rc = -ENOMEM; @@ -2400,7 +2403,9 @@ mptctl_hp_hostinfo(MPT_ADAPTER *ioc, unsigned long arg, unsigned int data_size) /* Issue the second config page request */ cfg.action = MPI_CONFIG_ACTION_PAGE_READ_CURRENT; - pbuf = pci_alloc_consistent(ioc->pcidev, hdr.PageLength * 4, &buf_dma); + pbuf = dma_alloc_coherent(&ioc->pcidev->dev, + hdr.PageLength * 4, + &buf_dma, GFP_KERNEL); if (pbuf) { cfg.physAddr = buf_dma; if (mpt_config(ioc, &cfg) == 0) { @@ -2477,7 +2482,7 @@ mptctl_hp_hostinfo(MPT_ADAPTER *ioc, unsigned long arg, unsigned int data_size) else IstwiRWRequest->DeviceAddr = 0xB0; - pbuf = pci_alloc_consistent(ioc->pcidev, 4, &buf_dma); + pbuf = dma_alloc_coherent(&ioc->pcidev->dev, 4, &buf_dma, GFP_KERNEL); if (!pbuf) goto out; ioc->add_sge((char *)&IstwiRWRequest->SGL, @@ -2592,7 +2597,8 @@ mptctl_hp_targetinfo(MPT_ADAPTER *ioc, unsigned long arg) /* Get the data transfer speeds */ data_sz = ioc->spi_data.sdp0length * 4; - pg0_alloc = pci_alloc_consistent(ioc->pcidev, data_sz, &page_dma); + pg0_alloc = dma_alloc_coherent(&ioc->pcidev->dev, data_sz, &page_dma, + GFP_KERNEL); if (pg0_alloc) { hdr.PageVersion = ioc->spi_data.sdp0version; hdr.PageLength = data_sz; @@ -2657,7 +2663,8 @@ mptctl_hp_targetinfo(MPT_ADAPTER *ioc, unsigned long arg) /* Issue the second config page request */ cfg.action = MPI_CONFIG_ACTION_PAGE_READ_CURRENT; data_sz = (int) cfg.cfghdr.hdr->PageLength * 4; - pg3_alloc = pci_alloc_consistent(ioc->pcidev, data_sz, &page_dma); + pg3_alloc = dma_alloc_coherent(&ioc->pcidev->dev, data_sz, + &page_dma, GFP_KERNEL); if (pg3_alloc) { cfg.physAddr = page_dma; cfg.pageAddr = (karg.hdr.channel << 8) | karg.hdr.id; -- cgit v1.2.3-58-ga151 From 1aa7d9799e85addc29c06ece99bf1eae1ef9198f Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 9 Jan 2022 19:57:04 +0100 Subject: scsi: efct: Remove useless DMA-32 fallback configuration As stated in [1], dma_set_mask() with a 64-bit mask never fails if dev->dma_mask is non-NULL. So, if it fails, the 32 bits case will also fail for the same reason. Simplify code and remove some dead code accordingly. While at it, return the error code returned by dma_set_mask_and_coherent() instead of -1. [1]: https://lkml.org/lkml/2021/6/7/398 Link: https://lore.kernel.org/r/958bcb2a6e86344c14f38369e8e7079615a2b0e3.1641754613.git.christophe.jaillet@wanadoo.fr Reviewed-by: Christoph Hellwig Signed-off-by: Christophe JAILLET Signed-off-by: Martin K. Petersen --- drivers/scsi/elx/efct/efct_driver.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/elx/efct/efct_driver.c b/drivers/scsi/elx/efct/efct_driver.c index ae62fc3c9ee3..b08fc8839808 100644 --- a/drivers/scsi/elx/efct/efct_driver.c +++ b/drivers/scsi/elx/efct/efct_driver.c @@ -541,13 +541,10 @@ efct_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_drvdata(pdev, efct); - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) != 0) { - dev_warn(&pdev->dev, "trying DMA_BIT_MASK(32)\n"); - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)) != 0) { - dev_err(&pdev->dev, "setting DMA_BIT_MASK failed\n"); - rc = -1; - goto dma_mask_out; - } + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (rc) { + dev_err(&pdev->dev, "setting DMA_BIT_MASK failed\n"); + goto dma_mask_out; } num_interrupts = efct_device_interrupts_required(efct); -- cgit v1.2.3-58-ga151 From 9008661e19606bdf6dddd33073b70872da400590 Mon Sep 17 00:00:00 2001 From: SEO HOYOUNG Date: Fri, 7 Jan 2022 06:39:24 +0900 Subject: scsi: ufs: Modify Tactive time setting conditions The Tactive time determines the waiting time before burst at hibern8 exit and is determined by hardware at linkup time. However, in the case of Samsung devices, increase host's Tactive time +100us for stability. If the HCI's Tactive time is equal or greater than the device, +100us should be set. Link: https://lore.kernel.org/r/20220106213924.186263-1-hy50.seo@samsung.com Reviewed-by: Alim Akhtar Acked-by: Avri Altman Signed-off-by: SEO HOYOUNG Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 1049e41abd5b..460d2b440d2e 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -7815,7 +7815,7 @@ static int ufshcd_quirk_tune_host_pa_tactivate(struct ufs_hba *hba) peer_pa_tactivate_us = peer_pa_tactivate * gran_to_us_table[peer_granularity - 1]; - if (pa_tactivate_us > peer_pa_tactivate_us) { + if (pa_tactivate_us >= peer_pa_tactivate_us) { u32 new_peer_pa_tactivate; new_peer_pa_tactivate = pa_tactivate_us / -- cgit v1.2.3-58-ga151 From 3ba880a12df5aa4488c18281701b5b1bc3d4531a Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Wed, 22 Dec 2021 07:09:30 +0000 Subject: scsi: ufs: ufs-mediatek: Fix error checking in ufs_mtk_init_va09_pwr_ctrl() The function regulator_get() returns an error pointer. Use IS_ERR() to validate the return value. Link: https://lore.kernel.org/r/20211222070930.9449-1-linmq006@gmail.com Fixes: cf137b3ea49a ("scsi: ufs-mediatek: Support VA09 regulator operations") Signed-off-by: Miaoqian Lin Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufs-mediatek.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufs-mediatek.c b/drivers/scsi/ufs/ufs-mediatek.c index 5393b5c9dd9c..86a938075f30 100644 --- a/drivers/scsi/ufs/ufs-mediatek.c +++ b/drivers/scsi/ufs/ufs-mediatek.c @@ -557,7 +557,7 @@ static void ufs_mtk_init_va09_pwr_ctrl(struct ufs_hba *hba) struct ufs_mtk_host *host = ufshcd_get_variant(hba); host->reg_va09 = regulator_get(hba->dev, "va09"); - if (!host->reg_va09) + if (IS_ERR(host->reg_va09)) dev_info(hba->dev, "failed to get va09"); else host->caps |= UFS_MTK_CAP_VA09_PWR_CTRL; -- cgit v1.2.3-58-ga151 From 2576e153cd982d540b212e989458edc42ad1b390 Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Thu, 30 Dec 2021 10:11:37 +0800 Subject: scsi: nsp_cs: Check of ioremap return value Since it is possible for ioremap() to fail, 'data->MmioAddress' could be NULL. Skip entry if ioremap() fails. Link: https://lore.kernel.org/r/20211230021137.1823352-1-jiasheng@iscas.ac.cn Fixes: 0e6f9d270840 ("pcmcia: use pcmcia_loop_config in scsi pcmcia drivers") Signed-off-by: Jiasheng Jiang Signed-off-by: Martin K. Petersen --- drivers/scsi/pcmcia/nsp_cs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/pcmcia/nsp_cs.c b/drivers/scsi/pcmcia/nsp_cs.c index 8b9e889bc306..92c818a8a84a 100644 --- a/drivers/scsi/pcmcia/nsp_cs.c +++ b/drivers/scsi/pcmcia/nsp_cs.c @@ -1557,6 +1557,9 @@ static int nsp_cs_config_check(struct pcmcia_device *p_dev, void *priv_data) data->MmioAddress = (unsigned long) ioremap(p_dev->resource[2]->start, resource_size(p_dev->resource[2])); + if (!data->MmioAddress) + goto next_entry; + data->MmioLength = resource_size(p_dev->resource[2]); } /* If we got this far, we're cool! */ -- cgit v1.2.3-58-ga151 From 31928a001bed0d9642711d2eba520fc46d41c376 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Wed, 29 Dec 2021 23:02:15 +0900 Subject: ksmbd: register ksmbd ib client with ib_register_client() Register ksmbd ib client with ib_register_client() to find the rdma capable network adapter. If ops.get_netdev(Chelsio NICs) is NULL, ksmbd will find it using ib_device_get_by_netdev in old way. Signed-off-by: Hyunchul Lee Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 107 +++++++++++++++++++++++++++++++++++++++++----- fs/ksmbd/transport_rdma.h | 2 +- 2 files changed, 98 insertions(+), 11 deletions(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 7e57cbb0bb35..339fa4f025f7 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -79,6 +79,14 @@ static int smb_direct_max_read_write_size = 1024 * 1024; static int smb_direct_max_outstanding_rw_ops = 8; +static LIST_HEAD(smb_direct_device_list); +static DEFINE_RWLOCK(smb_direct_device_lock); + +struct smb_direct_device { + struct ib_device *ib_dev; + struct list_head list; +}; + static struct smb_direct_listener { struct rdma_cm_id *cm_id; } smb_direct_listener; @@ -2007,12 +2015,61 @@ err: return ret; } +static int smb_direct_ib_client_add(struct ib_device *ib_dev) +{ + struct smb_direct_device *smb_dev; + + if (!ib_dev->ops.get_netdev || + !rdma_frwr_is_supported(&ib_dev->attrs)) + return 0; + + smb_dev = kzalloc(sizeof(*smb_dev), GFP_KERNEL); + if (!smb_dev) + return -ENOMEM; + smb_dev->ib_dev = ib_dev; + + write_lock(&smb_direct_device_lock); + list_add(&smb_dev->list, &smb_direct_device_list); + write_unlock(&smb_direct_device_lock); + + ksmbd_debug(RDMA, "ib device added: name %s\n", ib_dev->name); + return 0; +} + +static void smb_direct_ib_client_remove(struct ib_device *ib_dev, + void *client_data) +{ + struct smb_direct_device *smb_dev, *tmp; + + write_lock(&smb_direct_device_lock); + list_for_each_entry_safe(smb_dev, tmp, &smb_direct_device_list, list) { + if (smb_dev->ib_dev == ib_dev) { + list_del(&smb_dev->list); + kfree(smb_dev); + break; + } + } + write_unlock(&smb_direct_device_lock); +} + +static struct ib_client smb_direct_ib_client = { + .name = "ksmbd_smb_direct_ib", + .add = smb_direct_ib_client_add, + .remove = smb_direct_ib_client_remove, +}; + int ksmbd_rdma_init(void) { int ret; smb_direct_listener.cm_id = NULL; + ret = ib_register_client(&smb_direct_ib_client); + if (ret) { + pr_err("failed to ib_register_client\n"); + return ret; + } + /* When a client is running out of send credits, the credits are * granted by the server's sending a packet using this queue. * This avoids the situation that a clients cannot send packets @@ -2036,30 +2093,60 @@ int ksmbd_rdma_init(void) return 0; } -int ksmbd_rdma_destroy(void) +void ksmbd_rdma_destroy(void) { - if (smb_direct_listener.cm_id) - rdma_destroy_id(smb_direct_listener.cm_id); + if (!smb_direct_listener.cm_id) + return; + + ib_unregister_client(&smb_direct_ib_client); + rdma_destroy_id(smb_direct_listener.cm_id); + smb_direct_listener.cm_id = NULL; if (smb_direct_wq) { destroy_workqueue(smb_direct_wq); smb_direct_wq = NULL; } - return 0; } bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { - struct ib_device *ibdev; + struct smb_direct_device *smb_dev; + int i; bool rdma_capable = false; - ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); - if (ibdev) { - if (rdma_frwr_is_supported(&ibdev->attrs)) - rdma_capable = true; - ib_device_put(ibdev); + read_lock(&smb_direct_device_lock); + list_for_each_entry(smb_dev, &smb_direct_device_list, list) { + for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) { + struct net_device *ndev; + + ndev = smb_dev->ib_dev->ops.get_netdev(smb_dev->ib_dev, + i + 1); + if (!ndev) + continue; + + if (ndev == netdev) { + dev_put(ndev); + rdma_capable = true; + goto out; + } + dev_put(ndev); + } + } +out: + read_unlock(&smb_direct_device_lock); + + if (rdma_capable == false) { + struct ib_device *ibdev; + + ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); + if (ibdev) { + if (rdma_frwr_is_supported(&ibdev->attrs)) + rdma_capable = true; + ib_device_put(ibdev); + } } + return rdma_capable; } diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h index 0fa8adc0776f..ab9250a7cb86 100644 --- a/fs/ksmbd/transport_rdma.h +++ b/fs/ksmbd/transport_rdma.h @@ -52,7 +52,7 @@ struct smb_direct_data_transfer { #ifdef CONFIG_SMB_SERVER_SMBDIRECT int ksmbd_rdma_init(void); -int ksmbd_rdma_destroy(void); +void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); #else static inline int ksmbd_rdma_init(void) { return 0; } -- cgit v1.2.3-58-ga151 From cb097b3dd5ece9596a0a0b7e33893c02a9bde8c6 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 29 Dec 2021 23:02:50 +0900 Subject: ksmbd: set 445 port to smbdirect port by default When SMB Direct is used with iWARP, Windows use 5445 port for smb direct port, 445 port for SMB. This patch check ib_device using ib_client to know if NICs type is iWARP or Infiniband. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 15 ++++++++++++--- fs/ksmbd/transport_rdma.h | 2 -- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 339fa4f025f7..f89b64e27836 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -34,7 +34,8 @@ #include "smbstatus.h" #include "transport_rdma.h" -#define SMB_DIRECT_PORT 5445 +#define SMB_DIRECT_PORT_IWARP 5445 +#define SMB_DIRECT_PORT_INFINIBAND 445 #define SMB_DIRECT_VERSION_LE cpu_to_le16(0x0100) @@ -60,6 +61,10 @@ * as defined in [MS-SMBD] 3.1.1.1 * Those may change after a SMB_DIRECT negotiation */ + +/* Set 445 port to SMB Direct port by default */ +static int smb_direct_port = SMB_DIRECT_PORT_INFINIBAND; + /* The local peer's maximum number of credits to grant to the peer */ static int smb_direct_receive_credit_max = 255; @@ -1942,7 +1947,7 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, KSMBD_TRANS(t)->conn, "ksmbd:r%u", - SMB_DIRECT_PORT); + smb_direct_port); if (IS_ERR(KSMBD_TRANS(t)->handler)) { int ret = PTR_ERR(KSMBD_TRANS(t)->handler); @@ -2019,6 +2024,10 @@ static int smb_direct_ib_client_add(struct ib_device *ib_dev) { struct smb_direct_device *smb_dev; + /* Set 5445 port if device type is iWARP(No IB) */ + if (ib_dev->node_type != RDMA_NODE_IB_CA) + smb_direct_port = SMB_DIRECT_PORT_IWARP; + if (!ib_dev->ops.get_netdev || !rdma_frwr_is_supported(&ib_dev->attrs)) return 0; @@ -2080,7 +2089,7 @@ int ksmbd_rdma_init(void) if (!smb_direct_wq) return -ENOMEM; - ret = smb_direct_listen(SMB_DIRECT_PORT); + ret = smb_direct_listen(smb_direct_port); if (ret) { destroy_workqueue(smb_direct_wq); smb_direct_wq = NULL; diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h index ab9250a7cb86..5567d93a6f96 100644 --- a/fs/ksmbd/transport_rdma.h +++ b/fs/ksmbd/transport_rdma.h @@ -7,8 +7,6 @@ #ifndef __KSMBD_TRANSPORT_RDMA_H__ #define __KSMBD_TRANSPORT_RDMA_H__ -#define SMB_DIRECT_PORT 5445 - /* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */ struct smb_direct_negotiate_req { __le16 min_version; -- cgit v1.2.3-58-ga151 From 004443b3f6d722b455cf963ed7c3edd7f4772405 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 29 Dec 2021 23:08:46 +0900 Subject: ksmbd: add support for smb2 max credit parameter Add smb2 max credits parameter to adjust maximum credits value to limit number of outstanding requests. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/connection.h | 1 - fs/ksmbd/ksmbd_netlink.h | 1 + fs/ksmbd/smb2misc.c | 2 +- fs/ksmbd/smb2ops.c | 16 ++++++++++++---- fs/ksmbd/smb2pdu.c | 8 ++++---- fs/ksmbd/smb2pdu.h | 1 + fs/ksmbd/smb_common.h | 1 + fs/ksmbd/transport_ipc.c | 2 ++ 8 files changed, 22 insertions(+), 10 deletions(-) diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index 72dfd155b5bf..42ffb6d9c5d8 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -62,7 +62,6 @@ struct ksmbd_conn { /* References which are made for this Server object*/ atomic_t r_count; unsigned short total_credits; - unsigned short max_credits; spinlock_t credits_lock; wait_queue_head_t req_running_q; /* Lock to protect requests list*/ diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index c6718a05d347..a5c2861792ae 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -103,6 +103,7 @@ struct ksmbd_startup_request { * we set the SPARSE_FILES bit (0x40). */ __u32 sub_auth[3]; /* Subauth value for Security ID */ + __u32 smb2_max_credits; /* MAX credits */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; }; diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 50d0b1022289..6892d1822269 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -326,7 +326,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, ksmbd_debug(SMB, "Insufficient credit charge, given: %d, needed: %d\n", credit_charge, calc_credit_num); return 1; - } else if (credit_charge > conn->max_credits) { + } else if (credit_charge > conn->vals->max_credits) { ksmbd_debug(SMB, "Too large credit charge: %d\n", credit_charge); return 1; } diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index 02a44d28bdaf..ab23da2120b9 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -19,6 +19,7 @@ static struct smb_version_values smb21_server_values = { .max_read_size = SMB21_DEFAULT_IOSIZE, .max_write_size = SMB21_DEFAULT_IOSIZE, .max_trans_size = SMB21_DEFAULT_IOSIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -44,6 +45,7 @@ static struct smb_version_values smb30_server_values = { .max_read_size = SMB3_DEFAULT_IOSIZE, .max_write_size = SMB3_DEFAULT_IOSIZE, .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -70,6 +72,7 @@ static struct smb_version_values smb302_server_values = { .max_read_size = SMB3_DEFAULT_IOSIZE, .max_write_size = SMB3_DEFAULT_IOSIZE, .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -96,6 +99,7 @@ static struct smb_version_values smb311_server_values = { .max_read_size = SMB3_DEFAULT_IOSIZE, .max_write_size = SMB3_DEFAULT_IOSIZE, .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -197,7 +201,6 @@ void init_smb2_1_server(struct ksmbd_conn *conn) conn->ops = &smb2_0_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_HMAC_SHA256_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -215,7 +218,6 @@ void init_smb3_0_server(struct ksmbd_conn *conn) conn->ops = &smb3_0_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -240,7 +242,6 @@ void init_smb3_02_server(struct ksmbd_conn *conn) conn->ops = &smb3_0_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -265,7 +266,6 @@ int init_smb3_11_server(struct ksmbd_conn *conn) conn->ops = &smb3_11_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -304,3 +304,11 @@ void init_smb2_max_trans_size(unsigned int sz) smb302_server_values.max_trans_size = sz; smb311_server_values.max_trans_size = sz; } + +void init_smb2_max_credits(unsigned int sz) +{ + smb21_server_values.max_credits = sz; + smb30_server_values.max_credits = sz; + smb302_server_values.max_credits = sz; + smb311_server_values.max_credits = sz; +} diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index beae94f6033a..7fb0545c21ab 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -308,7 +308,7 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) hdr->CreditCharge = req_hdr->CreditCharge; - if (conn->total_credits > conn->max_credits) { + if (conn->total_credits > conn->vals->max_credits) { hdr->CreditRequest = 0; pr_err("Total credits overflow: %d\n", conn->total_credits); return -EINVAL; @@ -329,12 +329,12 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) if (hdr->Command == SMB2_NEGOTIATE) aux_max = 0; else - aux_max = conn->max_credits - credit_charge; + aux_max = conn->vals->max_credits - credit_charge; aux_credits = min_t(unsigned short, aux_credits, aux_max); credits_granted = credit_charge + aux_credits; - if (conn->max_credits - conn->total_credits < credits_granted) - credits_granted = conn->max_credits - + if (conn->vals->max_credits - conn->total_credits < credits_granted) + credits_granted = conn->vals->max_credits - conn->total_credits; conn->total_credits += credits_granted; diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 4a3e4339d4c4..725b800c29c8 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -980,6 +980,7 @@ int init_smb3_11_server(struct ksmbd_conn *conn); void init_smb2_max_read_size(unsigned int sz); void init_smb2_max_write_size(unsigned int sz); void init_smb2_max_trans_size(unsigned int sz); +void init_smb2_max_credits(unsigned int sz); bool is_smb2_neg_cmd(struct ksmbd_work *work); bool is_smb2_rsp(struct ksmbd_work *work); diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 50590842b651..e1369b4345a9 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -365,6 +365,7 @@ struct smb_version_values { __u32 max_read_size; __u32 max_write_size; __u32 max_trans_size; + __u32 max_credits; __u32 large_lock_type; __u32 exclusive_lock_type; __u32 shared_lock_type; diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 1acf1892a466..3ad6881e0f7e 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -301,6 +301,8 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) init_smb2_max_write_size(req->smb2_max_write); if (req->smb2_max_trans) init_smb2_max_trans_size(req->smb2_max_trans); + if (req->smb2_max_credits) + init_smb2_max_credits(req->smb2_max_credits); ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); -- cgit v1.2.3-58-ga151 From 914d7e5709ac59ded70bea7956d408fe2acd7c3c Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 29 Dec 2021 23:10:03 +0900 Subject: ksmbd: move credit charge deduction under processing request Moves the credit charge deduction from total_credits under the processing a request. When repeating smb2 lock request and other command request, there will be a problem that ->total_credits does not decrease. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2misc.c | 7 ++----- fs/ksmbd/smb2pdu.c | 16 ++++++++++------ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 6892d1822269..fedcb753c7af 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -289,7 +289,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, unsigned int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len; unsigned short credit_charge = le16_to_cpu(hdr->CreditCharge); void *__hdr = hdr; - int ret; + int ret = 0; switch (hdr->Command) { case SMB2_QUERY_INFO: @@ -332,10 +332,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, } spin_lock(&conn->credits_lock); - if (credit_charge <= conn->total_credits) { - conn->total_credits -= credit_charge; - ret = 0; - } else { + if (credit_charge > conn->total_credits) { ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n", credit_charge, conn->total_credits); ret = 1; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 7fb0545c21ab..706191f5e475 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -299,9 +299,8 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work); struct smb2_hdr *hdr = ksmbd_resp_buf_next(work); struct ksmbd_conn *conn = work->conn; - unsigned short credits_requested; + unsigned short credits_requested, aux_max; unsigned short credit_charge, credits_granted = 0; - unsigned short aux_max, aux_credits; if (work->send_no_response) return 0; @@ -316,6 +315,13 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) credit_charge = max_t(unsigned short, le16_to_cpu(req_hdr->CreditCharge), 1); + if (credit_charge > conn->total_credits) { + ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n", + credit_charge, conn->total_credits); + return -EINVAL; + } + + conn->total_credits -= credit_charge; credits_requested = max_t(unsigned short, le16_to_cpu(req_hdr->CreditRequest), 1); @@ -325,13 +331,11 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) * TODO: Need to adjuct CreditRequest value according to * current cpu load */ - aux_credits = credits_requested - 1; if (hdr->Command == SMB2_NEGOTIATE) - aux_max = 0; + aux_max = 1; else aux_max = conn->vals->max_credits - credit_charge; - aux_credits = min_t(unsigned short, aux_credits, aux_max); - credits_granted = credit_charge + aux_credits; + credits_granted = min_t(unsigned short, credits_requested, aux_max); if (conn->vals->max_credits - conn->total_credits < credits_granted) credits_granted = conn->vals->max_credits - -- cgit v1.2.3-58-ga151 From b589f5db6d4af8f14d70e31e1276b4c017668a26 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 31 Dec 2021 09:26:25 +0900 Subject: ksmbd: limits exceeding the maximum allowable outstanding requests If the client ignores the CreditResponse received from the server and continues to send the request, ksmbd limits the requests if it exceeds smb2 max credits. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/connection.c | 1 + fs/ksmbd/connection.h | 3 ++- fs/ksmbd/smb2misc.c | 9 +++++++++ fs/ksmbd/smb2pdu.c | 1 + 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 83a94d0bb480..d1d0105be5b1 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -62,6 +62,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); conn->total_credits = 1; + conn->outstanding_credits = 1; init_waitqueue_head(&conn->req_running_q); INIT_LIST_HEAD(&conn->conns_list); diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index 42ffb6d9c5d8..7e0730a262da 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -61,7 +61,8 @@ struct ksmbd_conn { atomic_t req_running; /* References which are made for this Server object*/ atomic_t r_count; - unsigned short total_credits; + unsigned int total_credits; + unsigned int outstanding_credits; spinlock_t credits_lock; wait_queue_head_t req_running_q; /* Lock to protect requests list*/ diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index fedcb753c7af..4a9460153b59 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -337,7 +337,16 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, credit_charge, conn->total_credits); ret = 1; } + + if ((u64)conn->outstanding_credits + credit_charge > conn->vals->max_credits) { + ksmbd_debug(SMB, "Limits exceeding the maximum allowable outstanding requests, given : %u, pending : %u\n", + credit_charge, conn->outstanding_credits); + ret = 1; + } else + conn->outstanding_credits += credit_charge; + spin_unlock(&conn->credits_lock); + return ret; } diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 706191f5e475..867ed982f729 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -322,6 +322,7 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) } conn->total_credits -= credit_charge; + conn->outstanding_credits -= credit_charge; credits_requested = max_t(unsigned short, le16_to_cpu(req_hdr->CreditRequest), 1); -- cgit v1.2.3-58-ga151 From 99b7650ac51847e81b4d5139824e321e6cb76130 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Tue, 4 Jan 2022 14:56:26 +0900 Subject: ksmbd: smbd: call rdma_accept() under CM handler if CONFIG_LOCKDEP is enabled, the following kernel warning message is generated because rdma_accept() checks whehter the handler_mutex is held by lockdep_assert_held. CM(Connection Manager) holds the mutex before CM handler callback is called. [ 63.211405 ] WARNING: CPU: 1 PID: 345 at drivers/infiniband/core/cma.c:4405 rdma_accept+0x17a/0x350 [ 63.212080 ] RIP: 0010:rdma_accept+0x17a/0x350 ... [ 63.214036 ] Call Trace: [ 63.214098 ] [ 63.214185 ] smb_direct_accept_client+0xb4/0x170 [ksmbd] [ 63.214412 ] smb_direct_prepare+0x322/0x8c0 [ksmbd] [ 63.214555 ] ? rcu_read_lock_sched_held+0x3a/0x70 [ 63.214700 ] ksmbd_conn_handler_loop+0x63/0x270 [ksmbd] [ 63.214826 ] ? ksmbd_conn_alive+0x80/0x80 [ksmbd] [ 63.214952 ] kthread+0x171/0x1a0 [ 63.215039 ] ? set_kthread_struct+0x40/0x40 [ 63.215128 ] ret_from_fork+0x22/0x30 To avoid this, move creating a queue pair and accepting a client from transport_ops->prepare() to smb_direct_handle_connect_request(). Acked-by: Namjae Jeon Signed-off-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 102 +++++++++++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 43 deletions(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index f89b64e27836..0fd706d01790 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -568,6 +568,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } t->negotiation_requested = true; t->full_packet_received = true; + enqueue_reassembly(t, recvmsg, 0); wake_up_interruptible(&t->wait_status); break; case SMB_DIRECT_MSG_DATA_TRANSFER: { @@ -1594,19 +1595,13 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) pr_err("error at rdma_accept: %d\n", ret); return ret; } - - wait_event_interruptible(t->wait_status, - t->status != SMB_DIRECT_CS_NEW); - if (t->status != SMB_DIRECT_CS_CONNECTED) - return -ENOTCONN; return 0; } -static int smb_direct_negotiate(struct smb_direct_transport *t) +static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) { int ret; struct smb_direct_recvmsg *recvmsg; - struct smb_direct_negotiate_req *req; recvmsg = get_free_recvmsg(t); if (!recvmsg) @@ -1616,44 +1611,20 @@ static int smb_direct_negotiate(struct smb_direct_transport *t) ret = smb_direct_post_recv(t, recvmsg); if (ret) { pr_err("Can't post recv: %d\n", ret); - goto out; + goto out_err; } t->negotiation_requested = false; ret = smb_direct_accept_client(t); if (ret) { pr_err("Can't accept client\n"); - goto out; + goto out_err; } smb_direct_post_recv_credits(&t->post_recv_credits_work.work); - - ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); - ret = wait_event_interruptible_timeout(t->wait_status, - t->negotiation_requested || - t->status == SMB_DIRECT_CS_DISCONNECTED, - SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); - if (ret <= 0 || t->status == SMB_DIRECT_CS_DISCONNECTED) { - ret = ret < 0 ? ret : -ETIMEDOUT; - goto out; - } - - ret = smb_direct_check_recvmsg(recvmsg); - if (ret == -ECONNABORTED) - goto out; - - req = (struct smb_direct_negotiate_req *)recvmsg->packet; - t->max_recv_size = min_t(int, t->max_recv_size, - le32_to_cpu(req->preferred_send_size)); - t->max_send_size = min_t(int, t->max_send_size, - le32_to_cpu(req->max_receive_size)); - t->max_fragmented_send_size = - le32_to_cpu(req->max_fragmented_size); - - ret = smb_direct_send_negotiate_response(t, ret); -out: - if (recvmsg) - put_recvmsg(t, recvmsg); + return 0; +out_err: + put_recvmsg(t, recvmsg); return ret; } @@ -1890,6 +1861,47 @@ err: static int smb_direct_prepare(struct ksmbd_transport *t) { struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_recvmsg *recvmsg; + struct smb_direct_negotiate_req *req; + int ret; + + ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); + ret = wait_event_interruptible_timeout(st->wait_status, + st->negotiation_requested || + st->status == SMB_DIRECT_CS_DISCONNECTED, + SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); + if (ret <= 0 || st->status == SMB_DIRECT_CS_DISCONNECTED) + return ret < 0 ? ret : -ETIMEDOUT; + + recvmsg = get_first_reassembly(st); + if (!recvmsg) + return -ECONNABORTED; + + ret = smb_direct_check_recvmsg(recvmsg); + if (ret == -ECONNABORTED) + goto out; + + req = (struct smb_direct_negotiate_req *)recvmsg->packet; + st->max_recv_size = min_t(int, st->max_recv_size, + le32_to_cpu(req->preferred_send_size)); + st->max_send_size = min_t(int, st->max_send_size, + le32_to_cpu(req->max_receive_size)); + st->max_fragmented_send_size = + le32_to_cpu(req->max_fragmented_size); + + ret = smb_direct_send_negotiate_response(st, ret); +out: + spin_lock_irq(&st->reassembly_queue_lock); + st->reassembly_queue_length--; + list_del(&recvmsg->list); + spin_unlock_irq(&st->reassembly_queue_lock); + put_recvmsg(st, recvmsg); + + return ret; +} + +static int smb_direct_connect(struct smb_direct_transport *st) +{ int ret; struct ib_qp_cap qp_cap; @@ -1911,13 +1923,11 @@ static int smb_direct_prepare(struct ksmbd_transport *t) return ret; } - ret = smb_direct_negotiate(st); + ret = smb_direct_prepare_negotiation(st); if (ret) { pr_err("Can't negotiate: %d\n", ret); return ret; } - - st->status = SMB_DIRECT_CS_CONNECTED; return 0; } @@ -1933,6 +1943,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) { struct smb_direct_transport *t; + int ret; if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { ksmbd_debug(RDMA, @@ -1945,18 +1956,23 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) if (!t) return -ENOMEM; + ret = smb_direct_connect(t); + if (ret) + goto out_err; + KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, KSMBD_TRANS(t)->conn, "ksmbd:r%u", smb_direct_port); if (IS_ERR(KSMBD_TRANS(t)->handler)) { - int ret = PTR_ERR(KSMBD_TRANS(t)->handler); - + ret = PTR_ERR(KSMBD_TRANS(t)->handler); pr_err("Can't start thread\n"); - free_transport(t); - return ret; + goto out_err; } return 0; +out_err: + free_transport(t); + return ret; } static int smb_direct_listen_handler(struct rdma_cm_id *cm_id, -- cgit v1.2.3-58-ga151 From 41dbda16a0902798e732abc6599de256b9dc3b27 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 6 Jan 2022 10:30:31 +0900 Subject: ksmbd: add reserved room in ipc request/response Whenever new parameter is added to smb configuration, It is possible to break the execution of the IPC daemon by mismatch size of request/response. This patch tries to reserve space in ipc request/response in advance to prevent that. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/ksmbd_netlink.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index a5c2861792ae..71bfb7de4472 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -104,6 +104,7 @@ struct ksmbd_startup_request { */ __u32 sub_auth[3]; /* Subauth value for Security ID */ __u32 smb2_max_credits; /* MAX credits */ + __u32 reserved[128]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; }; @@ -114,7 +115,7 @@ struct ksmbd_startup_request { * IPC request to shutdown ksmbd server. */ struct ksmbd_shutdown_request { - __s32 reserved; + __s32 reserved[16]; }; /* @@ -123,6 +124,7 @@ struct ksmbd_shutdown_request { struct ksmbd_login_request { __u32 handle; __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -136,6 +138,7 @@ struct ksmbd_login_response { __u16 status; __u16 hash_sz; /* hash size */ __s8 hash[KSMBD_REQ_MAX_HASH_SZ]; /* password hash */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -144,6 +147,7 @@ struct ksmbd_login_response { struct ksmbd_share_config_request { __u32 handle; __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME]; /* share name */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -158,6 +162,7 @@ struct ksmbd_share_config_response { __u16 force_directory_mode; __u16 force_uid; __u16 force_gid; + __u32 reserved[128]; /* Reserved room */ __u32 veto_list_sz; __s8 ____payload[]; }; @@ -188,6 +193,7 @@ struct ksmbd_tree_connect_request { __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; __s8 share[KSMBD_REQ_MAX_SHARE_NAME]; __s8 peer_addr[64]; + __u32 reserved[16]; /* Reserved room */ }; /* @@ -197,6 +203,7 @@ struct ksmbd_tree_connect_response { __u32 handle; __u16 status; __u16 connection_flags; + __u32 reserved[16]; /* Reserved room */ }; /* @@ -205,6 +212,7 @@ struct ksmbd_tree_connect_response { struct ksmbd_tree_disconnect_request { __u64 session_id; /* session id */ __u64 connect_id; /* tree connection id */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -213,6 +221,7 @@ struct ksmbd_tree_disconnect_request { struct ksmbd_logout_request { __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ __u32 account_flags; + __u32 reserved[16]; /* Reserved room */ }; /* -- cgit v1.2.3-58-ga151 From c9f189271cff85d5d735e25dfa4bc95952ec12d8 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Fri, 7 Jan 2022 14:45:30 +0900 Subject: ksmbd: smbd: create MR pool Create a memory region pool because rdma_rw_ctx_init() uses memory registration if memory registration yields better performance than using multiple SGE entries. Acked-by: Namjae Jeon Signed-off-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 0fd706d01790..f0b17da1cac2 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -428,6 +428,7 @@ static void free_transport(struct smb_direct_transport *t) if (t->qp) { ib_drain_qp(t->qp); + ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs); ib_destroy_qp(t->qp); } @@ -1708,7 +1709,9 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; cap->max_inline_data = 0; - cap->max_rdma_ctxs = 0; + cap->max_rdma_ctxs = + rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) * + smb_direct_max_outstanding_rw_ops; return 0; } @@ -1790,6 +1793,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, { int ret; struct ib_qp_init_attr qp_attr; + int pages_per_rw; t->pd = ib_alloc_pd(t->cm_id->device, 0); if (IS_ERR(t->pd)) { @@ -1837,6 +1841,23 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, t->qp = t->cm_id->qp; t->cm_id->event_handler = smb_direct_cm_handler; + pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; + if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) { + int pages_per_mr, mr_count; + + pages_per_mr = min_t(int, pages_per_rw, + t->cm_id->device->attrs.max_fast_reg_page_list_len); + mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) * + atomic_read(&t->rw_avail_ops); + ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count, + IB_MR_TYPE_MEM_REG, pages_per_mr, 0); + if (ret) { + pr_err("failed to init mr pool count %d pages %d\n", + mr_count, pages_per_mr); + goto err; + } + } + return 0; err: if (t->qp) { -- cgit v1.2.3-58-ga151 From 4d02c4fdc0e256b493f9a3b604c7ff18f0019f17 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Fri, 7 Jan 2022 14:45:31 +0900 Subject: ksmbd: smbd: change the default maximum read/write, receive size Due to restriction that cannot handle multiple buffer descriptor structures, decrease the maximum read/write size for Windows clients. And set the maximum fragmented receive size in consideration of the receive queue size. Acked-by: Namjae Jeon Signed-off-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/transport_rdma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index f0b17da1cac2..86fd64511512 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -80,7 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ static int smb_direct_max_receive_size = 8192; -static int smb_direct_max_read_write_size = 1024 * 1024; +static int smb_direct_max_read_write_size = 1048512; static int smb_direct_max_outstanding_rw_ops = 8; @@ -1908,7 +1908,9 @@ static int smb_direct_prepare(struct ksmbd_transport *t) st->max_send_size = min_t(int, st->max_send_size, le32_to_cpu(req->max_receive_size)); st->max_fragmented_send_size = - le32_to_cpu(req->max_fragmented_size); + le32_to_cpu(req->max_fragmented_size); + st->max_fragmented_recv_size = + (st->recv_credit_max * st->max_recv_size) / 2; ret = smb_direct_send_negotiate_response(st, ret); out: -- cgit v1.2.3-58-ga151 From 136dff3a6b71dc16c30b35cc390feb0bfc32ed50 Mon Sep 17 00:00:00 2001 From: Yufan Chen Date: Sun, 9 Jan 2022 11:34:16 +0900 Subject: ksmbd: add smb-direct shutdown When killing ksmbd server after connecting rdma, ksmbd threads does not terminate properly because the rdma connection is still alive. This patch add shutdown operation to disconnect rdma connection while ksmbd threads terminate. Signed-off-by: Yufan Chen Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/connection.c | 9 ++++++++- fs/ksmbd/connection.h | 1 + fs/ksmbd/transport_rdma.c | 10 ++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index d1d0105be5b1..208d2cff7bd3 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -387,17 +387,24 @@ out: static void stop_sessions(void) { struct ksmbd_conn *conn; + struct ksmbd_transport *t; again: read_lock(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) { struct task_struct *task; - task = conn->transport->handler; + t = conn->transport; + task = t->handler; if (task) ksmbd_debug(CONN, "Stop session handler %s/%d\n", task->comm, task_pid_nr(task)); conn->status = KSMBD_SESS_EXITING; + if (t->ops->shutdown) { + read_unlock(&conn_list_lock); + t->ops->shutdown(t); + read_lock(&conn_list_lock); + } } read_unlock(&conn_list_lock); diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index 7e0730a262da..7a59aacb5daa 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -117,6 +117,7 @@ struct ksmbd_conn_ops { struct ksmbd_transport_ops { int (*prepare)(struct ksmbd_transport *t); void (*disconnect)(struct ksmbd_transport *t); + void (*shutdown)(struct ksmbd_transport *t); int (*read)(struct ksmbd_transport *t, char *buf, unsigned int size); int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov, int size, bool need_invalidate_rkey, diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 86fd64511512..3c1ec1ac0b27 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -1453,6 +1453,15 @@ static void smb_direct_disconnect(struct ksmbd_transport *t) free_transport(st); } +static void smb_direct_shutdown(struct ksmbd_transport *t) +{ + struct smb_direct_transport *st = smb_trans_direct_transfort(t); + + ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", st->cm_id); + + smb_direct_disconnect_rdma_work(&st->disconnect_work); +} + static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -2201,6 +2210,7 @@ out: static struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { .prepare = smb_direct_prepare, .disconnect = smb_direct_disconnect, + .shutdown = smb_direct_shutdown, .writev = smb_direct_writev, .read = smb_direct_read, .rdma_read = smb_direct_rdma_read, -- cgit v1.2.3-58-ga151 From f3193ea1b6779023334faa72b214ece457e02656 Mon Sep 17 00:00:00 2001 From: Karl Kurbjun Date: Sun, 9 Jan 2022 20:49:35 -0700 Subject: HID: Ignore battery for Elan touchscreen on HP Envy X360 15t-dr100 Battery status on Elan tablet driver is reported for the HP ENVY x360 15t-dr100. There is no separate battery for the Elan controller resulting in a battery level report of 0% or 1% depending on whether a stylus has interacted with the screen. These low battery level reports causes a variety of bad behavior in desktop environments. This patch adds the appropriate quirk to indicate that the batery status is unused for this target. Cc: stable@vger.kernel.org Signed-off-by: Karl Kurbjun Signed-off-by: Jiri Kosina --- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-input.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 26cee452ec44..85975031389b 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -400,6 +400,7 @@ #define USB_DEVICE_ID_HP_X2 0x074d #define USB_DEVICE_ID_HP_X2_10_COVER 0x0755 #define I2C_DEVICE_ID_HP_ENVY_X360_15 0x2d05 +#define I2C_DEVICE_ID_HP_ENVY_X360_15T_DR100 0x29CF #define I2C_DEVICE_ID_HP_SPECTRE_X360_15 0x2817 #define USB_DEVICE_ID_ASUS_UX550VE_TOUCHSCREEN 0x2544 #define USB_DEVICE_ID_ASUS_UX550_TOUCHSCREEN 0x2706 diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c index 1ce75e8b49d5..112901d2d8d2 100644 --- a/drivers/hid/hid-input.c +++ b/drivers/hid/hid-input.c @@ -330,6 +330,8 @@ static const struct hid_device_id hid_battery_quirks[] = { HID_BATTERY_QUIRK_IGNORE }, { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_HP_ENVY_X360_15), HID_BATTERY_QUIRK_IGNORE }, + { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_HP_ENVY_X360_15T_DR100), + HID_BATTERY_QUIRK_IGNORE }, { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_HP_SPECTRE_X360_15), HID_BATTERY_QUIRK_IGNORE }, { HID_I2C_DEVICE(USB_VENDOR_ID_ELAN, I2C_DEVICE_ID_SURFACE_GO_TOUCHSCREEN), -- cgit v1.2.3-58-ga151 From 90e12a3191040bd3854d3e236c35921e4e92a044 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Dec 2021 14:53:29 -0500 Subject: NFSv4 remove zero number of fs_locations entries error check Remove the check for the zero length fs_locations reply in the xdr decoding, and instead check for that in the migration code. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4state.c | 3 +++ fs/nfs/nfs4xdr.c | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f63dfa01001c..f3265575c28d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2106,6 +2106,9 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred } result = -NFS4ERR_NXIO; + if (!locations->nlocations) + goto out; + if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) { dprintk("<-- %s: No fs_locations data, migration skipped\n", __func__); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index a9487c840052..8e70b92df4cc 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3732,8 +3732,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st if (unlikely(!p)) goto out_eio; n = be32_to_cpup(p); - if (n <= 0) - goto out_eio; for (res->nlocations = 0; res->nlocations < n; res->nlocations++) { u32 m; struct nfs4_fs_location *loc; -- cgit v1.2.3-58-ga151 From 8a59bb93b7e3cca389af44781a429ac12ac49be6 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Dec 2021 14:53:30 -0500 Subject: NFSv4 store server support for fs_location attribute Define and store if server returns it supports fs_locations attribute as a capability. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 2 ++ include/linux/nfs_fs_sb.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 44ab27f20f04..7d4c63282793 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3875,6 +3875,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f if (res.attr_bitmask[2] & FATTR4_WORD2_SECURITY_LABEL) server->caps |= NFS_CAP_SECURITY_LABEL; #endif + if (res.attr_bitmask[0] & FATTR4_WORD0_FS_LOCATIONS) + server->caps |= NFS_CAP_FS_LOCATIONS; if (!(res.attr_bitmask[0] & FATTR4_WORD0_FILEID)) server->fattr_valid &= ~NFS_ATTR_FATTR_FILEID; if (!(res.attr_bitmask[1] & FATTR4_WORD1_MODE)) diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index f24fc67af42d..8c08b356c8ca 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -289,5 +289,5 @@ struct nfs_server { #define NFS_CAP_COPY_NOTIFY (1U << 27) #define NFS_CAP_XATTR (1U << 28) #define NFS_CAP_READ_PLUS (1U << 29) - +#define NFS_CAP_FS_LOCATIONS (1U << 30) #endif -- cgit v1.2.3-58-ga151 From 180d0eb290a5d11e6d3d99fea0841ceae2893901 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 13 Jan 2022 11:58:05 +0100 Subject: parisc: Add visible flag to toc_stack variable Add the visible flag to the toc_stack variable to make it visible for assembly code and to avoid a sparse warning. Reported-by: kernel test robot Signed-off-by: Helge Deller --- arch/parisc/kernel/toc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/toc.c b/arch/parisc/kernel/toc.c index be9a0bebe61e..fa5a10eaf0aa 100644 --- a/arch/parisc/kernel/toc.c +++ b/arch/parisc/kernel/toc.c @@ -12,7 +12,7 @@ #include static unsigned int __aligned(16) toc_lock = 1; -DEFINE_PER_CPU_PAGE_ALIGNED(char [16384], toc_stack); +DEFINE_PER_CPU_PAGE_ALIGNED(char [16384], toc_stack) __visible; static void toc20_to_pt_regs(struct pt_regs *regs, struct pdc_toc_pim_20 *toc) { -- cgit v1.2.3-58-ga151 From 2d7c86a8f9cdce1408c4f3c69d94d007eff2f179 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:50 +0530 Subject: libceph: generalize addr/ip parsing based on delimiter ... and remove hardcoded function name in ceph_parse_ips(). [ idryomov: delim parameter, drop CEPH_ADDR_PARSE_DEFAULT_DELIM ] Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 3 ++- fs/ceph/super.c | 2 +- include/linux/ceph/libceph.h | 2 +- include/linux/ceph/messenger.h | 2 +- net/ceph/ceph_common.c | 9 ++++----- net/ceph/messenger.c | 15 ++++++++------- 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 953fa134cd3d..909dbe6111bf 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -6497,7 +6497,8 @@ static int rbd_add_parse_args(const char *buf, pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; pctx.opts->trim = RBD_TRIM_DEFAULT; - ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL); + ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL, + ','); if (ret) goto out_err; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bab61232dc5a..c444371ebc38 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -272,7 +272,7 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dout("server path '%s'\n", fsopt->server_path); ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, - pctx->copts, fc->log.log); + pctx->copts, fc->log.log, ','); if (ret) return ret; diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 409d8c29bc4f..c72285d8594e 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -301,7 +301,7 @@ struct fs_parameter; struct fc_log; struct ceph_options *ceph_alloc_options(void); int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt, - struct fc_log *l); + struct fc_log *l, char delim); int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, struct fc_log *l); int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 0e6e9ad3c3bf..ff99ce094cfa 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -532,7 +532,7 @@ extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr); extern int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, - int max_count, int *count); + int max_count, int *count, char delim); extern int ceph_msgr_init(void); extern void ceph_msgr_exit(void); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 97d6ea763e32..851b0c4c5730 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -422,14 +422,14 @@ out: } int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt, - struct fc_log *l) + struct fc_log *l, char delim) { struct p_log log = {.prefix = "libceph", .log = l}; int ret; - /* ip1[:port1][,ip2[:port2]...] */ + /* ip1[:port1][ip2[:port2]...] */ ret = ceph_parse_ips(buf, buf + len, opt->mon_addr, CEPH_MAX_MON, - &opt->num_mon); + &opt->num_mon, delim); if (ret) { error_plog(&log, "Failed to parse monitor IPs: %d", ret); return ret; @@ -455,8 +455,7 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, case Opt_ip: err = ceph_parse_ips(param->string, param->string + param->size, - &opt->my_addr, - 1, NULL); + &opt->my_addr, 1, NULL, ','); if (err) { error_plog(&log, "Failed to parse ip: %d", err); return err; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 57d043b382ed..929ed91f2ec3 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1267,30 +1267,31 @@ static int ceph_parse_server_name(const char *name, size_t namelen, */ int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, - int max_count, int *count) + int max_count, int *count, char delim) { int i, ret = -EINVAL; const char *p = c; dout("parse_ips on '%.*s'\n", (int)(end-c), c); for (i = 0; i < max_count; i++) { + char cur_delim = delim; const char *ipend; int port; - char delim = ','; if (*p == '[') { - delim = ']'; + cur_delim = ']'; p++; } - ret = ceph_parse_server_name(p, end - p, &addr[i], delim, &ipend); + ret = ceph_parse_server_name(p, end - p, &addr[i], cur_delim, + &ipend); if (ret) goto bad; ret = -EINVAL; p = ipend; - if (delim == ']') { + if (cur_delim == ']') { if (*p != ']') { dout("missing matching ']'\n"); goto bad; @@ -1326,11 +1327,11 @@ int ceph_parse_ips(const char *c, const char *end, addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY; addr[i].nonce = 0; - dout("parse_ips got %s\n", ceph_pr_addr(&addr[i])); + dout("%s got %s\n", __func__, ceph_pr_addr(&addr[i])); if (p == end) break; - if (*p != ',') + if (*p != delim) goto bad; p++; } -- cgit v1.2.3-58-ga151 From 4153c7fc937a2afa077dbdb9fe3189b9981f423c Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:51 +0530 Subject: libceph: rename parse_fsid() to ceph_parse_fsid() and export ... as it is too generic. also, use __func__ when logging rather than hardcoding the function name. Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 1 + net/ceph/ceph_common.c | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index c72285d8594e..644f224eccf7 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -296,6 +296,7 @@ extern bool libceph_compatible(void *data); extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); extern void *ceph_kvmalloc(size_t size, gfp_t flags); +extern int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid); struct fs_parameter; struct fc_log; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 851b0c4c5730..decae43b4262 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -217,14 +217,14 @@ void *ceph_kvmalloc(size_t size, gfp_t flags) return p; } -static int parse_fsid(const char *str, struct ceph_fsid *fsid) +int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; char tmp[3]; int err = -EINVAL; int d; - dout("parse_fsid '%s'\n", str); + dout("%s '%s'\n", __func__, str); tmp[2] = 0; while (*str && i < 16) { if (ispunct(*str)) { @@ -244,9 +244,10 @@ static int parse_fsid(const char *str, struct ceph_fsid *fsid) if (i == 16) err = 0; - dout("parse_fsid ret %d got fsid %pU\n", err, fsid); + dout("%s ret %d got fsid %pU\n", __func__, err, fsid); return err; } +EXPORT_SYMBOL(ceph_parse_fsid); /* * ceph options @@ -464,7 +465,7 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, break; case Opt_fsid: - err = parse_fsid(param->string, &opt->fsid); + err = ceph_parse_fsid(param->string, &opt->fsid); if (err) { error_plog(&log, "Failed to parse fsid: %d", err); return err; -- cgit v1.2.3-58-ga151 From 7b19b4db5add8d9f50e854907a82a10ba4d27c42 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:52 +0530 Subject: ceph: new device mount syntax Old mount device syntax (source) has the following problems: - mounts to the same cluster but with different fsnames and/or creds have identical device string which can confuse xfstests. - Userspace mount helper tool resolves monitor addresses and fill in mon addrs automatically, but that means the device shown in /proc/mounts is different than what was used for mounting. New device syntax is as follows: cephuser@fsid.mycephfs2=/path Note, there is no "monitor address" in the device string. That gets passed in as mount option. This keeps the device string same when monitor addresses change (on remounts). Also note that the userspace mount helper tool is backward compatible. I.e., the mount helper will fallback to using old syntax after trying to mount with the new syntax. [ idryomov: drop CEPH_MON_ADDR_MNTOPT_DELIM ] Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- fs/ceph/super.h | 3 ++ 2 files changed, 134 insertions(+), 10 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c444371ebc38..e2c40d055711 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -146,6 +146,7 @@ enum { Opt_mds_namespace, Opt_recover_session, Opt_source, + Opt_mon_addr, /* string args above */ Opt_dirstat, Opt_rbytes, @@ -197,6 +198,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_u32 ("rsize", Opt_rsize), fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), + fsparam_string ("mon_addr", Opt_mon_addr), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), {} @@ -228,9 +230,92 @@ static void canonicalize_path(char *path) } /* - * Parse the source parameter. Distinguish the server list from the path. + * Check if the mds namespace in ceph_mount_options matches + * the passed in namespace string. First time match (when + * ->mds_namespace is NULL) is treated specially, since + * ->mds_namespace needs to be initialized by the caller. + */ +static int namespace_equals(struct ceph_mount_options *fsopt, + const char *namespace, size_t len) +{ + return !(fsopt->mds_namespace && + (strlen(fsopt->mds_namespace) != len || + strncmp(fsopt->mds_namespace, namespace, len))); +} + +static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + int r; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + if (*dev_name_end != ':') + return invalfc(fc, "separator ':' missing in source"); + + r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name, + pctx->copts, fc->log.log, ','); + if (r) + return r; + + fsopt->new_dev_syntax = false; + return 0; +} + +static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + size_t len; + struct ceph_fsid fsid; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + char *fsid_start, *fs_name_start; + + if (*dev_name_end != '=') { + dout("separator '=' missing in source"); + return -EINVAL; + } + + fsid_start = strchr(dev_name, '@'); + if (!fsid_start) + return invalfc(fc, "missing cluster fsid"); + ++fsid_start; /* start of cluster fsid */ + + fs_name_start = strchr(fsid_start, '.'); + if (!fs_name_start) + return invalfc(fc, "missing file system name"); + + if (ceph_parse_fsid(fsid_start, &fsid)) + return invalfc(fc, "Invalid FSID"); + + ++fs_name_start; /* start of file system name */ + len = dev_name_end - fs_name_start; + + if (!namespace_equals(fsopt, fs_name_start, len)) + return invalfc(fc, "Mismatching mds_namespace"); + kfree(fsopt->mds_namespace); + fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL); + if (!fsopt->mds_namespace) + return -ENOMEM; + dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace); + + fsopt->new_dev_syntax = true; + return 0; +} + +/* + * Parse the source parameter for new device format. Distinguish the device + * spec from the path. Try parsing new device format and fallback to old + * format if needed. + * + * New device syntax will looks like: + * =/ + * where + * is name@fsid.fsname + * is optional, but if present must begin with '/' + * (monitor addresses are passed via mount option) * - * The source will look like: + * Old device syntax is: * [,...]:[] * where * is [:] @@ -263,24 +348,44 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dev_name_end = dev_name + strlen(dev_name); } - dev_name_end--; /* back up to ':' separator */ - if (dev_name_end < dev_name || *dev_name_end != ':') - return invalfc(fc, "No path or : separator in source"); + dev_name_end--; /* back up to separator */ + if (dev_name_end < dev_name) + return invalfc(fc, "Path missing in source"); dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); if (fsopt->server_path) dout("server path '%s'\n", fsopt->server_path); - ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, - pctx->copts, fc->log.log, ','); - if (ret) - return ret; + dout("trying new device syntax"); + ret = ceph_parse_new_source(dev_name, dev_name_end, fc); + if (ret) { + if (ret != -EINVAL) + return ret; + dout("trying old device syntax"); + ret = ceph_parse_old_source(dev_name, dev_name_end, fc); + if (ret) + return ret; + } fc->source = param->string; param->string = NULL; return 0; } +static int ceph_parse_mon_addr(struct fs_parameter *param, + struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + kfree(fsopt->mon_addr); + fsopt->mon_addr = param->string; + param->string = NULL; + + return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr), + pctx->copts, fc->log.log, '/'); +} + static int ceph_parse_mount_param(struct fs_context *fc, struct fs_parameter *param) { @@ -306,6 +411,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, param->string = NULL; break; case Opt_mds_namespace: + if (!namespace_equals(fsopt, param->string, strlen(param->string))) + return invalfc(fc, "Mismatching mds_namespace"); kfree(fsopt->mds_namespace); fsopt->mds_namespace = param->string; param->string = NULL; @@ -323,6 +430,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, if (fc->source) return invalfc(fc, "Multiple sources specified"); return ceph_parse_source(param, fc); + case Opt_mon_addr: + return ceph_parse_mon_addr(param, fc); case Opt_wsize: if (result.uint_32 < PAGE_SIZE || result.uint_32 > CEPH_MAX_WRITE_SIZE) @@ -474,6 +583,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) kfree(args->mds_namespace); kfree(args->server_path); kfree(args->fscache_uniq); + kfree(args->mon_addr); kfree(args); } @@ -517,6 +627,10 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; + ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr); + if (ret) + return ret; + return ceph_compare_options(new_opt, fsc->client); } @@ -572,9 +686,13 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0) seq_puts(m, ",copyfrom"); - if (fsopt->mds_namespace) + /* dump mds_namespace when old device syntax is in use */ + if (fsopt->mds_namespace && !fsopt->new_dev_syntax) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + if (fsopt->mon_addr) + seq_printf(m, ",mon_addr=%s", fsopt->mon_addr); + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) seq_show_option(m, "recover_session", "clean"); @@ -1060,6 +1178,7 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) static int ceph_get_tree(struct fs_context *fc) { struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; struct super_block *sb; struct ceph_fs_client *fsc; struct dentry *res; @@ -1071,6 +1190,8 @@ static int ceph_get_tree(struct fs_context *fc) if (!fc->source) return invalfc(fc, "No source"); + if (fsopt->new_dev_syntax && !fsopt->mon_addr) + return invalfc(fc, "No monitor address"); /* create client (which we may/may not use) */ fsc = create_fs_client(pctx->opts, pctx->copts); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ac331aa07cfa..ec6b221e5b62 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -89,6 +89,8 @@ struct ceph_mount_options { unsigned int max_readdir; /* max readdir result (entries) */ unsigned int max_readdir_bytes; /* max readdir result (bytes) */ + bool new_dev_syntax; + /* * everything above this point can be memcmp'd; everything below * is handled in compare_mount_options() @@ -98,6 +100,7 @@ struct ceph_mount_options { char *mds_namespace; /* default NULL */ char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ + char *mon_addr; }; struct ceph_fs_client { -- cgit v1.2.3-58-ga151 From 2167f2cc686a97911a0b06ba9c97cec304b7c432 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:53 +0530 Subject: ceph: record updated mon_addr on remount Note that the new monitors are just shown in /proc/mounts. Ceph does not (re)connect to new monitors yet. [ jlayton: s/printk\(KERN_NOTICE/pr_notice(/ s/strcmp/strcmp_null/ ] Signed-off-by: Venky Shankar Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e2c40d055711..31b5786cd98f 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1277,6 +1277,13 @@ static int ceph_reconfigure_fc(struct fs_context *fc) else ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) { + kfree(fsc->mount_options->mon_addr); + fsc->mount_options->mon_addr = fsopt->mon_addr; + fsopt->mon_addr = NULL; + pr_notice("ceph: monitor addresses recorded, but not used for reconnection"); + } + sync_filesystem(fc->root->d_sb); return 0; } -- cgit v1.2.3-58-ga151 From e1b9eb50763d108166651ca67aae09893332c6b0 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:54 +0530 Subject: doc: document new CephFS mount device syntax Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- Documentation/filesystems/ceph.rst | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/ceph.rst b/Documentation/filesystems/ceph.rst index 7d2ef4e27273..4942e018db85 100644 --- a/Documentation/filesystems/ceph.rst +++ b/Documentation/filesystems/ceph.rst @@ -82,7 +82,7 @@ Mount Syntax The basic mount syntax is:: - # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt + # mount -t ceph user@fsid.fs_name=/[subdir] mnt -o mon_addr=monip1[:port][/monip2[:port]] You only need to specify a single monitor, as the client will get the full list when it connects. (However, if the monitor you specify @@ -90,16 +90,35 @@ happens to be down, the mount won't succeed.) The port can be left off if the monitor is using the default. So if the monitor is at 1.2.3.4:: - # mount -t ceph 1.2.3.4:/ /mnt/ceph + # mount -t ceph cephuser@07fe3187-00d9-42a3-814b-72a4d5e7d5be.cephfs=/ /mnt/ceph -o mon_addr=1.2.3.4 is sufficient. If /sbin/mount.ceph is installed, a hostname can be -used instead of an IP address. +used instead of an IP address and the cluster FSID can be left out +(as the mount helper will fill it in by reading the ceph configuration +file):: + # mount -t ceph cephuser@cephfs=/ /mnt/ceph -o mon_addr=mon-addr +Multiple monitor addresses can be passed by separating each address with a slash (`/`):: + + # mount -t ceph cephuser@cephfs=/ /mnt/ceph -o mon_addr=192.168.1.100/192.168.1.101 + +When using the mount helper, monitor address can be read from ceph +configuration file if available. Note that, the cluster FSID (passed as part +of the device string) is validated by checking it with the FSID reported by +the monitor. Mount Options ============= + mon_addr=ip_address[:port][/ip_address[:port]] + Monitor address to the cluster. This is used to bootstrap the + connection to the cluster. Once connection is established, the + monitor addresses in the monitor map are followed. + + fsid=cluster-id + FSID of the cluster (from `ceph fsid` command). + ip=A.B.C.D[:N] Specify the IP and/or port the client should bind to locally. There is normally not much reason to do this. If the IP is not -- cgit v1.2.3-58-ga151 From adbed05ed62d1f3b6f6c5cb88ec52c1ffafc0fd9 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 3 Nov 2021 10:30:39 +0530 Subject: ceph: mount syntax module parameter Add read-only module parameters for supported mount syntaxes. Primary user is the user-space mount helper for catching v2 syntax bugs during testing by cross verifying if the kernel supports v2 syntax on mount failure. Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 31b5786cd98f..2166fc2c1625 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1461,6 +1461,14 @@ bool disable_send_metrics = false; module_param_cb(disable_send_metrics, ¶m_ops_metrics, &disable_send_metrics, 0644); MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)"); +/* for both v1 and v2 syntax */ +static bool mount_support = true; +static const struct kernel_param_ops param_ops_mount_syntax = { + .get = param_get_bool, +}; +module_param_cb(mount_syntax_v1, ¶m_ops_mount_syntax, &mount_support, 0444); +module_param_cb(mount_syntax_v2, ¶m_ops_mount_syntax, &mount_support, 0444); + module_init(init_ceph); module_exit(exit_ceph); -- cgit v1.2.3-58-ga151 From 8e55ba8caae5cd380b1c9c81a426602a667e110e Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Wed, 10 Nov 2021 23:30:21 +0530 Subject: ceph: Fix incorrect statfs report for small quota Problem: The statfs reports incorrect free/available space for quota less then CEPH_BLOCK size (4M). Solution: For quota less than CEPH_BLOCK size, smaller block size of 4K is used. But if quota is less than 4K, it is decided to go with binary use/free of 4K block. For quota size less than 4K size, report the total=used=4K,free=0 when quota is full and total=free=4K,used=0 otherwise. Signed-off-by: Kotresh HR Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 14 ++++++++++++++ fs/ceph/super.h | 1 + 2 files changed, 15 insertions(+) diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 620c691af40e..24ae13ea2241 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -494,10 +494,24 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) if (ci->i_max_bytes) { total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT; used = ci->i_rbytes >> CEPH_BLOCK_SHIFT; + /* For quota size less than 4MB, use 4KB block size */ + if (!total) { + total = ci->i_max_bytes >> CEPH_4K_BLOCK_SHIFT; + used = ci->i_rbytes >> CEPH_4K_BLOCK_SHIFT; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } /* It is possible for a quota to be exceeded. * Report 'zero' in that case */ free = total > used ? total - used : 0; + /* For quota size less than 4KB, report the + * total=used=4KB,free=0 when quota is full + * and total=free=4KB, used=0 otherwise */ + if (!total) { + total = 1; + free = ci->i_max_bytes > ci->i_rbytes ? 1 : 0; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } } spin_unlock(&ci->i_ceph_lock); if (total) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ec6b221e5b62..533189b537b2 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -32,6 +32,7 @@ * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) +#define CEPH_4K_BLOCK_SHIFT 12 /* 4 KB */ #define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ -- cgit v1.2.3-58-ga151 From 435a120a47eed0b3a1ac7b86cf1f7707bf2242ce Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 27 Nov 2021 17:21:04 +0000 Subject: rbd: make const pointer spaces a static const array Don't populate the const array spaces on the stack but make it static const and make the pointer an array to remove a dereference. Shrinks object code a little too. Also clean up intent, currently it is spaces and should be a tab. Signed-off-by: Colin Ian King Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 909dbe6111bf..1bf1595420a8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -6191,7 +6191,7 @@ static inline size_t next_token(const char **buf) * These are the characters that produce nonzero for * isspace() in the "C" and "POSIX" locales. */ - const char *spaces = " \f\n\r\t\v"; + static const char spaces[] = " \f\n\r\t\v"; *buf += strspn(*buf, spaces); /* Find start of token */ -- cgit v1.2.3-58-ga151 From af9ceae83cd26c9319bb2cdab23bb16d39300cbd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 26 Jan 2021 13:41:38 -0500 Subject: ceph: drop send metrics debug message This pops every second and isn't very useful. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/metric.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index c57699d8408d..0fcba68f9a99 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -160,8 +160,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, msg->hdr.version = cpu_to_le16(1); msg->hdr.compat_version = cpu_to_le16(1); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - dout("client%llu send metrics to mds%d\n", - ceph_client_gid(mdsc->fsc->client), s->s_mds); ceph_con_send(&s->s_con, msg); return true; -- cgit v1.2.3-58-ga151 From 0078ea3b0566e3da09ae8e1e4fbfd708702f2876 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 9 Nov 2021 09:54:49 -0500 Subject: ceph: don't check for quotas on MDS stray dirs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 玮文 胡 reported seeing the WARN_RATELIMIT pop when writing to an inode that had been transplanted into the stray dir. The client was trying to look up the quotarealm info from the parent and that tripped the warning. Change the ceph_vino_is_reserved helper to not throw a warning for MDS stray directories (0x100 - 0x1ff), only for reserved dirs that are not in that range. Also, fix ceph_has_realms_with_quotas to return false when encountering a reserved inode. URL: https://tracker.ceph.com/issues/53180 Reported-by: Hu Weiwen Signed-off-by: Jeff Layton Reviewed-by: Luis Henriques Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 3 +++ fs/ceph/super.h | 20 ++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 24ae13ea2241..a338a3ec0dc4 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -30,6 +30,9 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode) /* if root is the real CephFS root, we don't have quota realms */ if (root && ceph_ino(root) == CEPH_INO_ROOT) return false; + /* MDS stray dirs have no quota realms */ + if (ceph_vino_is_reserved(ceph_inode(inode)->i_vino)) + return false; /* otherwise, we can't know for sure */ return true; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 533189b537b2..5ec465a1350a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -539,19 +539,23 @@ static inline int ceph_ino_compare(struct inode *inode, void *data) * * These come from src/mds/mdstypes.h in the ceph sources. */ -#define CEPH_MAX_MDS 0x100 -#define CEPH_NUM_STRAY 10 +#define CEPH_MAX_MDS 0x100 +#define CEPH_NUM_STRAY 10 #define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) +#define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS) #define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) { - if (vino.ino < CEPH_INO_SYSTEM_BASE && - vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) { - WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino); - return true; - } - return false; + if (vino.ino >= CEPH_INO_SYSTEM_BASE || + vino.ino < CEPH_MDS_INO_MDSDIR_OFFSET) + return false; + + /* Don't warn on mdsdirs */ + WARN_RATELIMIT(vino.ino >= CEPH_MDS_INO_LOG_OFFSET, + "Attempt to access reserved inode number 0x%llx", + vino.ino); + return true; } static inline struct inode *ceph_find_inode(struct super_block *sb, -- cgit v1.2.3-58-ga151 From 94cc0877cad0bc6ca84686c4fa874bf530eb8b88 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 30 Nov 2021 14:12:13 -0500 Subject: ceph: add new "nopagecache" option CephFS is a bit unlike most other filesystems in that it only conditionally does buffered I/O based on the caps that it gets from the MDS. In most cases, unless there is contended access for an inode the MDS does give Fbc caps to the client, so the unbuffered codepaths are only infrequently traveled and are difficult to test. At one time, the "-o sync" mount option would give you this behavior, but that was removed in commit 7ab9b3807097 ("ceph: Don't use ceph-sync-mode for synchronous-fs."). Add a new mount option to tell the client to ignore Fbc caps when doing I/O, and to use the synchronous codepaths exclusively, even on non-O_DIRECT file descriptors. We already have an ioctl that forces this behavior on a per-file basis, so we can just always set the CEPH_F_SYNC flag in the file description on such mounts. Additionally, this patch also changes the client to not request Fbc when doing direct I/O. We aren't using the cache with O_DIRECT so we don't have any need for those caps. Signed-off-by: Jeff Layton Acked-by: Greg Farnum Reviewed-by: Venky Shankar Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 24 +++++++++++++++--------- fs/ceph/super.c | 11 +++++++++++ fs/ceph/super.h | 1 + 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c138e8126286..7de5db51c3d0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -204,6 +204,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, int fmode, bool isdir) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mount_options *opt = + ceph_inode_to_client(&ci->vfs_inode)->mount_options; struct ceph_file_info *fi; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, @@ -225,6 +227,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, if (!fi) return -ENOMEM; + if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + fi->flags |= CEPH_F_SYNC; + file->private_data = fi; } @@ -1536,7 +1541,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) struct ceph_inode_info *ci = ceph_inode(inode); bool direct_lock = iocb->ki_flags & IOCB_DIRECT; ssize_t ret; - int want, got = 0; + int want = 0, got = 0; int retry_op = 0, read = 0; again: @@ -1551,13 +1556,14 @@ again: else ceph_start_io_read(inode); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_CACHE; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_CACHE; + want |= CEPH_CAP_FILE_LAZYIO; + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got); if (ret < 0) { - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_end_io_direct(inode); else ceph_end_io_read(inode); @@ -1691,7 +1697,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; - int err, want, got; + int err, want = 0, got; bool direct_lock = false; u32 map_flags; u64 pool_flags; @@ -1766,10 +1772,10 @@ retry_snap: dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_BUFFER; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; + want |= CEPH_CAP_FILE_LAZYIO; got = 0; err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got); if (err < 0) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 2166fc2c1625..eb255d24e321 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -160,6 +160,7 @@ enum { Opt_quotadf, Opt_copyfrom, Opt_wsync, + Opt_pagecache, }; enum ceph_recover_session_mode { @@ -201,6 +202,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_string ("mon_addr", Opt_mon_addr), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), + fsparam_flag_no ("pagecache", Opt_pagecache), {} }; @@ -564,6 +566,12 @@ static int ceph_parse_mount_param(struct fs_context *fc, else fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS; break; + case Opt_pagecache: + if (result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE; + break; default: BUG(); } @@ -699,6 +707,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) seq_puts(m, ",wsync"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + seq_puts(m, ",nopagecache"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 5ec465a1350a..0d489a973d5e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -46,6 +46,7 @@ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ +#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ -- cgit v1.2.3-58-ga151 From 76bdbc7ac777adb6bc316bfe3f57b3de93c50985 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 6 Jan 2022 09:35:52 +0800 Subject: ceph: remove redundant Lsx caps check The newcaps has already included the Ls, no need to check it again. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c447fa2e2d1f..62448691608f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3375,8 +3375,7 @@ static void handle_cap_grant(struct inode *inode, if ((newcaps & CEPH_CAP_LINK_SHARED) && (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); - if (inode->i_nlink == 0 && - (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + if (inode->i_nlink == 0) deleted_inode = true; } -- cgit v1.2.3-58-ga151 From a0b3a15eab6bc2e90008460b646d53e7d9dcdbbb Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 10 Jan 2022 18:28:33 -0500 Subject: ceph: move CEPH_SUPER_MAGIC definition to magic.h The uapi headers are missing the ceph definition. Move it there so userland apps can ID cephfs. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 2 ++ fs/ceph/super.h | 3 --- include/uapi/linux/magic.h | 1 + 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index eb255d24e321..dbcf9b743c88 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -27,6 +27,8 @@ #include #include +#include + static DEFINE_SPINLOCK(ceph_fsc_lock); static LIST_HEAD(ceph_fsc_list); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0d489a973d5e..dc9fe7ed3fa8 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -25,9 +25,6 @@ #include #endif -/* f_type in struct statfs */ -#define CEPH_SUPER_MAGIC 0x00c36400 - /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 35687dcb1a42..53a3c20394cf 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -6,6 +6,7 @@ #define AFFS_SUPER_MAGIC 0xadff #define AFS_SUPER_MAGIC 0x5346414F #define AUTOFS_SUPER_MAGIC 0x0187 +#define CEPH_SUPER_MAGIC 0x00c36400 #define CODA_SUPER_MAGIC 0x73757245 #define CRAMFS_MAGIC 0x28cd3d45 /* some random number */ #define CRAMFS_MAGIC_WEND 0x453dcd28 /* magic number with the wrong endianess */ -- cgit v1.2.3-58-ga151 From 1976b2b31462151403c9fc110204fcc2a77bdfd1 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 12 Jan 2022 10:27:38 -0500 Subject: NFSv4.1 query for fs_location attr on a new file system Query the server for other possible trunkable locations for a given file system on a 4.1+ mount. v2: -- added missing static to nfs4_discover_trunking, reported by the kernel test robot Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 7 +++++ fs/nfs/nfs4_fs.h | 9 +++--- fs/nfs/nfs4proc.c | 76 ++++++++++++++++++++++++++++++++++++++++++------- fs/nfs/nfs4state.c | 3 +- include/linux/nfs_xdr.h | 1 + 5 files changed, 81 insertions(+), 15 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1e4dc1ab9312..f7e39cc4472b 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -860,6 +860,13 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str server->namelen = pathinfo.max_namelen; } + if (clp->rpc_ops->discover_trunking != NULL && + (server->caps & NFS_CAP_FS_LOCATIONS)) { + error = clp->rpc_ops->discover_trunking(server, mntfh); + if (error < 0) + return error; + } + return 0; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 89076dd32334..166ea6112e61 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -260,8 +260,8 @@ struct nfs4_state_maintenance_ops { }; struct nfs4_mig_recovery_ops { - int (*get_locations)(struct inode *, struct nfs4_fs_locations *, - struct page *, const struct cred *); + int (*get_locations)(struct nfs_server *, struct nfs_fh *, + struct nfs4_fs_locations *, struct page *, const struct cred *); int (*fsid_present)(struct inode *, const struct cred *); }; @@ -302,8 +302,9 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait); extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *, struct nfs4_fs_locations *, struct page *); -extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *, - struct page *page, const struct cred *); +extern int nfs4_proc_get_locations(struct nfs_server *, struct nfs_fh *, + struct nfs4_fs_locations *, + struct page *page, const struct cred *); extern int nfs4_proc_fsid_present(struct inode *, const struct cred *); extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct dentry *, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7d4c63282793..fc4629d2d2ab 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3935,6 +3935,60 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) return err; } +static int _nfs4_discover_trunking(struct nfs_server *server, + struct nfs_fh *fhandle) +{ + struct nfs4_fs_locations *locations = NULL; + struct page *page; + const struct cred *cred; + struct nfs_client *clp = server->nfs_client; + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; + int status = -ENOMEM; + + cred = ops->get_state_renewal_cred(clp); + if (cred == NULL) { + cred = nfs4_get_clid_cred(clp); + if (cred == NULL) + return -ENOKEY; + } + + page = alloc_page(GFP_KERNEL); + locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); + if (page == NULL || locations == NULL) + goto out; + + status = nfs4_proc_get_locations(server, fhandle, locations, page, + cred); + if (status) + goto out; +out: + if (page) + __free_page(page); + kfree(locations); + return status; +} + +static int nfs4_discover_trunking(struct nfs_server *server, + struct nfs_fh *fhandle) +{ + struct nfs4_exception exception = { + .interruptible = true, + }; + struct nfs_client *clp = server->nfs_client; + int err = 0; + + if (!nfs4_has_session(clp)) + goto out; + do { + err = nfs4_handle_exception(server, + _nfs4_discover_trunking(server, fhandle), + &exception); + } while (exception.retry); +out: + return err; +} + static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { @@ -7823,18 +7877,18 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, * appended to this compound to identify the client ID which is * performing recovery. */ -static int _nfs40_proc_get_locations(struct inode *inode, +static int _nfs40_proc_get_locations(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs4_fs_locations *locations, struct page *page, const struct cred *cred) { - struct nfs_server *server = NFS_SERVER(inode); struct rpc_clnt *clnt = server->client; u32 bitmask[2] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, }; struct nfs4_fs_locations_arg args = { .clientid = server->nfs_client->cl_clientid, - .fh = NFS_FH(inode), + .fh = fhandle, .page = page, .bitmask = bitmask, .migration = 1, /* skip LOOKUP */ @@ -7880,17 +7934,17 @@ static int _nfs40_proc_get_locations(struct inode *inode, * When the client supports GETATTR(fs_locations_info), it can * be plumbed in here. */ -static int _nfs41_proc_get_locations(struct inode *inode, +static int _nfs41_proc_get_locations(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs4_fs_locations *locations, struct page *page, const struct cred *cred) { - struct nfs_server *server = NFS_SERVER(inode); struct rpc_clnt *clnt = server->client; u32 bitmask[2] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, }; struct nfs4_fs_locations_arg args = { - .fh = NFS_FH(inode), + .fh = fhandle, .page = page, .bitmask = bitmask, .migration = 1, /* skip LOOKUP */ @@ -7939,11 +7993,11 @@ static int _nfs41_proc_get_locations(struct inode *inode, * -NFS4ERR_LEASE_MOVED is returned if the server still has leases * from this client that require migration recovery. */ -int nfs4_proc_get_locations(struct inode *inode, +int nfs4_proc_get_locations(struct nfs_server *server, + struct nfs_fh *fhandle, struct nfs4_fs_locations *locations, struct page *page, const struct cred *cred) { - struct nfs_server *server = NFS_SERVER(inode); struct nfs_client *clp = server->nfs_client; const struct nfs4_mig_recovery_ops *ops = clp->cl_mvops->mig_recovery_ops; @@ -7956,10 +8010,11 @@ int nfs4_proc_get_locations(struct inode *inode, (unsigned long long)server->fsid.major, (unsigned long long)server->fsid.minor, clp->cl_hostname); - nfs_display_fhandle(NFS_FH(inode), __func__); + nfs_display_fhandle(fhandle, __func__); do { - status = ops->get_locations(inode, locations, page, cred); + status = ops->get_locations(server, fhandle, locations, page, + cred); if (status != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, status, &exception); @@ -10428,6 +10483,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .free_client = nfs4_free_client, .create_server = nfs4_create_server, .clone_server = nfs_clone_server, + .discover_trunking = nfs4_discover_trunking, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f3265575c28d..499bef9fe118 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2098,7 +2098,8 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred } inode = d_inode(server->super->s_root); - result = nfs4_proc_get_locations(inode, locations, page, cred); + result = nfs4_proc_get_locations(server, NFS_FH(inode), locations, + page, cred); if (result) { dprintk("<-- %s: failed to retrieve fs_locations: %d\n", __func__, result); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index ddff92396a3f..728cb0c1f0b6 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1797,6 +1797,7 @@ struct nfs_rpc_ops { struct nfs_server *(*create_server)(struct fs_context *); struct nfs_server *(*clone_server)(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, rpc_authflavor_t); + int (*discover_trunking)(struct nfs_server *, struct nfs_fh *); }; /* -- cgit v1.2.3-58-ga151 From f5b27cc6761e27ee6387a24df1a99ca77b360fea Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Dec 2021 14:53:32 -0500 Subject: NFSv4 expose nfs_parse_server_name function Make nfs_parse_server_name available outside of nfs4namespace.c. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4_fs.h | 3 ++- fs/nfs/nfs4namespace.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 166ea6112e61..4f998dbeb73f 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -280,7 +280,8 @@ struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, int nfs4_submount(struct fs_context *, struct nfs_server *); int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); - +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, + size_t salen, struct net *net); /* nfs4proc.c */ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); extern int nfs4_async_handle_error(struct rpc_task *task, diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 873342308dc0..f1ed4f60a7f3 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -164,8 +164,8 @@ static int nfs4_validate_fspath(struct dentry *dentry, return 0; } -static size_t nfs_parse_server_name(char *string, size_t len, - struct sockaddr *sa, size_t salen, struct net *net) +size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, + size_t salen, struct net *net) { ssize_t ret; -- cgit v1.2.3-58-ga151 From a8d54baba7c65db2d3278873def61f8d3753d766 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Dec 2021 14:53:33 -0500 Subject: NFSv4 handle port presence in fs_location server string An fs_location attribute returns a string that can be ipv4, ipv6, or DNS name. An ip location can have a port appended to it and if no port is present a default port needs to be set. If rpc_pton() fails to parse, try calling rpc_uaddr2socaddr() that can convert an universal address. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4namespace.c | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4f998dbeb73f..84f39b6f1b1e 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -281,7 +281,7 @@ int nfs4_submount(struct fs_context *, struct nfs_server *); int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, - size_t salen, struct net *net); + size_t salen, struct net *net, int port); /* nfs4proc.c */ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); extern int nfs4_async_handle_error(struct rpc_task *task, diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index f1ed4f60a7f3..3680c8da510c 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -165,15 +165,20 @@ static int nfs4_validate_fspath(struct dentry *dentry, } size_t nfs_parse_server_name(char *string, size_t len, struct sockaddr *sa, - size_t salen, struct net *net) + size_t salen, struct net *net, int port) { ssize_t ret; ret = rpc_pton(net, string, len, sa, salen); if (ret == 0) { - ret = nfs_dns_resolve_name(net, string, len, sa, salen); - if (ret < 0) - ret = 0; + ret = rpc_uaddr2sockaddr(net, string, len, sa, salen); + if (ret == 0) { + ret = nfs_dns_resolve_name(net, string, len, sa, salen); + if (ret < 0) + ret = 0; + } + } else if (port) { + rpc_set_port(sa, port); } return ret; } @@ -328,7 +333,7 @@ static int try_location(struct fs_context *fc, nfs_parse_server_name(buf->data, buf->len, &ctx->nfs_server.address, sizeof(ctx->nfs_server._address), - fc->net_ns); + fc->net_ns, 0); if (ctx->nfs_server.addrlen == 0) continue; @@ -496,7 +501,7 @@ static int nfs4_try_replacing_one_location(struct nfs_server *server, continue; salen = nfs_parse_server_name(buf->data, buf->len, - sap, addr_bufsize, net); + sap, addr_bufsize, net, 0); if (salen == 0) continue; rpc_set_port(sap, NFS_PORT); -- cgit v1.2.3-58-ga151 From b8a09619a56334414cbd7f935a0796240d0cc07e Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Dec 2021 14:53:34 -0500 Subject: SUNRPC allow for unspecified transport time in rpc_clnt_add_xprt If the supplied argument doesn't specify the transport type, use the type of the existing rpc clnt and its existing transport. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index a312ea2bc440..c83fe618767c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2900,7 +2900,7 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, unsigned long connect_timeout; unsigned long reconnect_timeout; unsigned char resvport, reuseport; - int ret = 0; + int ret = 0, ident; rcu_read_lock(); xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); @@ -2914,8 +2914,11 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, reuseport = xprt->reuseport; connect_timeout = xprt->connect_timeout; reconnect_timeout = xprt->max_reconnect_timeout; + ident = xprt->xprt_class->ident; rcu_read_unlock(); + if (!xprtargs->ident) + xprtargs->ident = ident; xprt = xprt_create_transport(xprtargs); if (IS_ERR(xprt)) { ret = PTR_ERR(xprt); -- cgit v1.2.3-58-ga151 From 4ca9f31a2be66d5fbf34b5b91ef17de7480992e1 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 9 Dec 2021 14:53:35 -0500 Subject: NFSv4.1 test and add 4.1 trunking transport For each location returned in FS_LOCATION query, establish a transport to the server, send EXCHANGE_ID and test for trunking, if successful, add the transport to the exiting client. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index fc4629d2d2ab..b18f31b2c9e7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3935,6 +3935,56 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) return err; } +static void test_fs_location_for_trunking(struct nfs4_fs_location *location, + struct nfs_client *clp, + struct nfs_server *server) +{ + int i; + + for (i = 0; i < location->nservers; i++) { + struct nfs4_string *srv_loc = &location->servers[i]; + struct sockaddr addr; + size_t addrlen; + struct xprt_create xprt_args = { + .ident = 0, + .net = clp->cl_net, + }; + struct nfs4_add_xprt_data xprtdata = { + .clp = clp, + }; + struct rpc_add_xprt_test rpcdata = { + .add_xprt_test = clp->cl_mvops->session_trunk, + .data = &xprtdata, + }; + char *servername = NULL; + + if (!srv_loc->len) + continue; + + addrlen = nfs_parse_server_name(srv_loc->data, srv_loc->len, + &addr, sizeof(addr), + clp->cl_net, server->port); + if (!addrlen) + return; + xprt_args.dstaddr = &addr; + xprt_args.addrlen = addrlen; + servername = kmalloc(srv_loc->len + 1, GFP_KERNEL); + if (!servername) + return; + memcpy(servername, srv_loc->data, srv_loc->len); + servername[srv_loc->len] = '\0'; + xprt_args.servername = servername; + + xprtdata.cred = nfs4_get_clid_cred(clp); + rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, + rpc_clnt_setup_test_and_add_xprt, + &rpcdata); + if (xprtdata.cred) + put_cred(xprtdata.cred); + kfree(servername); + } +} + static int _nfs4_discover_trunking(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -3944,7 +3994,7 @@ static int _nfs4_discover_trunking(struct nfs_server *server, struct nfs_client *clp = server->nfs_client; const struct nfs4_state_maintenance_ops *ops = clp->cl_mvops->state_renewal_ops; - int status = -ENOMEM; + int status = -ENOMEM, i; cred = ops->get_state_renewal_cred(clp); if (cred == NULL) { @@ -3962,6 +4012,10 @@ static int _nfs4_discover_trunking(struct nfs_server *server, cred); if (status) goto out; + + for (i = 0; i < locations->nlocations; i++) + test_fs_location_for_trunking(&locations->locations[i], clp, + server); out: if (page) __free_page(page); -- cgit v1.2.3-58-ga151 From 776d794f28c95051bc70405a7b1fa40115658a18 Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Thu, 9 Sep 2021 12:32:38 +0800 Subject: net/sunrpc: fix reference count leaks in rpc_sysfs_xprt_state_change The refcount leak issues take place in an error handling path. When the 3rd argument buf doesn't match with "offline", "online" or "remove", the function simply returns -EINVAL and forgets to decrease the reference count of a rpc_xprt object and a rpc_xprt_switch object increased by rpc_sysfs_xprt_kobj_get_xprt() and rpc_sysfs_xprt_kobj_get_xprt_switch(), causing reference count leaks of both unused objects. Fix this issue by jumping to the error handling path labelled with out_put when buf matches none of "offline", "online" or "remove". Signed-off-by: Xiyu Yang Signed-off-by: Xin Xiong Signed-off-by: Xin Tan Signed-off-by: Anna Schumaker --- net/sunrpc/sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index b1aea3419218..49fd0339dc59 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -295,8 +295,10 @@ static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, online = 1; else if (!strncmp(buf, "remove", 6)) remove = 1; - else - return -EINVAL; + else { + count = -EINVAL; + goto out_put; + } if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { count = -EINTR; -- cgit v1.2.3-58-ga151 From 1a48db3fef499f615b56093947ec4b0d3d8e3021 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Mon, 15 Nov 2021 11:54:25 -0500 Subject: sunrpc: Fix potential race conditions in rpc_sysfs_xprt_state_change() We need to use test_and_set_bit() when changing xprt state flags to avoid potentially getting xps->xps_nactive out of sync. Signed-off-by: Anna Schumaker --- net/sunrpc/sysfs.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 49fd0339dc59..b64a0286b182 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -309,25 +309,28 @@ static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, goto release_tasks; } if (offline) { - set_bit(XPRT_OFFLINE, &xprt->state); - spin_lock(&xps->xps_lock); - xps->xps_nactive--; - spin_unlock(&xps->xps_lock); + if (!test_and_set_bit(XPRT_OFFLINE, &xprt->state)) { + spin_lock(&xps->xps_lock); + xps->xps_nactive--; + spin_unlock(&xps->xps_lock); + } } else if (online) { - clear_bit(XPRT_OFFLINE, &xprt->state); - spin_lock(&xps->xps_lock); - xps->xps_nactive++; - spin_unlock(&xps->xps_lock); + if (test_and_clear_bit(XPRT_OFFLINE, &xprt->state)) { + spin_lock(&xps->xps_lock); + xps->xps_nactive++; + spin_unlock(&xps->xps_lock); + } } else if (remove) { if (test_bit(XPRT_OFFLINE, &xprt->state)) { - set_bit(XPRT_REMOVE, &xprt->state); - xprt_force_disconnect(xprt); - if (test_bit(XPRT_CONNECTED, &xprt->state)) { - if (!xprt->sending.qlen && - !xprt->pending.qlen && - !xprt->backlog.qlen && - !atomic_long_read(&xprt->queuelen)) - rpc_xprt_switch_remove_xprt(xps, xprt); + if (!test_and_set_bit(XPRT_REMOVE, &xprt->state)) { + xprt_force_disconnect(xprt); + if (test_bit(XPRT_CONNECTED, &xprt->state)) { + if (!xprt->sending.qlen && + !xprt->pending.qlen && + !xprt->backlog.qlen && + !atomic_long_read(&xprt->queuelen)) + rpc_xprt_switch_remove_xprt(xps, xprt); + } } } else { count = -EINVAL; -- cgit v1.2.3-58-ga151 From c84b8a3fef663933007e885535591b9d30bdc860 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 14 Jan 2022 00:20:05 +0800 Subject: io_uring: Remove unused function req_ref_put Fix the following clang warnings: fs/io_uring.c:1195:20: warning: unused function 'req_ref_put' [-Wunused-function]. Fixes: aa43477b0402 ("io_uring: poll rework") Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/20220113162005.3011-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index de9c9de90655..fa3277844d2e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1192,12 +1192,6 @@ static inline bool req_ref_put_and_test(struct io_kiocb *req) return atomic_dec_and_test(&req->refs); } -static inline void req_ref_put(struct io_kiocb *req) -{ - WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); - WARN_ON_ONCE(req_ref_put_and_test(req)); -} - static inline void req_ref_get(struct io_kiocb *req) { WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); -- cgit v1.2.3-58-ga151 From 180dccb0dba4f5e84a4a70c1be1d34cbb6528b32 Mon Sep 17 00:00:00 2001 From: Laibin Qiu Date: Thu, 13 Jan 2022 10:55:36 +0800 Subject: blk-mq: fix tag_get wait task can't be awakened In case of shared tags, there might be more than one hctx which allocates from the same tags, and each hctx is limited to allocate at most: hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U); tag idle detection is lazy, and may be delayed for 30sec, so there could be just one real active hctx(queue) but all others are actually idle and still accounted as active because of the lazy idle detection. Then if wake_batch is > hctx_max_depth, driver tag allocation may wait forever on this real active hctx. Fix this by recalculating wake_batch when inc or dec active_queues. Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps") Suggested-by: Ming Lei Suggested-by: John Garry Signed-off-by: Laibin Qiu Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20220113025536.1479653-1-qiulaibin@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 40 +++++++++++++++++++++++++++++++++------- include/linux/sbitmap.h | 11 +++++++++++ lib/sbitmap.c | 25 ++++++++++++++++++++++--- 3 files changed, 66 insertions(+), 10 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index e55a6834c9a6..845f74e8dd7b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -16,6 +16,21 @@ #include "blk-mq-sched.h" #include "blk-mq-tag.h" +/* + * Recalculate wakeup batch when tag is shared by hctx. + */ +static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, + unsigned int users) +{ + if (!users) + return; + + sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags, + users); + sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags, + users); +} + /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail @@ -24,18 +39,26 @@ */ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { + unsigned int users; + if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; - if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && - !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) - atomic_inc(&hctx->tags->active_queues); + if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || + test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) { + return true; + } } else { - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && - !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - atomic_inc(&hctx->tags->active_queues); + if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || + test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) { + return true; + } } + users = atomic_inc_return(&hctx->tags->active_queues); + + blk_mq_update_wake_batch(hctx->tags, users); + return true; } @@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; + unsigned int users; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; @@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) return; } - atomic_dec(&tags->active_queues); + users = atomic_dec_return(&tags->active_queues); + + blk_mq_update_wake_batch(tags, users); blk_mq_tag_wakeup_all(tags, false); } diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index fc0357a6e19b..95df357ec009 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) sbitmap_free(&sbq->sb); } +/** + * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch + * @sbq: Bitmap queue to recalculate wake batch. + * @users: Number of shares. + * + * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch + * by depth. This interface is for HCTX shared tags or queue shared tags. + */ +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, + unsigned int users); + /** * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. * @sbq: Bitmap queue to resize. diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 2709ab825499..6220fa67fb7e 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, } EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); -static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, - unsigned int depth) +static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, + unsigned int wake_batch) { - unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth); int i; if (sbq->wake_batch != wake_batch) { @@ -476,6 +475,26 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, } } +static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, + unsigned int depth) +{ + unsigned int wake_batch; + + wake_batch = sbq_calc_wake_batch(sbq, depth); + __sbitmap_queue_update_wake_batch(sbq, wake_batch); +} + +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq, + unsigned int users) +{ + unsigned int wake_batch; + + wake_batch = clamp_val((sbq->sb.depth + users - 1) / + users, 4, SBQ_WAKE_BATCH); + __sbitmap_queue_update_wake_batch(sbq, wake_batch); +} +EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch); + void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) { sbitmap_queue_update_wake_batch(sbq, depth); -- cgit v1.2.3-58-ga151 From 413ec8057bc3d368574abd05dd27e747063b2f59 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 13 Jan 2022 00:14:32 +0000 Subject: loop: remove redundant initialization of pointer node The pointer node is being initialized with a value that is never read, it is being re-assigned the same value a little futher on. Remove the redundant initialization. Cleans up clang scan warning: drivers/block/loop.c:823:19: warning: Value stored to 'node' during its initialization is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20220113001432.1331871-1-colin.i.king@gmail.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index b1b05c45c07c..01cbbfc4e9e2 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -820,7 +820,7 @@ static inline int queue_on_root_worker(struct cgroup_subsys_state *css) static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd) { - struct rb_node **node = &(lo->worker_tree.rb_node), *parent = NULL; + struct rb_node **node, *parent = NULL; struct loop_worker *cur_worker, *worker = NULL; struct work_struct *work; struct list_head *cmd_list; -- cgit v1.2.3-58-ga151 From a6431e351c6ec5bb6800787d259b343088f369a3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 13 Jan 2022 00:05:45 +0000 Subject: aoe: remove redundant assignment on variable n The variable n is being bit-wise or'd with a value and reassigned before being returned. The update of n is redundant, replace the |= operator with | instead. Cleans up clang scan warning: drivers/block/aoe/aoecmd.c:125:9: warning: Although the value stored to 'n' is used in the enclosing expression, the value is never actually read from 'n' [deadcode.DeadStores] Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20220113000545.1307091-1-colin.i.king@gmail.com Signed-off-by: Jens Axboe --- drivers/block/aoe/aoecmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 588889bea7c3..6af111f568e4 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -122,7 +122,7 @@ newtag(struct aoedev *d) register ulong n; n = jiffies & 0xffff; - return n |= (++d->lasttag & 0x7fff) << 16; + return n | (++d->lasttag & 0x7fff) << 16; } static u32 -- cgit v1.2.3-58-ga151 From 49a8f2bc8d88702783c7e163ec84374e9a022f71 Mon Sep 17 00:00:00 2001 From: Robert Hancock Date: Wed, 12 Jan 2022 14:38:16 -0600 Subject: clk: si5341: Fix clock HW provider cleanup The call to of_clk_add_hw_provider was not undone on remove or on probe failure, which could cause an oops on a subsequent attempt to retrieve clocks for the removed device. Switch to the devm version of the function to avoid this issue. Fixes: 3044a860fd09 ("clk: Add Si5341/Si5340 driver") Signed-off-by: Robert Hancock Link: https://lore.kernel.org/r/20220112203816.1784610-1-robert.hancock@calian.com Signed-off-by: Stephen Boyd --- drivers/clk/clk-si5341.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/clk-si5341.c b/drivers/clk/clk-si5341.c index 57ae183982d8..f7b41366666e 100644 --- a/drivers/clk/clk-si5341.c +++ b/drivers/clk/clk-si5341.c @@ -1740,7 +1740,7 @@ static int si5341_probe(struct i2c_client *client, clk_prepare(data->clk[i].hw.clk); } - err = of_clk_add_hw_provider(client->dev.of_node, of_clk_si5341_get, + err = devm_of_clk_add_hw_provider(&client->dev, of_clk_si5341_get, data); if (err) { dev_err(&client->dev, "unable to add clk provider\n"); -- cgit v1.2.3-58-ga151 From 818d9150f2b22a0053bf568fa11ad3be804ce5c4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 11 Jan 2022 10:25:29 +0300 Subject: clk: visconti: Fix uninitialized variable in printk The "pll_clck" variable is uninitialized. The "ret" error code was supposed to be printed instead. Fixes: b4cbe606dc36 ("clk: visconti: Add support common clock driver and reset driver") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/20220111072529.GJ11243@kili Signed-off-by: Stephen Boyd --- drivers/clk/visconti/pll.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/clk/visconti/pll.c b/drivers/clk/visconti/pll.c index a2398bc6c6e4..a484cb945d67 100644 --- a/drivers/clk/visconti/pll.c +++ b/drivers/clk/visconti/pll.c @@ -246,7 +246,6 @@ static struct clk_hw *visconti_register_pll(struct visconti_pll_provider *ctx, { struct clk_init_data init; struct visconti_pll *pll; - struct clk *pll_clk; struct clk_hw *pll_hw_clk; size_t len; int ret; @@ -277,7 +276,7 @@ static struct clk_hw *visconti_register_pll(struct visconti_pll_provider *ctx, pll_hw_clk = &pll->hw; ret = clk_hw_register(NULL, &pll->hw); if (ret) { - pr_err("failed to register pll clock %s : %ld\n", name, PTR_ERR(pll_clk)); + pr_err("failed to register pll clock %s : %d\n", name, ret); kfree(pll); pll_hw_clk = ERR_PTR(ret); } -- cgit v1.2.3-58-ga151 From 6840f9094f2bd788a316d8cb0a4e42538d3e47dd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Jan 2022 16:44:19 -0500 Subject: pagevec: Initialise folio_batch->percpu_pvec_drained When UBSAN is enabled, it reports an invalid value in __pagevec_release() when accessing pvec->percpu_pvec_drained, which is simply whatever garbage was on the stack. Initialise it when initialising the rest of the folio_batch. Fixes: 10331795fb79 ("pagevec: Add folio_batch") Reported-by: Randy Dunlap Tested-by: Randy Dunlap Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/pagevec.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index dda8d5868c81..67b1246f136b 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -111,6 +111,7 @@ static_assert(offsetof(struct pagevec, pages) == static inline void folio_batch_init(struct folio_batch *fbatch) { fbatch->nr = 0; + fbatch->percpu_pvec_drained = false; } static inline unsigned int folio_batch_count(struct folio_batch *fbatch) -- cgit v1.2.3-58-ga151 From 3fe6acd4dc922237b30e55473c9349c6ce0690f3 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 7 Jan 2022 12:09:36 -0800 Subject: HID: vivaldi: fix handling devices not using numbered reports Unfortunately details of USB HID transport bled into HID core and handling of numbered/unnumbered reports is quite a mess, with hid_report_len() calculating the length according to USB rules, and hid_hw_raw_request() adding report ID to the buffer for both numbered and unnumbered reports. Untangling it all requres a lot of changes in HID, so for now let's handle this in the driver. [jkosina@suse.cz: microoptimize field->report->id to report->id] Fixes: 14c9c014babe ("HID: add vivaldi HID driver") Signed-off-by: Dmitry Torokhov Tested-by: Stephen Boyd # CoachZ Signed-off-by: Jiri Kosina --- drivers/hid/hid-vivaldi.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/drivers/hid/hid-vivaldi.c b/drivers/hid/hid-vivaldi.c index 72957a9f7117..576518e704ee 100644 --- a/drivers/hid/hid-vivaldi.c +++ b/drivers/hid/hid-vivaldi.c @@ -74,10 +74,11 @@ static void vivaldi_feature_mapping(struct hid_device *hdev, struct hid_usage *usage) { struct vivaldi_data *drvdata = hid_get_drvdata(hdev); + struct hid_report *report = field->report; int fn_key; int ret; u32 report_len; - u8 *buf; + u8 *report_data, *buf; if (field->logical != HID_USAGE_FN_ROW_PHYSMAP || (usage->hid & HID_USAGE_PAGE) != HID_UP_ORDINAL) @@ -89,12 +90,24 @@ static void vivaldi_feature_mapping(struct hid_device *hdev, if (fn_key > drvdata->max_function_row_key) drvdata->max_function_row_key = fn_key; - buf = hid_alloc_report_buf(field->report, GFP_KERNEL); - if (!buf) + report_data = buf = hid_alloc_report_buf(report, GFP_KERNEL); + if (!report_data) return; - report_len = hid_report_len(field->report); - ret = hid_hw_raw_request(hdev, field->report->id, buf, + report_len = hid_report_len(report); + if (!report->id) { + /* + * hid_hw_raw_request() will stuff report ID (which will be 0) + * into the first byte of the buffer even for unnumbered + * reports, so we need to account for this to avoid getting + * -EOVERFLOW in return. + * Note that hid_alloc_report_buf() adds 7 bytes to the size + * so we can safely say that we have space for an extra byte. + */ + report_len++; + } + + ret = hid_hw_raw_request(hdev, report->id, report_data, report_len, HID_FEATURE_REPORT, HID_REQ_GET_REPORT); if (ret < 0) { @@ -103,7 +116,16 @@ static void vivaldi_feature_mapping(struct hid_device *hdev, goto out; } - ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, buf, + if (!report->id) { + /* + * Undo the damage from hid_hw_raw_request() for unnumbered + * reports. + */ + report_data++; + report_len--; + } + + ret = hid_report_raw_event(hdev, HID_FEATURE_REPORT, report_data, report_len, 0); if (ret) { dev_warn(&hdev->dev, "failed to report feature %d\n", -- cgit v1.2.3-58-ga151 From e24aeff6db738be7ce24999a41e91299b5fe14be Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Fri, 7 Jan 2022 15:23:05 -0800 Subject: HID: vivaldi: Minor cleanups Perform some minor cleanups on this driver. Include header files for struct definitions that are used, drop a forward declaration that isn't useful, and mark a sysfs attribute static as it isn't used outside this file. Cc: Sean O'Brien Cc: Ting Shen Signed-off-by: Stephen Boyd Signed-off-by: Jiri Kosina --- drivers/hid/hid-vivaldi.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-vivaldi.c b/drivers/hid/hid-vivaldi.c index 576518e704ee..efa6140915f4 100644 --- a/drivers/hid/hid-vivaldi.c +++ b/drivers/hid/hid-vivaldi.c @@ -6,16 +6,17 @@ * Author: Sean O'Brien */ +#include #include +#include #include +#include #define MIN_FN_ROW_KEY 1 #define MAX_FN_ROW_KEY 24 #define HID_VD_FN_ROW_PHYSMAP 0x00000001 #define HID_USAGE_FN_ROW_PHYSMAP (HID_UP_GOOGLEVENDOR | HID_VD_FN_ROW_PHYSMAP) -static struct hid_driver hid_vivaldi; - struct vivaldi_data { u32 function_row_physmap[MAX_FN_ROW_KEY - MIN_FN_ROW_KEY + 1]; int max_function_row_key; @@ -40,7 +41,7 @@ static ssize_t function_row_physmap_show(struct device *dev, return size; } -DEVICE_ATTR_RO(function_row_physmap); +static DEVICE_ATTR_RO(function_row_physmap); static struct attribute *sysfs_attrs[] = { &dev_attr_function_row_physmap.attr, NULL -- cgit v1.2.3-58-ga151 From 791f3465c4afde02d7f16cf7424ca87070b69396 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 14 Jan 2022 11:59:10 +0000 Subject: io_uring: fix UAF due to missing POLLFREE handling Fixes a problem described in 50252e4b5e989 ("aio: fix use-after-free due to missing POLLFREE handling") and copies the approach used there. In short, we have to forcibly eject a poll entry when we meet POLLFREE. We can't rely on io_poll_get_ownership() as can't wait for potentially running tw handlers, so we use the fact that wqs are RCU freed. See Eric's patch and comments for more details. Reported-by: Eric Biggers Link: https://lore.kernel.org/r/20211209010455.42744-6-ebiggers@kernel.org Reported-and-tested-by: syzbot+5426c7ed6868c705ca14@syzkaller.appspotmail.com Fixes: 221c5eb233823 ("io_uring: add support for IORING_OP_POLL") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4ed56b6f548f7ea337603a82315750449412748a.1642161259.git.asml.silence@gmail.com [axboe: drop non-functional change from patch] Signed-off-by: Jens Axboe --- fs/io_uring.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index fa3277844d2e..422d6de48688 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5462,12 +5462,14 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, static inline void io_poll_remove_entry(struct io_poll_iocb *poll) { - struct wait_queue_head *head = poll->head; + struct wait_queue_head *head = smp_load_acquire(&poll->head); - spin_lock_irq(&head->lock); - list_del_init(&poll->wait.entry); - poll->head = NULL; - spin_unlock_irq(&head->lock); + if (head) { + spin_lock_irq(&head->lock); + list_del_init(&poll->wait.entry); + poll->head = NULL; + spin_unlock_irq(&head->lock); + } } static void io_poll_remove_entries(struct io_kiocb *req) @@ -5475,10 +5477,26 @@ static void io_poll_remove_entries(struct io_kiocb *req) struct io_poll_iocb *poll = io_poll_get_single(req); struct io_poll_iocb *poll_double = io_poll_get_double(req); - if (poll->head) - io_poll_remove_entry(poll); - if (poll_double && poll_double->head) + /* + * While we hold the waitqueue lock and the waitqueue is nonempty, + * wake_up_pollfree() will wait for us. However, taking the waitqueue + * lock in the first place can race with the waitqueue being freed. + * + * We solve this as eventpoll does: by taking advantage of the fact that + * all users of wake_up_pollfree() will RCU-delay the actual free. If + * we enter rcu_read_lock() and see that the pointer to the queue is + * non-NULL, we can then lock it without the memory being freed out from + * under us. + * + * Keep holding rcu_read_lock() as long as we hold the queue lock, in + * case the caller deletes the entry from the queue, leaving it empty. + * In that case, only RCU prevents the queue memory from being freed. + */ + rcu_read_lock(); + io_poll_remove_entry(poll); + if (poll_double) io_poll_remove_entry(poll_double); + rcu_read_unlock(); } /* @@ -5618,6 +5636,30 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, wait); __poll_t mask = key_to_poll(key); + if (unlikely(mask & POLLFREE)) { + io_poll_mark_cancelled(req); + /* we have to kick tw in case it's not already */ + io_poll_execute(req, 0); + + /* + * If the waitqueue is being freed early but someone is already + * holds ownership over it, we have to tear down the request as + * best we can. That means immediately removing the request from + * its waitqueue and preventing all further accesses to the + * waitqueue via the request. + */ + list_del_init(&poll->wait.entry); + + /* + * Careful: this *must* be the last step, since as soon + * as req->head is NULL'ed out, the request can be + * completed and freed, since aio_poll_complete_work() + * will no longer need to take the waitqueue lock. + */ + smp_store_release(&poll->head, NULL); + return 1; + } + /* for instances that support it check for an event match first */ if (mask && !(mask & poll->events)) return 0; -- cgit v1.2.3-58-ga151 From c03061e7a210b4fe37440a1940fc198744a55ca4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 13 Jan 2022 12:20:23 -0500 Subject: xprtrdma: Remove final dprintk call sites from xprtrdma Deprecated. This information is available via tracepoints. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3d3673ba9e1e..1d6e85fe3133 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -274,8 +274,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->re_connect_status = -ENETUNREACH; goto wake_connect_worker; case RDMA_CM_EVENT_REJECTED: - dprintk("rpcrdma: connection to %pISpc rejected: %s\n", - sap, rdma_reject_msg(id, event->status)); ep->re_connect_status = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) ep->re_connect_status = -ENOTCONN; @@ -291,8 +289,6 @@ disconnected: break; } - dprintk("RPC: %s: %pISpc on %s/frwr: %s\n", __func__, sap, - ep->re_id->device->name, rdma_event_msg(event->event)); return 0; } @@ -419,14 +415,6 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) ep->re_attr.qp_type = IB_QPT_RC; ep->re_attr.port_num = ~0; - dprintk("RPC: %s: requested max: dtos: send %d recv %d; " - "iovs: send %d recv %d\n", - __func__, - ep->re_attr.cap.max_send_wr, - ep->re_attr.cap.max_recv_wr, - ep->re_attr.cap.max_send_sge, - ep->re_attr.cap.max_recv_sge); - ep->re_send_batch = ep->re_max_requests >> 3; ep->re_send_count = ep->re_send_batch; init_waitqueue_head(&ep->re_connect_wait); -- cgit v1.2.3-58-ga151 From c0f26167ddcf94fb94e80fcb20aaac7f7db13c1a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 13 Jan 2022 12:20:30 -0500 Subject: xprtrdma: Remove definitions of RPCDBG_FACILITY Deprecated. dprintk is no longer used in xprtrdma. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 4 ---- net/sunrpc/xprtrdma/frwr_ops.c | 4 ---- net/sunrpc/xprtrdma/rpc_rdma.c | 4 ---- net/sunrpc/xprtrdma/transport.c | 4 ---- net/sunrpc/xprtrdma/verbs.c | 11 ----------- 5 files changed, 27 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 17f174d6ea3b..faba7136dd9a 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -13,10 +13,6 @@ #include "xprt_rdma.h" #include -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - #undef RPCRDMA_BACKCHANNEL_DEBUG /** diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index ff699307e820..515dd7a66a04 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -45,10 +45,6 @@ #include "xprt_rdma.h" #include -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - static void frwr_cid_init(struct rpcrdma_ep *ep, struct rpcrdma_mr *mr) { diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 8035a983c8ce..281ddb87ac8d 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -54,10 +54,6 @@ #include "xprt_rdma.h" #include -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - /* Returns size of largest RPC-over-RDMA header in a Call message * * The largest Call header contains a full-size Read list and a diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 16e5696314a4..42e375dbdadb 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -60,10 +60,6 @@ #include "xprt_rdma.h" #include -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - /* * tunables */ diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1d6e85fe3133..f172d1298013 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -63,17 +63,6 @@ #include "xprt_rdma.h" #include -/* - * Globals/Macros - */ - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - -/* - * internal functions - */ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt); static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, -- cgit v1.2.3-58-ga151 From aed28b7a2d620cb5cd0c554cb889075c02e25e8e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 13 Jan 2022 12:20:36 -0500 Subject: SUNRPC: Don't dereference xprt->snd_task if it's a cookie Fixes: e26d9972720e ("SUNRPC: Clean up scheduling of autoclose") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index a6af347c935d..7cdcbc6dc38e 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -965,7 +965,8 @@ TRACE_EVENT(rpc_socket_nospace, { BIT(XPRT_REMOVE), "REMOVE" }, \ { BIT(XPRT_CONGESTED), "CONGESTED" }, \ { BIT(XPRT_CWND_WAIT), "CWND_WAIT" }, \ - { BIT(XPRT_WRITE_SPACE), "WRITE_SPACE" }) + { BIT(XPRT_WRITE_SPACE), "WRITE_SPACE" }, \ + { BIT(XPRT_SND_IS_COOKIE), "SND_IS_COOKIE" }) DECLARE_EVENT_CLASS(rpc_xprt_lifetime_class, TP_PROTO( @@ -1162,8 +1163,11 @@ DECLARE_EVENT_CLASS(xprt_writelock_event, __entry->task_id = -1; __entry->client_id = -1; } - __entry->snd_task_id = xprt->snd_task ? - xprt->snd_task->tk_pid : -1; + if (xprt->snd_task && + !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) + __entry->snd_task_id = xprt->snd_task->tk_pid; + else + __entry->snd_task_id = -1; ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER @@ -1208,8 +1212,12 @@ DECLARE_EVENT_CLASS(xprt_cong_event, __entry->task_id = -1; __entry->client_id = -1; } - __entry->snd_task_id = xprt->snd_task ? - xprt->snd_task->tk_pid : -1; + if (xprt->snd_task && + !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) + __entry->snd_task_id = xprt->snd_task->tk_pid; + else + __entry->snd_task_id = -1; + __entry->cong = xprt->cong; __entry->cwnd = xprt->cwnd; __entry->wait = test_bit(XPRT_CWND_WAIT, &xprt->state); -- cgit v1.2.3-58-ga151 From 91502a9a0b0d5252cf3f32ebd898823c2f5aadab Mon Sep 17 00:00:00 2001 From: Alexander Sergeyev Date: Fri, 14 Jan 2022 19:50:50 +0300 Subject: ALSA: hda/realtek: fix speakers and micmute on HP 855 G8 There are several PCI ids associated with HP EliteBook 855 G8 Notebook PC. Commit 0e68c4b11f1e6 ("ALSA: hda/realtek: fix mute/micmute LEDs for HP 855 G8") covers 0x103c:0x8896, while this commit covers 0x103c:0x8895 which needs some additional work on top of the quirk from 0e68c4b11f1e6. Note that the device can boot up with working speakers and micmute LED without this patch, but the success rate would be quite low (order of 16 working boots across 709 boots) at least for the built-in drivers scenario. This also means that there are some timing issues during early boot and this patch is a workaround. With this patch applied speakers and headphones are consistenly working, as well as mute/micmute LEDs and the internal microphone. Signed-off-by: Alexander Sergeyev Link: https://lore.kernel.org/r/20220114165050.ouw2nknuspclynro@localhost.localdomain Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index eef973661b0a..668274e52674 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6948,6 +6948,7 @@ enum { ALC285_FIXUP_LEGION_Y9000X_AUTOMUTE, ALC287_FIXUP_LEGION_16ACHG6, ALC287_FIXUP_CS35L41_I2C_2, + ALC285_FIXUP_HP_SPEAKERS_MICMUTE_LED, }; static const struct hda_fixup alc269_fixups[] = { @@ -8698,6 +8699,16 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = cs35l41_fixup_i2c_two, }, + [ALC285_FIXUP_HP_SPEAKERS_MICMUTE_LED] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + { 0x20, AC_VERB_SET_COEF_INDEX, 0x19 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x8e11 }, + { } + }, + .chained = true, + .chain_id = ALC285_FIXUP_HP_MUTE_LED, + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -8911,6 +8922,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8870, "HP ZBook Fury 15.6 Inch G8 Mobile Workstation PC", ALC285_FIXUP_HP_GPIO_AMP_INIT), SND_PCI_QUIRK(0x103c, 0x8873, "HP ZBook Studio 15.6 Inch G8 Mobile Workstation PC", ALC285_FIXUP_HP_GPIO_AMP_INIT), SND_PCI_QUIRK(0x103c, 0x888d, "HP ZBook Power 15.6 inch G8 Mobile Workstation PC", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8895, "HP EliteBook 855 G8 Notebook PC", ALC285_FIXUP_HP_SPEAKERS_MICMUTE_LED), SND_PCI_QUIRK(0x103c, 0x8896, "HP EliteBook 855 G8 Notebook PC", ALC285_FIXUP_HP_MUTE_LED), SND_PCI_QUIRK(0x103c, 0x8898, "HP EliteBook 845 G8 Notebook PC", ALC285_FIXUP_HP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x103c, 0x88d0, "HP Pavilion 15-eh1xxx (mainboard 88D0)", ALC287_FIXUP_HP_GPIO_LED), -- cgit v1.2.3-58-ga151 From 4175c32be5ef0ff254d6931931ec412e8029c32a Mon Sep 17 00:00:00 2001 From: CHANDAN VURDIGERE NATARAJ Date: Tue, 11 Jan 2022 19:02:26 +0530 Subject: drm/amdgpu: Enable recovery on yellow carp Add yellow carp to devices which support recovery Signed-off-by: CHANDAN VURDIGERE NATARAJ Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index cf7fad88c138..cde34129e23a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4474,6 +4474,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) case CHIP_BEIGE_GOBY: case CHIP_VANGOGH: case CHIP_ALDEBARAN: + case CHIP_YELLOW_CARP: break; default: goto disabled; -- cgit v1.2.3-58-ga151 From 0ffb1fd1582a78649f22253d81515997fff88bc4 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 11 Jan 2022 17:41:44 -0500 Subject: drm/amdgpu: invert the logic in amdgpu_device_should_recover_gpu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rather than opting into GPU recovery support, default to on, and opt out if it's not working on a particular GPU. This avoids the need to add new asics to this list since this is a core feature. Reviewed-by: Evan Quan Reviewed-by: Guchun Chen Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 ++++++++++++------------------ 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index cde34129e23a..c4f3c886be55 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4450,34 +4450,24 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) if (amdgpu_gpu_recovery == -1) { switch (adev->asic_type) { - case CHIP_BONAIRE: - case CHIP_HAWAII: - case CHIP_TOPAZ: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - case CHIP_POLARIS12: - case CHIP_VEGAM: - case CHIP_VEGA20: - case CHIP_VEGA10: - case CHIP_VEGA12: - case CHIP_RAVEN: - case CHIP_ARCTURUS: - case CHIP_RENOIR: - case CHIP_NAVI10: - case CHIP_NAVI14: - case CHIP_NAVI12: - case CHIP_SIENNA_CICHLID: - case CHIP_NAVY_FLOUNDER: - case CHIP_DIMGREY_CAVEFISH: - case CHIP_BEIGE_GOBY: - case CHIP_VANGOGH: - case CHIP_ALDEBARAN: - case CHIP_YELLOW_CARP: - break; - default: +#ifdef CONFIG_DRM_AMDGPU_SI + case CHIP_VERDE: + case CHIP_TAHITI: + case CHIP_PITCAIRN: + case CHIP_OLAND: + case CHIP_HAINAN: +#endif +#ifdef CONFIG_DRM_AMDGPU_CIK + case CHIP_KAVERI: + case CHIP_KABINI: + case CHIP_MULLINS: +#endif + case CHIP_CARRIZO: + case CHIP_STONEY: + case CHIP_CYAN_SKILLFISH: goto disabled; + default: + break; } } -- cgit v1.2.3-58-ga151 From e8309d50e97851ff135c4e33325d37b032666b94 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 12 Jan 2022 22:38:51 -0500 Subject: drm/amdgpu: don't do resets on APUs which don't support it It can cause a hang. This is normally not enabled for GPU hangs on these asics, but was recently enabled for handling aborted suspends. This causes hangs on some platforms on suspend. Fixes: daf8de0874ab5b ("drm/amdgpu: always reset the asic in suspend (v2)") Cc: stable@vger.kernel.org Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1858 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/cik.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/vi.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c b/drivers/gpu/drm/amd/amdgpu/cik.c index 54f28c075f21..f10ce740a29c 100644 --- a/drivers/gpu/drm/amd/amdgpu/cik.c +++ b/drivers/gpu/drm/amd/amdgpu/cik.c @@ -1428,6 +1428,10 @@ static int cik_asic_reset(struct amdgpu_device *adev) { int r; + /* APUs don't have full asic reset */ + if (adev->flags & AMD_IS_APU) + return 0; + if (cik_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { dev_info(adev->dev, "BACO reset\n"); r = amdgpu_dpm_baco_reset(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c index fe9a7cc8d9eb..6645ebbd2696 100644 --- a/drivers/gpu/drm/amd/amdgpu/vi.c +++ b/drivers/gpu/drm/amd/amdgpu/vi.c @@ -956,6 +956,10 @@ static int vi_asic_reset(struct amdgpu_device *adev) { int r; + /* APUs don't have full asic reset */ + if (adev->flags & AMD_IS_APU) + return 0; + if (vi_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { dev_info(adev->dev, "BACO reset\n"); r = amdgpu_dpm_baco_reset(adev); -- cgit v1.2.3-58-ga151 From 3993a799fc971bc9b918bd969aa55864447b5dde Mon Sep 17 00:00:00 2001 From: Lukas Fink Date: Fri, 14 Jan 2022 07:51:41 +0100 Subject: drm/amdgpu: Fix rejecting Tahiti GPUs eb4fd29afd4a ("drm/amdgpu: bind to any 0x1002 PCI diplay class device") added generic bindings to amdgpu so that that it binds to all display class devices with VID 0x1002 and then rejects those in amdgpu_pci_probe. Unfortunately it reuses a driver_data value of 0 to detect those new bindings, which is already used to denote CHIP_TAHITI ASICs. The driver_data value given to those new bindings was changed in dd0761fd24ea1 ("drm/amdgpu: set CHIP_IP_DISCOVERY as the asic type by default") to CHIP_IP_DISCOVERY (=36), but it seems that the check in amdgpu_pci_probe was forgotten to be changed. Therefore, it still rejects Tahiti GPUs. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1860 Fixes: eb4fd29afd4a ("drm/amdgpu: bind to any 0x1002 PCI diplay class device") Cc: stable@vger.kernel.org Signed-off-by: Lukas Fink Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 366e475056bd..625e00131e27 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1907,7 +1907,7 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, return -ENODEV; } - if (flags == 0) { + if (flags == CHIP_IP_DISCOVERY) { DRM_INFO("Unsupported asic. Remove me when IP discovery init is in place.\n"); return -ENODEV; } -- cgit v1.2.3-58-ga151 From d82ce3cd30aa28db3e94ffc36ebf0af2ff12801d Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 14 Jan 2022 09:59:29 -0500 Subject: drm/amdgpu: drop flags check for CHIP_IP_DISCOVERY Support for IP based discovery is in place now so this check is no longer required. Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 625e00131e27..2a4bb032a7e2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1907,11 +1907,6 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, return -ENODEV; } - if (flags == CHIP_IP_DISCOVERY) { - DRM_INFO("Unsupported asic. Remove me when IP discovery init is in place.\n"); - return -ENODEV; - } - if (amdgpu_virtual_display || amdgpu_device_asic_has_dc_support(flags & AMD_ASIC_MASK)) supports_atomic = true; -- cgit v1.2.3-58-ga151 From c4849f88164b13dd141885e28210f599741b304b Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 7 Jan 2022 10:44:17 -0600 Subject: drm/amd/display: Revert W/A for hard hangs on DCN20/DCN21 The WA from commit 2a50edbf10c8 ("drm/amd/display: Apply w/a for hard hang on HPD") and commit 1bd3bc745e7f ("drm/amd/display: Extend w/a for hard hang on HPD to dcn20") causes a regression in s0ix where the system will fail to resume properly on many laptops. Pull the workarounds out to avoid that s0ix regression in the common case. This HPD hang happens with an external device in special circumstances and a new W/A will need to be developed for this in the future. Cc: stable@vger.kernel.org Cc: Qingqing Zhuo Reported-by: Scott Bruce Reported-by: Chris Hixon Reported-by: spasswolf@web.de Link: https://bugzilla.kernel.org/show_bug.cgi?id=215436 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1821 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1852 Fixes: 2a50edbf10c8 ("drm/amd/display: Apply w/a for hard hang on HPD") Fixes: 1bd3bc745e7f ("drm/amd/display: Extend w/a for hard hang on HPD to dcn20") Reviewed-by: Nicholas Kazlauskas Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher --- .../amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c | 11 +--------- .../drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c | 11 +--------- .../amd/display/dc/irq/dcn20/irq_service_dcn20.c | 25 ---------------------- .../amd/display/dc/irq/dcn20/irq_service_dcn20.h | 2 -- .../amd/display/dc/irq/dcn21/irq_service_dcn21.c | 25 ---------------------- .../amd/display/dc/irq/dcn21/irq_service_dcn21.h | 2 -- drivers/gpu/drm/amd/display/dc/irq/irq_service.c | 2 +- drivers/gpu/drm/amd/display/dc/irq/irq_service.h | 4 ---- 8 files changed, 3 insertions(+), 79 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c index 9f35f2e8f971..cac80ba69072 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn20/dcn20_clk_mgr.c @@ -38,7 +38,6 @@ #include "clk/clk_11_0_0_offset.h" #include "clk/clk_11_0_0_sh_mask.h" -#include "irq/dcn20/irq_service_dcn20.h" #undef FN #define FN(reg_name, field_name) \ @@ -223,8 +222,6 @@ void dcn2_update_clocks(struct clk_mgr *clk_mgr_base, bool force_reset = false; bool p_state_change_support; int total_plane_count; - int irq_src; - uint32_t hpd_state; if (dc->work_arounds.skip_clock_update) return; @@ -242,13 +239,7 @@ void dcn2_update_clocks(struct clk_mgr *clk_mgr_base, if (dc->res_pool->pp_smu) pp_smu = &dc->res_pool->pp_smu->nv_funcs; - for (irq_src = DC_IRQ_SOURCE_HPD1; irq_src <= DC_IRQ_SOURCE_HPD6; irq_src++) { - hpd_state = dc_get_hpd_state_dcn20(dc->res_pool->irqs, irq_src); - if (hpd_state) - break; - } - - if (display_count == 0 && !hpd_state) + if (display_count == 0) enter_display_off = true; if (enter_display_off == safe_to_lower) { diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c index fbda42313bfe..f4dee0e48a67 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c @@ -42,7 +42,6 @@ #include "clk/clk_10_0_2_sh_mask.h" #include "renoir_ip_offset.h" -#include "irq/dcn21/irq_service_dcn21.h" /* Constants */ @@ -129,11 +128,9 @@ static void rn_update_clocks(struct clk_mgr *clk_mgr_base, struct dc_clocks *new_clocks = &context->bw_ctx.bw.dcn.clk; struct dc *dc = clk_mgr_base->ctx->dc; int display_count; - int irq_src; bool update_dppclk = false; bool update_dispclk = false; bool dpp_clock_lowered = false; - uint32_t hpd_state; struct dmcu *dmcu = clk_mgr_base->ctx->dc->res_pool->dmcu; @@ -150,14 +147,8 @@ static void rn_update_clocks(struct clk_mgr *clk_mgr_base, display_count = rn_get_active_display_cnt_wa(dc, context); - for (irq_src = DC_IRQ_SOURCE_HPD1; irq_src <= DC_IRQ_SOURCE_HPD5; irq_src++) { - hpd_state = dc_get_hpd_state_dcn21(dc->res_pool->irqs, irq_src); - if (hpd_state) - break; - } - /* if we can go lower, go lower */ - if (display_count == 0 && !hpd_state) { + if (display_count == 0) { rn_vbios_smu_set_dcn_low_power_state(clk_mgr, DCN_PWR_STATE_LOW_POWER); /* update power state */ clk_mgr_base->clks.pwr_state = DCN_PWR_STATE_LOW_POWER; diff --git a/drivers/gpu/drm/amd/display/dc/irq/dcn20/irq_service_dcn20.c b/drivers/gpu/drm/amd/display/dc/irq/dcn20/irq_service_dcn20.c index 9ccafe007b23..c4b067d01895 100644 --- a/drivers/gpu/drm/amd/display/dc/irq/dcn20/irq_service_dcn20.c +++ b/drivers/gpu/drm/amd/display/dc/irq/dcn20/irq_service_dcn20.c @@ -132,31 +132,6 @@ enum dc_irq_source to_dal_irq_source_dcn20( } } -uint32_t dc_get_hpd_state_dcn20(struct irq_service *irq_service, enum dc_irq_source source) -{ - const struct irq_source_info *info; - uint32_t addr; - uint32_t value; - uint32_t current_status; - - info = find_irq_source_info(irq_service, source); - if (!info) - return 0; - - addr = info->status_reg; - if (!addr) - return 0; - - value = dm_read_reg(irq_service->ctx, addr); - current_status = - get_reg_field_value( - value, - HPD0_DC_HPD_INT_STATUS, - DC_HPD_SENSE); - - return current_status; -} - static bool hpd_ack( struct irq_service *irq_service, const struct irq_source_info *info) diff --git a/drivers/gpu/drm/amd/display/dc/irq/dcn20/irq_service_dcn20.h b/drivers/gpu/drm/amd/display/dc/irq/dcn20/irq_service_dcn20.h index 4d69ab24ca25..aee4b37999f1 100644 --- a/drivers/gpu/drm/amd/display/dc/irq/dcn20/irq_service_dcn20.h +++ b/drivers/gpu/drm/amd/display/dc/irq/dcn20/irq_service_dcn20.h @@ -31,6 +31,4 @@ struct irq_service *dal_irq_service_dcn20_create( struct irq_service_init_data *init_data); -uint32_t dc_get_hpd_state_dcn20(struct irq_service *irq_service, enum dc_irq_source source); - #endif diff --git a/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.c b/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.c index 235294534c43..0f15bcada4e9 100644 --- a/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.c +++ b/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.c @@ -134,31 +134,6 @@ static enum dc_irq_source to_dal_irq_source_dcn21(struct irq_service *irq_servic return DC_IRQ_SOURCE_INVALID; } -uint32_t dc_get_hpd_state_dcn21(struct irq_service *irq_service, enum dc_irq_source source) -{ - const struct irq_source_info *info; - uint32_t addr; - uint32_t value; - uint32_t current_status; - - info = find_irq_source_info(irq_service, source); - if (!info) - return 0; - - addr = info->status_reg; - if (!addr) - return 0; - - value = dm_read_reg(irq_service->ctx, addr); - current_status = - get_reg_field_value( - value, - HPD0_DC_HPD_INT_STATUS, - DC_HPD_SENSE); - - return current_status; -} - static bool hpd_ack( struct irq_service *irq_service, const struct irq_source_info *info) diff --git a/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.h b/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.h index 616470e32380..da2bd0e93d7a 100644 --- a/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.h +++ b/drivers/gpu/drm/amd/display/dc/irq/dcn21/irq_service_dcn21.h @@ -31,6 +31,4 @@ struct irq_service *dal_irq_service_dcn21_create( struct irq_service_init_data *init_data); -uint32_t dc_get_hpd_state_dcn21(struct irq_service *irq_service, enum dc_irq_source source); - #endif diff --git a/drivers/gpu/drm/amd/display/dc/irq/irq_service.c b/drivers/gpu/drm/amd/display/dc/irq/irq_service.c index 4db1133e4466..a2a4fbeb83f8 100644 --- a/drivers/gpu/drm/amd/display/dc/irq/irq_service.c +++ b/drivers/gpu/drm/amd/display/dc/irq/irq_service.c @@ -79,7 +79,7 @@ void dal_irq_service_destroy(struct irq_service **irq_service) *irq_service = NULL; } -const struct irq_source_info *find_irq_source_info( +static const struct irq_source_info *find_irq_source_info( struct irq_service *irq_service, enum dc_irq_source source) { diff --git a/drivers/gpu/drm/amd/display/dc/irq/irq_service.h b/drivers/gpu/drm/amd/display/dc/irq/irq_service.h index e60b82480093..dbfcb096eedd 100644 --- a/drivers/gpu/drm/amd/display/dc/irq/irq_service.h +++ b/drivers/gpu/drm/amd/display/dc/irq/irq_service.h @@ -69,10 +69,6 @@ struct irq_service { const struct irq_service_funcs *funcs; }; -const struct irq_source_info *find_irq_source_info( - struct irq_service *irq_service, - enum dc_irq_source source); - void dal_irq_service_construct( struct irq_service *irq_service, struct irq_service_init_data *init_data); -- cgit v1.2.3-58-ga151 From b992f01e66150fc5e90be4a96f5eb8e634c8249e Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 6 Jan 2022 17:15:05 +0530 Subject: bpf: Guard against accessing NULL pt_regs in bpf_get_task_stack() task_pt_regs() can return NULL on powerpc for kernel threads. This is then used in __bpf_get_stack() to check for user mode, resulting in a kernel oops. Guard against this by checking return value of task_pt_regs() before trying to obtain the call chain. Fixes: fa28dcb82a38f8 ("bpf: Introduce helper bpf_get_task_stack()") Cc: stable@vger.kernel.org # v5.9+ Signed-off-by: Naveen N. Rao Acked-by: Daniel Borkmann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d5ef83c361cc255494afd15ff1b4fb02a36e1dcf.1641468127.git.naveen.n.rao@linux.vnet.ibm.com --- kernel/bpf/stackmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 49e567209c6b..22c8ae94e4c1 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -472,13 +472,14 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, u32, size, u64, flags) { struct pt_regs *regs; - long res; + long res = -EINVAL; if (!try_get_task_stack(task)) return -EFAULT; regs = task_pt_regs(task); - res = __bpf_get_stack(regs, task, NULL, buf, size, flags); + if (regs) + res = __bpf_get_stack(regs, task, NULL, buf, size, flags); put_task_stack(task); return res; -- cgit v1.2.3-58-ga151 From fab07611fb2e6a15fac05c4583045ca5582fd826 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 6 Jan 2022 17:15:06 +0530 Subject: powerpc32/bpf: Fix codegen for bpf-to-bpf calls Pad instructions emitted for BPF_CALL so that the number of instructions generated does not change for different function addresses. This is especially important for calls to other bpf functions, whose address will only be known during extra pass. Fixes: 51c66ad849a703 ("powerpc/bpf: Implement extended BPF on PPC32") Cc: stable@vger.kernel.org # v5.13+ Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/52d8fe51f7620a6f27f377791564d79d75463576.1641468127.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/net/bpf_jit_comp32.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index faaebd446cad..c20b49bf8f5b 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -191,6 +191,9 @@ void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 fun if (image && rel < 0x2000000 && rel >= -0x2000000) { PPC_BL_ABS(func); + EMIT(PPC_RAW_NOP()); + EMIT(PPC_RAW_NOP()); + EMIT(PPC_RAW_NOP()); } else { /* Load function address into r0 */ EMIT(PPC_RAW_LIS(_R0, IMM_H(func))); -- cgit v1.2.3-58-ga151 From f9320c49993ca3c0ec0f9a7026b313735306bb8b Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 6 Jan 2022 17:15:07 +0530 Subject: powerpc/bpf: Update ldimm64 instructions during extra pass These instructions are updated after the initial JIT, so redo codegen during the extra pass. Rename bpf_jit_fixup_subprog_calls() to clarify that this is more than just subprog calls. Fixes: 69c087ba6225b5 ("bpf: Add bpf_for_each_map_elem() helper") Cc: stable@vger.kernel.org # v5.15 Signed-off-by: Naveen N. Rao Tested-by: Jiri Olsa Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/7cc162af77ba918eb3ecd26ec9e7824bc44b1fae.1641468127.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/net/bpf_jit_comp.c | 29 +++++++++++++++++++++++------ arch/powerpc/net/bpf_jit_comp32.c | 6 ++++++ arch/powerpc/net/bpf_jit_comp64.c | 7 ++++++- 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index d6ffdd0f2309..56dd1f4e3e44 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -23,15 +23,15 @@ static void bpf_jit_fill_ill_insns(void *area, unsigned int size) memset32(area, BREAKPOINT_INSTRUCTION, size / 4); } -/* Fix the branch target addresses for subprog calls */ -static int bpf_jit_fixup_subprog_calls(struct bpf_prog *fp, u32 *image, - struct codegen_context *ctx, u32 *addrs) +/* Fix updated addresses (for subprog calls, ldimm64, et al) during extra pass */ +static int bpf_jit_fixup_addresses(struct bpf_prog *fp, u32 *image, + struct codegen_context *ctx, u32 *addrs) { const struct bpf_insn *insn = fp->insnsi; bool func_addr_fixed; u64 func_addr; u32 tmp_idx; - int i, ret; + int i, j, ret; for (i = 0; i < fp->len; i++) { /* @@ -66,6 +66,23 @@ static int bpf_jit_fixup_subprog_calls(struct bpf_prog *fp, u32 *image, * of the JITed sequence remains unchanged. */ ctx->idx = tmp_idx; + } else if (insn[i].code == (BPF_LD | BPF_IMM | BPF_DW)) { + tmp_idx = ctx->idx; + ctx->idx = addrs[i] / 4; +#ifdef CONFIG_PPC32 + PPC_LI32(ctx->b2p[insn[i].dst_reg] - 1, (u32)insn[i + 1].imm); + PPC_LI32(ctx->b2p[insn[i].dst_reg], (u32)insn[i].imm); + for (j = ctx->idx - addrs[i] / 4; j < 4; j++) + EMIT(PPC_RAW_NOP()); +#else + func_addr = ((u64)(u32)insn[i].imm) | (((u64)(u32)insn[i + 1].imm) << 32); + PPC_LI64(b2p[insn[i].dst_reg], func_addr); + /* overwrite rest with nops */ + for (j = ctx->idx - addrs[i] / 4; j < 5; j++) + EMIT(PPC_RAW_NOP()); +#endif + ctx->idx = tmp_idx; + i++; } } @@ -200,13 +217,13 @@ skip_init_ctx: /* * Do not touch the prologue and epilogue as they will remain * unchanged. Only fix the branch target address for subprog - * calls in the body. + * calls in the body, and ldimm64 instructions. * * This does not change the offsets and lengths of the subprog * call instruction sequences and hence, the size of the JITed * image as well. */ - bpf_jit_fixup_subprog_calls(fp, code_base, &cgctx, addrs); + bpf_jit_fixup_addresses(fp, code_base, &cgctx, addrs); /* There is no need to perform the usual passes. */ goto skip_codegen_passes; diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index c20b49bf8f5b..cf8dd8aea386 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -293,6 +293,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * bool func_addr_fixed; u64 func_addr; u32 true_cond; + u32 tmp_idx; + int j; /* * addrs[] maps a BPF bytecode address into a real offset from @@ -908,8 +910,12 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * * 16 byte instruction that uses two 'struct bpf_insn' */ case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */ + tmp_idx = ctx->idx; PPC_LI32(dst_reg_h, (u32)insn[i + 1].imm); PPC_LI32(dst_reg, (u32)insn[i].imm); + /* padding to allow full 4 instructions for later patching */ + for (j = ctx->idx - tmp_idx; j < 4; j++) + EMIT(PPC_RAW_NOP()); /* Adjust for two bpf instructions */ addrs[++i] = ctx->idx * 4; break; diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 9eae8d8ed340..bfdd417c6b81 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -319,6 +319,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * u64 imm64; u32 true_cond; u32 tmp_idx; + int j; /* * addrs[] maps a BPF bytecode address into a real offset from @@ -848,9 +849,13 @@ emit_clear: case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */ imm64 = ((u64)(u32) insn[i].imm) | (((u64)(u32) insn[i+1].imm) << 32); + tmp_idx = ctx->idx; + PPC_LI64(dst_reg, imm64); + /* padding to allow full 5 instructions for later patching */ + for (j = ctx->idx - tmp_idx; j < 5; j++) + EMIT(PPC_RAW_NOP()); /* Adjust for two bpf instructions */ addrs[++i] = ctx->idx * 4; - PPC_LI64(dst_reg, imm64); break; /* -- cgit v1.2.3-58-ga151 From 88a71086c48ae98e93c0208044827621e9717f7e Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 6 Jan 2022 17:15:08 +0530 Subject: tools/bpf: Rename 'struct event' to avoid naming conflict On ppc64le, trying to build bpf seltests throws the below warning: In file included from runqslower.bpf.c:5: ./runqslower.h:7:8: error: redefinition of 'event' struct event { ^ /home/naveen/linux/tools/testing/selftests/bpf/tools/build/runqslower/vmlinux.h:156602:8: note: previous definition is here struct event { ^ This happens since 'struct event' is defined in drivers/net/ethernet/alteon/acenic.h . Rename the one in runqslower to a more appropriate 'runq_event' to avoid the naming conflict. Signed-off-by: Naveen N. Rao Acked-by: Daniel Borkmann Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c13cb3767d26257ca4387b8296b632b433a58db6.1641468127.git.naveen.n.rao@linux.vnet.ibm.com --- tools/bpf/runqslower/runqslower.bpf.c | 2 +- tools/bpf/runqslower/runqslower.c | 2 +- tools/bpf/runqslower/runqslower.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c index ab9353f2fd46..9a5c1f008fe6 100644 --- a/tools/bpf/runqslower/runqslower.bpf.c +++ b/tools/bpf/runqslower/runqslower.bpf.c @@ -68,7 +68,7 @@ int handle__sched_switch(u64 *ctx) */ struct task_struct *prev = (struct task_struct *)ctx[1]; struct task_struct *next = (struct task_struct *)ctx[2]; - struct event event = {}; + struct runq_event event = {}; u64 *tsp, delta_us; long state; u32 pid; diff --git a/tools/bpf/runqslower/runqslower.c b/tools/bpf/runqslower/runqslower.c index 2414cc764461..d78f4148597f 100644 --- a/tools/bpf/runqslower/runqslower.c +++ b/tools/bpf/runqslower/runqslower.c @@ -100,7 +100,7 @@ static int bump_memlock_rlimit(void) void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) { - const struct event *e = data; + const struct runq_event *e = data; struct tm *tm; char ts[32]; time_t t; diff --git a/tools/bpf/runqslower/runqslower.h b/tools/bpf/runqslower/runqslower.h index 9db225425e5f..4f70f07200c2 100644 --- a/tools/bpf/runqslower/runqslower.h +++ b/tools/bpf/runqslower/runqslower.h @@ -4,7 +4,7 @@ #define TASK_COMM_LEN 16 -struct event { +struct runq_event { char task[TASK_COMM_LEN]; __u64 delta_us; pid_t pid; -- cgit v1.2.3-58-ga151 From 3f5f766d5f7f95a69a630da3544a1a0cee1cdddf Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 6 Jan 2022 17:15:12 +0530 Subject: powerpc64/bpf: Limit 'ldbrx' to processors compliant with ISA v2.06 Johan reported the below crash with test_bpf on ppc64 e5500: test_bpf: #296 ALU_END_FROM_LE 64: 0x0123456789abcdef -> 0x67452301 jited:1 Oops: Exception in kernel mode, sig: 4 [#1] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500 Modules linked in: test_bpf(+) CPU: 0 PID: 76 Comm: insmod Not tainted 5.14.0-03771-g98c2059e008a-dirty #1 NIP: 8000000000061c3c LR: 80000000006dea64 CTR: 8000000000061c18 REGS: c0000000032d3420 TRAP: 0700 Not tainted (5.14.0-03771-g98c2059e008a-dirty) MSR: 0000000080089000 CR: 88002822 XER: 20000000 IRQMASK: 0 <...> NIP [8000000000061c3c] 0x8000000000061c3c LR [80000000006dea64] .__run_one+0x104/0x17c [test_bpf] Call Trace: .__run_one+0x60/0x17c [test_bpf] (unreliable) .test_bpf_init+0x6a8/0xdc8 [test_bpf] .do_one_initcall+0x6c/0x28c .do_init_module+0x68/0x28c .load_module+0x2460/0x2abc .__do_sys_init_module+0x120/0x18c .system_call_exception+0x110/0x1b8 system_call_common+0xf0/0x210 --- interrupt: c00 at 0x101d0acc <...> ---[ end trace 47b2bf19090bb3d0 ]--- Illegal instruction The illegal instruction turned out to be 'ldbrx' emitted for BPF_FROM_[L|B]E, which was only introduced in ISA v2.06. Guard use of the same and implement an alternative approach for older processors. Fixes: 156d0e290e969c ("powerpc/ebpf/jit: Implement JIT compiler for extended BPF") Reported-by: Johan Almbladh Signed-off-by: Naveen N. Rao Tested-by: Johan Almbladh Acked-by: Johan Almbladh Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d1e51c6fdf572062cf3009a751c3406bda01b832.1641468127.git.naveen.n.rao@linux.vnet.ibm.com --- arch/powerpc/include/asm/ppc-opcode.h | 1 + arch/powerpc/net/bpf_jit_comp64.c | 22 +++++++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index efad07081cc0..9675303b724e 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -500,6 +500,7 @@ #define PPC_RAW_LDX(r, base, b) (0x7c00002a | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) #define PPC_RAW_LHZ(r, base, i) (0xa0000000 | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i)) #define PPC_RAW_LHBRX(r, base, b) (0x7c00062c | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) +#define PPC_RAW_LWBRX(r, base, b) (0x7c00042c | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) #define PPC_RAW_LDBRX(r, base, b) (0x7c000428 | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) #define PPC_RAW_STWCX(s, a, b) (0x7c00012d | ___PPC_RS(s) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_CMPWI(a, i) (0x2c000000 | ___PPC_RA(a) | IMM_L(i)) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index bfdd417c6b81..e1e8c934308a 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -634,17 +634,21 @@ bpf_alu32_trunc: EMIT(PPC_RAW_MR(dst_reg, b2p[TMP_REG_1])); break; case 64: - /* - * Way easier and faster(?) to store the value - * into stack and then use ldbrx - * - * ctx->seen will be reliable in pass2, but - * the instructions generated will remain the - * same across all passes - */ + /* Store the value to stack and then use byte-reverse loads */ PPC_BPF_STL(dst_reg, 1, bpf_jit_stack_local(ctx)); EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], 1, bpf_jit_stack_local(ctx))); - EMIT(PPC_RAW_LDBRX(dst_reg, 0, b2p[TMP_REG_1])); + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + EMIT(PPC_RAW_LDBRX(dst_reg, 0, b2p[TMP_REG_1])); + } else { + EMIT(PPC_RAW_LWBRX(dst_reg, 0, b2p[TMP_REG_1])); + if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) + EMIT(PPC_RAW_SLDI(dst_reg, dst_reg, 32)); + EMIT(PPC_RAW_LI(b2p[TMP_REG_2], 4)); + EMIT(PPC_RAW_LWBRX(b2p[TMP_REG_2], b2p[TMP_REG_2], b2p[TMP_REG_1])); + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) + EMIT(PPC_RAW_SLDI(b2p[TMP_REG_2], b2p[TMP_REG_2], 32)); + EMIT(PPC_RAW_OR(dst_reg, dst_reg, b2p[TMP_REG_2])); + } break; } break; -- cgit v1.2.3-58-ga151 From 252745240ba0ae774d2f80c5e185ed59fbc4fb41 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 14 Jan 2022 11:26:25 +0000 Subject: powerpc/audit: Fix syscall_get_arch() Commit 770cec16cdc9 ("powerpc/audit: Simplify syscall_get_arch()") and commit 898a1ef06ad4 ("powerpc/audit: Avoid unneccessary #ifdef in syscall_get_arguments()") replaced test_tsk_thread_flag(task, TIF_32BIT)) by is_32bit_task(). But is_32bit_task() applies on current task while be want the test done on task 'task' So add a new macro is_tsk_32bit_task() to check any task. Fixes: 770cec16cdc9 ("powerpc/audit: Simplify syscall_get_arch()") Fixes: 898a1ef06ad4 ("powerpc/audit: Avoid unneccessary #ifdef in syscall_get_arguments()") Cc: stable@vger.kernel.org Reported-by: Dmitry V. Levin Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c55cddb8f65713bf5859ed675d75a50cb37d5995.1642159570.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/syscall.h | 4 ++-- arch/powerpc/include/asm/thread_info.h | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index 52d05b465e3e..25fc8ad9a27a 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -90,7 +90,7 @@ static inline void syscall_get_arguments(struct task_struct *task, unsigned long val, mask = -1UL; unsigned int n = 6; - if (is_32bit_task()) + if (is_tsk_32bit_task(task)) mask = 0xffffffff; while (n--) { @@ -105,7 +105,7 @@ static inline void syscall_get_arguments(struct task_struct *task, static inline int syscall_get_arch(struct task_struct *task) { - if (is_32bit_task()) + if (is_tsk_32bit_task(task)) return AUDIT_ARCH_PPC; else if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) return AUDIT_ARCH_PPC64LE; diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 5725029aaa29..d6e649b3c70b 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -168,8 +168,10 @@ static inline bool test_thread_local_flags(unsigned int flags) #ifdef CONFIG_COMPAT #define is_32bit_task() (test_thread_flag(TIF_32BIT)) +#define is_tsk_32bit_task(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT)) #else #define is_32bit_task() (IS_ENABLED(CONFIG_PPC32)) +#define is_tsk_32bit_task(tsk) (IS_ENABLED(CONFIG_PPC32)) #endif #if defined(CONFIG_PPC64) -- cgit v1.2.3-58-ga151 From b7ec62d7ee0f0b8af6ba190501dff7f9ee6545ca Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:16:57 -0700 Subject: bitops: protect find_first_{,zero}_bit properly find_first_bit() and find_first_zero_bit() are not protected with ifdefs as other functions in find.h. It causes build errors on some platforms if CONFIG_GENERIC_FIND_FIRST_BIT is enabled. Signed-off-by: Yury Norov Fixes: 2cc7b6a44ac2 ("lib: add fast path for find_first_*_bit() and find_last_bit()") Reported-by: kernel test robot Tested-by: Wolfram Sang --- include/asm-generic/bitops/find.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 0d132ee2a291..835f959a25f2 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -97,6 +97,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, #ifdef CONFIG_GENERIC_FIND_FIRST_BIT +#ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region * @addr: The address to start the search at @@ -116,7 +117,9 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) return _find_first_bit(addr, size); } +#endif +#ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region * @addr: The address to start the search at @@ -136,6 +139,8 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) return _find_first_zero_bit(addr, size); } +#endif + #else /* CONFIG_GENERIC_FIND_FIRST_BIT */ #ifndef find_first_bit -- cgit v1.2.3-58-ga151 From 6b8ecb84f8f64017ae6e56cd745ad88e48f68779 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:16:58 -0700 Subject: bitops: move find_bit_*_le functions from le.h to find.h It's convenient to have all find_bit declarations in one place. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- include/asm-generic/bitops/find.h | 69 +++++++++++++++++++++++++++++++++++++++ include/asm-generic/bitops/le.h | 64 ------------------------------------ 2 files changed, 69 insertions(+), 64 deletions(-) diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h index 835f959a25f2..91b1b23f2b0c 100644 --- a/include/asm-generic/bitops/find.h +++ b/include/asm-generic/bitops/find.h @@ -190,4 +190,73 @@ extern unsigned long find_next_clump8(unsigned long *clump, #define find_first_clump8(clump, bits, size) \ find_next_clump8((clump), (bits), (size), 0) +#if defined(__LITTLE_ENDIAN) + +static inline unsigned long find_next_zero_bit_le(const void *addr, + unsigned long size, unsigned long offset) +{ + return find_next_zero_bit(addr, size, offset); +} + +static inline unsigned long find_next_bit_le(const void *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +static inline unsigned long find_first_zero_bit_le(const void *addr, + unsigned long size) +{ + return find_first_zero_bit(addr, size); +} + +#elif defined(__BIG_ENDIAN) + +#ifndef find_next_zero_bit_le +static inline +unsigned long find_next_zero_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + + return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); +} +#endif + +#ifndef find_next_bit_le +static inline +unsigned long find_next_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr, NULL, size, offset, 0UL, 1); +} +#endif + +#ifndef find_first_zero_bit_le +#define find_first_zero_bit_le(addr, size) \ + find_next_zero_bit_le((addr), (size), 0) +#endif + +#else +#error "Please fix " +#endif + #endif /*_ASM_GENERIC_BITOPS_FIND_H_ */ diff --git a/include/asm-generic/bitops/le.h b/include/asm-generic/bitops/le.h index 5a28629cbf4d..d51beff60375 100644 --- a/include/asm-generic/bitops/le.h +++ b/include/asm-generic/bitops/le.h @@ -2,83 +2,19 @@ #ifndef _ASM_GENERIC_BITOPS_LE_H_ #define _ASM_GENERIC_BITOPS_LE_H_ -#include #include #include -#include #if defined(__LITTLE_ENDIAN) #define BITOP_LE_SWIZZLE 0 -static inline unsigned long find_next_zero_bit_le(const void *addr, - unsigned long size, unsigned long offset) -{ - return find_next_zero_bit(addr, size, offset); -} - -static inline unsigned long find_next_bit_le(const void *addr, - unsigned long size, unsigned long offset) -{ - return find_next_bit(addr, size, offset); -} - -static inline unsigned long find_first_zero_bit_le(const void *addr, - unsigned long size) -{ - return find_first_zero_bit(addr, size); -} - #elif defined(__BIG_ENDIAN) #define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) -#ifndef find_next_zero_bit_le -static inline -unsigned long find_next_zero_bit_le(const void *addr, unsigned - long size, unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val = *(const unsigned long *)addr; - - if (unlikely(offset >= size)) - return size; - - val = swab(val) | ~GENMASK(size - 1, offset); - return val == ~0UL ? size : ffz(val); - } - - return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); -} -#endif - -#ifndef find_next_bit_le -static inline -unsigned long find_next_bit_le(const void *addr, unsigned - long size, unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val = *(const unsigned long *)addr; - - if (unlikely(offset >= size)) - return size; - - val = swab(val) & GENMASK(size - 1, offset); - return val ? __ffs(val) : size; - } - - return _find_next_bit(addr, NULL, size, offset, 0UL, 1); -} #endif -#ifndef find_first_zero_bit_le -#define find_first_zero_bit_le(addr, size) \ - find_next_zero_bit_le((addr), (size), 0) -#endif - -#else -#error "Please fix " -#endif static inline int test_bit_le(int nr, const void *addr) { -- cgit v1.2.3-58-ga151 From 47d8c15615c0a2046d2d90b04cb80b81ddf31fb1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:16:59 -0700 Subject: include: move find.h from asm_generic to linux find_bit API and bitmap API are closely related, but inclusion paths are different - include/asm-generic and include/linux, correspondingly. In the past it made a lot of troubles due to circular dependencies and/or undefined symbols. Fix this by moving find.h under include/linux. Signed-off-by: Yury Norov Tested-by: Wolfram Sang Acked-by: Geert Uytterhoeven --- MAINTAINERS | 2 +- arch/alpha/include/asm/bitops.h | 2 - arch/arc/include/asm/bitops.h | 1 - arch/arm/include/asm/bitops.h | 1 - arch/arm64/include/asm/bitops.h | 1 - arch/csky/include/asm/bitops.h | 1 - arch/h8300/include/asm/bitops.h | 1 - arch/hexagon/include/asm/bitops.h | 1 - arch/ia64/include/asm/bitops.h | 2 - arch/m68k/include/asm/bitops.h | 2 - arch/mips/include/asm/bitops.h | 1 - arch/openrisc/include/asm/bitops.h | 1 - arch/parisc/include/asm/bitops.h | 1 - arch/powerpc/include/asm/bitops.h | 2 - arch/riscv/include/asm/bitops.h | 1 - arch/s390/include/asm/bitops.h | 1 - arch/sh/include/asm/bitops.h | 1 - arch/sparc/include/asm/bitops_32.h | 1 - arch/sparc/include/asm/bitops_64.h | 2 - arch/x86/include/asm/bitops.h | 2 - arch/xtensa/include/asm/bitops.h | 1 - include/asm-generic/bitops.h | 1 - include/asm-generic/bitops/find.h | 262 ------------------------------------ include/linux/bitmap.h | 1 + include/linux/find.h | 268 +++++++++++++++++++++++++++++++++++++ 25 files changed, 270 insertions(+), 290 deletions(-) delete mode 100644 include/asm-generic/bitops/find.h create mode 100644 include/linux/find.h diff --git a/MAINTAINERS b/MAINTAINERS index dd36acc87ce6..4646786f5302 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3359,8 +3359,8 @@ M: Yury Norov R: Andy Shevchenko R: Rasmus Villemoes S: Maintained -F: include/asm-generic/bitops/find.h F: include/linux/bitmap.h +F: include/linux/find.h F: lib/bitmap.c F: lib/find_bit.c F: lib/find_bit_benchmark.c diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h index 5adca78830b5..e1d8483a45f2 100644 --- a/arch/alpha/include/asm/bitops.h +++ b/arch/alpha/include/asm/bitops.h @@ -430,8 +430,6 @@ static inline unsigned int __arch_hweight8(unsigned int w) #endif /* __KERNEL__ */ -#include - #ifdef __KERNEL__ /* diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h index a7daaf64ae34..bdb7e190a294 100644 --- a/arch/arc/include/asm/bitops.h +++ b/arch/arc/include/asm/bitops.h @@ -189,7 +189,6 @@ static inline __attribute__ ((const)) unsigned long __ffs(unsigned long x) #include #include -#include #include #include diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h index c92e42a5c8f7..8e94fe7ab5eb 100644 --- a/arch/arm/include/asm/bitops.h +++ b/arch/arm/include/asm/bitops.h @@ -264,7 +264,6 @@ static inline int find_next_bit_le(const void *p, int size, int offset) #endif -#include #include /* diff --git a/arch/arm64/include/asm/bitops.h b/arch/arm64/include/asm/bitops.h index 81a3e519b07d..9b3c787132d2 100644 --- a/arch/arm64/include/asm/bitops.h +++ b/arch/arm64/include/asm/bitops.h @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/arch/csky/include/asm/bitops.h b/arch/csky/include/asm/bitops.h index 02b72a000767..72e1b2aa29a0 100644 --- a/arch/csky/include/asm/bitops.h +++ b/arch/csky/include/asm/bitops.h @@ -59,7 +59,6 @@ static __always_inline unsigned long __fls(unsigned long x) #include #include -#include #ifndef _LINUX_BITOPS_H #error only can be included directly diff --git a/arch/h8300/include/asm/bitops.h b/arch/h8300/include/asm/bitops.h index c867a80cab5b..4489e3d6edd3 100644 --- a/arch/h8300/include/asm/bitops.h +++ b/arch/h8300/include/asm/bitops.h @@ -168,7 +168,6 @@ static inline unsigned long __ffs(unsigned long word) return result; } -#include #include #include #include diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h index 71429f756af0..75d6ba3643b8 100644 --- a/arch/hexagon/include/asm/bitops.h +++ b/arch/hexagon/include/asm/bitops.h @@ -271,7 +271,6 @@ static inline unsigned long __fls(unsigned long word) } #include -#include #include #include diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h index 2f24ee6459d2..577be93c0818 100644 --- a/arch/ia64/include/asm/bitops.h +++ b/arch/ia64/include/asm/bitops.h @@ -441,8 +441,6 @@ static __inline__ unsigned long __arch_hweight64(unsigned long x) #endif /* __KERNEL__ */ -#include - #ifdef __KERNEL__ #include diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index 7b93e1fd8ffa..51283db53667 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -529,6 +529,4 @@ static inline int __fls(int x) #include #endif /* __KERNEL__ */ -#include - #endif /* _M68K_BITOPS_H */ diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h index dc2a6234dd3c..c09d57f907f7 100644 --- a/arch/mips/include/asm/bitops.h +++ b/arch/mips/include/asm/bitops.h @@ -446,7 +446,6 @@ static inline int ffs(int word) } #include -#include #ifdef __KERNEL__ diff --git a/arch/openrisc/include/asm/bitops.h b/arch/openrisc/include/asm/bitops.h index 7f1ca35213d8..d773ed938acb 100644 --- a/arch/openrisc/include/asm/bitops.h +++ b/arch/openrisc/include/asm/bitops.h @@ -30,7 +30,6 @@ #include #include #include -#include #ifndef _LINUX_BITOPS_H #error only can be included directly diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h index daa2afd974fb..0ec9cfc5131f 100644 --- a/arch/parisc/include/asm/bitops.h +++ b/arch/parisc/include/asm/bitops.h @@ -203,7 +203,6 @@ static __inline__ int fls(unsigned int x) #include #include #include -#include #include #include diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 11847b6a244e..abc8f2c7c570 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -255,8 +255,6 @@ unsigned long __arch_hweight64(__u64 w); #include #endif -#include - /* wrappers that deal with KASAN instrumentation */ #include #include diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index 396a3303c537..3540b690944b 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -20,7 +20,6 @@ #include #include #include -#include #include #include diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 5a530c552c23..1d40630128a5 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -387,7 +387,6 @@ static inline int fls(unsigned int word) #endif /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */ #include -#include #include #include #include diff --git a/arch/sh/include/asm/bitops.h b/arch/sh/include/asm/bitops.h index 3b6c7b5b7ec9..10ceb0d6b5a9 100644 --- a/arch/sh/include/asm/bitops.h +++ b/arch/sh/include/asm/bitops.h @@ -68,6 +68,5 @@ static inline unsigned long __ffs(unsigned long word) #include #include -#include #endif /* __ASM_SH_BITOPS_H */ diff --git a/arch/sparc/include/asm/bitops_32.h b/arch/sparc/include/asm/bitops_32.h index 0ceff3b915a8..889afa9f990f 100644 --- a/arch/sparc/include/asm/bitops_32.h +++ b/arch/sparc/include/asm/bitops_32.h @@ -100,7 +100,6 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *addr) #include #include #include -#include #include #include diff --git a/arch/sparc/include/asm/bitops_64.h b/arch/sparc/include/asm/bitops_64.h index ca7ea5913494..005a8ae858f1 100644 --- a/arch/sparc/include/asm/bitops_64.h +++ b/arch/sparc/include/asm/bitops_64.h @@ -52,8 +52,6 @@ unsigned int __arch_hweight8(unsigned int w); #include #endif /* __KERNEL__ */ -#include - #ifdef __KERNEL__ #include diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 0367efdc5b7a..a288ecd230ab 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -380,8 +380,6 @@ static __always_inline int fls64(__u64 x) #include #endif -#include - #include #include diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h index 3f71d364ba90..cd225896c40f 100644 --- a/arch/xtensa/include/asm/bitops.h +++ b/arch/xtensa/include/asm/bitops.h @@ -205,7 +205,6 @@ BIT_OPS(change, "xor", ) #undef BIT_OP #undef TEST_AND_BIT_OP -#include #include #include diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h index df9b5bc3d282..a47b8a71d6fe 100644 --- a/include/asm-generic/bitops.h +++ b/include/asm-generic/bitops.h @@ -20,7 +20,6 @@ #include #include #include -#include #ifndef _LINUX_BITOPS_H #error only can be included directly diff --git a/include/asm-generic/bitops/find.h b/include/asm-generic/bitops/find.h deleted file mode 100644 index 91b1b23f2b0c..000000000000 --- a/include/asm-generic/bitops/find.h +++ /dev/null @@ -1,262 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_GENERIC_BITOPS_FIND_H_ -#define _ASM_GENERIC_BITOPS_FIND_H_ - -extern unsigned long _find_next_bit(const unsigned long *addr1, - const unsigned long *addr2, unsigned long nbits, - unsigned long start, unsigned long invert, unsigned long le); -extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); -extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); -extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); - -#ifndef find_next_bit -/** - * find_next_bit - find the next set bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The bitmap size in bits - * - * Returns the bit number for the next set bit - * If no bits are set, returns @size. - */ -static inline -unsigned long find_next_bit(const unsigned long *addr, unsigned long size, - unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val; - - if (unlikely(offset >= size)) - return size; - - val = *addr & GENMASK(size - 1, offset); - return val ? __ffs(val) : size; - } - - return _find_next_bit(addr, NULL, size, offset, 0UL, 0); -} -#endif - -#ifndef find_next_and_bit -/** - * find_next_and_bit - find the next set bit in both memory regions - * @addr1: The first address to base the search on - * @addr2: The second address to base the search on - * @offset: The bitnumber to start searching at - * @size: The bitmap size in bits - * - * Returns the bit number for the next set bit - * If no bits are set, returns @size. - */ -static inline -unsigned long find_next_and_bit(const unsigned long *addr1, - const unsigned long *addr2, unsigned long size, - unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val; - - if (unlikely(offset >= size)) - return size; - - val = *addr1 & *addr2 & GENMASK(size - 1, offset); - return val ? __ffs(val) : size; - } - - return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); -} -#endif - -#ifndef find_next_zero_bit -/** - * find_next_zero_bit - find the next cleared bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The bitmap size in bits - * - * Returns the bit number of the next zero bit - * If no bits are zero, returns @size. - */ -static inline -unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, - unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val; - - if (unlikely(offset >= size)) - return size; - - val = *addr | ~GENMASK(size - 1, offset); - return val == ~0UL ? size : ffz(val); - } - - return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); -} -#endif - -#ifdef CONFIG_GENERIC_FIND_FIRST_BIT - -#ifndef find_first_bit -/** - * find_first_bit - find the first set bit in a memory region - * @addr: The address to start the search at - * @size: The maximum number of bits to search - * - * Returns the bit number of the first set bit. - * If no bits are set, returns @size. - */ -static inline -unsigned long find_first_bit(const unsigned long *addr, unsigned long size) -{ - if (small_const_nbits(size)) { - unsigned long val = *addr & GENMASK(size - 1, 0); - - return val ? __ffs(val) : size; - } - - return _find_first_bit(addr, size); -} -#endif - -#ifndef find_first_zero_bit -/** - * find_first_zero_bit - find the first cleared bit in a memory region - * @addr: The address to start the search at - * @size: The maximum number of bits to search - * - * Returns the bit number of the first cleared bit. - * If no bits are zero, returns @size. - */ -static inline -unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) -{ - if (small_const_nbits(size)) { - unsigned long val = *addr | ~GENMASK(size - 1, 0); - - return val == ~0UL ? size : ffz(val); - } - - return _find_first_zero_bit(addr, size); -} -#endif - -#else /* CONFIG_GENERIC_FIND_FIRST_BIT */ - -#ifndef find_first_bit -#define find_first_bit(addr, size) find_next_bit((addr), (size), 0) -#endif -#ifndef find_first_zero_bit -#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) -#endif - -#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ - -#ifndef find_last_bit -/** - * find_last_bit - find the last set bit in a memory region - * @addr: The address to start the search at - * @size: The number of bits to search - * - * Returns the bit number of the last set bit, or size. - */ -static inline -unsigned long find_last_bit(const unsigned long *addr, unsigned long size) -{ - if (small_const_nbits(size)) { - unsigned long val = *addr & GENMASK(size - 1, 0); - - return val ? __fls(val) : size; - } - - return _find_last_bit(addr, size); -} -#endif - -/** - * find_next_clump8 - find next 8-bit clump with set bits in a memory region - * @clump: location to store copy of found clump - * @addr: address to base the search on - * @size: bitmap size in number of bits - * @offset: bit offset at which to start searching - * - * Returns the bit offset for the next set clump; the found clump value is - * copied to the location pointed by @clump. If no bits are set, returns @size. - */ -extern unsigned long find_next_clump8(unsigned long *clump, - const unsigned long *addr, - unsigned long size, unsigned long offset); - -#define find_first_clump8(clump, bits, size) \ - find_next_clump8((clump), (bits), (size), 0) - -#if defined(__LITTLE_ENDIAN) - -static inline unsigned long find_next_zero_bit_le(const void *addr, - unsigned long size, unsigned long offset) -{ - return find_next_zero_bit(addr, size, offset); -} - -static inline unsigned long find_next_bit_le(const void *addr, - unsigned long size, unsigned long offset) -{ - return find_next_bit(addr, size, offset); -} - -static inline unsigned long find_first_zero_bit_le(const void *addr, - unsigned long size) -{ - return find_first_zero_bit(addr, size); -} - -#elif defined(__BIG_ENDIAN) - -#ifndef find_next_zero_bit_le -static inline -unsigned long find_next_zero_bit_le(const void *addr, unsigned - long size, unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val = *(const unsigned long *)addr; - - if (unlikely(offset >= size)) - return size; - - val = swab(val) | ~GENMASK(size - 1, offset); - return val == ~0UL ? size : ffz(val); - } - - return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); -} -#endif - -#ifndef find_next_bit_le -static inline -unsigned long find_next_bit_le(const void *addr, unsigned - long size, unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val = *(const unsigned long *)addr; - - if (unlikely(offset >= size)) - return size; - - val = swab(val) & GENMASK(size - 1, offset); - return val ? __ffs(val) : size; - } - - return _find_next_bit(addr, NULL, size, offset, 0UL, 1); -} -#endif - -#ifndef find_first_zero_bit_le -#define find_first_zero_bit_le(addr, size) \ - find_next_zero_bit_le((addr), (size), 0) -#endif - -#else -#error "Please fix " -#endif - -#endif /*_ASM_GENERIC_BITOPS_FIND_H_ */ diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index a241dcf50f39..ead4a150bd7f 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -6,6 +6,7 @@ #include #include +#include #include #include #include diff --git a/include/linux/find.h b/include/linux/find.h new file mode 100644 index 000000000000..c5410c243e04 --- /dev/null +++ b/include/linux/find.h @@ -0,0 +1,268 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_FIND_H_ +#define __LINUX_FIND_H_ + +#ifndef __LINUX_BITMAP_H +#error only can be included directly +#endif + +#include + +extern unsigned long _find_next_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long nbits, + unsigned long start, unsigned long invert, unsigned long le); +extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); + +#ifndef find_next_bit +/** + * find_next_bit - find the next set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr, NULL, size, offset, 0UL, 0); +} +#endif + +#ifndef find_next_and_bit +/** + * find_next_and_bit - find the next set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @offset: The bitnumber to start searching at + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_next_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr1 & *addr2 & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); +} +#endif + +#ifndef find_next_zero_bit +/** + * find_next_zero_bit - find the next cleared bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The bitmap size in bits + * + * Returns the bit number of the next zero bit + * If no bits are zero, returns @size. + */ +static inline +unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + + return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); +} +#endif + +#ifdef CONFIG_GENERIC_FIND_FIRST_BIT + +#ifndef find_first_bit +/** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum number of bits to search + * + * Returns the bit number of the first set bit. + * If no bits are set, returns @size. + */ +static inline +unsigned long find_first_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_bit(addr, size); +} +#endif + +#ifndef find_first_zero_bit +/** + * find_first_zero_bit - find the first cleared bit in a memory region + * @addr: The address to start the search at + * @size: The maximum number of bits to search + * + * Returns the bit number of the first cleared bit. + * If no bits are zero, returns @size. + */ +static inline +unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr | ~GENMASK(size - 1, 0); + + return val == ~0UL ? size : ffz(val); + } + + return _find_first_zero_bit(addr, size); +} +#endif + +#else /* CONFIG_GENERIC_FIND_FIRST_BIT */ + +#ifndef find_first_bit +#define find_first_bit(addr, size) find_next_bit((addr), (size), 0) +#endif +#ifndef find_first_zero_bit +#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) +#endif + +#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ + +#ifndef find_last_bit +/** + * find_last_bit - find the last set bit in a memory region + * @addr: The address to start the search at + * @size: The number of bits to search + * + * Returns the bit number of the last set bit, or size. + */ +static inline +unsigned long find_last_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __fls(val) : size; + } + + return _find_last_bit(addr, size); +} +#endif + +/** + * find_next_clump8 - find next 8-bit clump with set bits in a memory region + * @clump: location to store copy of found clump + * @addr: address to base the search on + * @size: bitmap size in number of bits + * @offset: bit offset at which to start searching + * + * Returns the bit offset for the next set clump; the found clump value is + * copied to the location pointed by @clump. If no bits are set, returns @size. + */ +extern unsigned long find_next_clump8(unsigned long *clump, + const unsigned long *addr, + unsigned long size, unsigned long offset); + +#define find_first_clump8(clump, bits, size) \ + find_next_clump8((clump), (bits), (size), 0) + +#if defined(__LITTLE_ENDIAN) + +static inline unsigned long find_next_zero_bit_le(const void *addr, + unsigned long size, unsigned long offset) +{ + return find_next_zero_bit(addr, size, offset); +} + +static inline unsigned long find_next_bit_le(const void *addr, + unsigned long size, unsigned long offset) +{ + return find_next_bit(addr, size, offset); +} + +static inline unsigned long find_first_zero_bit_le(const void *addr, + unsigned long size) +{ + return find_first_zero_bit(addr, size); +} + +#elif defined(__BIG_ENDIAN) + +#ifndef find_next_zero_bit_le +static inline +unsigned long find_next_zero_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + + return _find_next_bit(addr, NULL, size, offset, ~0UL, 1); +} +#endif + +#ifndef find_next_bit_le +static inline +unsigned long find_next_bit_le(const void *addr, unsigned + long size, unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val = *(const unsigned long *)addr; + + if (unlikely(offset >= size)) + return size; + + val = swab(val) & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr, NULL, size, offset, 0UL, 1); +} +#endif + +#ifndef find_first_zero_bit_le +#define find_first_zero_bit_le(addr, size) \ + find_next_zero_bit_le((addr), (size), 0) +#endif + +#else +#error "Please fix " +#endif + +#endif /*__LINUX_FIND_H_ */ -- cgit v1.2.3-58-ga151 From c126a53c276048125b4a950072bab37ad0fea120 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:00 -0700 Subject: arch: remove GENERIC_FIND_FIRST_BIT entirely In 5.12 cycle we enabled GENERIC_FIND_FIRST_BIT config option for ARM64 and MIPS. It increased performance and shrunk .text size; and so far I didn't receive any negative feedback on the change. https://lore.kernel.org/linux-arch/20210225135700.1381396-1-yury.norov@gmail.com/ Now I think it's a good time to switch all architectures to use find_{first,last}_bit() unconditionally, and so remove corresponding config option. The patch does't introduce functioal changes for arc, arm, arm64, mips, m68k, s390 and x86, for other architectures I expect improvement both in performance and .text size. Signed-off-by: Yury Norov Tested-by: Alexander Lobakin (mips) Reviewed-by: Alexander Lobakin (mips) Reviewed-by: Andy Shevchenko Acked-by: Will Deacon Tested-by: Wolfram Sang --- arch/arc/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/x86/um/Kconfig | 1 - include/linux/find.h | 13 ------------- lib/Kconfig | 3 --- 8 files changed, 22 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index b4ae6058902a..4bec4b0b6ce1 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -20,7 +20,6 @@ config ARC select COMMON_CLK select DMA_DIRECT_REMAP select GENERIC_ATOMIC64 if !ISA_ARCV2 || !(ARC_HAS_LL64 && ARC_HAS_LLSC) - select GENERIC_FIND_FIRST_BIT # for now, we don't need GENERIC_IRQ_PROBE, CONFIG_GENERIC_IRQ_CHIP select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c4207cf9bb17..517d26c8002d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -120,7 +120,6 @@ config ARM64 select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP - select GENERIC_FIND_FIRST_BIT select GENERIC_IDLE_POLL_SETUP select GENERIC_IRQ_IPI select GENERIC_IRQ_PROBE diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 0215dc1529e9..00951bfdbab0 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -32,7 +32,6 @@ config MIPS select GENERIC_ATOMIC64 if !64BIT select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE - select GENERIC_FIND_FIRST_BIT select GENERIC_GETTIMEOFDAY select GENERIC_IOMAP select GENERIC_IRQ_PROBE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2a5bb4f29cfe..4f80f1c95468 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -127,7 +127,6 @@ config S390 select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES select GENERIC_ENTRY - select GENERIC_FIND_FIRST_BIT select GENERIC_GETTIMEOFDAY select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5c2ccb85f2ef..60484b39257c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -136,7 +136,6 @@ config X86 select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_ENTRY - select GENERIC_FIND_FIRST_BIT select GENERIC_IOMAP select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP select GENERIC_IRQ_MATRIX_ALLOCATOR if X86_LOCAL_APIC diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 95d26a69088b..40d6a06e41c8 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -8,7 +8,6 @@ endmenu config UML_X86 def_bool y - select GENERIC_FIND_FIRST_BIT config 64BIT bool "64-bit kernel" if "$(SUBARCH)" = "x86" diff --git a/include/linux/find.h b/include/linux/find.h index c5410c243e04..ea57f7f38c49 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -101,8 +101,6 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, } #endif -#ifdef CONFIG_GENERIC_FIND_FIRST_BIT - #ifndef find_first_bit /** * find_first_bit - find the first set bit in a memory region @@ -147,17 +145,6 @@ unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) } #endif -#else /* CONFIG_GENERIC_FIND_FIRST_BIT */ - -#ifndef find_first_bit -#define find_first_bit(addr, size) find_next_bit((addr), (size), 0) -#endif -#ifndef find_first_zero_bit -#define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) -#endif - -#endif /* CONFIG_GENERIC_FIND_FIRST_BIT */ - #ifndef find_last_bit /** * find_last_bit - find the last set bit in a memory region diff --git a/lib/Kconfig b/lib/Kconfig index 5e7165e6a346..6a6ae5312fa0 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -65,9 +65,6 @@ config GENERIC_STRNLEN_USER config GENERIC_NET_UTILS bool -config GENERIC_FIND_FIRST_BIT - bool - source "lib/math/Kconfig" config NO_GENERIC_PCI_IOPORT_MAP -- cgit v1.2.3-58-ga151 From f68edc9297bf3f7c94abb54b9b0b053607f7587b Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:01 -0700 Subject: lib: add find_first_and_bit() Currently find_first_and_bit() is an alias to find_next_and_bit(). However, it is widely used in cpumask, so it worth to optimize it. This patch adds its own implementation for find_first_and_bit(). On x86_64 find_bit_benchmark says: Before (#define find_first_and_bit(...) find_next_and_bit(..., 0): Start testing find_bit() with random-filled bitmap [ 140.291468] find_first_and_bit: 46890919 ns, 32671 iterations Start testing find_bit() with sparse bitmap [ 140.295028] find_first_and_bit: 7103 ns, 1 iterations After: Start testing find_bit() with random-filled bitmap [ 162.574907] find_first_and_bit: 25045813 ns, 32846 iterations Start testing find_bit() with sparse bitmap [ 162.578458] find_first_and_bit: 4900 ns, 1 iterations (Thanks to Alexey Klimov for thorough testing.) Signed-off-by: Yury Norov Tested-by: Wolfram Sang Tested-by: Alexey Klimov --- include/linux/find.h | 27 +++++++++++++++++++++++++++ lib/find_bit.c | 21 +++++++++++++++++++++ lib/find_bit_benchmark.c | 21 +++++++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/include/linux/find.h b/include/linux/find.h index ea57f7f38c49..6048f8c97418 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -12,6 +12,8 @@ extern unsigned long _find_next_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, unsigned long start, unsigned long invert, unsigned long le); extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); @@ -123,6 +125,31 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size) } #endif +#ifndef find_first_and_bit +/** + * find_first_and_bit - find the first set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_and_bit(addr1, addr2, size); +} +#endif + #ifndef find_first_zero_bit /** * find_first_zero_bit - find the first cleared bit in a memory region diff --git a/lib/find_bit.c b/lib/find_bit.c index 0f8e2e369b1d..1b8e4b2a9cba 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -89,6 +89,27 @@ unsigned long _find_first_bit(const unsigned long *addr, unsigned long size) EXPORT_SYMBOL(_find_first_bit); #endif +#ifndef find_first_and_bit +/* + * Find the first set bit in two memory regions. + */ +unsigned long _find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + unsigned long idx, val; + + for (idx = 0; idx * BITS_PER_LONG < size; idx++) { + val = addr1[idx] & addr2[idx]; + if (val) + return min(idx * BITS_PER_LONG + __ffs(val), size); + } + + return size; +} +EXPORT_SYMBOL(_find_first_and_bit); +#endif + #ifndef find_first_zero_bit /* * Find the first cleared bit in a memory region. diff --git a/lib/find_bit_benchmark.c b/lib/find_bit_benchmark.c index 5637c5711db9..db904b57d4b8 100644 --- a/lib/find_bit_benchmark.c +++ b/lib/find_bit_benchmark.c @@ -49,6 +49,25 @@ static int __init test_find_first_bit(void *bitmap, unsigned long len) return 0; } +static int __init test_find_first_and_bit(void *bitmap, const void *bitmap2, unsigned long len) +{ + static DECLARE_BITMAP(cp, BITMAP_LEN) __initdata; + unsigned long i, cnt; + ktime_t time; + + bitmap_copy(cp, bitmap, BITMAP_LEN); + + time = ktime_get(); + for (cnt = i = 0; i < len; cnt++) { + i = find_first_and_bit(cp, bitmap2, len); + __clear_bit(i, cp); + } + time = ktime_get() - time; + pr_err("find_first_and_bit: %18llu ns, %6ld iterations\n", time, cnt); + + return 0; +} + static int __init test_find_next_bit(const void *bitmap, unsigned long len) { unsigned long i, cnt; @@ -129,6 +148,7 @@ static int __init find_bit_test(void) * traverse only part of bitmap to avoid soft lockup. */ test_find_first_bit(bitmap, BITMAP_LEN / 10); + test_find_first_and_bit(bitmap, bitmap2, BITMAP_LEN / 2); test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN); pr_err("\nStart testing find_bit() with sparse bitmap\n"); @@ -145,6 +165,7 @@ static int __init find_bit_test(void) test_find_next_zero_bit(bitmap, BITMAP_LEN); test_find_last_bit(bitmap, BITMAP_LEN); test_find_first_bit(bitmap, BITMAP_LEN); + test_find_first_and_bit(bitmap, bitmap2, BITMAP_LEN); test_find_next_and_bit(bitmap, bitmap2, BITMAP_LEN); /* -- cgit v1.2.3-58-ga151 From 93ba139ba8190c33009c5353ca43c8519443f467 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:02 -0700 Subject: cpumask: use find_first_and_bit() Now we have an efficient implementation for find_first_and_bit(), so switch cpumask to use it where appropriate. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- include/linux/cpumask.h | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 1e7399fc69c0..c4e1b9ea0ba4 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -123,6 +123,12 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return 0; } +static inline unsigned int cpumask_first_and(const struct cpumask *srcp1, + const struct cpumask *srcp2) +{ + return 0; +} + static inline unsigned int cpumask_last(const struct cpumask *srcp) { return 0; @@ -167,7 +173,7 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node) static inline int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { - return cpumask_next_and(-1, src1p, src2p); + return cpumask_first_and(src1p, src2p); } static inline int cpumask_any_distribute(const struct cpumask *srcp) @@ -195,6 +201,19 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } +/** + * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 + * @src1p: the first input + * @src2p: the second input + * + * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). + */ +static inline +unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) +{ + return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), nr_cpumask_bits); +} + /** * cpumask_last - get the last CPU in a cpumask * @srcp: - the cpumask pointer @@ -585,15 +604,6 @@ static inline void cpumask_copy(struct cpumask *dstp, */ #define cpumask_any(srcp) cpumask_first(srcp) -/** - * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 - * @src1p: the first input - * @src2p: the second input - * - * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). - */ -#define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p)) - /** * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2 * @mask1: the first input cpumask -- cgit v1.2.3-58-ga151 From b5c7e7ec7d3418af2544452b45cc67297c857a86 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:03 -0700 Subject: all: replace find_next{,_zero}_bit with find_first{,_zero}_bit where appropriate find_first{,_zero}_bit is a more effective analogue of 'next' version if start == 0. This patch replaces 'next' with 'first' where things look trivial. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- arch/powerpc/platforms/pasemi/dma_lib.c | 4 ++-- arch/s390/kvm/kvm-s390.c | 2 +- drivers/block/rnbd/rnbd-clt.c | 2 +- drivers/dma/ti/edma.c | 2 +- drivers/iio/adc/ad7124.c | 2 +- drivers/infiniband/hw/irdma/hw.c | 16 ++++++++-------- drivers/media/cec/core/cec-core.c | 2 +- drivers/media/mc/mc-devnode.c | 2 +- drivers/pci/controller/dwc/pci-dra7xx.c | 2 +- drivers/scsi/lpfc/lpfc_sli.c | 10 +++++----- drivers/soc/ti/k3-ringacc.c | 4 ++-- drivers/tty/n_tty.c | 2 +- drivers/virt/acrn/ioreq.c | 3 +-- fs/f2fs/segment.c | 8 ++++---- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/ocfs2/dlm/dlmdomain.c | 4 ++-- fs/ocfs2/dlm/dlmmaster.c | 18 +++++++++--------- fs/ocfs2/dlm/dlmrecovery.c | 2 +- fs/ocfs2/dlm/dlmthread.c | 2 +- lib/genalloc.c | 2 +- net/ncsi/ncsi-manage.c | 4 ++-- 21 files changed, 47 insertions(+), 48 deletions(-) diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c b/arch/powerpc/platforms/pasemi/dma_lib.c index 270fa3c0d372..26427311fc72 100644 --- a/arch/powerpc/platforms/pasemi/dma_lib.c +++ b/arch/powerpc/platforms/pasemi/dma_lib.c @@ -375,7 +375,7 @@ int pasemi_dma_alloc_flag(void) int bit; retry: - bit = find_next_bit(flags_free, MAX_FLAGS, 0); + bit = find_first_bit(flags_free, MAX_FLAGS); if (bit >= MAX_FLAGS) return -ENOSPC; if (!test_and_clear_bit(bit, flags_free)) @@ -440,7 +440,7 @@ int pasemi_dma_alloc_fun(void) int bit; retry: - bit = find_next_bit(fun_free, MAX_FLAGS, 0); + bit = find_first_bit(fun_free, MAX_FLAGS); if (bit >= MAX_FLAGS) return -ENOSPC; if (!test_and_clear_bit(bit, fun_free)) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 14a18ba5ff2c..9aba96d621b9 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2021,7 +2021,7 @@ static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, while ((slotidx > 0) && (ofs >= ms->npages)) { slotidx--; ms = slots->memslots + slotidx; - ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0); + ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages); } return ms->base_gfn + ofs; } diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 2df0657cdf00..cef1058ec2fd 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -196,7 +196,7 @@ rnbd_get_cpu_qlist(struct rnbd_clt_session *sess, int cpu) return per_cpu_ptr(sess->cpu_queues, bit); } else if (cpu != 0) { /* Search from 0 to cpu */ - bit = find_next_bit(sess->cpu_queues_bm, cpu, 0); + bit = find_first_bit(sess->cpu_queues_bm, cpu); if (bit < cpu) return per_cpu_ptr(sess->cpu_queues, bit); } diff --git a/drivers/dma/ti/edma.c b/drivers/dma/ti/edma.c index 35d81bd857f1..caa4050ecc02 100644 --- a/drivers/dma/ti/edma.c +++ b/drivers/dma/ti/edma.c @@ -1681,7 +1681,7 @@ static irqreturn_t dma_ccerr_handler(int irq, void *data) dev_dbg(ecc->dev, "EMR%d 0x%08x\n", j, val); emr = val; - for (i = find_next_bit(&emr, 32, 0); i < 32; + for (i = find_first_bit(&emr, 32); i < 32; i = find_next_bit(&emr, 32, i + 1)) { int k = (j << 5) + i; diff --git a/drivers/iio/adc/ad7124.c b/drivers/iio/adc/ad7124.c index e45c600fccc0..bc2cfa5f9592 100644 --- a/drivers/iio/adc/ad7124.c +++ b/drivers/iio/adc/ad7124.c @@ -347,7 +347,7 @@ static int ad7124_find_free_config_slot(struct ad7124_state *st) { unsigned int free_cfg_slot; - free_cfg_slot = find_next_zero_bit(&st->cfg_slots_status, AD7124_MAX_CONFIGS, 0); + free_cfg_slot = find_first_zero_bit(&st->cfg_slots_status, AD7124_MAX_CONFIGS); if (free_cfg_slot == AD7124_MAX_CONFIGS) return -1; diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index b4c657f5f2f9..e2dbd6b1ffcd 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -1709,14 +1709,14 @@ clean_msixtbl: */ static void irdma_get_used_rsrc(struct irdma_device *iwdev) { - iwdev->rf->used_pds = find_next_zero_bit(iwdev->rf->allocated_pds, - iwdev->rf->max_pd, 0); - iwdev->rf->used_qps = find_next_zero_bit(iwdev->rf->allocated_qps, - iwdev->rf->max_qp, 0); - iwdev->rf->used_cqs = find_next_zero_bit(iwdev->rf->allocated_cqs, - iwdev->rf->max_cq, 0); - iwdev->rf->used_mrs = find_next_zero_bit(iwdev->rf->allocated_mrs, - iwdev->rf->max_mr, 0); + iwdev->rf->used_pds = find_first_zero_bit(iwdev->rf->allocated_pds, + iwdev->rf->max_pd); + iwdev->rf->used_qps = find_first_zero_bit(iwdev->rf->allocated_qps, + iwdev->rf->max_qp); + iwdev->rf->used_cqs = find_first_zero_bit(iwdev->rf->allocated_cqs, + iwdev->rf->max_cq); + iwdev->rf->used_mrs = find_first_zero_bit(iwdev->rf->allocated_mrs, + iwdev->rf->max_mr); } void irdma_ctrl_deinit_hw(struct irdma_pci_f *rf) diff --git a/drivers/media/cec/core/cec-core.c b/drivers/media/cec/core/cec-core.c index 551689d371a7..7322e7cd9753 100644 --- a/drivers/media/cec/core/cec-core.c +++ b/drivers/media/cec/core/cec-core.c @@ -106,7 +106,7 @@ static int __must_check cec_devnode_register(struct cec_devnode *devnode, /* Part 1: Find a free minor number */ mutex_lock(&cec_devnode_lock); - minor = find_next_zero_bit(cec_devnode_nums, CEC_NUM_DEVICES, 0); + minor = find_first_zero_bit(cec_devnode_nums, CEC_NUM_DEVICES); if (minor == CEC_NUM_DEVICES) { mutex_unlock(&cec_devnode_lock); pr_err("could not get a free minor\n"); diff --git a/drivers/media/mc/mc-devnode.c b/drivers/media/mc/mc-devnode.c index f11382afe23b..680fbb3a9340 100644 --- a/drivers/media/mc/mc-devnode.c +++ b/drivers/media/mc/mc-devnode.c @@ -217,7 +217,7 @@ int __must_check media_devnode_register(struct media_device *mdev, /* Part 1: Find a free minor number */ mutex_lock(&media_devnode_lock); - minor = find_next_zero_bit(media_devnode_nums, MEDIA_NUM_DEVICES, 0); + minor = find_first_zero_bit(media_devnode_nums, MEDIA_NUM_DEVICES); if (minor == MEDIA_NUM_DEVICES) { mutex_unlock(&media_devnode_lock); pr_err("could not get a free minor\n"); diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c b/drivers/pci/controller/dwc/pci-dra7xx.c index a4221f6f3629..279a6fa56584 100644 --- a/drivers/pci/controller/dwc/pci-dra7xx.c +++ b/drivers/pci/controller/dwc/pci-dra7xx.c @@ -213,7 +213,7 @@ static int dra7xx_pcie_handle_msi(struct pcie_port *pp, int index) if (!val) return 0; - pos = find_next_bit(&val, MAX_MSI_IRQS_PER_CTRL, 0); + pos = find_first_bit(&val, MAX_MSI_IRQS_PER_CTRL); while (pos != MAX_MSI_IRQS_PER_CTRL) { generic_handle_domain_irq(pp->irq_domain, (index * MAX_MSI_IRQS_PER_CTRL) + pos); diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c index 5dedb3de271d..77dfe293bf23 100644 --- a/drivers/scsi/lpfc/lpfc_sli.c +++ b/drivers/scsi/lpfc/lpfc_sli.c @@ -17990,8 +17990,8 @@ lpfc_sli4_alloc_xri(struct lpfc_hba *phba) * the driver starts at 0 each time. */ spin_lock_irq(&phba->hbalock); - xri = find_next_zero_bit(phba->sli4_hba.xri_bmask, - phba->sli4_hba.max_cfg_param.max_xri, 0); + xri = find_first_zero_bit(phba->sli4_hba.xri_bmask, + phba->sli4_hba.max_cfg_param.max_xri); if (xri >= phba->sli4_hba.max_cfg_param.max_xri) { spin_unlock_irq(&phba->hbalock); return NO_XRI; @@ -19668,7 +19668,7 @@ lpfc_sli4_alloc_rpi(struct lpfc_hba *phba) max_rpi = phba->sli4_hba.max_cfg_param.max_rpi; rpi_limit = phba->sli4_hba.next_rpi; - rpi = find_next_zero_bit(phba->sli4_hba.rpi_bmask, rpi_limit, 0); + rpi = find_first_zero_bit(phba->sli4_hba.rpi_bmask, rpi_limit); if (rpi >= rpi_limit) rpi = LPFC_RPI_ALLOC_ERROR; else { @@ -20311,8 +20311,8 @@ next_priority: * have been tested so that we can detect when we should * change the priority level. */ - next_fcf_index = find_next_bit(phba->fcf.fcf_rr_bmask, - LPFC_SLI4_FCF_TBL_INDX_MAX, 0); + next_fcf_index = find_first_bit(phba->fcf.fcf_rr_bmask, + LPFC_SLI4_FCF_TBL_INDX_MAX); } diff --git a/drivers/soc/ti/k3-ringacc.c b/drivers/soc/ti/k3-ringacc.c index 312ba0f98ad7..573be88f8191 100644 --- a/drivers/soc/ti/k3-ringacc.c +++ b/drivers/soc/ti/k3-ringacc.c @@ -358,8 +358,8 @@ struct k3_ring *k3_ringacc_request_ring(struct k3_ringacc *ringacc, goto out; if (flags & K3_RINGACC_RING_USE_PROXY) { - proxy_id = find_next_zero_bit(ringacc->proxy_inuse, - ringacc->num_proxies, 0); + proxy_id = find_first_zero_bit(ringacc->proxy_inuse, + ringacc->num_proxies); if (proxy_id == ringacc->num_proxies) goto error; } diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index 5be6d02dc690..9fc2319a394d 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -1975,7 +1975,7 @@ static bool canon_copy_from_read_buf(struct tty_struct *tty, more = n - (size - tail); if (eol == N_TTY_BUF_SIZE && more) { /* scan wrapped without finding set bit */ - eol = find_next_bit(ldata->read_flags, more, 0); + eol = find_first_bit(ldata->read_flags, more); found = eol != more; } else found = eol != size; diff --git a/drivers/virt/acrn/ioreq.c b/drivers/virt/acrn/ioreq.c index 80b2e3f0e276..5ff1c53740c0 100644 --- a/drivers/virt/acrn/ioreq.c +++ b/drivers/virt/acrn/ioreq.c @@ -246,8 +246,7 @@ void acrn_ioreq_request_clear(struct acrn_vm *vm) spin_lock_bh(&vm->ioreq_clients_lock); client = vm->default_client; if (client) { - vcpu = find_next_bit(client->ioreqs_map, - ACRN_IO_REQUEST_MAX, 0); + vcpu = find_first_bit(client->ioreqs_map, ACRN_IO_REQUEST_MAX); while (vcpu < ACRN_IO_REQUEST_MAX) { acrn_ioreq_complete_request(client, vcpu, NULL); vcpu = find_next_bit(client->ioreqs_map, diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index df9ed75f0b7a..913552c98171 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2558,8 +2558,8 @@ find_other_zone: secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); if (secno >= MAIN_SECS(sbi)) { if (dir == ALLOC_RIGHT) { - secno = find_next_zero_bit(free_i->free_secmap, - MAIN_SECS(sbi), 0); + secno = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi)); } else { go_left = 1; @@ -2574,8 +2574,8 @@ find_other_zone: left_start--; continue; } - left_start = find_next_zero_bit(free_i->free_secmap, - MAIN_SECS(sbi), 0); + left_start = find_first_zero_bit(free_i->free_secmap, + MAIN_SECS(sbi)); f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi)); break; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f89ffcbd585f..a17be1618bf7 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -379,7 +379,7 @@ static void o2hb_nego_timeout(struct work_struct *work) o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); /* lowest node as master node to make negotiate decision. */ - master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0); + master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES); if (master_node == o2nm_this_node()) { if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 9f90fc9551e1..c4eccd499db8 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1045,7 +1045,7 @@ static int dlm_send_regions(struct dlm_ctxt *dlm, unsigned long *node_map) int status, ret = 0, i; char *p; - if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES) goto bail; qr = kzalloc(sizeof(struct dlm_query_region), GFP_KERNEL); @@ -1217,7 +1217,7 @@ static int dlm_send_nodeinfo(struct dlm_ctxt *dlm, unsigned long *node_map) struct o2nm_node *node; int ret = 0, status, count, i; - if (find_next_bit(node_map, O2NM_MAX_NODES, 0) >= O2NM_MAX_NODES) + if (find_first_bit(node_map, O2NM_MAX_NODES) >= O2NM_MAX_NODES) goto bail; qn = kzalloc(sizeof(struct dlm_query_nodeinfo), GFP_KERNEL); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9b88219febb5..227da5b1b6ab 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -861,7 +861,7 @@ lookup: * to see if there are any nodes that still need to be * considered. these will not appear in the mle nodemap * but they might own this lockres. wait on them. */ - bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) { mlog(0, "%s: res %.*s, At least one node (%d) " "to recover before lock mastery can begin\n", @@ -912,7 +912,7 @@ redo_request: dlm_wait_for_recovery(dlm); spin_lock(&dlm->spinlock); - bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) { mlog(0, "%s: res %.*s, At least one node (%d) " "to recover before lock mastery can begin\n", @@ -1079,7 +1079,7 @@ recheck: sleep = 1; /* have all nodes responded? */ if (voting_done && !*blocked) { - bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (dlm->node_num <= bit) { /* my node number is lowest. * now tell other nodes that I am @@ -1234,8 +1234,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, } else { mlog(ML_ERROR, "node down! %d\n", node); if (blocked) { - int lowest = find_next_bit(mle->maybe_map, - O2NM_MAX_NODES, 0); + int lowest = find_first_bit(mle->maybe_map, + O2NM_MAX_NODES); /* act like it was never there */ clear_bit(node, mle->maybe_map); @@ -1795,7 +1795,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, "MLE for it! (%.*s)\n", assert->node_idx, namelen, name); } else { - int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); + int bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (bit >= O2NM_MAX_NODES) { /* not necessarily an error, though less likely. * could be master just re-asserting. */ @@ -2521,7 +2521,7 @@ static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm, } if (!nonlocal) { - node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + node_ref = find_first_bit(res->refmap, O2NM_MAX_NODES); if (node_ref >= O2NM_MAX_NODES) return 0; } @@ -3303,7 +3303,7 @@ static void dlm_clean_block_mle(struct dlm_ctxt *dlm, BUG_ON(mle->type != DLM_MLE_BLOCK); spin_lock(&mle->spinlock); - bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(mle->maybe_map, O2NM_MAX_NODES); if (bit != dead_node) { mlog(0, "mle found, but dead node %u would not have been " "master\n", dead_node); @@ -3542,7 +3542,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm) spin_lock(&dlm->master_lock); BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING); - BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES)); + BUG_ON((find_first_bit(dlm->domain_map, O2NM_MAX_NODES) < O2NM_MAX_NODES)); for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_master_hash(dlm, i); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 5cd5f7511dac..52ad342fec3e 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -451,7 +451,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { int bit; - bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0); + bit = find_first_bit(dlm->recovery_map, O2NM_MAX_NODES); if (bit >= O2NM_MAX_NODES || bit < 0) dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); else diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index c350bd4df770..eedf07ca23ca 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -92,7 +92,7 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res) return 0; /* Another node has this resource with this node as the master */ - bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + bit = find_first_bit(res->refmap, O2NM_MAX_NODES); if (bit < O2NM_MAX_NODES) return 0; diff --git a/lib/genalloc.c b/lib/genalloc.c index 9a57257988c7..00fc50d0a640 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -251,7 +251,7 @@ void gen_pool_destroy(struct gen_pool *pool) list_del(&chunk->next_chunk); end_bit = chunk_size(chunk) >> order; - bit = find_next_bit(chunk->bits, end_bit, 0); + bit = find_first_bit(chunk->bits, end_bit); BUG_ON(bit < end_bit); vfree(chunk); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 7121ce2a47c0..78814417d753 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -608,7 +608,7 @@ static int clear_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, bitmap = &ncf->bitmap; spin_lock_irqsave(&nc->lock, flags); - index = find_next_bit(bitmap, ncf->n_vids, 0); + index = find_first_bit(bitmap, ncf->n_vids); if (index >= ncf->n_vids) { spin_unlock_irqrestore(&nc->lock, flags); return -1; @@ -667,7 +667,7 @@ static int set_one_vid(struct ncsi_dev_priv *ndp, struct ncsi_channel *nc, return -1; } - index = find_next_zero_bit(bitmap, ncf->n_vids, 0); + index = find_first_zero_bit(bitmap, ncf->n_vids); if (index < 0 || index >= ncf->n_vids) { netdev_err(ndp->ndev.dev, "Channel %u already has all VLAN filters set\n", -- cgit v1.2.3-58-ga151 From 4ade0818cf048bb166e875ed4f8b456e6c2c7b3c Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:04 -0700 Subject: tools: sync tools/bitmap with mother linux Remove tools/include/asm-generic/bitops/find.h and copy include/linux/bitmap.h to tools. find_*_le() functions are not copied because not needed in tools. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- MAINTAINERS | 2 +- tools/include/asm-generic/bitops.h | 1 - tools/include/asm-generic/bitops/find.h | 145 ---------------------- tools/include/linux/bitmap.h | 7 +- tools/include/linux/find.h | 214 ++++++++++++++++++++++++++++++++ tools/lib/find_bit.c | 20 +++ 6 files changed, 239 insertions(+), 150 deletions(-) delete mode 100644 tools/include/asm-generic/bitops/find.h create mode 100644 tools/include/linux/find.h diff --git a/MAINTAINERS b/MAINTAINERS index 4646786f5302..092b1c6fd2f1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3365,8 +3365,8 @@ F: lib/bitmap.c F: lib/find_bit.c F: lib/find_bit_benchmark.c F: lib/test_bitmap.c -F: tools/include/asm-generic/bitops/find.h F: tools/include/linux/bitmap.h +F: tools/include/linux/find.h F: tools/lib/bitmap.c F: tools/lib/find_bit.c diff --git a/tools/include/asm-generic/bitops.h b/tools/include/asm-generic/bitops.h index 5d2ab38965cc..9ab313e93555 100644 --- a/tools/include/asm-generic/bitops.h +++ b/tools/include/asm-generic/bitops.h @@ -18,7 +18,6 @@ #include #include #include -#include #ifndef _TOOLS_LINUX_BITOPS_H_ #error only can be included directly diff --git a/tools/include/asm-generic/bitops/find.h b/tools/include/asm-generic/bitops/find.h deleted file mode 100644 index 6481fd11012a..000000000000 --- a/tools/include/asm-generic/bitops/find.h +++ /dev/null @@ -1,145 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ -#define _TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ - -extern unsigned long _find_next_bit(const unsigned long *addr1, - const unsigned long *addr2, unsigned long nbits, - unsigned long start, unsigned long invert, unsigned long le); -extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); -extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); -extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); - -#ifndef find_next_bit -/** - * find_next_bit - find the next set bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The bitmap size in bits - * - * Returns the bit number for the next set bit - * If no bits are set, returns @size. - */ -static inline -unsigned long find_next_bit(const unsigned long *addr, unsigned long size, - unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val; - - if (unlikely(offset >= size)) - return size; - - val = *addr & GENMASK(size - 1, offset); - return val ? __ffs(val) : size; - } - - return _find_next_bit(addr, NULL, size, offset, 0UL, 0); -} -#endif - -#ifndef find_next_and_bit -/** - * find_next_and_bit - find the next set bit in both memory regions - * @addr1: The first address to base the search on - * @addr2: The second address to base the search on - * @offset: The bitnumber to start searching at - * @size: The bitmap size in bits - * - * Returns the bit number for the next set bit - * If no bits are set, returns @size. - */ -static inline -unsigned long find_next_and_bit(const unsigned long *addr1, - const unsigned long *addr2, unsigned long size, - unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val; - - if (unlikely(offset >= size)) - return size; - - val = *addr1 & *addr2 & GENMASK(size - 1, offset); - return val ? __ffs(val) : size; - } - - return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); -} -#endif - -#ifndef find_next_zero_bit -/** - * find_next_zero_bit - find the next cleared bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The bitmap size in bits - * - * Returns the bit number of the next zero bit - * If no bits are zero, returns @size. - */ -static inline -unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, - unsigned long offset) -{ - if (small_const_nbits(size)) { - unsigned long val; - - if (unlikely(offset >= size)) - return size; - - val = *addr | ~GENMASK(size - 1, offset); - return val == ~0UL ? size : ffz(val); - } - - return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); -} -#endif - -#ifndef find_first_bit - -/** - * find_first_bit - find the first set bit in a memory region - * @addr: The address to start the search at - * @size: The maximum number of bits to search - * - * Returns the bit number of the first set bit. - * If no bits are set, returns @size. - */ -static inline -unsigned long find_first_bit(const unsigned long *addr, unsigned long size) -{ - if (small_const_nbits(size)) { - unsigned long val = *addr & GENMASK(size - 1, 0); - - return val ? __ffs(val) : size; - } - - return _find_first_bit(addr, size); -} - -#endif /* find_first_bit */ - -#ifndef find_first_zero_bit - -/** - * find_first_zero_bit - find the first cleared bit in a memory region - * @addr: The address to start the search at - * @size: The maximum number of bits to search - * - * Returns the bit number of the first cleared bit. - * If no bits are zero, returns @size. - */ -static inline -unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) -{ - if (small_const_nbits(size)) { - unsigned long val = *addr | ~GENMASK(size - 1, 0); - - return val == ~0UL ? size : ffz(val); - } - - return _find_first_zero_bit(addr, size); -} -#endif - -#endif /*_TOOLS_LINUX_ASM_GENERIC_BITOPS_FIND_H_ */ diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 95611df1d26e..ea97804d04d4 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -1,9 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PERF_BITOPS_H -#define _PERF_BITOPS_H +#ifndef _TOOLS_LINUX_BITMAP_H +#define _TOOLS_LINUX_BITMAP_H #include #include +#include #include #include @@ -181,4 +182,4 @@ static inline int bitmap_intersects(const unsigned long *src1, return __bitmap_intersects(src1, src2, nbits); } -#endif /* _PERF_BITOPS_H */ +#endif /* _TOOLS_LINUX_BITMAP_H */ diff --git a/tools/include/linux/find.h b/tools/include/linux/find.h new file mode 100644 index 000000000000..47e2bd6c5174 --- /dev/null +++ b/tools/include/linux/find.h @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_LINUX_FIND_H_ +#define _TOOLS_LINUX_FIND_H_ + +#ifndef _TOOLS_LINUX_BITMAP_H +#error tools: only can be included directly +#endif + +#include + +extern unsigned long _find_next_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long nbits, + unsigned long start, unsigned long invert, unsigned long le); +extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size); +extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); +extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size); + +#ifndef find_next_bit +/** + * find_next_bit - find the next set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr, NULL, size, offset, 0UL, 0); +} +#endif + +#ifndef find_next_and_bit +/** + * find_next_and_bit - find the next set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @offset: The bitnumber to start searching at + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_next_and_bit(const unsigned long *addr1, + const unsigned long *addr2, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr1 & *addr2 & GENMASK(size - 1, offset); + return val ? __ffs(val) : size; + } + + return _find_next_bit(addr1, addr2, size, offset, 0UL, 0); +} +#endif + +#ifndef find_next_zero_bit +/** + * find_next_zero_bit - find the next cleared bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The bitmap size in bits + * + * Returns the bit number of the next zero bit + * If no bits are zero, returns @size. + */ +static inline +unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + if (small_const_nbits(size)) { + unsigned long val; + + if (unlikely(offset >= size)) + return size; + + val = *addr | ~GENMASK(size - 1, offset); + return val == ~0UL ? size : ffz(val); + } + + return _find_next_bit(addr, NULL, size, offset, ~0UL, 0); +} +#endif + +#ifndef find_first_bit +/** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum number of bits to search + * + * Returns the bit number of the first set bit. + * If no bits are set, returns @size. + */ +static inline +unsigned long find_first_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_bit(addr, size); +} +#endif + +#ifndef find_first_and_bit +/** + * find_first_and_bit - find the first set bit in both memory regions + * @addr1: The first address to base the search on + * @addr2: The second address to base the search on + * @size: The bitmap size in bits + * + * Returns the bit number for the next set bit + * If no bits are set, returns @size. + */ +static inline +unsigned long find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0); + + return val ? __ffs(val) : size; + } + + return _find_first_and_bit(addr1, addr2, size); +} +#endif + +#ifndef find_first_zero_bit +/** + * find_first_zero_bit - find the first cleared bit in a memory region + * @addr: The address to start the search at + * @size: The maximum number of bits to search + * + * Returns the bit number of the first cleared bit. + * If no bits are zero, returns @size. + */ +static inline +unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr | ~GENMASK(size - 1, 0); + + return val == ~0UL ? size : ffz(val); + } + + return _find_first_zero_bit(addr, size); +} +#endif + +#ifndef find_last_bit +/** + * find_last_bit - find the last set bit in a memory region + * @addr: The address to start the search at + * @size: The number of bits to search + * + * Returns the bit number of the last set bit, or size. + */ +static inline +unsigned long find_last_bit(const unsigned long *addr, unsigned long size) +{ + if (small_const_nbits(size)) { + unsigned long val = *addr & GENMASK(size - 1, 0); + + return val ? __fls(val) : size; + } + + return _find_last_bit(addr, size); +} +#endif + +/** + * find_next_clump8 - find next 8-bit clump with set bits in a memory region + * @clump: location to store copy of found clump + * @addr: address to base the search on + * @size: bitmap size in number of bits + * @offset: bit offset at which to start searching + * + * Returns the bit offset for the next set clump; the found clump value is + * copied to the location pointed by @clump. If no bits are set, returns @size. + */ +extern unsigned long find_next_clump8(unsigned long *clump, + const unsigned long *addr, + unsigned long size, unsigned long offset); + +#define find_first_clump8(clump, bits, size) \ + find_next_clump8((clump), (bits), (size), 0) + + +#endif /*__LINUX_FIND_H_ */ diff --git a/tools/lib/find_bit.c b/tools/lib/find_bit.c index 109aa7ffcf97..ba4b8d94e004 100644 --- a/tools/lib/find_bit.c +++ b/tools/lib/find_bit.c @@ -96,6 +96,26 @@ unsigned long _find_first_bit(const unsigned long *addr, unsigned long size) } #endif +#ifndef find_first_and_bit +/* + * Find the first set bit in two memory regions. + */ +unsigned long _find_first_and_bit(const unsigned long *addr1, + const unsigned long *addr2, + unsigned long size) +{ + unsigned long idx, val; + + for (idx = 0; idx * BITS_PER_LONG < size; idx++) { + val = addr1[idx] & addr2[idx]; + if (val) + return min(idx * BITS_PER_LONG + __ffs(val), size); + } + + return size; +} +#endif + #ifndef find_first_zero_bit /* * Find the first cleared bit in a memory region. -- cgit v1.2.3-58-ga151 From 9b51d9d866482a703646fd4c07e433c3d9d88efd Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:05 -0700 Subject: cpumask: replace cpumask_next_* with cpumask_first_* where appropriate cpumask_first() is a more effective analogue of 'next' version if n == -1 (which means start == 0). This patch replaces 'next' with 'first' where things look trivial. There's no cpumask_first_zero() function, so create it. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- block/blk-mq.c | 2 +- drivers/net/virtio_net.c | 2 +- drivers/soc/fsl/qbman/bman_portal.c | 2 +- drivers/soc/fsl/qbman/qman_portal.c | 2 +- include/linux/cpumask.h | 16 ++++++++++++++++ kernel/time/clocksource.c | 4 ++-- 6 files changed, 22 insertions(+), 6 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8874a63ae952..ef56e753ab38 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2967,7 +2967,7 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, struct blk_mq_hw_ctx *hctx) { - if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu) return false; if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) return false; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b107835242ad..8c70ab3c436a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2101,7 +2101,7 @@ static void virtnet_set_affinity(struct virtnet_info *vi) stragglers = num_cpu >= vi->curr_queue_pairs ? num_cpu % vi->curr_queue_pairs : 0; - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); for (i = 0; i < vi->curr_queue_pairs; i++) { group_size = stride + (i < stragglers ? 1 : 0); diff --git a/drivers/soc/fsl/qbman/bman_portal.c b/drivers/soc/fsl/qbman/bman_portal.c index acda8a5637c5..4d7b9caee1c4 100644 --- a/drivers/soc/fsl/qbman/bman_portal.c +++ b/drivers/soc/fsl/qbman/bman_portal.c @@ -155,7 +155,7 @@ static int bman_portal_probe(struct platform_device *pdev) } spin_lock(&bman_lock); - cpu = cpumask_next_zero(-1, &portal_cpus); + cpu = cpumask_first_zero(&portal_cpus); if (cpu >= nr_cpu_ids) { __bman_portals_probed = 1; /* unassigned portal, skip init */ diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c index 96f74a1dc603..e23b60618c1a 100644 --- a/drivers/soc/fsl/qbman/qman_portal.c +++ b/drivers/soc/fsl/qbman/qman_portal.c @@ -248,7 +248,7 @@ static int qman_portal_probe(struct platform_device *pdev) pcfg->pools = qm_get_pools_sdqcr(); spin_lock(&qman_lock); - cpu = cpumask_next_zero(-1, &portal_cpus); + cpu = cpumask_first_zero(&portal_cpus); if (cpu >= nr_cpu_ids) { __qman_portals_probed = 1; /* unassigned portal, skip init */ diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index c4e1b9ea0ba4..64dae70d31f5 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -123,6 +123,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return 0; } +static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) +{ + return 0; +} + static inline unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { @@ -201,6 +206,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } +/** + * cpumask_first_zero - get the first unset cpu in a cpumask + * @srcp: the cpumask pointer + * + * Returns >= nr_cpu_ids if all cpus are set. + */ +static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) +{ + return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits); +} + /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 * @src1p: the first input diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b8a14d2fb5ba..f29d1a524fa5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -257,7 +257,7 @@ static void clocksource_verify_choose_cpus(void) return; /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); if (cpu == smp_processor_id()) cpu = cpumask_next(cpu, cpu_online_mask); if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) @@ -279,7 +279,7 @@ static void clocksource_verify_choose_cpus(void) cpu = prandom_u32() % nr_cpu_ids; cpu = cpumask_next(cpu - 1, cpu_online_mask); if (cpu >= nr_cpu_ids) - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) cpumask_set_cpu(cpu, &cpus_chosen); } -- cgit v1.2.3-58-ga151 From bc9d6635c293a2ac30c6319f7cfd08860ab7948a Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:06 -0700 Subject: include/linux: move for_each_bit() macros from bitops.h to find.h for_each_bit() macros depend on find_bit() machinery, and so the proper place for them is the find.h header. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- include/linux/bitops.h | 34 ---------------------------------- include/linux/find.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 5e62e2383b7f..7aaed501f768 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -32,40 +32,6 @@ extern unsigned long __sw_hweight64(__u64 w); */ #include -#define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_first_bit((addr), (size)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) - -/* same as for_each_set_bit() but use bit as value to start with */ -#define for_each_set_bit_from(bit, addr, size) \ - for ((bit) = find_next_bit((addr), (size), (bit)); \ - (bit) < (size); \ - (bit) = find_next_bit((addr), (size), (bit) + 1)) - -#define for_each_clear_bit(bit, addr, size) \ - for ((bit) = find_first_zero_bit((addr), (size)); \ - (bit) < (size); \ - (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) - -/* same as for_each_clear_bit() but use bit as value to start with */ -#define for_each_clear_bit_from(bit, addr, size) \ - for ((bit) = find_next_zero_bit((addr), (size), (bit)); \ - (bit) < (size); \ - (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) - -/** - * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits - * @start: bit offset to start search and to store the current iteration offset - * @clump: location to store copy of current 8-bit clump - * @bits: bitmap address to base the search on - * @size: bitmap size in number of bits - */ -#define for_each_set_clump8(start, clump, bits, size) \ - for ((start) = find_first_clump8(&(clump), (bits), (size)); \ - (start) < (size); \ - (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) - static inline int get_bitmask_order(unsigned int count) { int order; diff --git a/include/linux/find.h b/include/linux/find.h index 6048f8c97418..4500e8ab93e2 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -279,4 +279,38 @@ unsigned long find_next_bit_le(const void *addr, unsigned #error "Please fix " #endif +#define for_each_set_bit(bit, addr, size) \ + for ((bit) = find_first_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +/* same as for_each_set_bit() but use bit as value to start with */ +#define for_each_set_bit_from(bit, addr, size) \ + for ((bit) = find_next_bit((addr), (size), (bit)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +#define for_each_clear_bit(bit, addr, size) \ + for ((bit) = find_first_zero_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) + +/* same as for_each_clear_bit() but use bit as value to start with */ +#define for_each_clear_bit_from(bit, addr, size) \ + for ((bit) = find_next_zero_bit((addr), (size), (bit)); \ + (bit) < (size); \ + (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) + +/** + * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits + * @start: bit offset to start search and to store the current iteration offset + * @clump: location to store copy of current 8-bit clump + * @bits: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_clump8(start, clump, bits, size) \ + for ((start) = find_first_clump8(&(clump), (bits), (size)); \ + (start) < (size); \ + (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8)) + #endif /*__LINUX_FIND_H_ */ -- cgit v1.2.3-58-ga151 From 7516be9931b8bc8bcaac8531f490b42ab11ded1e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:07 -0700 Subject: find: micro-optimize for_each_{set,clear}_bit() The macros iterate thru all set/clear bits in a bitmap. They search a first bit using find_first_bit(), and the rest bits using find_next_bit(). Since find_next_bit() is called shortly after find_first_bit(), we can save few lines of I-cache by not using find_first_bit(). Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- include/linux/find.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/find.h b/include/linux/find.h index 4500e8ab93e2..ae9ed52b52b8 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -280,7 +280,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned #endif #define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_first_bit((addr), (size)); \ + for ((bit) = find_next_bit((addr), (size), 0); \ (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) @@ -291,7 +291,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned (bit) = find_next_bit((addr), (size), (bit) + 1)) #define for_each_clear_bit(bit, addr, size) \ - for ((bit) = find_first_zero_bit((addr), (size)); \ + for ((bit) = find_next_zero_bit((addr), (size), 0); \ (bit) < (size); \ (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) -- cgit v1.2.3-58-ga151 From 749443de8dde3b8b420ee8b4daac4d929a6adeb9 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:08 -0700 Subject: Replace for_each_*_bit_from() with for_each_*_bit() where appropriate A couple of kernel functions call for_each_*_bit_from() with start bit equal to 0. Replace them with for_each_*_bit(). No functional changes, but might improve on readability. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- arch/x86/kernel/apic/vector.c | 4 ++-- drivers/gpu/drm/etnaviv/etnaviv_gpu.c | 4 ++-- drivers/hwmon/ltc2992.c | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index c132daabe615..3e6f6b448f6a 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -760,9 +760,9 @@ void __init lapic_update_legacy_vectors(void) void __init lapic_assign_system_vectors(void) { - unsigned int i, vector = 0; + unsigned int i, vector; - for_each_set_bit_from(vector, system_vectors, NR_VECTORS) + for_each_set_bit(vector, system_vectors, NR_VECTORS) irq_matrix_assign_system(vector_matrix, vector, false); if (nr_legacy_irqs() > 1) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c index 242a5fd8b932..06bde46df451 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gpu.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gpu.c @@ -1047,7 +1047,7 @@ pm_put: void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) { - unsigned int i = 0; + unsigned int i; dev_err(gpu->dev, "recover hung GPU!\n"); @@ -1060,7 +1060,7 @@ void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu) /* complete all events, the GPU won't do it after the reset */ spin_lock(&gpu->event_spinlock); - for_each_set_bit_from(i, gpu->event_bitmap, ETNA_NR_EVENTS) + for_each_set_bit(i, gpu->event_bitmap, ETNA_NR_EVENTS) complete(&gpu->event_free); bitmap_zero(gpu->event_bitmap, ETNA_NR_EVENTS); spin_unlock(&gpu->event_spinlock); diff --git a/drivers/hwmon/ltc2992.c b/drivers/hwmon/ltc2992.c index 2a4bed0ab226..7352d2b3c756 100644 --- a/drivers/hwmon/ltc2992.c +++ b/drivers/hwmon/ltc2992.c @@ -248,8 +248,7 @@ static int ltc2992_gpio_get_multiple(struct gpio_chip *chip, unsigned long *mask gpio_status = reg; - gpio_nr = 0; - for_each_set_bit_from(gpio_nr, mask, LTC2992_GPIO_NR) { + for_each_set_bit(gpio_nr, mask, LTC2992_GPIO_NR) { if (test_bit(LTC2992_GPIO_BIT(gpio_nr), &gpio_status)) set_bit(gpio_nr, bits); } -- cgit v1.2.3-58-ga151 From 801a57365fc836d7ec866e2069d0b21d79925c1e Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:10 -0700 Subject: mm/percpu: micro-optimize pcpu_is_populated() bitmap_next_clear_region() calls find_next_zero_bit() and find_next_bit() sequentially to find a range of clear bits. In case of pcpu_is_populated() there's a chance to return earlier if bitmap has all bits set. Signed-off-by: Yury Norov Tested-by: Wolfram Sang Acked-by: Dennis Zhou --- mm/percpu.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index f5b2c2ea5a54..0c6f85c5514f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1070,17 +1070,18 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, int *next_off) { - unsigned int page_start, page_end, rs, re; + unsigned int start, end; - page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); - page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); + start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); + end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); - rs = page_start; - bitmap_next_clear_region(chunk->populated, &rs, &re, page_end); - if (rs >= page_end) + start = find_next_zero_bit(chunk->populated, end, start); + if (start >= end) return true; - *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; + end = find_next_bit(chunk->populated, end, start + 1); + + *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; return false; } -- cgit v1.2.3-58-ga151 From ec288a2cf7ca40a939316b6df206ab845bb112d1 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:11 -0700 Subject: bitmap: unify find_bit operations bitmap_for_each_{set,clear}_region() are similar to for_each_bit() macros in include/linux/find.h, but interface and implementation of them are different. This patch adds for_each_bitrange() macros and drops unused bitmap_*_region() API in sake of unification. Signed-off-by: Yury Norov Tested-by: Wolfram Sang Acked-by: Dennis Zhou Acked-by: Ulf Hansson # For MMC --- drivers/mmc/host/renesas_sdhi_core.c | 2 +- include/linux/bitmap.h | 33 --------------------- include/linux/find.h | 56 ++++++++++++++++++++++++++++++++++++ mm/percpu.c | 20 ++++++------- 4 files changed, 65 insertions(+), 46 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index f5b2684ad805..6de30a2719a3 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -628,7 +628,7 @@ static int renesas_sdhi_select_tuning(struct tmio_mmc_host *host) * is at least SH_MOBILE_SDHI_MIN_TAP_ROW probes long then use the * center index as the tap, otherwise bail out. */ - bitmap_for_each_set_region(bitmap, rs, re, 0, taps_size) { + for_each_set_bitrange(rs, re, bitmap, taps_size) { if (re - rs > tap_cnt) { tap_end = re; tap_start = rs; diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index ead4a150bd7f..7dba0847510c 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -55,12 +55,6 @@ struct device; * bitmap_clear(dst, pos, nbits) Clear specified bit area * bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area * bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off) as above - * bitmap_next_clear_region(map, &start, &end, nbits) Find next clear region - * bitmap_next_set_region(map, &start, &end, nbits) Find next set region - * bitmap_for_each_clear_region(map, rs, re, start, end) - * Iterate over all clear regions - * bitmap_for_each_set_region(map, rs, re, start, end) - * Iterate over all set regions * bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n * bitmap_shift_left(dst, src, n, nbits) *dst = *src << n * bitmap_cut(dst, src, first, n, nbits) Cut n bits from first, copy rest @@ -467,14 +461,6 @@ static inline void bitmap_replace(unsigned long *dst, __bitmap_replace(dst, old, new, mask, nbits); } -static inline void bitmap_next_clear_region(unsigned long *bitmap, - unsigned int *rs, unsigned int *re, - unsigned int end) -{ - *rs = find_next_zero_bit(bitmap, end, *rs); - *re = find_next_bit(bitmap, end, *rs + 1); -} - static inline void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs, unsigned int *re, unsigned int end) @@ -483,25 +469,6 @@ static inline void bitmap_next_set_region(unsigned long *bitmap, *re = find_next_zero_bit(bitmap, end, *rs + 1); } -/* - * Bitmap region iterators. Iterates over the bitmap between [@start, @end). - * @rs and @re should be integer variables and will be set to start and end - * index of the current clear or set region. - */ -#define bitmap_for_each_clear_region(bitmap, rs, re, start, end) \ - for ((rs) = (start), \ - bitmap_next_clear_region((bitmap), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, \ - bitmap_next_clear_region((bitmap), &(rs), &(re), (end))) - -#define bitmap_for_each_set_region(bitmap, rs, re, start, end) \ - for ((rs) = (start), \ - bitmap_next_set_region((bitmap), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, \ - bitmap_next_set_region((bitmap), &(rs), &(re), (end))) - /** * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap. * @n: u64 value diff --git a/include/linux/find.h b/include/linux/find.h index ae9ed52b52b8..5bb6db213bcb 100644 --- a/include/linux/find.h +++ b/include/linux/find.h @@ -301,6 +301,62 @@ unsigned long find_next_bit_le(const void *addr, unsigned (bit) < (size); \ (bit) = find_next_zero_bit((addr), (size), (bit) + 1)) +/** + * for_each_set_bitrange - iterate over all set bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit) + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_bitrange(b, e, addr, size) \ + for ((b) = find_next_bit((addr), (size), 0), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_bit((addr), (size), (e) + 1), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1)) + +/** + * for_each_set_bitrange_from - iterate over all set bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit); must be initialized + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_set_bitrange_from(b, e, addr, size) \ + for ((b) = find_next_bit((addr), (size), (b)), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_bit((addr), (size), (e) + 1), \ + (e) = find_next_zero_bit((addr), (size), (b) + 1)) + +/** + * for_each_clear_bitrange - iterate over all unset bit ranges [b; e) + * @b: bit offset of start of current bitrange (first unset bit) + * @e: bit offset of end of current bitrange (first set bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_clear_bitrange(b, e, addr, size) \ + for ((b) = find_next_zero_bit((addr), (size), 0), \ + (e) = find_next_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_zero_bit((addr), (size), (e) + 1), \ + (e) = find_next_bit((addr), (size), (b) + 1)) + +/** + * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e) + * @b: bit offset of start of current bitrange (first set bit); must be initialized + * @e: bit offset of end of current bitrange (first unset bit) + * @addr: bitmap address to base the search on + * @size: bitmap size in number of bits + */ +#define for_each_clear_bitrange_from(b, e, addr, size) \ + for ((b) = find_next_zero_bit((addr), (size), (b)), \ + (e) = find_next_bit((addr), (size), (b) + 1); \ + (b) < (size); \ + (b) = find_next_zero_bit((addr), (size), (e) + 1), \ + (e) = find_next_bit((addr), (size), (b) + 1)) + /** * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits * @start: bit offset to start search and to store the current iteration offset diff --git a/mm/percpu.c b/mm/percpu.c index 0c6f85c5514f..293009cc03ef 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -779,7 +779,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) { struct pcpu_block_md *block = chunk->md_blocks + index; unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); - unsigned int rs, re, start; /* region start, region end */ + unsigned int start, end; /* region start, region end */ /* promote scan_hint to contig_hint */ if (block->scan_hint) { @@ -795,9 +795,8 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) block->right_free = 0; /* iterate over free areas and update the contig hints */ - bitmap_for_each_clear_region(alloc_map, rs, re, start, - PCPU_BITMAP_BLOCK_BITS) - pcpu_block_update(block, rs, re); + for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS) + pcpu_block_update(block, start, end); } /** @@ -1852,13 +1851,12 @@ area_found: /* populate if not all pages are already there */ if (!is_atomic) { - unsigned int page_start, page_end, rs, re; + unsigned int page_end, rs, re; - page_start = PFN_DOWN(off); + rs = PFN_DOWN(off); page_end = PFN_UP(off + size); - bitmap_for_each_clear_region(chunk->populated, rs, re, - page_start, page_end) { + for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); @@ -2014,8 +2012,7 @@ static void pcpu_balance_free(bool empty_only) list_for_each_entry_safe(chunk, next, &to_free, list) { unsigned int rs, re; - bitmap_for_each_set_region(chunk->populated, rs, re, 0, - chunk->nr_pages) { + for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); @@ -2085,8 +2082,7 @@ retry_pop: continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ - bitmap_for_each_clear_region(chunk->populated, rs, re, 0, - chunk->nr_pages) { + for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) { int nr = min_t(int, re - rs, nr_to_pop); spin_unlock_irq(&pcpu_lock); -- cgit v1.2.3-58-ga151 From db7313005e9c2d4e80888dd18d4a83926b920e8c Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:12 -0700 Subject: lib: bitmap: add performance test for bitmap_print_to_pagebuf Functional tests for bitmap_print_to_pagebuf() are provided in lib/test_printf.c. This patch adds performance test for a case of fully set bitmap. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- lib/test_bitmap.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index d33fa5a61b95..0c82f07f74fc 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -446,6 +446,42 @@ static void __init test_bitmap_parselist(void) } } +static void __init test_bitmap_printlist(void) +{ + unsigned long *bmap = kmalloc(PAGE_SIZE, GFP_KERNEL); + char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + char expected[256]; + int ret, slen; + ktime_t time; + + if (!buf || !bmap) + goto out; + + memset(bmap, -1, PAGE_SIZE); + slen = snprintf(expected, 256, "0-%ld", PAGE_SIZE * 8 - 1); + if (slen < 0) + goto out; + + time = ktime_get(); + ret = bitmap_print_to_pagebuf(true, buf, bmap, PAGE_SIZE * 8); + time = ktime_get() - time; + + if (ret != slen + 1) { + pr_err("bitmap_print_to_pagebuf: result is %d, expected %d\n", ret, slen); + goto out; + } + + if (strncmp(buf, expected, slen)) { + pr_err("bitmap_print_to_pagebuf: result is %s, expected %s\n", buf, expected); + goto out; + } + + pr_err("bitmap_print_to_pagebuf: input is '%s', Time: %llu\n", buf, time); +out: + kfree(buf); + kfree(bmap); +} + static const unsigned long parse_test[] __initconst = { BITMAP_FROM_U64(0), BITMAP_FROM_U64(1), @@ -818,6 +854,7 @@ static void __init selftest(void) test_bitmap_arr32(); test_bitmap_parse(); test_bitmap_parselist(); + test_bitmap_printlist(); test_mem_optimisations(); test_for_each_set_clump8(); test_bitmap_cut(); -- cgit v1.2.3-58-ga151 From 15325b4f768f2b27b5765489eeab6ec0d6b5e902 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:13 -0700 Subject: vsprintf: rework bitmap_list_string bitmap_list_string() is very ineffective when printing bitmaps with long ranges of set bits because it calls find_next_bit for each bit in the bitmap. We can do better by detecting ranges of set bits. In my environment, before/after is 943008/31008 ns. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- lib/vsprintf.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 58d5e567f836..d4b4e481045b 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1241,20 +1241,13 @@ char *bitmap_list_string(char *buf, char *end, unsigned long *bitmap, struct printf_spec spec, const char *fmt) { int nr_bits = max_t(int, spec.field_width, 0); - /* current bit is 'cur', most recently seen range is [rbot, rtop] */ - int cur, rbot, rtop; bool first = true; + int rbot, rtop; if (check_pointer(&buf, end, bitmap, spec)) return buf; - rbot = cur = find_first_bit(bitmap, nr_bits); - while (cur < nr_bits) { - rtop = cur; - cur = find_next_bit(bitmap, nr_bits, cur + 1); - if (cur < nr_bits && cur <= rtop + 1) - continue; - + for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) { if (!first) { if (buf < end) *buf = ','; @@ -1263,15 +1256,12 @@ char *bitmap_list_string(char *buf, char *end, unsigned long *bitmap, first = false; buf = number(buf, end, rbot, default_dec_spec); - if (rbot < rtop) { - if (buf < end) - *buf = '-'; - buf++; - - buf = number(buf, end, rtop, default_dec_spec); - } + if (rtop == rbot + 1) + continue; - rbot = cur; + if (buf < end) + *buf = '-'; + buf = number(++buf, end, rtop - 1, default_dec_spec); } return buf; } -- cgit v1.2.3-58-ga151 From 5762f980ca10dcfe5eead7c40d1c34cae61f409b Mon Sep 17 00:00:00 2001 From: Johannes Schickel Date: Sat, 15 Jan 2022 15:02:57 +0100 Subject: ALSA: usb-audio: add mapping for MSI MPG X570S Carbon Max Wifi. The USB audio device 0db0:419c based on the Realtek ALC4080 chip exposes all playback volume controls as "PCM". This is makes distinguishing the individual functions hard. The added mapping distinguishes all playback volume controls as their respective function: - Speaker - for back panel output - Frontpanel Headphone - for front panel output - IEC958 - for digital output on the back panel This clarifies the individual volume control functions for users. Signed-off-by: Johannes Schickel Link: https://lore.kernel.org/r/20220115140257.8751-1-lordhoto@gmail.com Signed-off-by: Takashi Iwai --- sound/usb/mixer_maps.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sound/usb/mixer_maps.c b/sound/usb/mixer_maps.c index 5d391f62351b..96991ddf5055 100644 --- a/sound/usb/mixer_maps.c +++ b/sound/usb/mixer_maps.c @@ -431,6 +431,14 @@ static const struct usbmix_name_map aorus_master_alc1220vb_map[] = { {} }; +/* MSI MPG X570S Carbon Max Wifi with ALC4080 */ +static const struct usbmix_name_map msi_mpg_x570s_carbon_max_wifi_alc4080_map[] = { + { 29, "Speaker Playback" }, + { 30, "Front Headphone Playback" }, + { 32, "IEC958 Playback" }, + {} +}; + /* * Control map entries */ @@ -577,6 +585,10 @@ static const struct usbmix_ctl_map usbmix_ctl_maps[] = { .map = trx40_mobo_map, .connector_map = trx40_mobo_connector_map, }, + { /* MSI MPG X570S Carbon Max Wifi */ + .id = USB_ID(0x0db0, 0x419c), + .map = msi_mpg_x570s_carbon_max_wifi_alc4080_map, + }, { /* MSI TRX40 */ .id = USB_ID(0x0db0, 0x543d), .map = trx40_mobo_map, -- cgit v1.2.3-58-ga151 From 87b9d74fb0be80054c729e8d6a119ca0955cedf3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 10 Jan 2022 15:29:53 +0000 Subject: powerpc/time: Fix build failure due to do_hard_irq_enable() on PPC32 CC arch/powerpc/kernel/time.o In file included from : ./arch/powerpc/include/asm/hw_irq.h: In function 'do_hard_irq_enable': ././include/linux/compiler_types.h:335:45: error: call to '__compiletime_assert_35' declared with attribute error: BUILD_BUG failed 335 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^ ././include/linux/compiler_types.h:316:25: note: in definition of macro '__compiletime_assert' 316 | prefix ## suffix(); \ | ^~~~~~ ././include/linux/compiler_types.h:335:9: note: in expansion of macro '_compiletime_assert' 335 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^~~~~~~~~~~~~~~~~~~ ./include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert' 39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg) | ^~~~~~~~~~~~~~~~~~ ./include/linux/build_bug.h:59:21: note: in expansion of macro 'BUILD_BUG_ON_MSG' 59 | #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed") | ^~~~~~~~~~~~~~~~ ./arch/powerpc/include/asm/hw_irq.h:483:9: note: in expansion of macro 'BUILD_BUG' 483 | BUILD_BUG(); | ^~~~~~~~~ should_hard_irq_enable() returns false on PPC32 so this BUILD_BUG() shouldn't trigger. Force inlining of should_hard_irq_enable() Fixes: 0faf20a1ad16 ("powerpc/64s/interrupt: Don't enable MSR[EE] in irq handlers unless perf is in use") Signed-off-by: Christophe Leroy Acked-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/247e01e0e10f4dbc59b5ff89e81702eb1ee7641e.1641828571.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/hw_irq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index a58fb4aa6c81..674e5aaafcbd 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -473,7 +473,7 @@ static inline bool arch_irq_disabled_regs(struct pt_regs *regs) return !(regs->msr & MSR_EE); } -static inline bool should_hard_irq_enable(void) +static __always_inline bool should_hard_irq_enable(void) { return false; } -- cgit v1.2.3-58-ga151 From d37823c3528e5e0705fc7746bcbc2afffb619259 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 10 Jan 2022 15:29:25 +0000 Subject: powerpc/32s: Fix kasan_init_region() for KASAN It has been reported some configuration where the kernel doesn't boot with KASAN enabled. This is due to wrong BAT allocation for the KASAN area: ---[ Data Block Address Translation ]--- 0: 0xc0000000-0xcfffffff 0x00000000 256M Kernel rw m 1: 0xd0000000-0xdfffffff 0x10000000 256M Kernel rw m 2: 0xe0000000-0xefffffff 0x20000000 256M Kernel rw m 3: 0xf8000000-0xf9ffffff 0x2a000000 32M Kernel rw m 4: 0xfa000000-0xfdffffff 0x2c000000 64M Kernel rw m A BAT must have both virtual and physical addresses alignment matching the size of the BAT. This is not the case for BAT 4 above. Fix kasan_init_region() by using block_size() function that is in book3s32/mmu.c. To be able to reuse it here, make it non static and change its name to bat_block_size() in order to avoid name conflict with block_size() defined in Also reuse find_free_bat() to avoid an error message from setbat() when no BAT is available. And allocate memory outside of linear memory mapping to avoid wasting that precious space. With this change we get correct alignment for BATs and KASAN shadow memory is allocated outside the linear memory space. ---[ Data Block Address Translation ]--- 0: 0xc0000000-0xcfffffff 0x00000000 256M Kernel rw 1: 0xd0000000-0xdfffffff 0x10000000 256M Kernel rw 2: 0xe0000000-0xefffffff 0x20000000 256M Kernel rw 3: 0xf8000000-0xfbffffff 0x7c000000 64M Kernel rw 4: 0xfc000000-0xfdffffff 0x7a000000 32M Kernel rw Fixes: 7974c4732642 ("powerpc/32s: Implement dedicated kasan_init_region()") Cc: stable@vger.kernel.org Reported-by: Maxime Bizon Signed-off-by: Christophe Leroy Tested-by: Maxime Bizon Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/7a50ef902494d1325227d47d33dada01e52e5518.1641818726.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/mmu-hash.h | 2 + arch/powerpc/mm/book3s32/mmu.c | 10 ++--- arch/powerpc/mm/kasan/book3s_32.c | 59 ++++++++++++++------------- 3 files changed, 38 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 7be27862329f..78c6a5fde1d6 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -223,6 +223,8 @@ static __always_inline void update_user_segments(u32 val) update_user_segment(15, val); } +int __init find_free_bat(void); +unsigned int bat_block_size(unsigned long base, unsigned long top); #endif /* !__ASSEMBLY__ */ /* We happily ignore the smaller BATs on 601, we don't actually use diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 94045b265b6b..203735caf691 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -76,7 +76,7 @@ unsigned long p_block_mapped(phys_addr_t pa) return 0; } -static int __init find_free_bat(void) +int __init find_free_bat(void) { int b; int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4; @@ -100,7 +100,7 @@ static int __init find_free_bat(void) * - block size has to be a power of two. This is calculated by finding the * highest bit set to 1. */ -static unsigned int block_size(unsigned long base, unsigned long top) +unsigned int bat_block_size(unsigned long base, unsigned long top) { unsigned int max_size = SZ_256M; unsigned int base_shift = (ffs(base) - 1) & 31; @@ -145,7 +145,7 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to int idx; while ((idx = find_free_bat()) != -1 && base != top) { - unsigned int size = block_size(base, top); + unsigned int size = bat_block_size(base, top); if (size < 128 << 10) break; @@ -201,12 +201,12 @@ void mmu_mark_initmem_nx(void) unsigned long size; for (i = 0; i < nb - 1 && base < top;) { - size = block_size(base, top); + size = bat_block_size(base, top); setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT); base += size; } if (base < top) { - size = block_size(base, top); + size = bat_block_size(base, top); if ((top - base) > size) { size <<= 1; if (strict_kernel_rwx_enabled() && base + size > border) diff --git a/arch/powerpc/mm/kasan/book3s_32.c b/arch/powerpc/mm/kasan/book3s_32.c index 35b287b0a8da..450a67ef0bbe 100644 --- a/arch/powerpc/mm/kasan/book3s_32.c +++ b/arch/powerpc/mm/kasan/book3s_32.c @@ -10,48 +10,51 @@ int __init kasan_init_region(void *start, size_t size) { unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start); unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size); - unsigned long k_cur = k_start; - int k_size = k_end - k_start; - int k_size_base = 1 << (ffs(k_size) - 1); + unsigned long k_nobat = k_start; + unsigned long k_cur; + phys_addr_t phys; int ret; - void *block; - block = memblock_alloc(k_size, k_size_base); - - if (block && k_size_base >= SZ_128K && k_start == ALIGN(k_start, k_size_base)) { - int shift = ffs(k_size - k_size_base); - int k_size_more = shift ? 1 << (shift - 1) : 0; - - setbat(-1, k_start, __pa(block), k_size_base, PAGE_KERNEL); - if (k_size_more >= SZ_128K) - setbat(-1, k_start + k_size_base, __pa(block) + k_size_base, - k_size_more, PAGE_KERNEL); - if (v_block_mapped(k_start)) - k_cur = k_start + k_size_base; - if (v_block_mapped(k_start + k_size_base)) - k_cur = k_start + k_size_base + k_size_more; - - update_bats(); + while (k_nobat < k_end) { + unsigned int k_size = bat_block_size(k_nobat, k_end); + int idx = find_free_bat(); + + if (idx == -1) + break; + if (k_size < SZ_128K) + break; + phys = memblock_phys_alloc_range(k_size, k_size, 0, + MEMBLOCK_ALLOC_ANYWHERE); + if (!phys) + break; + + setbat(idx, k_nobat, phys, k_size, PAGE_KERNEL); + k_nobat += k_size; } + if (k_nobat != k_start) + update_bats(); - if (!block) - block = memblock_alloc(k_size, PAGE_SIZE); - if (!block) - return -ENOMEM; + if (k_nobat < k_end) { + phys = memblock_phys_alloc_range(k_end - k_nobat, PAGE_SIZE, 0, + MEMBLOCK_ALLOC_ANYWHERE); + if (!phys) + return -ENOMEM; + } ret = kasan_init_shadow_page_tables(k_start, k_end); if (ret) return ret; - kasan_update_early_region(k_start, k_cur, __pte(0)); + kasan_update_early_region(k_start, k_nobat, __pte(0)); - for (; k_cur < k_end; k_cur += PAGE_SIZE) { + for (k_cur = k_nobat; k_cur < k_end; k_cur += PAGE_SIZE) { pmd_t *pmd = pmd_off_k(k_cur); - void *va = block + k_cur - k_start; - pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL); + pte_t pte = pfn_pte(PHYS_PFN(phys + k_cur - k_nobat), PAGE_KERNEL); __set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0); } flush_tlb_kernel_range(k_start, k_end); + memset(kasan_mem_to_shadow(start), 0, k_end - k_start); + return 0; } -- cgit v1.2.3-58-ga151 From 7372971c1be5b7d4fdd8ad237798bdc1d1d54162 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 11 Jan 2022 10:19:22 +0300 Subject: rtc: mc146818-lib: fix signedness bug in mc146818_get_time() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mc146818_get_time() function returns zero on success or negative a error code on failure. It needs to be type int. Fixes: d35786b3a28d ("rtc: mc146818-lib: change return values of mc146818_get_time()") Signed-off-by: Dan Carpenter Reviewed-by: Mateusz Jończyk Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220111071922.GE11243@kili --- drivers/rtc/rtc-mc146818-lib.c | 2 +- include/linux/mc146818rtc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index f62e658cbe23..7f689f1bafc5 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -130,7 +130,7 @@ static void mc146818_get_time_callback(unsigned char seconds, void *param_in) p->ctrl = CMOS_READ(RTC_CONTROL); } -unsigned int mc146818_get_time(struct rtc_time *time) +int mc146818_get_time(struct rtc_time *time) { struct mc146818_get_time_callback_param p = { .time = time diff --git a/include/linux/mc146818rtc.h b/include/linux/mc146818rtc.h index 67fb0a12becc..808bb4cee230 100644 --- a/include/linux/mc146818rtc.h +++ b/include/linux/mc146818rtc.h @@ -124,7 +124,7 @@ struct cmos_rtc_board_info { #endif /* ARCH_RTC_LOCATION */ bool mc146818_does_rtc_work(void); -unsigned int mc146818_get_time(struct rtc_time *time); +int mc146818_get_time(struct rtc_time *time); int mc146818_set_time(struct rtc_time *time); bool mc146818_avoid_UIP(void (*callback)(unsigned char seconds, void *param), -- cgit v1.2.3-58-ga151 From 900ed72c8a190e8c0b87cb17abc645b8ec713011 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 7 Jan 2022 10:33:40 +0300 Subject: rtc: gamecube: Fix an IS_ERR() vs NULL check The devm_kzalloc() function returns NULL on error, it doesn't return error pointers. Fixes: 86559400b3ef ("rtc: gamecube: Add a RTC driver for the GameCube, Wii and Wii U") Signed-off-by: Dan Carpenter Reviewed-by: Emmanuel Gil Peyrot Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220107073340.GF22086@kili --- drivers/rtc/rtc-gamecube.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-gamecube.c b/drivers/rtc/rtc-gamecube.c index 98128746171e..f717b36f4738 100644 --- a/drivers/rtc/rtc-gamecube.c +++ b/drivers/rtc/rtc-gamecube.c @@ -319,8 +319,8 @@ static int gamecube_rtc_probe(struct platform_device *pdev) int ret; d = devm_kzalloc(dev, sizeof(struct priv), GFP_KERNEL); - if (IS_ERR(d)) - return PTR_ERR(d); + if (!d) + return -ENOMEM; d->iob = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(d->iob)) -- cgit v1.2.3-58-ga151 From ff164ae39b82ee483b24579c8e22a13a8ce5bd04 Mon Sep 17 00:00:00 2001 From: Riwen Lu Date: Thu, 6 Jan 2022 16:46:09 +0800 Subject: rtc: cmos: Evaluate century appropriate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's limiting the year to 2069. When setting the rtc year to 2070, reading it returns 1970. Evaluate century starting from 19 to count the correct year. $ sudo date -s 20700106 Mon 06 Jan 2070 12:00:00 AM CST $ sudo hwclock -w $ sudo hwclock -r 1970-01-06 12:00:49.604968+08:00 Fixes: 2a4daadd4d3e5071 ("rtc: cmos: ignore bogus century byte") Signed-off-by: Riwen Lu Acked-by: Eric Wong Reviewed-by: Mateusz Jończyk Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220106084609.1223688-1-luriwen@kylinos.cn --- drivers/rtc/rtc-mc146818-lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index 7f689f1bafc5..ae9f131b43c0 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -159,7 +159,7 @@ int mc146818_get_time(struct rtc_time *time) #endif #ifdef CONFIG_ACPI - if (p.century > 20) + if (p.century > 19) time->tm_year += (p.century - 19) * 100; #endif -- cgit v1.2.3-58-ga151 From 5ceee540fdc7f1d65ca6e2b1b193ce5aa95ab99c Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 6 Jan 2022 15:57:11 +0800 Subject: rtc: sunplus: fix return value in sp_rtc_probe() If devm_ioremap_resource() fails, it should return error code from sp_rtc->reg_base in sp_rtc_probe(). Fixes: fad6cbe9b2b4 ("rtc: Add driver for RTC in Sunplus SP7021") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20220106075711.3216468-1-yangyingliang@huawei.com --- drivers/rtc/rtc-sunplus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-sunplus.c b/drivers/rtc/rtc-sunplus.c index 0b3873204f5c..e8e2ab1103fc 100644 --- a/drivers/rtc/rtc-sunplus.c +++ b/drivers/rtc/rtc-sunplus.c @@ -238,7 +238,7 @@ static int sp_rtc_probe(struct platform_device *plat_dev) sp_rtc->res = platform_get_resource_byname(plat_dev, IORESOURCE_MEM, RTC_REG_NAME); sp_rtc->reg_base = devm_ioremap_resource(&plat_dev->dev, sp_rtc->res); if (IS_ERR(sp_rtc->reg_base)) - return dev_err_probe(&plat_dev->dev, PTR_ERR(sp_rtc->res), + return dev_err_probe(&plat_dev->dev, PTR_ERR(sp_rtc->reg_base), "%s devm_ioremap_resource fail\n", RTC_REG_NAME); dev_dbg(&plat_dev->dev, "res = 0x%x, reg_base = 0x%lx\n", sp_rtc->res->start, (unsigned long)sp_rtc->reg_base); -- cgit v1.2.3-58-ga151 From 3fe7fa5843d204e235d92902190fecb972a3f9cc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 10 Dec 2021 15:09:21 -0500 Subject: mm: Add folio_put_refs() This is like folio_put(), but puts N references at once instead of just one. It's like put_page_refs(), but does one atomic operation instead of two, and is available to more than just gup.c. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Jason Gunthorpe Reviewed-by: William Kucharski --- include/linux/mm.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index c768a7c81b0b..cb98f75b245e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1244,6 +1244,26 @@ static inline void folio_put(struct folio *folio) __put_page(&folio->page); } +/** + * folio_put_refs - Reduce the reference count on a folio. + * @folio: The folio. + * @refs: The amount to subtract from the folio's reference count. + * + * If the folio's reference count reaches zero, the memory will be + * released back to the page allocator and may be used by another + * allocation immediately. Do not access the memory or the struct folio + * after calling folio_put_refs() unless you can be sure that these weren't + * the last references. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folio_put_refs(struct folio *folio, int refs) +{ + if (folio_ref_sub_and_test(folio, refs)) + __put_page(&folio->page); +} + static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); -- cgit v1.2.3-58-ga151 From 3abb28e275bfbe60136db37eae6679c3e1928cd5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 7 Jan 2022 13:03:48 -0500 Subject: filemap: Use folio_put_refs() in filemap_free_folio() This shrinks filemap_free_folio() by 55 bytes in my .config; 24 bytes from removing the VM_BUG_ON_FOLIO() and 31 bytes from unifying the small/large folio paths. We could just use folio_ref_sub() here since the caller should hold a reference (as the VM_BUG_ON_FOLIO() was asserting), but that's fragile. Signed-off-by: Matthew Wilcox (Oracle) --- mm/filemap.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 2fd9b2f24025..afc8f5ca85ac 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -231,17 +231,15 @@ void __filemap_remove_folio(struct folio *folio, void *shadow) void filemap_free_folio(struct address_space *mapping, struct folio *folio) { void (*freepage)(struct page *); + int refs = 1; freepage = mapping->a_ops->freepage; if (freepage) freepage(&folio->page); - if (folio_test_large(folio) && !folio_test_hugetlb(folio)) { - folio_ref_sub(folio, folio_nr_pages(folio)); - VM_BUG_ON_FOLIO(folio_ref_count(folio) <= 0, folio); - } else { - folio_put(folio); - } + if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + refs = folio_nr_pages(folio); + folio_put_refs(folio, refs); } /** -- cgit v1.2.3-58-ga151 From 429a64f6e91fbfe4912d17247c27d0d66767b1c2 Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 14 Jan 2022 08:43:55 +0530 Subject: powerpc/perf: Only define power_pmu_wants_prompt_pmi() for CONFIG_PPC64 power_pmu_wants_prompt_pmi() is used to decide if PMIs should be taken promptly. This is valid only for ppc64 and is used only if CONFIG_PPC_BOOK3S_64=y. Hence include the function under config check for PPC64. Fixes warning for 32-bit compilation: arch/powerpc/perf/core-book3s.c:2455:6: warning: no previous prototype for 'power_pmu_wants_prompt_pmi' 2455 | bool power_pmu_wants_prompt_pmi(void) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ Fixes: 5a7745b96f43 ("powerpc/64s/perf: add power_pmu_wants_prompt_pmi to say whether perf wants PMIs to be soft-NMI") Reported-by: kernel test robot Signed-off-by: Athira Rajeev Reviewed-by: Nicholas Piggin [mpe: Move inside existing CONFIG_PPC64 ifdef block] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220114031355.87480-1-atrajeev@linux.vnet.ibm.com --- arch/powerpc/perf/core-book3s.c | 58 ++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index a684901b6965..32b98b7a1f86 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -776,6 +776,34 @@ static void pmao_restore_workaround(bool ebb) mtspr(SPRN_PMC6, pmcs[5]); } +/* + * If the perf subsystem wants performance monitor interrupts as soon as + * possible (e.g., to sample the instruction address and stack chain), + * this should return true. The IRQ masking code can then enable MSR[EE] + * in some places (e.g., interrupt handlers) that allows PMI interrupts + * through to improve accuracy of profiles, at the cost of some performance. + * + * The PMU counters can be enabled by other means (e.g., sysfs raw SPR + * access), but in that case there is no need for prompt PMI handling. + * + * This currently returns true if any perf counter is being used. It + * could possibly return false if only events are being counted rather than + * samples being taken, but for now this is good enough. + */ +bool power_pmu_wants_prompt_pmi(void) +{ + struct cpu_hw_events *cpuhw; + + /* + * This could simply test local_paca->pmcregs_in_use if that were not + * under ifdef KVM. + */ + if (!ppmu) + return false; + + cpuhw = this_cpu_ptr(&cpu_hw_events); + return cpuhw->n_events; +} #endif /* CONFIG_PPC64 */ static void perf_event_interrupt(struct pt_regs *regs); @@ -2438,36 +2466,6 @@ static void perf_event_interrupt(struct pt_regs *regs) perf_sample_event_took(sched_clock() - start_clock); } -/* - * If the perf subsystem wants performance monitor interrupts as soon as - * possible (e.g., to sample the instruction address and stack chain), - * this should return true. The IRQ masking code can then enable MSR[EE] - * in some places (e.g., interrupt handlers) that allows PMI interrupts - * though to improve accuracy of profiles, at the cost of some performance. - * - * The PMU counters can be enabled by other means (e.g., sysfs raw SPR - * access), but in that case there is no need for prompt PMI handling. - * - * This currently returns true if any perf counter is being used. It - * could possibly return false if only events are being counted rather than - * samples being taken, but for now this is good enough. - */ -bool power_pmu_wants_prompt_pmi(void) -{ - struct cpu_hw_events *cpuhw; - - /* - * This could simply test local_paca->pmcregs_in_use if that were not - * under ifdef KVM. - */ - - if (!ppmu) - return false; - - cpuhw = this_cpu_ptr(&cpu_hw_events); - return cpuhw->n_events; -} - static int power_pmu_prepare_cpu(unsigned int cpu) { struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); -- cgit v1.2.3-58-ga151 From 5576c4f24c56722a2d9fb9c447d896e5b312078b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 16 Jan 2022 09:28:38 +0100 Subject: ALSA: core: Fix SSID quirk lookup for subvendor=0 Some weird devices set the codec SSID vendor ID 0, and snd_pci_quirk_lookup_id() loop aborts at the point although it should still try matching with the SSID device ID. This resulted in a missing quirk for some old Macs. Fix the loop termination condition to check both subvendor and subdevice. Fixes: 73355ddd8775 ("ALSA: hda: Code refactoring snd_hda_pick_fixup()") Cc: BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215495 Link: https://lore.kernel.org/r/20220116082838.19382-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/misc.c b/sound/core/misc.c index 3579dd7a161f..50e4aaa6270d 100644 --- a/sound/core/misc.c +++ b/sound/core/misc.c @@ -112,7 +112,7 @@ snd_pci_quirk_lookup_id(u16 vendor, u16 device, { const struct snd_pci_quirk *q; - for (q = list; q->subvendor; q++) { + for (q = list; q->subvendor || q->subdevice; q++) { if (q->subvendor != vendor) continue; if (!q->subdevice || -- cgit v1.2.3-58-ga151 From ef3ac01564067a4337bb798b8eddc6ea7b78fd10 Mon Sep 17 00:00:00 2001 From: José Roberto de Souza Date: Thu, 13 Jan 2022 08:04:37 -0800 Subject: drm/i915/display/ehl: Update voltage swing table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit EHL table was recently updated with some minor fixes. BSpec: 21257 Cc: stable@vger.kernel.org Cc: Clint Taylor Signed-off-by: José Roberto de Souza Reviewed-by: Clint Taylor Link: https://patchwork.freedesktop.org/patch/msgid/20220113160437.49059-1-jose.souza@intel.com (cherry picked from commit 5ec7baef52c367cdbda964aa662f7135c25bab1f) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_ddi_buf_trans.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.c b/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.c index 1e689d573512..e2dfb93a82bd 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.c +++ b/drivers/gpu/drm/i915/display/intel_ddi_buf_trans.c @@ -477,14 +477,14 @@ static const struct intel_ddi_buf_trans icl_combo_phy_trans_hdmi = { static const union intel_ddi_buf_trans_entry _ehl_combo_phy_trans_dp[] = { /* NT mV Trans mV db */ { .icl = { 0xA, 0x33, 0x3F, 0x00, 0x00 } }, /* 350 350 0.0 */ - { .icl = { 0xA, 0x47, 0x36, 0x00, 0x09 } }, /* 350 500 3.1 */ - { .icl = { 0xC, 0x64, 0x34, 0x00, 0x0B } }, /* 350 700 6.0 */ - { .icl = { 0x6, 0x7F, 0x30, 0x00, 0x0F } }, /* 350 900 8.2 */ + { .icl = { 0xA, 0x47, 0x38, 0x00, 0x07 } }, /* 350 500 3.1 */ + { .icl = { 0xC, 0x64, 0x33, 0x00, 0x0C } }, /* 350 700 6.0 */ + { .icl = { 0x6, 0x7F, 0x2F, 0x00, 0x10 } }, /* 350 900 8.2 */ { .icl = { 0xA, 0x46, 0x3F, 0x00, 0x00 } }, /* 500 500 0.0 */ - { .icl = { 0xC, 0x64, 0x38, 0x00, 0x07 } }, /* 500 700 2.9 */ + { .icl = { 0xC, 0x64, 0x37, 0x00, 0x08 } }, /* 500 700 2.9 */ { .icl = { 0x6, 0x7F, 0x32, 0x00, 0x0D } }, /* 500 900 5.1 */ { .icl = { 0xC, 0x61, 0x3F, 0x00, 0x00 } }, /* 650 700 0.6 */ - { .icl = { 0x6, 0x7F, 0x38, 0x00, 0x07 } }, /* 600 900 3.5 */ + { .icl = { 0x6, 0x7F, 0x37, 0x00, 0x08 } }, /* 600 900 3.5 */ { .icl = { 0x6, 0x7F, 0x3F, 0x00, 0x00 } }, /* 900 900 0.0 */ }; -- cgit v1.2.3-58-ga151 From e26602be4869c74dd8a0f66f718b8a0ce120edb4 Mon Sep 17 00:00:00 2001 From: José Roberto de Souza Date: Thu, 13 Jan 2022 09:48:26 -0800 Subject: drm/i915/display/adlp: Implement new step in the TC voltage swing prog sequence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TC voltage swing programming sequence was updated with a new step. BSpec: 54956 Cc: stable@vger.kernel.org Cc: Jani Nikula Cc: Clint Taylor Cc: Imre Deak Signed-off-by: José Roberto de Souza Reviewed-by: Clint Taylor Link: https://patchwork.freedesktop.org/patch/msgid/20220113174826.50272-1-jose.souza@intel.com (cherry picked from commit 5ff59dddacd4738edcbd01847d9df7682348cf86) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_ddi.c | 22 ++++++++++++++++++++++ drivers/gpu/drm/i915/i915_reg.h | 8 ++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index 9c9d574f0b8c..cab505277595 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -1298,6 +1298,28 @@ static void tgl_dkl_phy_set_signal_levels(struct intel_encoder *encoder, intel_de_rmw(dev_priv, DKL_TX_DPCNTL2(tc_port), DKL_TX_DP20BITMODE, 0); + + if (IS_ALDERLAKE_P(dev_priv)) { + u32 val; + + if (intel_crtc_has_type(crtc_state, INTEL_OUTPUT_HDMI)) { + if (ln == 0) { + val = DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX1(0); + val |= DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX2(2); + } else { + val = DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX1(3); + val |= DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX2(3); + } + } else { + val = DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX1(0); + val |= DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX2(0); + } + + intel_de_rmw(dev_priv, DKL_TX_DPCNTL2(tc_port), + DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX1_MASK | + DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX2_MASK, + val); + } } } diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 4c28dadf8d69..971d601fe751 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -11166,8 +11166,12 @@ enum skl_power_gate { _DKL_PHY2_BASE) + \ _DKL_TX_DPCNTL1) -#define _DKL_TX_DPCNTL2 0x2C8 -#define DKL_TX_DP20BITMODE (1 << 2) +#define _DKL_TX_DPCNTL2 0x2C8 +#define DKL_TX_DP20BITMODE REG_BIT(2) +#define DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX1_MASK REG_GENMASK(4, 3) +#define DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX1(val) REG_FIELD_PREP(DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX1_MASK, (val)) +#define DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX2_MASK REG_GENMASK(6, 5) +#define DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX2(val) REG_FIELD_PREP(DKL_TX_DPCNTL2_CFG_LOADGENSELECT_TX2_MASK, (val)) #define DKL_TX_DPCNTL2(tc_port) _MMIO(_PORT(tc_port, \ _DKL_PHY1_BASE, \ _DKL_PHY2_BASE) + \ -- cgit v1.2.3-58-ga151 From a87b0fd4f9003f8521226e226cf92b18147b3519 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 13 Jan 2022 12:31:16 +0100 Subject: s390/cpumf: Support for CPU Measurement Facility CSVN 7 Adds support for the CPU Measurement Counter Facility second version number 7. Signed-off-by: Thomas Richter Acked-by: Sumanth Korikkar Signed-off-by: Heiko Carstens --- arch/s390/kernel/perf_cpum_cf_common.c | 4 ++-- arch/s390/kernel/perf_cpum_cf_events.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c index 30f0242de4a5..8ee48672233f 100644 --- a/arch/s390/kernel/perf_cpum_cf_common.c +++ b/arch/s390/kernel/perf_cpum_cf_common.c @@ -178,7 +178,7 @@ size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, case CPUMF_CTR_SET_CRYPTO: if (info->csvn >= 1 && info->csvn <= 5) ctrset_size = 16; - else if (info->csvn == 6) + else if (info->csvn == 6 || info->csvn == 7) ctrset_size = 20; break; case CPUMF_CTR_SET_EXT: @@ -188,7 +188,7 @@ size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, ctrset_size = 48; else if (info->csvn >= 3 && info->csvn <= 5) ctrset_size = 128; - else if (info->csvn == 6) + else if (info->csvn == 6 || info->csvn == 7) ctrset_size = 160; break; case CPUMF_CTR_SET_MT_DIAG: diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index 37265f551a11..52c1fe23b823 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -344,7 +344,7 @@ static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = { NULL, }; -static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = { +static struct attribute *cpumcf_svn_67_pmu_event_attr[] __initdata = { CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), @@ -715,8 +715,8 @@ __init const struct attribute_group **cpumf_cf_event_group(void) case 1 ... 5: csvn = cpumcf_svn_12345_pmu_event_attr; break; - case 6: - csvn = cpumcf_svn_6_pmu_event_attr; + case 6 ... 7: + csvn = cpumcf_svn_67_pmu_event_attr; break; default: csvn = none; -- cgit v1.2.3-58-ga151 From 745f5d20e7936931f924410f32d8b0e599b5990e Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 13 Jan 2022 12:51:56 +0100 Subject: s390/cpumf: Support for CPU Measurement Sampling Facility LS bit Adds support for the CPU Measurement Sampling Facility limit sampling bit in the sampling device driver. Limited samples have no valueable information are not collected. Signed-off-by: Thomas Richter Acked-by: Sumanth Korikkar Signed-off-by: Heiko Carstens --- arch/s390/include/asm/cpu_mf.h | 4 +++- arch/s390/kernel/perf_cpum_sf.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index 0d90cbeb89b4..e3f12db46cfc 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -109,7 +109,9 @@ struct hws_basic_entry { unsigned int AS:2; /* 29-30 PSW address-space control */ unsigned int I:1; /* 31 entry valid or invalid */ unsigned int CL:2; /* 32-33 Configuration Level */ - unsigned int:14; + unsigned int H:1; /* 34 Host Indicator */ + unsigned int LS:1; /* 35 Limited Sampling */ + unsigned int:12; unsigned int prim_asn:16; /* primary ASN */ unsigned long long ia; /* Instruction Address */ unsigned long long gpp; /* Guest Program Parameter */ diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index db62def4ef28..332a49965130 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1179,7 +1179,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, sample = (struct hws_basic_entry *) *sdbt; while ((unsigned long *) sample < (unsigned long *) te) { /* Check for an empty sample */ - if (!sample->def) + if (!sample->def || sample->LS) break; /* Update perf event period */ -- cgit v1.2.3-58-ga151 From 012a224e1fa31fc256aab921f691598e03db6018 Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Tue, 11 Jan 2022 11:00:03 +0100 Subject: s390/uaccess: introduce bit field for OAC specifier Previously, we've used magic values to specify the OAC (operand-access control) for mvcos. Instead we introduce a bit field for it. When using a bit field, we cannot use an immediate value with K constraint anymore, since GCC older than 10 doesn't recognize the bit field union as a compile time constant. To make things work with older compilers, load the OAC value through a register. Bloat-o-meter reports a slight increase in kernel size with this change: Total: Before=15692135, After=15693015, chg +0.01% Signed-off-by: Nico Boehr Co-developed-by: Janis Schoetterl-Glausch Signed-off-by: Janis Schoetterl-Glausch Link: https://lore.kernel.org/r/20220111100003.743116-1-scgl@linux.ibm.com Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Vasily Gorbik Cc: Sven Schnelle Signed-off-by: Heiko Carstens --- arch/s390/include/asm/uaccess.h | 120 ++++++++++++++++++++++++++-------------- arch/s390/lib/uaccess.c | 24 ++++++-- 2 files changed, 95 insertions(+), 49 deletions(-) diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index ce550d06abc3..147cb3534ce4 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -49,51 +49,85 @@ int __get_user_bad(void) __attribute__((noreturn)); #ifdef CONFIG_HAVE_MARCH_Z10_FEATURES -#define __put_get_user_asm(to, from, size, insn) \ -({ \ - int __rc; \ - \ - asm volatile( \ - insn " 0,%[spec]\n" \ - "0: mvcos %[_to],%[_from],%[_size]\n" \ - "1: xr %[rc],%[rc]\n" \ - "2:\n" \ - ".pushsection .fixup, \"ax\"\n" \ - "3: lhi %[rc],%[retval]\n" \ - " jg 2b\n" \ - ".popsection\n" \ - EX_TABLE(0b,3b) EX_TABLE(1b,3b) \ - : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \ - : [_size] "d" (size), [_from] "Q" (*(from)), \ - [retval] "K" (-EFAULT), [spec] "K" (0x81UL) \ - : "cc", "0"); \ - __rc; \ +union oac { + unsigned int val; + struct { + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac1; + struct { + unsigned short key : 4; + unsigned short : 4; + unsigned short as : 2; + unsigned short : 4; + unsigned short k : 1; + unsigned short a : 1; + } oac2; + }; +}; + +#define __put_get_user_asm(to, from, size, oac_spec) \ +({ \ + int __rc; \ + \ + asm volatile( \ + " lr 0,%[spec]\n" \ + "0: mvcos %[_to],%[_from],%[_size]\n" \ + "1: xr %[rc],%[rc]\n" \ + "2:\n" \ + ".pushsection .fixup, \"ax\"\n" \ + "3: lhi %[rc],%[retval]\n" \ + " jg 2b\n" \ + ".popsection\n" \ + EX_TABLE(0b,3b) EX_TABLE(1b,3b) \ + : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \ + : [_size] "d" (size), [_from] "Q" (*(from)), \ + [retval] "K" (-EFAULT), [spec] "d" (oac_spec.val) \ + : "cc", "0"); \ + __rc; \ }) +#define __put_user_asm(to, from, size) \ + __put_get_user_asm(to, from, size, ((union oac) { \ + .oac1.as = PSW_BITS_AS_SECONDARY, \ + .oac1.a = 1 \ + })) + +#define __get_user_asm(to, from, size) \ + __put_get_user_asm(to, from, size, ((union oac) { \ + .oac2.as = PSW_BITS_AS_SECONDARY, \ + .oac2.a = 1 \ + })) \ + static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) { int rc; switch (size) { case 1: - rc = __put_get_user_asm((unsigned char __user *)ptr, - (unsigned char *)x, - size, "llilh"); + rc = __put_user_asm((unsigned char __user *)ptr, + (unsigned char *)x, + size); break; case 2: - rc = __put_get_user_asm((unsigned short __user *)ptr, - (unsigned short *)x, - size, "llilh"); + rc = __put_user_asm((unsigned short __user *)ptr, + (unsigned short *)x, + size); break; case 4: - rc = __put_get_user_asm((unsigned int __user *)ptr, - (unsigned int *)x, - size, "llilh"); + rc = __put_user_asm((unsigned int __user *)ptr, + (unsigned int *)x, + size); break; case 8: - rc = __put_get_user_asm((unsigned long __user *)ptr, - (unsigned long *)x, - size, "llilh"); + rc = __put_user_asm((unsigned long __user *)ptr, + (unsigned long *)x, + size); break; default: __put_user_bad(); @@ -108,24 +142,24 @@ static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsign switch (size) { case 1: - rc = __put_get_user_asm((unsigned char *)x, - (unsigned char __user *)ptr, - size, "lghi"); + rc = __get_user_asm((unsigned char *)x, + (unsigned char __user *)ptr, + size); break; case 2: - rc = __put_get_user_asm((unsigned short *)x, - (unsigned short __user *)ptr, - size, "lghi"); + rc = __get_user_asm((unsigned short *)x, + (unsigned short __user *)ptr, + size); break; case 4: - rc = __put_get_user_asm((unsigned int *)x, - (unsigned int __user *)ptr, - size, "lghi"); + rc = __get_user_asm((unsigned int *)x, + (unsigned int __user *)ptr, + size); break; case 8: - rc = __put_get_user_asm((unsigned long *)x, - (unsigned long __user *)ptr, - size, "lghi"); + rc = __get_user_asm((unsigned long *)x, + (unsigned long __user *)ptr, + size); break; default: __get_user_bad(); diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index a596e69d3c47..8a5d21461889 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -62,10 +62,14 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr unsigned long size) { unsigned long tmp1, tmp2; + union oac spec = { + .oac2.as = PSW_BITS_AS_SECONDARY, + .oac2.a = 1, + }; tmp1 = -4096UL; asm volatile( - " lghi 0,%[spec]\n" + " lr 0,%[spec]\n" "0: .insn ss,0xc80000000000,0(%0,%2),0(%1),0\n" "6: jz 4f\n" "1: algr %0,%3\n" @@ -84,7 +88,7 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr "5:\n" EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : [spec] "K" (0x81UL) + : [spec] "d" (spec.val) : "cc", "memory", "0"); return size; } @@ -135,10 +139,14 @@ static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x, unsigned long size) { unsigned long tmp1, tmp2; + union oac spec = { + .oac1.as = PSW_BITS_AS_SECONDARY, + .oac1.a = 1, + }; tmp1 = -4096UL; asm volatile( - " llilh 0,%[spec]\n" + " lr 0,%[spec]\n" "0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n" "6: jz 4f\n" "1: algr %0,%3\n" @@ -157,7 +165,7 @@ static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x, "5:\n" EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) - : [spec] "K" (0x81UL) + : [spec] "d" (spec.val) : "cc", "memory", "0"); return size; } @@ -207,10 +215,14 @@ EXPORT_SYMBOL(raw_copy_to_user); static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size) { unsigned long tmp1, tmp2; + union oac spec = { + .oac1.as = PSW_BITS_AS_SECONDARY, + .oac1.a = 1, + }; tmp1 = -4096UL; asm volatile( - " llilh 0,%[spec]\n" + " lr 0,%[spec]\n" "0: .insn ss,0xc80000000000,0(%0,%1),0(%4),0\n" " jz 4f\n" "1: algr %0,%2\n" @@ -228,7 +240,7 @@ static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size "5:\n" EX_TABLE(0b,2b) EX_TABLE(3b,5b) : "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2) - : "a" (empty_zero_page), [spec] "K" (0x81UL) + : "a" (empty_zero_page), [spec] "d" (spec.val) : "cc", "memory", "0"); return size; } -- cgit v1.2.3-58-ga151 From 5754f9084f261f6fbfdcc6e57dae97c86a6ff688 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 17 Jan 2022 13:31:40 +0100 Subject: s390: add Sven Schnelle as reviewer Sven Schnelle will help reviewing s390 architecture code. Acked-by: Sven Schnelle Acked-by: Alexander Gordeev Acked-by: Vasily Gorbik Acked-by: Christian Borntraeger Signed-off-by: Heiko Carstens --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1f7658bab9e7..253abb28795b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16611,6 +16611,7 @@ M: Heiko Carstens M: Vasily Gorbik M: Christian Borntraeger R: Alexander Gordeev +R: Sven Schnelle L: linux-s390@vger.kernel.org S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ -- cgit v1.2.3-58-ga151 From 0b39536cc699db6850c426db7f9cb45923de40c5 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Fri, 14 Jan 2022 06:48:20 +0000 Subject: gpio: mpc8xxx: Fix IRQ check in mpc8xxx_probe platform_get_irq() returns negative error number instead 0 on failure. And the doc of platform_get_irq() provides a usage example: int irq = platform_get_irq(pdev, 0); if (irq < 0) return irq; Fix the check of return value to catch errors correctly. Fixes: 76c47d1449fc ("gpio: mpc8xxx: Add ACPI support") Signed-off-by: Miaoqian Lin Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mpc8xxx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index 70d6ae20b1da..01634c8d27b3 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -388,8 +388,8 @@ static int mpc8xxx_probe(struct platform_device *pdev) } mpc8xxx_gc->irqn = platform_get_irq(pdev, 0); - if (!mpc8xxx_gc->irqn) - return 0; + if (mpc8xxx_gc->irqn < 0) + return mpc8xxx_gc->irqn; mpc8xxx_gc->irq = irq_domain_create_linear(fwnode, MPC8XXX_GPIO_PINS, -- cgit v1.2.3-58-ga151 From 30fee1d7462a446ade399c0819717a830cbdca69 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Fri, 14 Jan 2022 06:51:24 +0000 Subject: gpio: idt3243x: Fix IRQ check in idt_gpio_probe platform_get_irq() returns negative error number instead 0 on failure. And the doc of platform_get_irq() provides a usage example: int irq = platform_get_irq(pdev, 0); if (irq < 0) return irq; Fix the check of return value to catch errors correctly. Fixes: 4195926aedca ("gpio: Add support for IDT 79RC3243x GPIO controller") Signed-off-by: Miaoqian Lin Reviewed-by: Linus Walleij Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-idt3243x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-idt3243x.c b/drivers/gpio/gpio-idt3243x.c index 50003ad2e589..08493b05be2d 100644 --- a/drivers/gpio/gpio-idt3243x.c +++ b/drivers/gpio/gpio-idt3243x.c @@ -164,8 +164,8 @@ static int idt_gpio_probe(struct platform_device *pdev) return PTR_ERR(ctrl->pic); parent_irq = platform_get_irq(pdev, 0); - if (!parent_irq) - return -EINVAL; + if (parent_irq < 0) + return parent_irq; girq = &ctrl->gc.irq; girq->chip = &idt_gpio_irqchip; -- cgit v1.2.3-58-ga151 From 00358933f66c44d511368a57eb421e172447cfb9 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 6 Jan 2022 18:53:16 +0900 Subject: brd: remove brd_devices_mutex mutex If brd_alloc() from brd_probe() is called before brd_alloc() from brd_init() is called, module loading will fail with -EEXIST error. To close this race, call __register_blkdev() just before leaving brd_init(). Then, we can remove brd_devices_mutex mutex, for brd_device list will no longer be accessed concurrently. Signed-off-by: Tetsuo Handa Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/6b074af7-c165-4fab-b7da-8270a4f6f6cd@i-love.sakura.ne.jp Signed-off-by: Jens Axboe --- drivers/block/brd.c | 73 ++++++++++++++++++++++------------------------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 8fe2e4289dae..6e3f2f0d2352 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -362,7 +362,6 @@ __setup("ramdisk_size=", ramdisk_size); * (should share code eventually). */ static LIST_HEAD(brd_devices); -static DEFINE_MUTEX(brd_devices_mutex); static struct dentry *brd_debugfs_dir; static int brd_alloc(int i) @@ -372,21 +371,14 @@ static int brd_alloc(int i) char buf[DISK_NAME_LEN]; int err = -ENOMEM; - mutex_lock(&brd_devices_mutex); - list_for_each_entry(brd, &brd_devices, brd_list) { - if (brd->brd_number == i) { - mutex_unlock(&brd_devices_mutex); + list_for_each_entry(brd, &brd_devices, brd_list) + if (brd->brd_number == i) return -EEXIST; - } - } brd = kzalloc(sizeof(*brd), GFP_KERNEL); - if (!brd) { - mutex_unlock(&brd_devices_mutex); + if (!brd) return -ENOMEM; - } brd->brd_number = i; list_add_tail(&brd->brd_list, &brd_devices); - mutex_unlock(&brd_devices_mutex); spin_lock_init(&brd->brd_lock); INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); @@ -429,9 +421,7 @@ static int brd_alloc(int i) out_cleanup_disk: blk_cleanup_disk(disk); out_free_dev: - mutex_lock(&brd_devices_mutex); list_del(&brd->brd_list); - mutex_unlock(&brd_devices_mutex); kfree(brd); return err; } @@ -441,15 +431,19 @@ static void brd_probe(dev_t dev) brd_alloc(MINOR(dev) / max_part); } -static void brd_del_one(struct brd_device *brd) +static void brd_cleanup(void) { - del_gendisk(brd->brd_disk); - blk_cleanup_disk(brd->brd_disk); - brd_free_pages(brd); - mutex_lock(&brd_devices_mutex); - list_del(&brd->brd_list); - mutex_unlock(&brd_devices_mutex); - kfree(brd); + struct brd_device *brd, *next; + + debugfs_remove_recursive(brd_debugfs_dir); + + list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { + del_gendisk(brd->brd_disk); + blk_cleanup_disk(brd->brd_disk); + brd_free_pages(brd); + list_del(&brd->brd_list); + kfree(brd); + } } static inline void brd_check_and_reset_par(void) @@ -473,9 +467,18 @@ static inline void brd_check_and_reset_par(void) static int __init brd_init(void) { - struct brd_device *brd, *next; int err, i; + brd_check_and_reset_par(); + + brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); + + for (i = 0; i < rd_nr; i++) { + err = brd_alloc(i); + if (err) + goto out_free; + } + /* * brd module now has a feature to instantiate underlying device * structure on-demand, provided that there is an access dev node. @@ -491,28 +494,16 @@ static int __init brd_init(void) * dynamically. */ - if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) - return -EIO; - - brd_check_and_reset_par(); - - brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL); - - for (i = 0; i < rd_nr; i++) { - err = brd_alloc(i); - if (err) - goto out_free; + if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) { + err = -EIO; + goto out_free; } pr_info("brd: module loaded\n"); return 0; out_free: - unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); - debugfs_remove_recursive(brd_debugfs_dir); - - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) - brd_del_one(brd); + brd_cleanup(); pr_info("brd: module NOT loaded !!!\n"); return err; @@ -520,13 +511,9 @@ out_free: static void __exit brd_exit(void) { - struct brd_device *brd, *next; unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); - debugfs_remove_recursive(brd_debugfs_dir); - - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) - brd_del_one(brd); + brd_cleanup(); pr_info("brd: module unloaded\n"); } -- cgit v1.2.3-58-ga151 From e6a2e5116e07ce5acc8698785c29e9e47f010fd5 Mon Sep 17 00:00:00 2001 From: GuoYong Zheng Date: Mon, 17 Jan 2022 18:22:37 +0800 Subject: block: Remove unnecessary variable assignment The parameter "ret" should be zero when running to this line, no need to set to zero again, remove it. Signed-off-by: GuoYong Zheng Link: https://lore.kernel.org/r/1642414957-6785-1-git-send-email-zhenggy@chinatelecom.cn Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e20eadfcf5c8..bed4a2facd65 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -887,7 +887,6 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); - ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); -- cgit v1.2.3-58-ga151 From 850fd2abbe02eb2b52cbb1550adbcc89b36d65de Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 Jan 2022 20:34:01 +0800 Subject: block: cleanup q->srcu srcu structure has to be cleanup via cleanup_srcu_struct(), so fix it. Reported-by: syzbot+4f789823c1abc5accf13@syzkaller.appspotmail.com Fixes: 704b914f15fb ("blk-mq: move srcu from blk_mq_hw_ctx to request_queue") Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220111123401.520192-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index bed4a2facd65..9f32882ceb2f 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -811,6 +811,9 @@ static void blk_release_queue(struct kobject *kobj) bioset_exit(&q->bio_split); + if (blk_queue_has_srcu(q)) + cleanup_srcu_struct(q->srcu); + ida_simple_remove(&blk_queue_ida, q->id); call_rcu(&q->rcu_head, blk_free_queue_rcu); } -- cgit v1.2.3-58-ga151 From 9dec0368b9640c09ef5af48214e097245e57a204 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 11 Jan 2022 14:05:02 -0800 Subject: xfs: remove the XFS_IOC_FSSETDM definitions Remove the definitions for these ioctls, since the functionality (and, weirdly, the 32-bit compat ioctl definitions) were removed from the kernel in November 2019. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_fs.h | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index c43877c8a279..52b48b24ff66 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -92,21 +92,6 @@ struct getbmapx { #define XFS_FMR_OWN_COW FMR_OWNER('X', 7) /* cow staging */ #define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */ -/* - * Structure for XFS_IOC_FSSETDM. - * For use by backup and restore programs to set the XFS on-disk inode - * fields di_dmevmask and di_dmstate. These must be set to exactly and - * only values previously obtained via xfs_bulkstat! (Specifically the - * struct xfs_bstat fields bs_dmevmask and bs_dmstate.) - */ -#ifndef HAVE_FSDMIDATA -struct fsdmidata { - __u32 fsd_dmevmask; /* corresponds to di_dmevmask */ - __u16 fsd_padding; - __u16 fsd_dmstate; /* corresponds to di_dmstate */ -}; -#endif - /* * File segment locking set data type for 64 bit access. * Also used for all the RESV/FREE interfaces. @@ -562,16 +547,10 @@ typedef struct xfs_fsop_handlereq { /* * Compound structures for passing args through Handle Request interfaces - * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle - * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and - * XFS_IOC_ATTRMULTI_BY_HANDLE + * xfs_attrlist_by_handle, xfs_attrmulti_by_handle + * - ioctls: XFS_IOC_ATTRLIST_BY_HANDLE, and XFS_IOC_ATTRMULTI_BY_HANDLE */ -typedef struct xfs_fsop_setdm_handlereq { - struct xfs_fsop_handlereq hreq; /* handle information */ - struct fsdmidata __user *data; /* DMAPI data */ -} xfs_fsop_setdm_handlereq_t; - /* * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface. * @@ -789,7 +768,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) #define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) -#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata) +/* XFS_IOC_FSSETDM ------- deprecated 39 */ #define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64) #define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64) #define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64) @@ -831,7 +810,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */ #define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */ -#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) +/* XFS_IOC_FSSETDM_BY_HANDLE -- deprecated 121 */ #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) #define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4) -- cgit v1.2.3-58-ga151 From 4d1b97f9ce7c0d2af2bb85b12d48e6902172a28e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 7 Jan 2022 17:45:51 -0800 Subject: xfs: kill the XFS_IOC_{ALLOC,FREE}SP* ioctls According to the glibc compat header for Irix 4, these ioctls originated in April 1991 as a (somewhat clunky) way to preallocate space at the end of a file on an EFS filesystem. XFS, which was released in Irix 5.3 in December 1993, picked up these ioctls to maintain compatibility and they were ported to Linux in the early 2000s. Recently it was pointed out to me they still lurk in the kernel, even though the Linux fallocate syscall supplanted the functionality a long time ago. fstests doesn't seem to include any real functional or stress tests for these ioctls, which means that the code quality is ... very questionable. Most notably, it was a stale disk block exposure vector for 21 years and nobody noticed or complained. As mature programmers say, "If you're not testing it, it's broken." Given all that, let's withdraw these ioctls from the XFS userspace API. Normally we'd set a long deprecation process, but I estimate that there aren't any real users, so let's trigger a warning in dmesg and return -ENOTTY. See: CVE-2021-4155 Augments: 983d8e60f508 ("xfs: map unwritten blocks in XFS_IOC_{ALLOC,FREE}SP just like fallocate") Signed-off-by: Darrick J. Wong Reviewed-by: Eric Sandeen Reviewed-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 7 ++-- fs/xfs/xfs_bmap_util.h | 2 +- fs/xfs/xfs_file.c | 3 +- fs/xfs/xfs_ioctl.c | 92 +++----------------------------------------------- fs/xfs/xfs_ioctl.h | 6 ---- fs/xfs/xfs_ioctl32.c | 27 --------------- 6 files changed, 10 insertions(+), 127 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 73a36b7be3bd..575060a7c768 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -771,8 +771,7 @@ int xfs_alloc_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, - int alloc_type) + xfs_off_t len) { xfs_mount_t *mp = ip->i_mount; xfs_off_t count; @@ -865,8 +864,8 @@ xfs_alloc_file_space( goto error; error = xfs_bmapi_write(tp, ip, startoffset_fsb, - allocatesize_fsb, alloc_type, 0, imapp, - &nimaps); + allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, + &nimaps); if (error) goto error; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 9f993168b55b..24b37d211f1d 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -54,7 +54,7 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, /* preallocation and hole punch interface */ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, int alloc_type); + xfs_off_t len); int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 27594738b0d1..d81a28cada35 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1052,8 +1052,7 @@ xfs_file_fallocate( } if (!xfs_is_always_cow_inode(ip)) { - error = xfs_alloc_file_space(ip, offset, len, - XFS_BMAPI_PREALLOC); + error = xfs_alloc_file_space(ip, offset, len); if (error) goto out_unlock; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 29231a8c8a45..64a7ef4a7298 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -627,86 +627,6 @@ xfs_attrmulti_by_handle( return error; } -int -xfs_ioc_space( - struct file *filp, - xfs_flock64_t *bf) -{ - struct inode *inode = file_inode(filp); - struct xfs_inode *ip = XFS_I(inode); - struct iattr iattr; - enum xfs_prealloc_flags flags = XFS_PREALLOC_CLEAR; - uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - int error; - - if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) - return -EPERM; - - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - if (xfs_is_always_cow_inode(ip)) - return -EOPNOTSUPP; - - if (filp->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; - if (filp->f_mode & FMODE_NOCMTIME) - flags |= XFS_PREALLOC_INVISIBLE; - - error = mnt_want_write_file(filp); - if (error) - return error; - - xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); - if (error) - goto out_unlock; - inode_dio_wait(inode); - - switch (bf->l_whence) { - case 0: /*SEEK_SET*/ - break; - case 1: /*SEEK_CUR*/ - bf->l_start += filp->f_pos; - break; - case 2: /*SEEK_END*/ - bf->l_start += XFS_ISIZE(ip); - break; - default: - error = -EINVAL; - goto out_unlock; - } - - if (bf->l_start < 0 || bf->l_start > inode->i_sb->s_maxbytes) { - error = -EINVAL; - goto out_unlock; - } - - if (bf->l_start > XFS_ISIZE(ip)) { - error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), - bf->l_start - XFS_ISIZE(ip), 0); - if (error) - goto out_unlock; - } - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = bf->l_start; - error = xfs_vn_setattr_size(file_mnt_user_ns(filp), file_dentry(filp), - &iattr); - if (error) - goto out_unlock; - - error = xfs_update_prealloc_flags(ip, flags); - -out_unlock: - xfs_iunlock(ip, iolock); - mnt_drop_write_file(filp); - return error; -} - /* Return 0 on success or positive error */ int xfs_fsbulkstat_one_fmt( @@ -1964,13 +1884,11 @@ xfs_file_ioctl( case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: { - xfs_flock64_t bf; - - if (copy_from_user(&bf, arg, sizeof(bf))) - return -EFAULT; - return xfs_ioc_space(filp, &bf); - } + case XFS_IOC_FREESP64: + xfs_warn_once(mp, + "%s should use fallocate; XFS_IOC_{ALLOC,FREE}SP ioctl unsupported", + current->comm); + return -ENOTTY; case XFS_IOC_DIOINFO: { struct xfs_buftarg *target = xfs_inode_buftarg(ip); struct dioattr da; diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 845d3bcab74b..d4abba2c13c1 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -10,12 +10,6 @@ struct xfs_bstat; struct xfs_ibulk; struct xfs_inogrp; - -extern int -xfs_ioc_space( - struct file *filp, - xfs_flock64_t *bf); - int xfs_ioc_swapext( xfs_swapext_t *sxp); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 8783af203cfc..004ed2a251e8 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -27,22 +27,6 @@ _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) #ifdef BROKEN_X86_ALIGNMENT -STATIC int -xfs_compat_flock64_copyin( - xfs_flock64_t *bf, - compat_xfs_flock64_t __user *arg32) -{ - if (get_user(bf->l_type, &arg32->l_type) || - get_user(bf->l_whence, &arg32->l_whence) || - get_user(bf->l_start, &arg32->l_start) || - get_user(bf->l_len, &arg32->l_len) || - get_user(bf->l_sysid, &arg32->l_sysid) || - get_user(bf->l_pid, &arg32->l_pid) || - copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32))) - return -EFAULT; - return 0; -} - STATIC int xfs_compat_ioc_fsgeometry_v1( struct xfs_mount *mp, @@ -445,17 +429,6 @@ xfs_file_compat_ioctl( switch (cmd) { #if defined(BROKEN_X86_ALIGNMENT) - case XFS_IOC_ALLOCSP_32: - case XFS_IOC_FREESP_32: - case XFS_IOC_ALLOCSP64_32: - case XFS_IOC_FREESP64_32: { - struct xfs_flock64 bf; - - if (xfs_compat_flock64_copyin(&bf, arg)) - return -EFAULT; - cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(filp, &bf); - } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(ip->i_mount, arg); case XFS_IOC_FSGROWFSDATA_32: { -- cgit v1.2.3-58-ga151 From b3bb9413e717b44e4aea833d07f14e90fb91cf97 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 21 Dec 2021 13:07:38 -0800 Subject: xfs: remove the XFS_IOC_{ALLOC,FREE}SP* definitions Now that we've made these ioctls defunct, move them from xfs_fs.h to xfs_ioctl.c, which effectively removes them from the publicly supported ioctl interfaces for XFS. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Eric Sandeen --- fs/xfs/libxfs/xfs_fs.h | 8 ++++---- fs/xfs/xfs_ioctl.c | 9 +++++++++ fs/xfs/xfs_ioctl32.h | 4 ---- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 52b48b24ff66..505533c43a92 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -760,13 +760,13 @@ struct xfs_scrub_metadata { * For 'documentation' purposed more than anything else, * the "cmd #" field reflects the IRIX fcntl number. */ -#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) -#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +/* XFS_IOC_ALLOCSP ------- deprecated 10 */ +/* XFS_IOC_FREESP -------- deprecated 11 */ #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) #define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR -#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) -#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) +/* XFS_IOC_ALLOCSP64 ----- deprecated 36 */ +/* XFS_IOC_FREESP64 ------ deprecated 37 */ #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) /* XFS_IOC_FSSETDM ------- deprecated 39 */ #define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 64a7ef4a7298..03a6198c97f6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1854,6 +1854,15 @@ xfs_fs_eofblocks_from_user( return 0; } +/* + * These long-unused ioctls were removed from the official ioctl API in 5.17, + * but retain these definitions so that we can log warnings about them. + */ +#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) +#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) +#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 9929482bf358..fc5a91f3a5e0 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -154,10 +154,6 @@ typedef struct compat_xfs_flock64 { __s32 l_pad[4]; /* reserve area */ } compat_xfs_flock64_t; -#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64) -#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64) -#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64) -#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64) #define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64) #define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) #define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) -- cgit v1.2.3-58-ga151 From a21864486f7e220bd5938c6fb637613d9635739a Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 5 Jan 2022 13:15:09 +0800 Subject: KVM: x86/pmu: Fix available_event_types check for REF_CPU_CYCLES event According to CPUID 0x0A.EBX bit vector, the event [7] should be the unrealized event "Topdown Slots" instead of the *kernel* generalized common hardware event "REF_CPU_CYCLES", so we need to skip the cpuid unavaliblity check in the intel_pmc_perf_hw_id() for the last REF_CPU_CYCLES event and update the confusing comment. If the event is marked as unavailable in the Intel guest CPUID 0AH.EBX leaf, we need to avoid any perf_event creation, whether it's a gp or fixed counter. To distinguish whether it is a rejected event or an event that needs to be programmed with PERF_TYPE_RAW type, a new special returned value of "PERF_COUNT_HW_MAX + 1" is introduced. Fixes: 62079d8a43128 ("KVM: PMU: add proper support for fixed counter 2") Signed-off-by: Like Xu Message-Id: <20220105051509.69437-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 3 +++ arch/x86/kvm/vmx/pmu_intel.c | 18 ++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 8abdadb7e22a..e632693a2266 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -109,6 +109,9 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .config = config, }; + if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX) + return; + attr.sample_period = get_sample_period(pmc, pmc->counter); if (in_tx) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 5e0ac57d6d1b..ffccfd9823c0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -21,7 +21,6 @@ #define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) static struct kvm_event_hw_type_mapping intel_arch_events[] = { - /* Index must match CPUID 0x0A.EBX bit vector */ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, @@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = { [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + /* The above index must match CPUID 0x0A.EBX bit vector */ [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES }, }; @@ -75,11 +75,17 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc) u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; int i; - for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) - if (intel_arch_events[i].eventsel == event_select && - intel_arch_events[i].unit_mask == unit_mask && - (pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i))) - break; + for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) { + if (intel_arch_events[i].eventsel != event_select || + intel_arch_events[i].unit_mask != unit_mask) + continue; + + /* disable event that reported as not present by cpuid */ + if ((i < 7) && !(pmu->available_event_types & (1 << i))) + return PERF_COUNT_HW_MAX + 1; + + break; + } if (i == ARRAY_SIZE(intel_arch_events)) return PERF_COUNT_HW_MAX; -- cgit v1.2.3-58-ga151 From ee3a5f9e3d9bf94159f3cc80da542fbe83502dd8 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 17 Jan 2022 16:05:39 +0100 Subject: KVM: x86: Do runtime CPUID update before updating vcpu->arch.cpuid_entries kvm_update_cpuid_runtime() mangles CPUID data coming from userspace VMM after updating 'vcpu->arch.cpuid_entries', this makes it impossible to compare an update with what was previously supplied. Introduce __kvm_update_cpuid_runtime() version which can be used to tweak the input before it goes to 'vcpu->arch.cpuid_entries' so the upcoming update check can compare tweaked data. No functional change intended. Signed-off-by: Vitaly Kuznetsov Message-Id: <20220117150542.2176196-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index c55e57b30e81..812190a707f6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -145,14 +145,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) } } -static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu, + struct kvm_cpuid_entry2 *entries, int nent) { u32 base = vcpu->arch.kvm_cpuid_base; if (!base) return NULL; - return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0); + return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0); +} + +static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +{ + return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries, + vcpu->arch.cpuid_nent); } void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) @@ -167,11 +174,12 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) vcpu->arch.pv_cpuid.features = best->eax; } -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries, + int nent) { struct kvm_cpuid_entry2 *best; - best = kvm_find_cpuid_entry(vcpu, 1, 0); + best = cpuid_entry2_find(entries, nent, 1, 0); if (best) { /* Update OSXSAVE bit */ if (boot_cpu_has(X86_FEATURE_XSAVE)) @@ -182,33 +190,38 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); } - best = kvm_find_cpuid_entry(vcpu, 7, 0); + best = cpuid_entry2_find(entries, nent, 7, 0); if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) cpuid_entry_change(best, X86_FEATURE_OSPKE, kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); - best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + best = cpuid_entry2_find(entries, nent, 0xD, 0); if (best) best->ebx = xstate_required_size(vcpu->arch.xcr0, false); - best = kvm_find_cpuid_entry(vcpu, 0xD, 1); + best = cpuid_entry2_find(entries, nent, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = kvm_find_kvm_cpuid_features(vcpu); + best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { - best = kvm_find_cpuid_entry(vcpu, 0x1, 0); + best = cpuid_entry2_find(entries, nent, 0x1, 0); if (best) cpuid_entry_change(best, X86_FEATURE_MWAIT, vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } } + +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +{ + __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); +} EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) @@ -298,6 +311,8 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, { int r; + __kvm_update_cpuid_runtime(vcpu, e2, nent); + r = kvm_check_cpuid(vcpu, e2, nent); if (r) return r; @@ -307,7 +322,6 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, vcpu->arch.cpuid_nent = nent; kvm_update_kvm_cpuid_base(vcpu); - kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); return 0; -- cgit v1.2.3-58-ga151 From c6617c61e8fe44b9e9fdfede921f61cac6b5149d Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 17 Jan 2022 16:05:40 +0100 Subject: KVM: x86: Partially allow KVM_SET_CPUID{,2} after KVM_RUN Commit feb627e8d6f6 ("KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN") forbade changing CPUID altogether but unfortunately this is not fully compatible with existing VMMs. In particular, QEMU reuses vCPU fds for CPU hotplug after unplug and it calls KVM_SET_CPUID2. Instead of full ban, check whether the supplied CPUID data is equal to what was previously set. Reported-by: Igor Mammedov Fixes: feb627e8d6f6 ("KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN") Signed-off-by: Vitaly Kuznetsov Message-Id: <20220117150542.2176196-3-vkuznets@redhat.com> Cc: stable@vger.kernel.org [Do not call kvm_find_cpuid_entry repeatedly. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 36 ++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 19 ------------------- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 812190a707f6..7eb046d907c6 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -119,6 +119,28 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu, return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures); } +/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */ +static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, + int nent) +{ + struct kvm_cpuid_entry2 *orig; + int i; + + if (nent != vcpu->arch.cpuid_nent) + return -EINVAL; + + for (i = 0; i < nent; i++) { + orig = &vcpu->arch.cpuid_entries[i]; + if (e2[i].function != orig->function || + e2[i].index != orig->index || + e2[i].eax != orig->eax || e2[i].ebx != orig->ebx || + e2[i].ecx != orig->ecx || e2[i].edx != orig->edx) + return -EINVAL; + } + + return 0; +} + static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) { u32 function; @@ -313,6 +335,20 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, __kvm_update_cpuid_runtime(vcpu, e2, nent); + /* + * KVM does not correctly handle changing guest CPUID after KVM_RUN, as + * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't + * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page + * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with + * the core vCPU model on the fly. It would've been better to forbid any + * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately + * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do + * KVM_SET_CPUID{,2} again. To support this legacy behavior, check + * whether the supplied CPUID data is equal to what's already set. + */ + if (vcpu->arch.last_vmentry_cpu != -1) + return kvm_cpuid_check_equal(vcpu, e2, nent); + r = kvm_check_cpuid(vcpu, e2, nent); if (r) return r; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 60da2331ec32..a0bc637348b2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5230,17 +5230,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; - /* - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page - * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with - * the core vCPU model on the fly, so fail. - */ - r = -EINVAL; - if (vcpu->arch.last_vmentry_cpu != -1) - goto out; - r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; @@ -5251,14 +5240,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; - /* - * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in - * KVM_SET_CPUID case above. - */ - r = -EINVAL; - if (vcpu->arch.last_vmentry_cpu != -1) - goto out; - r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; -- cgit v1.2.3-58-ga151 From 9e6d484f9991176269607bb3c54a494e32eab27a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 17 Jan 2022 16:05:41 +0100 Subject: KVM: selftests: Rename 'get_cpuid_test' to 'cpuid_test' In preparation to reusing the existing 'get_cpuid_test' for testing "KVM_SET_CPUID{,2} after KVM_RUN" rename it to 'cpuid_test' to avoid the confusion. No functional change intended. Signed-off-by: Vitaly Kuznetsov Message-Id: <20220117150542.2176196-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 2 +- tools/testing/selftests/kvm/Makefile | 4 +- tools/testing/selftests/kvm/x86_64/cpuid_test.c | 179 +++++++++++++++++++++ .../testing/selftests/kvm/x86_64/get_cpuid_test.c | 179 --------------------- 4 files changed, 182 insertions(+), 182 deletions(-) create mode 100644 tools/testing/selftests/kvm/x86_64/cpuid_test.c delete mode 100644 tools/testing/selftests/kvm/x86_64/get_cpuid_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 8c129961accf..20b4c921c9a2 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -8,11 +8,11 @@ /s390x/memop /s390x/resets /s390x/sync_regs_test +/x86_64/cpuid_test /x86_64/cr4_cpuid_sync_test /x86_64/debug_regs /x86_64/evmcs_test /x86_64/emulator_error_test -/x86_64/get_cpuid_test /x86_64/get_msr_index_features /x86_64/kvm_clock_test /x86_64/kvm_pv_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index ee8cf2149824..ec78a8692899 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -43,11 +43,11 @@ LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c lib/aarch64/handler LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c LIBKVM_riscv = lib/riscv/processor.c lib/riscv/ucall.c -TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test +TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test +TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/emulator_error_test -TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/hyperv_features diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c new file mode 100644 index 000000000000..a711f83749ea --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021, Red Hat Inc. + * + * Generic tests for KVM CPUID set/get ioctls + */ +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define VCPU_ID 0 + +/* CPUIDs known to differ */ +struct { + u32 function; + u32 index; +} mangled_cpuids[] = { + /* + * These entries depend on the vCPU's XCR0 register and IA32_XSS MSR, + * which are not controlled for by this test. + */ + {.function = 0xd, .index = 0}, + {.function = 0xd, .index = 1}, +}; + +static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid) +{ + int i; + u32 eax, ebx, ecx, edx; + + for (i = 0; i < guest_cpuid->nent; i++) { + eax = guest_cpuid->entries[i].function; + ecx = guest_cpuid->entries[i].index; + + cpuid(&eax, &ebx, &ecx, &edx); + + GUEST_ASSERT(eax == guest_cpuid->entries[i].eax && + ebx == guest_cpuid->entries[i].ebx && + ecx == guest_cpuid->entries[i].ecx && + edx == guest_cpuid->entries[i].edx); + } + +} + +static void test_cpuid_40000000(struct kvm_cpuid2 *guest_cpuid) +{ + u32 eax = 0x40000000, ebx, ecx = 0, edx; + + cpuid(&eax, &ebx, &ecx, &edx); + + GUEST_ASSERT(eax == 0x40000001); +} + +static void guest_main(struct kvm_cpuid2 *guest_cpuid) +{ + GUEST_SYNC(1); + + test_guest_cpuids(guest_cpuid); + + GUEST_SYNC(2); + + test_cpuid_40000000(guest_cpuid); + + GUEST_DONE(); +} + +static bool is_cpuid_mangled(struct kvm_cpuid_entry2 *entrie) +{ + int i; + + for (i = 0; i < sizeof(mangled_cpuids); i++) { + if (mangled_cpuids[i].function == entrie->function && + mangled_cpuids[i].index == entrie->index) + return true; + } + + return false; +} + +static void check_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *entrie) +{ + int i; + + for (i = 0; i < cpuid->nent; i++) { + if (cpuid->entries[i].function == entrie->function && + cpuid->entries[i].index == entrie->index) { + if (is_cpuid_mangled(entrie)) + return; + + TEST_ASSERT(cpuid->entries[i].eax == entrie->eax && + cpuid->entries[i].ebx == entrie->ebx && + cpuid->entries[i].ecx == entrie->ecx && + cpuid->entries[i].edx == entrie->edx, + "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x", + entrie->function, entrie->index, + cpuid->entries[i].eax, cpuid->entries[i].ebx, + cpuid->entries[i].ecx, cpuid->entries[i].edx, + entrie->eax, entrie->ebx, entrie->ecx, entrie->edx); + return; + } + } + + TEST_ASSERT(false, "CPUID 0x%x.%x not found", entrie->function, entrie->index); +} + +static void compare_cpuids(struct kvm_cpuid2 *cpuid1, struct kvm_cpuid2 *cpuid2) +{ + int i; + + for (i = 0; i < cpuid1->nent; i++) + check_cpuid(cpuid2, &cpuid1->entries[i]); + + for (i = 0; i < cpuid2->nent; i++) + check_cpuid(cpuid1, &cpuid2->entries[i]); +} + +static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage) +{ + struct ucall uc; + + _vcpu_run(vm, vcpuid); + + switch (get_ucall(vm, vcpuid, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage + 1, + "Stage %d: Unexpected register values vmexit, got %lx", + stage + 1, (ulong)uc.args[1]); + return; + case UCALL_DONE: + return; + case UCALL_ABORT: + TEST_ASSERT(false, "%s at %s:%ld\n\tvalues: %#lx, %#lx", (const char *)uc.args[0], + __FILE__, uc.args[1], uc.args[2], uc.args[3]); + default: + TEST_ASSERT(false, "Unexpected exit: %s", + exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + } +} + +struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct kvm_cpuid2 *cpuid) +{ + int size = sizeof(*cpuid) + cpuid->nent * sizeof(cpuid->entries[0]); + vm_vaddr_t gva = vm_vaddr_alloc(vm, size, KVM_UTIL_MIN_VADDR); + struct kvm_cpuid2 *guest_cpuids = addr_gva2hva(vm, gva); + + memcpy(guest_cpuids, cpuid, size); + + *p_gva = gva; + return guest_cpuids; +} + +int main(void) +{ + struct kvm_cpuid2 *supp_cpuid, *cpuid2; + vm_vaddr_t cpuid_gva; + struct kvm_vm *vm; + int stage; + + vm = vm_create_default(VCPU_ID, 0, guest_main); + + supp_cpuid = kvm_get_supported_cpuid(); + cpuid2 = vcpu_get_cpuid(vm, VCPU_ID); + + compare_cpuids(supp_cpuid, cpuid2); + + vcpu_alloc_cpuid(vm, &cpuid_gva, cpuid2); + + vcpu_args_set(vm, VCPU_ID, 1, cpuid_gva); + + for (stage = 0; stage < 3; stage++) + run_vcpu(vm, VCPU_ID, stage); + + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c b/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c deleted file mode 100644 index a711f83749ea..000000000000 --- a/tools/testing/selftests/kvm/x86_64/get_cpuid_test.c +++ /dev/null @@ -1,179 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2021, Red Hat Inc. - * - * Generic tests for KVM CPUID set/get ioctls - */ -#include -#include -#include - -#include "test_util.h" -#include "kvm_util.h" -#include "processor.h" - -#define VCPU_ID 0 - -/* CPUIDs known to differ */ -struct { - u32 function; - u32 index; -} mangled_cpuids[] = { - /* - * These entries depend on the vCPU's XCR0 register and IA32_XSS MSR, - * which are not controlled for by this test. - */ - {.function = 0xd, .index = 0}, - {.function = 0xd, .index = 1}, -}; - -static void test_guest_cpuids(struct kvm_cpuid2 *guest_cpuid) -{ - int i; - u32 eax, ebx, ecx, edx; - - for (i = 0; i < guest_cpuid->nent; i++) { - eax = guest_cpuid->entries[i].function; - ecx = guest_cpuid->entries[i].index; - - cpuid(&eax, &ebx, &ecx, &edx); - - GUEST_ASSERT(eax == guest_cpuid->entries[i].eax && - ebx == guest_cpuid->entries[i].ebx && - ecx == guest_cpuid->entries[i].ecx && - edx == guest_cpuid->entries[i].edx); - } - -} - -static void test_cpuid_40000000(struct kvm_cpuid2 *guest_cpuid) -{ - u32 eax = 0x40000000, ebx, ecx = 0, edx; - - cpuid(&eax, &ebx, &ecx, &edx); - - GUEST_ASSERT(eax == 0x40000001); -} - -static void guest_main(struct kvm_cpuid2 *guest_cpuid) -{ - GUEST_SYNC(1); - - test_guest_cpuids(guest_cpuid); - - GUEST_SYNC(2); - - test_cpuid_40000000(guest_cpuid); - - GUEST_DONE(); -} - -static bool is_cpuid_mangled(struct kvm_cpuid_entry2 *entrie) -{ - int i; - - for (i = 0; i < sizeof(mangled_cpuids); i++) { - if (mangled_cpuids[i].function == entrie->function && - mangled_cpuids[i].index == entrie->index) - return true; - } - - return false; -} - -static void check_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *entrie) -{ - int i; - - for (i = 0; i < cpuid->nent; i++) { - if (cpuid->entries[i].function == entrie->function && - cpuid->entries[i].index == entrie->index) { - if (is_cpuid_mangled(entrie)) - return; - - TEST_ASSERT(cpuid->entries[i].eax == entrie->eax && - cpuid->entries[i].ebx == entrie->ebx && - cpuid->entries[i].ecx == entrie->ecx && - cpuid->entries[i].edx == entrie->edx, - "CPUID 0x%x.%x differ: 0x%x:0x%x:0x%x:0x%x vs 0x%x:0x%x:0x%x:0x%x", - entrie->function, entrie->index, - cpuid->entries[i].eax, cpuid->entries[i].ebx, - cpuid->entries[i].ecx, cpuid->entries[i].edx, - entrie->eax, entrie->ebx, entrie->ecx, entrie->edx); - return; - } - } - - TEST_ASSERT(false, "CPUID 0x%x.%x not found", entrie->function, entrie->index); -} - -static void compare_cpuids(struct kvm_cpuid2 *cpuid1, struct kvm_cpuid2 *cpuid2) -{ - int i; - - for (i = 0; i < cpuid1->nent; i++) - check_cpuid(cpuid2, &cpuid1->entries[i]); - - for (i = 0; i < cpuid2->nent; i++) - check_cpuid(cpuid1, &cpuid2->entries[i]); -} - -static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage) -{ - struct ucall uc; - - _vcpu_run(vm, vcpuid); - - switch (get_ucall(vm, vcpuid, &uc)) { - case UCALL_SYNC: - TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && - uc.args[1] == stage + 1, - "Stage %d: Unexpected register values vmexit, got %lx", - stage + 1, (ulong)uc.args[1]); - return; - case UCALL_DONE: - return; - case UCALL_ABORT: - TEST_ASSERT(false, "%s at %s:%ld\n\tvalues: %#lx, %#lx", (const char *)uc.args[0], - __FILE__, uc.args[1], uc.args[2], uc.args[3]); - default: - TEST_ASSERT(false, "Unexpected exit: %s", - exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); - } -} - -struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct kvm_cpuid2 *cpuid) -{ - int size = sizeof(*cpuid) + cpuid->nent * sizeof(cpuid->entries[0]); - vm_vaddr_t gva = vm_vaddr_alloc(vm, size, KVM_UTIL_MIN_VADDR); - struct kvm_cpuid2 *guest_cpuids = addr_gva2hva(vm, gva); - - memcpy(guest_cpuids, cpuid, size); - - *p_gva = gva; - return guest_cpuids; -} - -int main(void) -{ - struct kvm_cpuid2 *supp_cpuid, *cpuid2; - vm_vaddr_t cpuid_gva; - struct kvm_vm *vm; - int stage; - - vm = vm_create_default(VCPU_ID, 0, guest_main); - - supp_cpuid = kvm_get_supported_cpuid(); - cpuid2 = vcpu_get_cpuid(vm, VCPU_ID); - - compare_cpuids(supp_cpuid, cpuid2); - - vcpu_alloc_cpuid(vm, &cpuid_gva, cpuid2); - - vcpu_args_set(vm, VCPU_ID, 1, cpuid_gva); - - for (stage = 0; stage < 3; stage++) - run_vcpu(vm, VCPU_ID, stage); - - kvm_vm_free(vm); -} -- cgit v1.2.3-58-ga151 From ecebb966acaab2466d9857d1cc435ee1fc9eee50 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 17 Jan 2022 16:05:42 +0100 Subject: KVM: selftests: Test KVM_SET_CPUID2 after KVM_RUN KVM forbids KVM_SET_CPUID2 after KVM_RUN was performed on a vCPU unless the supplied CPUID data is equal to what was previously set. Test this. Signed-off-by: Vitaly Kuznetsov Message-Id: <20220117150542.2176196-5-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 7 +++++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 33 +++++++++++++++++++--- tools/testing/selftests/kvm/x86_64/cpuid_test.c | 30 ++++++++++++++++++++ 3 files changed, 66 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index e94ba0fc67d8..bb013d101c14 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -375,6 +375,8 @@ uint64_t kvm_get_feature_msr(uint64_t msr_index); struct kvm_cpuid2 *kvm_get_supported_cpuid(void); struct kvm_cpuid2 *vcpu_get_cpuid(struct kvm_vm *vm, uint32_t vcpuid); +int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_cpuid2 *cpuid); void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_cpuid2 *cpuid); @@ -418,6 +420,11 @@ uint64_t vm_get_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr); void vm_set_page_table_entry(struct kvm_vm *vm, int vcpuid, uint64_t vaddr, uint64_t pte); +/* + * get_cpuid() - find matching CPUID entry and return pointer to it. + */ +struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function, + uint32_t index); /* * set_cpuid() - overwrites a matching cpuid entry with the provided value. * matches based on ent->function && ent->index. returns true diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index babb0f28575c..d61e2326dc85 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -886,6 +886,17 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) return entry; } + +int __vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_cpuid2 *cpuid) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + + return ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid); +} + /* * VM VCPU CPUID Set * @@ -903,12 +914,9 @@ kvm_get_supported_cpuid_index(uint32_t function, uint32_t index) void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_cpuid2 *cpuid) { - struct vcpu *vcpu = vcpu_find(vm, vcpuid); int rc; - TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); - - rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid); + rc = __vcpu_set_cpuid(vm, vcpuid, cpuid); TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i", rc, errno); @@ -1384,6 +1392,23 @@ void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid) } } +struct kvm_cpuid_entry2 *get_cpuid(struct kvm_cpuid2 *cpuid, uint32_t function, + uint32_t index) +{ + int i; + + for (i = 0; i < cpuid->nent; i++) { + struct kvm_cpuid_entry2 *cur = &cpuid->entries[i]; + + if (cur->function == function && cur->index == index) + return cur; + } + + TEST_FAIL("CPUID function 0x%x index 0x%x not found ", function, index); + + return NULL; +} + bool set_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *ent) { diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index a711f83749ea..16d2465c5634 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -154,6 +154,34 @@ struct kvm_cpuid2 *vcpu_alloc_cpuid(struct kvm_vm *vm, vm_vaddr_t *p_gva, struct return guest_cpuids; } +static void set_cpuid_after_run(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid) +{ + struct kvm_cpuid_entry2 *ent; + int rc; + u32 eax, ebx, x; + + /* Setting unmodified CPUID is allowed */ + rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid); + TEST_ASSERT(!rc, "Setting unmodified CPUID after KVM_RUN failed: %d", rc); + + /* Changing CPU features is forbidden */ + ent = get_cpuid(cpuid, 0x7, 0); + ebx = ent->ebx; + ent->ebx--; + rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid); + TEST_ASSERT(rc, "Changing CPU features should fail"); + ent->ebx = ebx; + + /* Changing MAXPHYADDR is forbidden */ + ent = get_cpuid(cpuid, 0x80000008, 0); + eax = ent->eax; + x = eax & 0xff; + ent->eax = (eax & ~0xffu) | (x - 1); + rc = __vcpu_set_cpuid(vm, VCPU_ID, cpuid); + TEST_ASSERT(rc, "Changing MAXPHYADDR should fail"); + ent->eax = eax; +} + int main(void) { struct kvm_cpuid2 *supp_cpuid, *cpuid2; @@ -175,5 +203,7 @@ int main(void) for (stage = 0; stage < 3; stage++) run_vcpu(vm, VCPU_ID, stage); + set_cpuid_after_run(vm, cpuid2); + kvm_vm_free(vm); } -- cgit v1.2.3-58-ga151 From 4732f2444acd9b7eabc744f20eb5cc9694c0bfbd Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 11 Jan 2022 15:38:23 +0800 Subject: KVM: x86: Making the module parameter of vPMU more common The new module parameter to control PMU virtualization should apply to Intel as well as AMD, for situations where userspace is not trusted. If the module parameter allows PMU virtualization, there could be a new KVM_CAP or guest CPUID bits whereby userspace can enable/disable PMU virtualization on a per-VM basis. If the module parameter does not allow PMU virtualization, there should be no userspace override, since we have no precedent for authorizing that kind of override. If it's false, other counter-based profiling features (such as LBR including the associated CPUID bits if any) will not be exposed. Change its name from "pmu" to "enable_pmu" as we have temporary variables with the same name in our code like "struct kvm_pmu *pmu". Fixes: b1d66dad65dc ("KVM: x86/svm: Add module param to control PMU virtualization") Suggested-by : Jim Mattson Signed-off-by: Like Xu Message-Id: <20220111073823.21885-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 6 +++--- arch/x86/kvm/svm/pmu.c | 2 +- arch/x86/kvm/svm/svm.c | 8 ++------ arch/x86/kvm/svm/svm.h | 1 - arch/x86/kvm/vmx/capabilities.h | 4 ++++ arch/x86/kvm/vmx/pmu_intel.c | 2 +- arch/x86/kvm/x86.c | 5 +++++ arch/x86/kvm/x86.h | 1 + 8 files changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7eb046d907c6..a7c280d2113b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -845,10 +845,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) perf_get_x86_pmu_capability(&cap); /* - * Only support guest architectural pmu on a host - * with architectural pmu. + * The guest architecture pmu is only supported if the architecture + * pmu exists on the host and the module parameters allow it. */ - if (!cap.version) + if (!cap.version || !enable_pmu) memset(&cap, 0, sizeof(cap)); eax.split.version_id = min(cap.version, 2); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 12d8b301065a..5aa45f13b16d 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -101,7 +101,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, { struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - if (!pmu) + if (!enable_pmu) return NULL; switch (msr) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index c3d9006478a4..d8cac84fb2dc 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -192,10 +192,6 @@ module_param(vgif, int, 0444); static int lbrv = true; module_param(lbrv, int, 0444); -/* enable/disable PMU virtualization */ -bool pmu = true; -module_param(pmu, bool, 0444); - static int tsc_scaling = true; module_param(tsc_scaling, int, 0444); @@ -957,7 +953,7 @@ static __init void svm_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); /* AMD PMU PERFCTR_CORE CPUID */ - if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); /* CPUID 0x8000001F (SME/SEV features) */ @@ -1093,7 +1089,7 @@ static __init int svm_hardware_setup(void) pr_info("LBR virtualization supported\n"); } - if (!pmu) + if (!enable_pmu) pr_info("PMU virtualization is disabled\n"); svm_set_cpu_caps(); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 9f153c59f2c8..6d29421dd4bf 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -32,7 +32,6 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; extern bool npt_enabled; extern bool intercept_smi; -extern bool pmu; /* * Clean bits in VMCB. diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index c8029b7845b6..959b59d13b5a 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -5,6 +5,7 @@ #include #include "lapic.h" +#include "x86.h" extern bool __read_mostly enable_vpid; extern bool __read_mostly flexpriority_enabled; @@ -389,6 +390,9 @@ static inline u64 vmx_get_perf_capabilities(void) { u64 perf_cap = 0; + if (!enable_pmu) + return perf_cap; + if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index ffccfd9823c0..466d18fc0c5d 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -487,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits = 0xffffffff00200000ull; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry) + if (!entry || !enable_pmu) return; eax.full = entry->eax; edx.full = entry->edx; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a0bc637348b2..52df8e6eaa57 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -187,6 +187,11 @@ module_param(force_emulation_prefix, bool, S_IRUGO); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); +/* Enable/disable PMU virtualization */ +bool __read_mostly enable_pmu = true; +EXPORT_SYMBOL_GPL(enable_pmu); +module_param(enable_pmu, bool, 0444); + /* * Restoring the host value for MSRs that are only consumed when running in * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index da7031e80f23..1ebd5a7594da 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -336,6 +336,7 @@ extern u64 host_xcr0; extern u64 supported_xcr0; extern u64 host_xss; extern u64 supported_xss; +extern bool enable_pmu; static inline bool kvm_mpx_supported(void) { -- cgit v1.2.3-58-ga151 From e3548aaf41a200c2af359462be23bcdd76efd795 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Mon, 17 Jan 2022 00:20:47 -0600 Subject: cifs: free ntlmsspblob allocated in negotiate One of my previous fixes: cifs: send workstation name during ntlmssp session setup ...changed the prototype of build_ntlmssp_negotiate_blob from being allocated by the caller to being allocated within the function. The caller needs to free this object too. While SMB2 version of the caller did it, I forgot to free for the SMB1 version. Fixing that here. Fixes: 49bd49f983b5 ("cifs: send workstation name during ntlmssp session setup") Cc: stable@vger.kernel.org # 5.16 Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/sess.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index d12490e12be5..ffaa091e41a4 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1413,7 +1413,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) &blob_len, ses, server, sess_data->nls_cp); if (rc) - goto out; + goto out_free_ntlmsspblob; sess_data->iov[1].iov_len = blob_len; sess_data->iov[1].iov_base = ntlmsspblob; @@ -1421,7 +1421,7 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) rc = _sess_auth_rawntlmssp_assemble_req(sess_data); if (rc) - goto out; + goto out_free_ntlmsspblob; rc = sess_sendreceive(sess_data); @@ -1435,14 +1435,14 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) rc = 0; if (rc) - goto out; + goto out_free_ntlmsspblob; cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); if (smb_buf->WordCount != 4) { rc = -EIO; cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); - goto out; + goto out_free_ntlmsspblob; } ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ @@ -1456,10 +1456,13 @@ sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data) cifs_dbg(VFS, "bad security blob length %d\n", blob_len); rc = -EINVAL; - goto out; + goto out_free_ntlmsspblob; } rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses); + +out_free_ntlmsspblob: + kfree(ntlmsspblob); out: sess_free_buffer(sess_data); -- cgit v1.2.3-58-ga151 From 74ce6135ae6ef482715cff2ccd703b7295f900f2 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 17 Jan 2022 09:11:30 +0800 Subject: cifs: clean up an inconsistent indenting Eliminate the follow smatch warning: fs/cifs/sess.c:1581 sess_auth_rawntlmssp_authenticate() warn: inconsistent indenting Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Steve French --- fs/cifs/sess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index ffaa091e41a4..50ca479d7039 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1577,7 +1577,7 @@ out_free_ntlmsspblob: out: sess_free_buffer(sess_data); - if (!rc) + if (!rc) rc = sess_establish_session(sess_data); /* Cleanup */ -- cgit v1.2.3-58-ga151 From 5f51c7ce1dc36565296b3ef342585f70ec72a2a9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Jan 2022 19:26:32 +0100 Subject: ACPI: CPPC: Fix up I/O port access in cpc_read() The code as currently implemented does not work on big endian systems, so fix it up. Fixes: a2c8f92bea5f ("ACPI: CPPC: Implement support for SystemIO registers") Reported-by: Dan Carpenter Suggested-by: Dan Carpenter Link: https://lore.kernel.org/linux-acpi/20220111092928.GA24968@kili/ Signed-off-by: Rafael J. Wysocki Acked-by: Huang Rui --- drivers/acpi/cppc_acpi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index a9d2de403c0c..5bc55322e425 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -929,16 +929,18 @@ static int cpc_read(int cpu, struct cpc_register_resource *reg_res, u64 *val) if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) { u32 width = 8 << (reg->access_width - 1); + u32 val_u32; acpi_status status; status = acpi_os_read_port((acpi_io_address)reg->address, - (u32 *)val, width); + &val_u32, width); if (ACPI_FAILURE(status)) { pr_debug("Error: Failed to read SystemIO port %llx\n", reg->address); return -EFAULT; } + *val = val_u32; return 0; } else if (reg->space_id == ACPI_ADR_SPACE_PLATFORM_COMM && pcc_ss_id >= 0) vaddr = GET_PCC_VADDR(reg->address, pcc_ss_id); -- cgit v1.2.3-58-ga151 From f684b10751287e8b7e15e8f10ae40bdd74149bba Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Jan 2022 19:27:22 +0100 Subject: ACPI: CPPC: Drop redundant local variable from cpc_read() The ret_val local variable in cpc_read() is not necessary, so eliminate it. No functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Huang Rui --- drivers/acpi/cppc_acpi.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 5bc55322e425..866560cbb082 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -915,14 +915,13 @@ int __weak cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) static int cpc_read(int cpu, struct cpc_register_resource *reg_res, u64 *val) { - int ret_val = 0; void __iomem *vaddr = NULL; int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu); struct cpc_reg *reg = ®_res->cpc_entry.reg; if (reg_res->type == ACPI_TYPE_INTEGER) { *val = reg_res->cpc_entry.int_value; - return ret_val; + return 0; } *val = 0; @@ -968,10 +967,10 @@ static int cpc_read(int cpu, struct cpc_register_resource *reg_res, u64 *val) default: pr_debug("Error: Cannot read %u bit width from PCC for ss: %d\n", reg->bit_width, pcc_ss_id); - ret_val = -EFAULT; + return -EFAULT; } - return ret_val; + return 0; } static int cpc_write(int cpu, struct cpc_register_resource *reg_res, u64 val) -- cgit v1.2.3-58-ga151 From a510c78e5b6f2f9b9add88071fdfc2b740d2e356 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 14 Jan 2022 15:24:33 -0800 Subject: ACPI: DPTF: Support Raptor Lake Add Raptor Lake ACPI IDs for DPTF devices. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/acpi/dptf/dptf_pch_fivr.c | 1 + drivers/acpi/dptf/dptf_power.c | 2 ++ drivers/acpi/dptf/int340x_thermal.c | 6 ++++++ drivers/acpi/fan.h | 1 + 4 files changed, 10 insertions(+) diff --git a/drivers/acpi/dptf/dptf_pch_fivr.c b/drivers/acpi/dptf/dptf_pch_fivr.c index e7ab0fc90db9..c0da24c9f8c3 100644 --- a/drivers/acpi/dptf/dptf_pch_fivr.c +++ b/drivers/acpi/dptf/dptf_pch_fivr.c @@ -151,6 +151,7 @@ static int pch_fivr_remove(struct platform_device *pdev) static const struct acpi_device_id pch_fivr_device_ids[] = { {"INTC1045", 0}, {"INTC1049", 0}, + {"INTC10A3", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, pch_fivr_device_ids); diff --git a/drivers/acpi/dptf/dptf_power.c b/drivers/acpi/dptf/dptf_power.c index a24d5d7aa117..dc1f52a5b3f4 100644 --- a/drivers/acpi/dptf/dptf_power.c +++ b/drivers/acpi/dptf/dptf_power.c @@ -231,6 +231,8 @@ static const struct acpi_device_id int3407_device_ids[] = { {"INTC1050", 0}, {"INTC1060", 0}, {"INTC1061", 0}, + {"INTC10A4", 0}, + {"INTC10A5", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, int3407_device_ids); diff --git a/drivers/acpi/dptf/int340x_thermal.c b/drivers/acpi/dptf/int340x_thermal.c index da5d5f0be2f2..42a556346548 100644 --- a/drivers/acpi/dptf/int340x_thermal.c +++ b/drivers/acpi/dptf/int340x_thermal.c @@ -37,6 +37,12 @@ static const struct acpi_device_id int340x_thermal_device_ids[] = { {"INTC1050"}, {"INTC1060"}, {"INTC1061"}, + {"INTC10A0"}, + {"INTC10A1"}, + {"INTC10A2"}, + {"INTC10A3"}, + {"INTC10A4"}, + {"INTC10A5"}, {""}, }; diff --git a/drivers/acpi/fan.h b/drivers/acpi/fan.h index dc9a6efa514b..dd9bb8ca2244 100644 --- a/drivers/acpi/fan.h +++ b/drivers/acpi/fan.h @@ -10,4 +10,5 @@ {"INT3404", }, /* Fan */ \ {"INTC1044", }, /* Fan for Tiger Lake generation */ \ {"INTC1048", }, /* Fan for Alder Lake generation */ \ + {"INTC10A2", }, /* Fan for Raptor Lake generation */ \ {"PNP0C0B", } /* Generic ACPI fan */ -- cgit v1.2.3-58-ga151 From a95be874d26bed7c12a87b5493d5dac9281f707f Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 14 Jan 2022 15:24:34 -0800 Subject: thermal: int340x: Support Raptor Lake Add Raptor Lake ACPI IDs for DPTF devices. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/int340x_thermal/int3400_thermal.c | 1 + drivers/thermal/intel/int340x_thermal/int3403_thermal.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c index 8502b7d8df89..72acb1f61849 100644 --- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c @@ -596,6 +596,7 @@ static const struct acpi_device_id int3400_thermal_match[] = { {"INT3400", 0}, {"INTC1040", 0}, {"INTC1041", 0}, + {"INTC10A0", 0}, {} }; diff --git a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c index c3c4c4d34542..07e25321dfe3 100644 --- a/drivers/thermal/intel/int340x_thermal/int3403_thermal.c +++ b/drivers/thermal/intel/int340x_thermal/int3403_thermal.c @@ -285,6 +285,7 @@ static const struct acpi_device_id int3403_device_ids[] = { {"INT3403", 0}, {"INTC1043", 0}, {"INTC1046", 0}, + {"INTC10A1", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, int3403_device_ids); -- cgit v1.2.3-58-ga151 From e5b54867f47f765fcb439e09ed763b5de617af3e Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 14 Jan 2022 15:24:35 -0800 Subject: thermal: int340x: Add Raptor Lake PCI device id Add Raptor Lake PCI ID for processor thermal device. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/int340x_thermal/processor_thermal_device.h | 1 + drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h index 9b2a64ef55d0..49932a68abac 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h @@ -24,6 +24,7 @@ #define PCI_DEVICE_ID_INTEL_HSB_THERMAL 0x0A03 #define PCI_DEVICE_ID_INTEL_ICL_THERMAL 0x8a03 #define PCI_DEVICE_ID_INTEL_JSL_THERMAL 0x4E03 +#define PCI_DEVICE_ID_INTEL_RPL_THERMAL 0xA71D #define PCI_DEVICE_ID_INTEL_SKL_THERMAL 0x1903 #define PCI_DEVICE_ID_INTEL_TGL_THERMAL 0x9A03 diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index b4bcd3fe9eb2..ca40b0967cdd 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -358,6 +358,7 @@ static SIMPLE_DEV_PM_OPS(proc_thermal_pci_pm, proc_thermal_pci_suspend, static const struct pci_device_id proc_thermal_pci_ids[] = { { PCI_DEVICE_DATA(INTEL, ADL_THERMAL, PROC_THERMAL_FEATURE_RAPL | PROC_THERMAL_FEATURE_FIVR | PROC_THERMAL_FEATURE_DVFS | PROC_THERMAL_FEATURE_MBOX) }, + { PCI_DEVICE_DATA(INTEL, RPL_THERMAL, PROC_THERMAL_FEATURE_RAPL | PROC_THERMAL_FEATURE_FIVR | PROC_THERMAL_FEATURE_DVFS | PROC_THERMAL_FEATURE_MBOX) }, { }, }; -- cgit v1.2.3-58-ga151 From 7eacba3b00a3c35c1ad189f543b1995dd0bdca9c Mon Sep 17 00:00:00 2001 From: Eugene Korenevsky Date: Fri, 14 Jan 2022 22:53:00 +0300 Subject: cifs: alloc_path_with_tree_prefix: do not append sep. if the path is empty alloc_path_with_tree_prefix() concatenates tree prefix and the path. Windows CIFS client does not add separator after the tree prefix if the path is empty. Let's do the same. This fixes mounting DFS namespaces with names containing non-ASCII symbols. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215440 Signed-off-by: Eugene Korenevsky Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 8d471df69c59..625e3e9cb614 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2587,8 +2587,13 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, cp = load_nls_default(); cifs_strtoUTF16(*out_path, treename, treename_len, cp); - UniStrcat(*out_path, sep); - UniStrcat(*out_path, path); + + /* Do not append the separator if the path is empty */ + if (path[0] != cpu_to_le16(0x0000)) { + UniStrcat(*out_path, sep); + UniStrcat(*out_path, path); + } + unload_nls(cp); return 0; -- cgit v1.2.3-58-ga151 From a2809d0e16963fdf3984409e47f145cccb0c6821 Mon Sep 17 00:00:00 2001 From: Eugene Korenevsky Date: Fri, 14 Jan 2022 22:53:40 +0300 Subject: cifs: quirk for STATUS_OBJECT_NAME_INVALID returned for non-ASCII dfs refs Windows SMB server responds with STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request for "\\\" DFS reference, where contains non-ASCII unicode symbols. Check such DFS reference and emulate -EREMOTE if it is actual. BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=215440 Signed-off-by: Eugene Korenevsky Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 5 +++++ fs/cifs/connect.c | 5 +++++ fs/cifs/inode.c | 6 ++++++ fs/cifs/misc.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index e0dc147e69a8..f2029bc46215 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -647,6 +647,11 @@ static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, int match_target_ip(struct TCP_Server_Info *server, const char *share, size_t share_len, bool *result); + +int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *dfs_link_path); #endif static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0f36deff790e..accce1b351c6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3374,6 +3374,11 @@ static int is_path_remote(struct mount_ctx *mnt_ctx) rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, full_path); +#ifdef CONFIG_CIFS_DFS_UPCALL + if (rc == -ENOENT && is_tcon_dfs(tcon)) + rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb, + full_path); +#endif if (rc != 0 && rc != -EREMOTE) { kfree(full_path); return rc; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 279622e4eb1c..baa197edd8c5 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -952,6 +952,12 @@ cifs_get_inode_info(struct inode **inode, rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, tmp_data, &adjust_tz, &is_reparse_point); +#ifdef CONFIG_CIFS_DFS_UPCALL + if (rc == -ENOENT && is_tcon_dfs(tcon)) + rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, + cifs_sb, + full_path); +#endif data = tmp_data; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 5148d48d6a35..56598f7dbe00 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1302,4 +1302,53 @@ int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; return 0; } + +/** cifs_dfs_query_info_nonascii_quirk + * Handle weird Windows SMB server behaviour. It responds with + * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request + * for "\\\" DFS reference, + * where contains non-ASCII unicode symbols. + * + * Check such DFS reference and emulate -ENOENT if it is actual. + */ +int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid, + struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const char *linkpath) +{ + char *treename, *dfspath, sep; + int treenamelen, linkpathlen, rc; + + treename = tcon->treeName; + /* MS-DFSC: All paths in REQ_GET_DFS_REFERRAL and RESP_GET_DFS_REFERRAL + * messages MUST be encoded with exactly one leading backslash, not two + * leading backslashes. + */ + sep = CIFS_DIR_SEP(cifs_sb); + if (treename[0] == sep && treename[1] == sep) + treename++; + linkpathlen = strlen(linkpath); + treenamelen = strnlen(treename, MAX_TREE_SIZE + 1); + dfspath = kzalloc(treenamelen + linkpathlen + 1, GFP_KERNEL); + if (!dfspath) + return -ENOMEM; + if (treenamelen) + memcpy(dfspath, treename, treenamelen); + memcpy(dfspath + treenamelen, linkpath, linkpathlen); + rc = dfs_cache_find(xid, tcon->ses, cifs_sb->local_nls, + cifs_remap(cifs_sb), dfspath, NULL, NULL); + if (rc == 0) { + cifs_dbg(FYI, "DFS ref '%s' is found, emulate -EREMOTE\n", + dfspath); + rc = -EREMOTE; + } else if (rc == -EEXIST) { + cifs_dbg(FYI, "DFS ref '%s' is not found, emulate -ENOENT\n", + dfspath); + rc = -ENOENT; + } else { + cifs_dbg(FYI, "%s: dfs_cache_find returned %d\n", __func__, rc); + } + kfree(dfspath); + return rc; +} #endif -- cgit v1.2.3-58-ga151 From 6b0764598dc7dea21a44cd4e7ec3dd4a7aabf5c2 Mon Sep 17 00:00:00 2001 From: Wang Cheng Date: Sat, 15 Jan 2022 20:52:36 +0800 Subject: docs: staging/tee.rst: fix two typos found while reading Signed-off-by: Wang Cheng Reviewed-by: Sumit Garg Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20220115125236.34886-1-wanngchenng@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/staging/tee.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/staging/tee.rst b/Documentation/staging/tee.rst index 4d4b5f889603..4eec213eaefe 100644 --- a/Documentation/staging/tee.rst +++ b/Documentation/staging/tee.rst @@ -225,7 +225,7 @@ The following picture shows a high level overview of AMD-TEE:: +--------------------------+ +---------+--------------------+ At the lowest level (in x86), the AMD Secure Processor (ASP) driver uses the -CPU to PSP mailbox regsister to submit commands to the PSP. The format of the +CPU to PSP mailbox register to submit commands to the PSP. The format of the command buffer is opaque to the ASP driver. It's role is to submit commands to the secure processor and return results to AMD-TEE driver. The interface between AMD-TEE driver and AMD Secure Processor driver can be found in [6]. @@ -260,7 +260,7 @@ cancel_req driver callback is not supported by AMD-TEE. The GlobalPlatform TEE Client API [5] can be used by the user space (client) to talk to AMD's TEE. AMD's TEE provides a secure environment for loading, opening -a session, invoking commands and clossing session with TA. +a session, invoking commands and closing session with TA. References ========== -- cgit v1.2.3-58-ga151 From cc2cf6796a90bf356a8ddc854f65cea434477ea7 Mon Sep 17 00:00:00 2001 From: Huichun Feng Date: Tue, 11 Jan 2022 13:20:01 +0800 Subject: docs: ftrace: fix ambiguous sentence The sentence looks ambiguous, rephrase it by adding ", there". Signed-off-by: Huichun Feng Signed-off-by: Ching-Chun (Jim) Huang Signed-off-by: Chun-Hung Tseng Acked-by: Steven Rostedt Link: https://lore.kernel.org/r/20220111052000.2675944-1-foxhoundsk.tw@gmail.com Signed-off-by: Jonathan Corbet --- Documentation/trace/ftrace.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index b3166c4a7867..45b8c56af67a 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -3370,7 +3370,7 @@ one of the latency tracers, you will get the following results. Instances --------- -In the tracefs tracing directory is a directory called "instances". +In the tracefs tracing directory, there is a directory called "instances". This directory can have new directories created inside of it using mkdir, and removing directories with rmdir. The directory created with mkdir in this directory will already contain files and other -- cgit v1.2.3-58-ga151 From 2ba144e68edb4987a2fe1b1cf418b58cbdc4ee96 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Mon, 10 Jan 2022 22:44:56 +0100 Subject: dt-bindings: power: reset: gpio-restart: Correct default priority Commit bcd56fe1aa97 ("power: reset: gpio-restart: increase priority slightly") changed the default restart priority 129, but did not update the documentation. Correct this, so the driver and documentation have the same default value. Signed-off-by: Sander Vanheule Reviewed-by: Rob Herring Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220110214456.67087-1-sander@svanheule.net --- Documentation/devicetree/bindings/power/reset/gpio-restart.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/power/reset/gpio-restart.yaml b/Documentation/devicetree/bindings/power/reset/gpio-restart.yaml index 3dd22220cb5f..a72d5c721516 100644 --- a/Documentation/devicetree/bindings/power/reset/gpio-restart.yaml +++ b/Documentation/devicetree/bindings/power/reset/gpio-restart.yaml @@ -43,7 +43,7 @@ properties: priority: $ref: /schemas/types.yaml#/definitions/uint32 description: | - A priority ranging from 0 to 255 (default 128) according to the following guidelines: + A priority ranging from 0 to 255 (default 129) according to the following guidelines: 0: Restart handler of last resort, with limited restart capabilities. 128: Default restart handler; use if no other restart handler is expected to be available, @@ -51,7 +51,7 @@ properties: 255: Highest priority restart handler, will preempt all other restart handlers. minimum: 0 maximum: 255 - default: 128 + default: 129 active-delay: $ref: /schemas/types.yaml#/definitions/uint32 -- cgit v1.2.3-58-ga151 From 38a9840e2e396e86e3a39d1d2daf5f46204066c7 Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 11:28:42 +0100 Subject: dt-bindings: vendor-prefixes: add 8devices The vendor prefix for 8devices [1] is used in device tree [2], but was not documented so far. Add it to the schema to document it. [1] https://www.8devices.com/ [2] arch/arm/boot/dts/qcom-ipq4018-jalapeno.dts Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220113102842.GA4357@standask-GA-A55M-S2HP --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 5983a2f6fb30..724a65146993 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -25,6 +25,8 @@ patternProperties: # Keep list in alphabetical order. "^70mai,.*": description: 70mai Co., Ltd. + "^8dev,.*": + description: 8devices, UAB "^abb,.*": description: ABB "^abilis,.*": -- cgit v1.2.3-58-ga151 From 8316cbbafd8bdf523fb1b3230f0701beae51679b Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 11:29:26 +0100 Subject: dt-bindings: vendor-prefixes: add F(x)tec The vendor prefix for F(x)tec [1] is used in device tree [2], but was not documented so far. Add it to the schema to document it. [1] https://www.fxtec.com/ [2] arch/arm64/boot/dts/qcom/msm8998-fxtec-pro1.dts Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220113102926.GA4388@standask-GA-A55M-S2HP --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 724a65146993..c14a084c90a6 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -439,6 +439,8 @@ patternProperties: description: Freescale Semiconductor "^fujitsu,.*": description: Fujitsu Ltd. + "^fxtec,.*": + description: FX Technology Ltd. "^gardena,.*": description: GARDENA GmbH "^gateworks,.*": -- cgit v1.2.3-58-ga151 From ca146834d6cdbde80dead9daf8e7e127fc85d31f Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 11:30:05 +0100 Subject: dt-bindings: vendor-prefixes: add Huawei The vendor prefix for Huawei [1] is used in device trees [2][3], but was not documented so far. Add it to the schema to document it. [1] https://www.huawei.com/en/ [2] arch/arm64/boot/dts/qcom/msm8916-huawei-g7.dts [3] arch/arm64/boot/dts/qcom/msm8994-angler-rev-101.dts Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220113103005.GA4421@standask-GA-A55M-S2HP --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index c14a084c90a6..9de8c0f5217f 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -515,6 +515,8 @@ patternProperties: description: HannStar Display Co. "^holtek,.*": description: Holtek Semiconductor, Inc. + "^huawei,.*": + description: Huawei Technologies Co., Ltd. "^hugsun,.*": description: Shenzhen Hugsun Technology Co. Ltd. "^hwacom,.*": -- cgit v1.2.3-58-ga151 From 6f2dfed0b6f078149072c34631a77902a8c85c7e Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 11:30:36 +0100 Subject: dt-bindings: vendor-prefixes: add Thundercomm The vendor prefix for Thundercomm [1] is used in device tree [2], but was not documented so far. Add it to the schema to document it. [1] https://www.thundercomm.com/ [2] arch/arm64/boot/dts/qcom/sdm845-db845c.dts Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220113103036.GA4456@standask-GA-A55M-S2HP --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 9de8c0f5217f..1634c4db79a9 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -1207,6 +1207,8 @@ patternProperties: description: THine Electronics, Inc. "^thingyjp,.*": description: thingy.jp + "^thundercomm,.*": + description: Thundercomm Technology Co., Ltd. "^ti,.*": description: Texas Instruments "^tianma,.*": -- cgit v1.2.3-58-ga151 From 364da22cb30eb79198922ebf53c40f589bcece9d Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 11:31:10 +0100 Subject: dt-bindings: vendor-prefixes: add Wingtech The vendor prefix for Wingtech [1] is used in device tree [2], but was not documented so far. Add it to the schema to document it. [1] http://www.wingtech.com/en [2] arch/arm64/boot/dts/qcom/msm8916-wingtech-wt88047.dts Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220113103110.GA4488@standask-GA-A55M-S2HP --- Documentation/devicetree/bindings/vendor-prefixes.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml index 1634c4db79a9..a2efbc26d945 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.yaml +++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml @@ -1332,6 +1332,8 @@ patternProperties: description: Wiligear, Ltd. "^winbond,.*": description: Winbond Electronics corp. + "^wingtech,.*": + description: Wingtech Technology Co., Ltd. "^winstar,.*": description: Winstar Display Corp. "^wits,.*": -- cgit v1.2.3-58-ga151 From 154e5f296e2af04fc2775a6de3c76e6ee37b5609 Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 14:39:01 +0100 Subject: dt-bindings: trivial-devices: fix swapped comments sparkfun,qwiic-joystick and st,24c256 had their comments incorrectly swapped. Swap them to make them correct. Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/7dc6ddb0b042cd243b2875e9aea81cad541d1c6b.1642080090.git.stano.jakubek@gmail.com --- Documentation/devicetree/bindings/trivial-devices.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml index 72c9f8610766..ee5edc8ad153 100644 --- a/Documentation/devicetree/bindings/trivial-devices.yaml +++ b/Documentation/devicetree/bindings/trivial-devices.yaml @@ -295,9 +295,9 @@ properties: - skyworks,sky81452 # Socionext SynQuacer TPM MMIO module - socionext,synquacer-tpm-mmio - # i2c serial eeprom (24cxx) - - sparkfun,qwiic-joystick # SparkFun Qwiic Joystick (COM-15168) with i2c interface + - sparkfun,qwiic-joystick + # i2c serial eeprom (24cxx) - st,24c256 # Ambient Light Sensor with SMBUS/Two Wire Serial Interface - taos,tsl2550 -- cgit v1.2.3-58-ga151 From af35a8b5bab7c1e31d30b9ad78a981fae9bda903 Mon Sep 17 00:00:00 2001 From: Stanislav Jakubek Date: Thu, 13 Jan 2022 14:39:20 +0100 Subject: dt-bindings: trivial-devices: fix double spaces in comments Cleanup double spaces in some of the comments. Signed-off-by: Stanislav Jakubek Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/13b3f66efd3b20f1d9bbb9eff1eca00757ac5367.1642080090.git.stano.jakubek@gmail.com --- Documentation/devicetree/bindings/trivial-devices.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml index ee5edc8ad153..e285386f3de0 100644 --- a/Documentation/devicetree/bindings/trivial-devices.yaml +++ b/Documentation/devicetree/bindings/trivial-devices.yaml @@ -31,7 +31,7 @@ properties: - enum: # SMBus/I2C Digital Temperature Sensor in 6-Pin SOT with SMBus Alert and Over Temperature Pin - ad,ad7414 - # ADM9240: Complete System Hardware Monitor for uProcessor-Based Systems + # ADM9240: Complete System Hardware Monitor for uProcessor-Based Systems - ad,adm9240 # AD5110 - Nonvolatile Digital Potentiometer - adi,ad5110 @@ -43,7 +43,7 @@ properties: - adi,adp5589 # AMS iAQ-Core VOC Sensor - ams,iaq-core - # i2c serial eeprom (24cxx) + # i2c serial eeprom (24cxx) - at,24c08 # i2c trusted platform module (TPM) - atmel,at97sc3204t @@ -297,7 +297,7 @@ properties: - socionext,synquacer-tpm-mmio # SparkFun Qwiic Joystick (COM-15168) with i2c interface - sparkfun,qwiic-joystick - # i2c serial eeprom (24cxx) + # i2c serial eeprom (24cxx) - st,24c256 # Ambient Light Sensor with SMBUS/Two Wire Serial Interface - taos,tsl2550 -- cgit v1.2.3-58-ga151 From d8adf5b92a9d2205620874d498c39923ecea8749 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Thu, 13 Jan 2022 09:19:18 +0100 Subject: scripts/dtc: dtx_diff: remove broken example from help text dtx_diff suggests to use <(...) syntax to pipe two inputs into it, but this has never worked: The /proc/self/fds/... paths passed by the shell will fail the `[ -f "${dtx}" ] && [ -r "${dtx}" ]` check in compile_to_dts, but even with this check removed, the function cannot work: hexdump will eat up the DTB magic, making the subsequent dtc call fail, as a pipe cannot be rewound. Simply remove this broken example, as there is already an alternative one that works fine. Fixes: 10eadc253ddf ("dtc: create tool to diff device trees") Signed-off-by: Matthias Schiffer Reviewed-by: Frank Rowand Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220113081918.10387-1-matthias.schiffer@ew.tq-group.com --- scripts/dtc/dtx_diff | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/dtc/dtx_diff b/scripts/dtc/dtx_diff index d3422ee15e30..f2bbde4bba86 100755 --- a/scripts/dtc/dtx_diff +++ b/scripts/dtc/dtx_diff @@ -59,12 +59,8 @@ Otherwise DTx is treated as a dts source file (aka .dts). or '/include/' to be processed. If DTx_1 and DTx_2 are in different architectures, then this script - may not work since \${ARCH} is part of the include path. Two possible - workarounds: - - `basename $0` \\ - <(ARCH=arch_of_dtx_1 `basename $0` DTx_1) \\ - <(ARCH=arch_of_dtx_2 `basename $0` DTx_2) + may not work since \${ARCH} is part of the include path. The following + workaround can be used: `basename $0` ARCH=arch_of_dtx_1 DTx_1 >tmp_dtx_1.dts `basename $0` ARCH=arch_of_dtx_2 DTx_2 >tmp_dtx_2.dts -- cgit v1.2.3-58-ga151 From b7fb2dad571d1e21173c06cef0bced77b323990a Mon Sep 17 00:00:00 2001 From: Sujit Kautkar Date: Mon, 10 Jan 2022 10:47:36 -0800 Subject: rpmsg: char: Fix race between the release of rpmsg_ctrldev and cdev struct rpmsg_ctrldev contains a struct cdev. The current code frees the rpmsg_ctrldev struct in rpmsg_ctrldev_release_device(), but the cdev is a managed object, therefore its release is not predictable and the rpmsg_ctrldev could be freed before the cdev is entirely released, as in the backtrace below. [ 93.625603] ODEBUG: free active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x7c [ 93.636115] WARNING: CPU: 0 PID: 12 at lib/debugobjects.c:488 debug_print_object+0x13c/0x1b0 [ 93.644799] Modules linked in: veth xt_cgroup xt_MASQUERADE rfcomm algif_hash algif_skcipher af_alg uinput ip6table_nat fuse uvcvideo videobuf2_vmalloc venus_enc venus_dec videobuf2_dma_contig hci_uart btandroid btqca snd_soc_rt5682_i2c bluetooth qcom_spmi_temp_alarm snd_soc_rt5682v [ 93.715175] CPU: 0 PID: 12 Comm: kworker/0:1 Tainted: G B 5.4.163-lockdep #26 [ 93.723855] Hardware name: Google Lazor (rev3 - 8) with LTE (DT) [ 93.730055] Workqueue: events kobject_delayed_cleanup [ 93.735271] pstate: 60c00009 (nZCv daif +PAN +UAO) [ 93.740216] pc : debug_print_object+0x13c/0x1b0 [ 93.744890] lr : debug_print_object+0x13c/0x1b0 [ 93.749555] sp : ffffffacf5bc7940 [ 93.752978] x29: ffffffacf5bc7940 x28: dfffffd000000000 [ 93.758448] x27: ffffffacdb11a800 x26: dfffffd000000000 [ 93.763916] x25: ffffffd0734f856c x24: dfffffd000000000 [ 93.769389] x23: 0000000000000000 x22: ffffffd0733c35b0 [ 93.774860] x21: ffffffd0751994a0 x20: ffffffd075ec27c0 [ 93.780338] x19: ffffffd075199100 x18: 00000000000276e0 [ 93.785814] x17: 0000000000000000 x16: dfffffd000000000 [ 93.791291] x15: ffffffffffffffff x14: 6e6968207473696c [ 93.796768] x13: 0000000000000000 x12: ffffffd075e2b000 [ 93.802244] x11: 0000000000000001 x10: 0000000000000000 [ 93.807723] x9 : d13400dff1921900 x8 : d13400dff1921900 [ 93.813200] x7 : 0000000000000000 x6 : 0000000000000000 [ 93.818676] x5 : 0000000000000080 x4 : 0000000000000000 [ 93.824152] x3 : ffffffd0732a0fa4 x2 : 0000000000000001 [ 93.829628] x1 : ffffffacf5bc7580 x0 : 0000000000000061 [ 93.835104] Call trace: [ 93.837644] debug_print_object+0x13c/0x1b0 [ 93.841963] __debug_check_no_obj_freed+0x25c/0x3c0 [ 93.846987] debug_check_no_obj_freed+0x18/0x20 [ 93.851669] slab_free_freelist_hook+0xbc/0x1e4 [ 93.856346] kfree+0xfc/0x2f4 [ 93.859416] rpmsg_ctrldev_release_device+0x78/0xb8 [ 93.864445] device_release+0x84/0x168 [ 93.868310] kobject_cleanup+0x12c/0x298 [ 93.872356] kobject_delayed_cleanup+0x10/0x18 [ 93.876948] process_one_work+0x578/0x92c [ 93.881086] worker_thread+0x804/0xcf8 [ 93.884963] kthread+0x2a8/0x314 [ 93.888303] ret_from_fork+0x10/0x18 The cdev_device_add/del() API was created to address this issue (see commit '233ed09d7fda ("chardev: add helper function to register char devs with a struct device")'), use it instead of cdev add/del(). Fixes: c0cdc19f84a4 ("rpmsg: Driver for user space endpoint interface") Signed-off-by: Sujit Kautkar Signed-off-by: Matthias Kaehlcke Reviewed-by: Mathieu Poirier Reviewed-by: Bjorn Andersson Reviewed-by: Stephen Boyd Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220110104706.v6.1.Iaac908f3e3149a89190ce006ba166e2d3fd247a3@changeid --- drivers/rpmsg/rpmsg_char.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/rpmsg/rpmsg_char.c b/drivers/rpmsg/rpmsg_char.c index d6214cb66026..8d3c2ca7081b 100644 --- a/drivers/rpmsg/rpmsg_char.c +++ b/drivers/rpmsg/rpmsg_char.c @@ -462,7 +462,6 @@ static void rpmsg_ctrldev_release_device(struct device *dev) ida_simple_remove(&rpmsg_ctrl_ida, dev->id); ida_simple_remove(&rpmsg_minor_ida, MINOR(dev->devt)); - cdev_del(&ctrldev->cdev); kfree(ctrldev); } @@ -497,19 +496,13 @@ static int rpmsg_chrdev_probe(struct rpmsg_device *rpdev) dev->id = ret; dev_set_name(&ctrldev->dev, "rpmsg_ctrl%d", ret); - ret = cdev_add(&ctrldev->cdev, dev->devt, 1); + ret = cdev_device_add(&ctrldev->cdev, &ctrldev->dev); if (ret) goto free_ctrl_ida; /* We can now rely on the release function for cleanup */ dev->release = rpmsg_ctrldev_release_device; - ret = device_add(dev); - if (ret) { - dev_err(&rpdev->dev, "device_add failed: %d\n", ret); - put_device(dev); - } - dev_set_drvdata(&rpdev->dev, ctrldev); return ret; @@ -535,7 +528,7 @@ static void rpmsg_chrdev_remove(struct rpmsg_device *rpdev) if (ret) dev_warn(&rpdev->dev, "failed to nuke endpoints: %d\n", ret); - device_del(&ctrldev->dev); + cdev_device_del(&ctrldev->cdev, &ctrldev->dev); put_device(&ctrldev->dev); } -- cgit v1.2.3-58-ga151 From 7a534ae89e34e9b51acb5a63dd0f88308178b46a Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Mon, 10 Jan 2022 10:47:37 -0800 Subject: rpmsg: char: Fix race between the release of rpmsg_eptdev and cdev struct rpmsg_eptdev contains a struct cdev. The current code frees the rpmsg_eptdev struct in rpmsg_eptdev_destroy(), but the cdev is a managed object, therefore its release is not predictable and the rpmsg_eptdev could be freed before the cdev is entirely released. The cdev_device_add/del() API was created to address this issue (see commit '233ed09d7fda ("chardev: add helper function to register char devs with a struct device")'), use it instead of cdev add/del(). Fixes: c0cdc19f84a4 ("rpmsg: Driver for user space endpoint interface") Suggested-by: Bjorn Andersson Signed-off-by: Matthias Kaehlcke Reviewed-by: Mathieu Poirier Reviewed-by: Stephen Boyd Reviewed-by: Bjorn Andersson Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220110104706.v6.2.Idde68b05b88d4a2e6e54766c653f3a6d9e419ce6@changeid --- drivers/rpmsg/rpmsg_char.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/rpmsg/rpmsg_char.c b/drivers/rpmsg/rpmsg_char.c index 8d3c2ca7081b..5663cf799c95 100644 --- a/drivers/rpmsg/rpmsg_char.c +++ b/drivers/rpmsg/rpmsg_char.c @@ -93,7 +93,7 @@ static int rpmsg_eptdev_destroy(struct device *dev, void *data) /* wake up any blocked readers */ wake_up_interruptible(&eptdev->readq); - device_del(&eptdev->dev); + cdev_device_del(&eptdev->cdev, &eptdev->dev); put_device(&eptdev->dev); return 0; @@ -336,7 +336,6 @@ static void rpmsg_eptdev_release_device(struct device *dev) ida_simple_remove(&rpmsg_ept_ida, dev->id); ida_simple_remove(&rpmsg_minor_ida, MINOR(eptdev->dev.devt)); - cdev_del(&eptdev->cdev); kfree(eptdev); } @@ -381,19 +380,13 @@ static int rpmsg_eptdev_create(struct rpmsg_ctrldev *ctrldev, dev->id = ret; dev_set_name(dev, "rpmsg%d", ret); - ret = cdev_add(&eptdev->cdev, dev->devt, 1); + ret = cdev_device_add(&eptdev->cdev, &eptdev->dev); if (ret) goto free_ept_ida; /* We can now rely on the release function for cleanup */ dev->release = rpmsg_eptdev_release_device; - ret = device_add(dev); - if (ret) { - dev_err(dev, "device_add failed: %d\n", ret); - put_device(dev); - } - return ret; free_ept_ida: -- cgit v1.2.3-58-ga151 From eee412e968f7b950564880bc6a7a9f00f49034da Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 14 Jan 2022 17:13:38 -0800 Subject: remoteproc: qcom: q6v5: fix service routines build errors When CONFIG_QCOM_AOSS_QMP=m and CONFIG_QCOM_Q6V5_MSS=y, the builtin driver cannot call into the loadable module's low-level service functions. Trying to build with that config combo causes linker errors. There are two problems here. First, drivers/remoteproc/qcom_q6v5.c should #include for the definitions of the service functions, depending on whether CONFIG_QCOM_AOSS_QMP is set/enabled or not. Second, the qcom remoteproc drivers should depend on QCOM_AOSS_QMP iff it is enabled (=y or =m) so that the qcom remoteproc drivers can be built properly. This prevents these build errors: aarch64-linux-ld: drivers/remoteproc/qcom_q6v5.o: in function `q6v5_load_state_toggle': qcom_q6v5.c:(.text+0xc4): undefined reference to `qmp_send' aarch64-linux-ld: drivers/remoteproc/qcom_q6v5.o: in function `qcom_q6v5_deinit': (.text+0x2e4): undefined reference to `qmp_put' aarch64-linux-ld: drivers/remoteproc/qcom_q6v5.o: in function `qcom_q6v5_init': (.text+0x778): undefined reference to `qmp_get' aarch64-linux-ld: (.text+0x7d8): undefined reference to `qmp_put' Fixes: c1fe10d238c0 ("remoteproc: qcom: q6v5: Use qmp_send to update co-processor load state") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Bjorn Andersson Cc: Mathieu Poirier Cc: linux-remoteproc@vger.kernel.org Cc: Sibi Sankar Cc: Stephen Boyd Reviewed-by: Stephen Boyd Reviewed-by: Bjorn Andersson Signed-off-by: Bjorn Andersson Link: https://lore.kernel.org/r/20220115011338.2973-1-rdunlap@infradead.org --- drivers/remoteproc/Kconfig | 4 ++++ drivers/remoteproc/qcom_q6v5.c | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig index 3ddd426fc969..166019786653 100644 --- a/drivers/remoteproc/Kconfig +++ b/drivers/remoteproc/Kconfig @@ -180,6 +180,7 @@ config QCOM_Q6V5_ADSP depends on RPMSG_QCOM_GLINK_SMEM || RPMSG_QCOM_GLINK_SMEM=n depends on QCOM_SYSMON || QCOM_SYSMON=n depends on RPMSG_QCOM_GLINK || RPMSG_QCOM_GLINK=n + depends on QCOM_AOSS_QMP || QCOM_AOSS_QMP=n select MFD_SYSCON select QCOM_PIL_INFO select QCOM_MDT_LOADER @@ -199,6 +200,7 @@ config QCOM_Q6V5_MSS depends on RPMSG_QCOM_GLINK_SMEM || RPMSG_QCOM_GLINK_SMEM=n depends on QCOM_SYSMON || QCOM_SYSMON=n depends on RPMSG_QCOM_GLINK || RPMSG_QCOM_GLINK=n + depends on QCOM_AOSS_QMP || QCOM_AOSS_QMP=n select MFD_SYSCON select QCOM_MDT_LOADER select QCOM_PIL_INFO @@ -218,6 +220,7 @@ config QCOM_Q6V5_PAS depends on RPMSG_QCOM_GLINK_SMEM || RPMSG_QCOM_GLINK_SMEM=n depends on QCOM_SYSMON || QCOM_SYSMON=n depends on RPMSG_QCOM_GLINK || RPMSG_QCOM_GLINK=n + depends on QCOM_AOSS_QMP || QCOM_AOSS_QMP=n select MFD_SYSCON select QCOM_PIL_INFO select QCOM_MDT_LOADER @@ -239,6 +242,7 @@ config QCOM_Q6V5_WCSS depends on RPMSG_QCOM_GLINK_SMEM || RPMSG_QCOM_GLINK_SMEM=n depends on QCOM_SYSMON || QCOM_SYSMON=n depends on RPMSG_QCOM_GLINK || RPMSG_QCOM_GLINK=n + depends on QCOM_AOSS_QMP || QCOM_AOSS_QMP=n select MFD_SYSCON select QCOM_MDT_LOADER select QCOM_PIL_INFO diff --git a/drivers/remoteproc/qcom_q6v5.c b/drivers/remoteproc/qcom_q6v5.c index eada7e34f3af..442a388f8102 100644 --- a/drivers/remoteproc/qcom_q6v5.c +++ b/drivers/remoteproc/qcom_q6v5.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-58-ga151 From aee101d7b95a03078945681dd7f7ea5e4a1e7686 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 17 Jan 2022 23:44:03 +1000 Subject: powerpc/64s: Mask SRR0 before checking against the masked NIP Commit 314f6c23dd8d ("powerpc/64s: Mask NIP before checking against SRR0") masked off the low 2 bits of the NIP value in the interrupt stack frame in case they are non-zero and mis-compare against a SRR0 register value of a CPU which always reads back 0 from the 2 low bits which are reserved. This now causes the opposite problem that an implementation which does implement those bits in SRR0 will mis-compare against the masked NIP value in which they have been cleared. QEMU is one such implementation, and this is allowed by the architecture. This can be triggered by sigfuz by setting low bits of PT_NIP in the signal context. Fix this for now by masking the SRR0 bits as well. Cleaner is probably to sanitise these values before putting them in registers or stack, but this is the quick and backportable fix. Fixes: 314f6c23dd8d ("powerpc/64s: Mask NIP before checking against SRR0") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220117134403.2995059-1-npiggin@gmail.com --- arch/powerpc/kernel/interrupt_64.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 92088f848266..7bab2d7de372 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -30,6 +30,7 @@ COMPAT_SYS_CALL_TABLE: .ifc \srr,srr mfspr r11,SPRN_SRR0 ld r12,_NIP(r1) + clrrdi r11,r11,2 clrrdi r12,r12,2 100: tdne r11,r12 EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) @@ -40,6 +41,7 @@ COMPAT_SYS_CALL_TABLE: .else mfspr r11,SPRN_HSRR0 ld r12,_NIP(r1) + clrrdi r11,r11,2 clrrdi r12,r12,2 100: tdne r11,r12 EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) -- cgit v1.2.3-58-ga151 From 5455b9ecaf231ec5c6b0cd5c6076eb64c9dbc9aa Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 18 Jan 2022 12:16:57 +1000 Subject: cifs: serialize all mount attempts RHBZ: 2008434 Some servers, such as Windows2016 have a very low number of concurrent mounts that they allow from each client. This can be a problem if you have a more than a handful (==3 in this case) of cifs entries in your fstab and cause a number of the mounts there to randomly fail. Add a global mutex and use it to serialize all mount attempts. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/fs_context.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index e3ed25dc6f3f..7ec35f3f0a5f 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -37,6 +37,8 @@ #include "rfc1002pdu.h" #include "fs_context.h" +static DEFINE_MUTEX(cifs_mount_mutex); + static const match_table_t cifs_smb_version_tokens = { { Smb_1, SMB1_VERSION_STRING }, { Smb_20, SMB20_VERSION_STRING}, @@ -707,10 +709,14 @@ static int smb3_get_tree_common(struct fs_context *fc) static int smb3_get_tree(struct fs_context *fc) { int err = smb3_fs_context_validate(fc); + int ret; if (err) return err; - return smb3_get_tree_common(fc); + mutex_lock(&cifs_mount_mutex); + ret = smb3_get_tree_common(fc); + mutex_unlock(&cifs_mount_mutex); + return ret; } static void smb3_fs_context_free(struct fs_context *fc) -- cgit v1.2.3-58-ga151 From 5f02ef741a785678930f3ff0a8b6b2b0ef1bb402 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 18 Jan 2022 04:34:43 -0500 Subject: KVM: VMX: switch blocked_vcpu_on_cpu_lock to raw spinlock blocked_vcpu_on_cpu_lock is taken from hard interrupt context (pi_wakeup_handler), therefore it cannot sleep. Switch it to a raw spinlock. Fixes: [41297.066254] BUG: scheduling while atomic: CPU 0/KVM/635218/0x00010001 [41297.066323] Preemption disabled at: [41297.066324] [] irq_enter_rcu+0xf/0x60 [41297.066339] Call Trace: [41297.066342] [41297.066346] dump_stack_lvl+0x34/0x44 [41297.066353] ? irq_enter_rcu+0xf/0x60 [41297.066356] __schedule_bug.cold+0x7d/0x8b [41297.066361] __schedule+0x439/0x5b0 [41297.066365] ? task_blocks_on_rt_mutex.constprop.0.isra.0+0x1b0/0x440 [41297.066369] schedule_rtlock+0x1e/0x40 [41297.066371] rtlock_slowlock_locked+0xf1/0x260 [41297.066374] rt_spin_lock+0x3b/0x60 [41297.066378] pi_wakeup_handler+0x31/0x90 [kvm_intel] [41297.066388] sysvec_kvm_posted_intr_wakeup_ipi+0x9d/0xd0 [41297.066392] [41297.066392] asm_sysvec_kvm_posted_intr_wakeup_ipi+0x12/0x20 ... Signed-off-by: Marcelo Tosatti Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 1c94783b5a54..21ea58d25771 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -15,7 +15,7 @@ * can find which vCPU should be waken up. */ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); -static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); +static DEFINE_PER_CPU(raw_spinlock_t, blocked_vcpu_on_cpu_lock); static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { @@ -121,9 +121,9 @@ static void __pi_post_block(struct kvm_vcpu *vcpu) new.control) != old.control); if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); list_del(&vcpu->blocked_vcpu_list); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); vcpu->pre_pcpu = -1; } } @@ -154,11 +154,11 @@ int pi_pre_block(struct kvm_vcpu *vcpu) local_irq_disable(); if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { vcpu->pre_pcpu = vcpu->cpu; - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); list_add_tail(&vcpu->blocked_vcpu_list, &per_cpu(blocked_vcpu_on_cpu, vcpu->pre_pcpu)); - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); + raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); } do { @@ -215,7 +215,7 @@ void pi_wakeup_handler(void) struct kvm_vcpu *vcpu; int cpu = smp_processor_id(); - spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), blocked_vcpu_list) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -223,13 +223,13 @@ void pi_wakeup_handler(void) if (pi_test_on(pi_desc) == 1) kvm_vcpu_kick(vcpu); } - spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); } void __init pi_init_cpu(int cpu) { INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + raw_spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); } bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) -- cgit v1.2.3-58-ga151 From 09f5e7dc7ad705289e1b1ec065439aa3c42951c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Dec 2021 13:19:52 +0100 Subject: perf: Fix perf_event_read_local() time Time readers that cannot take locks (due to NMI etc..) currently make use of perf_event::shadow_ctx_time, which, for that event gives: time' = now + (time - timestamp) or, alternatively arranged: time' = time + (now - timestamp) IOW, the progression of time since the last time the shadow_ctx_time was updated. There's problems with this: A) the shadow_ctx_time is per-event, even though the ctx_time it reflects is obviously per context. The direct concequence of this is that the context needs to iterate all events all the time to keep the shadow_ctx_time in sync. B) even with the prior point, the context itself might not be active meaning its time should not advance to begin with. C) shadow_ctx_time isn't consistently updated when ctx_time is There are 3 users of this stuff, that suffer differently from this: - calc_timer_values() - perf_output_read() - perf_event_update_userpage() /* A */ - perf_event_read_local() /* A,B */ In particular, perf_output_read() doesn't suffer at all, because it's sample driven and hence only relevant when the event is actually running. This same was supposed to be true for perf_event_update_userpage(), after all self-monitoring implies the context is active *HOWEVER*, as per commit f79256532682 ("perf/core: fix userpage->time_enabled of inactive events") this goes wrong when combined with counter overcommit, in that case those events that do not get scheduled when the context becomes active (task events typically) miss out on the EVENT_TIME update and ENABLED time is inflated (for a little while) with the time the context was inactive. Once the event gets rotated in, this gets corrected, leading to a non-monotonic timeflow. perf_event_read_local() made things even worse, it can request time at any point, suffering all the problems perf_event_update_userpage() does and more. Because while perf_event_update_userpage() is limited by the context being active, perf_event_read_local() users have no such constraint. Therefore, completely overhaul things and do away with perf_event::shadow_ctx_time. Instead have regular context time updates keep track of this offset directly and provide perf_event_time_now() to complement perf_event_time(). perf_event_time_now() will, in adition to being context wide, also take into account if the context is active. For inactive context, it will not advance time. This latter property means the cgroup perf_cgroup_info context needs to grow addition state to track this. Additionally, since all this is strictly per-cpu, we can use barrier() to order context activity vs context time. Fixes: 7d9285e82db5 ("perf/bpf: Extend the perf_event_read_local() interface, a.k.a. "bpf: perf event change needed for subsequent bpf helpers"") Signed-off-by: Peter Zijlstra (Intel) Tested-by: Song Liu Tested-by: Namhyung Kim Link: https://lkml.kernel.org/r/YcB06DasOBtU0b00@hirez.programming.kicks-ass.net --- include/linux/perf_event.h | 15 +-- kernel/events/core.c | 246 +++++++++++++++++++++++++++------------------ 2 files changed, 149 insertions(+), 112 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 117f230bcdfd..733649184b27 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -693,18 +693,6 @@ struct perf_event { u64 total_time_running; u64 tstamp; - /* - * timestamp shadows the actual context timing but it can - * be safely used in NMI interrupt context. It reflects the - * context time as it was when the event was last scheduled in, - * or when ctx_sched_in failed to schedule the event because we - * run out of PMC. - * - * ctx_time already accounts for ctx->timestamp. Therefore to - * compute ctx_time for a sample, simply add perf_clock(). - */ - u64 shadow_ctx_time; - struct perf_event_attr attr; u16 header_size; u16 id_header_size; @@ -852,6 +840,7 @@ struct perf_event_context { */ u64 time; u64 timestamp; + u64 timeoffset; /* * These fields let us detect when two contexts have both @@ -934,6 +923,8 @@ struct bpf_perf_event_data_kern { struct perf_cgroup_info { u64 time; u64 timestamp; + u64 timeoffset; + int active; }; struct perf_cgroup { diff --git a/kernel/events/core.c b/kernel/events/core.c index fc18664f49b0..479c9e672ec4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -674,6 +674,23 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state) WRITE_ONCE(event->state, state); } +/* + * UP store-release, load-acquire + */ + +#define __store_release(ptr, val) \ +do { \ + barrier(); \ + WRITE_ONCE(*(ptr), (val)); \ +} while (0) + +#define __load_acquire(ptr) \ +({ \ + __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ + barrier(); \ + ___p; \ +}) + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -719,34 +736,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) return t->time; } -static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { - struct perf_cgroup_info *info; - u64 now; - - now = perf_clock(); + struct perf_cgroup_info *t; - info = this_cpu_ptr(cgrp->info); + t = per_cpu_ptr(event->cgrp->info, event->cpu); + if (!__load_acquire(&t->active)) + return t->time; + now += READ_ONCE(t->timeoffset); + return now; +} - info->time += now - info->timestamp; +static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) +{ + if (adv) + info->time += now - info->timestamp; info->timestamp = now; + /* + * see update_context_time() + */ + WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; + struct perf_cgroup_info *info; if (cgrp) { + u64 now = perf_clock(); + for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); - __update_cgrp_time(cgrp); + info = this_cpu_ptr(cgrp->info); + + __update_cgrp_time(info, now, true); + if (final) + __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { + struct perf_cgroup_info *info; struct perf_cgroup *cgrp; /* @@ -760,8 +794,10 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) /* * Do not update time when cgroup is not active */ - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) - __update_cgrp_time(event->cgrp); + if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) { + info = this_cpu_ptr(event->cgrp->info); + __update_cgrp_time(info, perf_clock(), true); + } } static inline void @@ -785,7 +821,8 @@ perf_cgroup_set_timestamp(struct task_struct *task, for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; + __update_cgrp_time(info, ctx->timestamp, false); + __store_release(&info->active, 1); } } @@ -981,14 +1018,6 @@ out: return ret; } -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) -{ - struct perf_cgroup_info *t; - t = per_cpu_ptr(event->cgrp->info, event->cpu); - event->shadow_ctx_time = now - t->timestamp; -} - static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { @@ -1066,7 +1095,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) { } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, + bool final) { } @@ -1098,12 +1128,12 @@ perf_cgroup_switch(struct task_struct *task, struct task_struct *next) { } -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +static inline u64 perf_cgroup_event_time(struct perf_event *event) { + return 0; } -static inline u64 perf_cgroup_event_time(struct perf_event *event) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } @@ -1525,22 +1555,59 @@ static void perf_unpin_context(struct perf_event_context *ctx) /* * Update the record of the current time in a context. */ -static void update_context_time(struct perf_event_context *ctx) +static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); - ctx->time += now - ctx->timestamp; + if (adv) + ctx->time += now - ctx->timestamp; ctx->timestamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); +} + +static void update_context_time(struct perf_event_context *ctx) +{ + __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + if (unlikely(!ctx)) + return 0; + if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx ? ctx->time : 0; + return ctx->time; +} + +static u64 perf_event_time_now(struct perf_event *event, u64 now) +{ + struct perf_event_context *ctx = event->ctx; + + if (unlikely(!ctx)) + return 0; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time_now(event, now); + + if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) + return ctx->time; + + now += READ_ONCE(ctx->timeoffset); + return now; } static enum event_type_t get_event_type(struct perf_event *event) @@ -2350,7 +2417,7 @@ __perf_remove_from_context(struct perf_event *event, if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); + update_cgrp_time_from_cpuctx(cpuctx, false); } event_sched_out(event, cpuctx, ctx); @@ -2361,6 +2428,9 @@ __perf_remove_from_context(struct perf_event *event, list_del_event(event, ctx); if (!ctx->nr_events && ctx->is_active) { + if (ctx == &cpuctx->ctx) + update_cgrp_time_from_cpuctx(cpuctx, true); + ctx->is_active = 0; ctx->rotate_necessary = 0; if (ctx->task) { @@ -2482,40 +2552,6 @@ void perf_event_disable_inatomic(struct perf_event *event) irq_work_queue(&event->pending); } -static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx) -{ - /* - * use the correct time source for the time snapshot - * - * We could get by without this by leveraging the - * fact that to get to this function, the caller - * has most likely already called update_context_time() - * and update_cgrp_time_xx() and thus both timestamp - * are identical (or very close). Given that tstamp is, - * already adjusted for cgroup, we could say that: - * tstamp - ctx->timestamp - * is equivalent to - * tstamp - cgrp->timestamp. - * - * Then, in perf_output_read(), the calculation would - * work with no changes because: - * - event is guaranteed scheduled in - * - no scheduled out in between - * - thus the timestamp would be the same - * - * But this is a bit hairy. - * - * So instead, we have an explicit cgroup call to remain - * within the time source all along. We believe it - * is cleaner and simpler to understand. - */ - if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, event->tstamp); - else - event->shadow_ctx_time = event->tstamp - ctx->timestamp; -} - #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); @@ -2556,8 +2592,6 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - perf_set_shadow_time(event, ctx); - perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { @@ -3251,16 +3285,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, return; } - ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - - if (ctx->task) { - WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) - cpuctx->task_ctx = NULL; - } - /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last @@ -3274,7 +3298,22 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (is_active & EVENT_TIME) { /* update (and stop) ctx time */ update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); + update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + + ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!ctx->is_active) + cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ @@ -3711,13 +3750,19 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, return 0; } +/* + * Because the userpage is strictly per-event (there is no concept of context, + * so there cannot be a context indirection), every userpage must be updated + * when context time starts :-( + * + * IOW, we must not miss EVENT_TIME edges. + */ static inline bool event_update_userpage(struct perf_event *event) { if (likely(!atomic_read(&event->mmap_count))) return false; perf_event_update_time(event); - perf_set_shadow_time(event, event->ctx); perf_event_update_userpage(event); return true; @@ -3801,13 +3846,23 @@ ctx_sched_in(struct perf_event_context *ctx, struct task_struct *task) { int is_active = ctx->is_active; - u64 now; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; + if (is_active ^ EVENT_TIME) { + /* start ctx time */ + __update_context_time(ctx, false); + perf_cgroup_set_timestamp(task, ctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) @@ -3818,13 +3873,6 @@ ctx_sched_in(struct perf_event_context *ctx, is_active ^= ctx->is_active; /* changed bits */ - if (is_active & EVENT_TIME) { - /* start ctx time */ - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); - } - /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -4418,6 +4466,18 @@ static inline u64 perf_event_count(struct perf_event *event) return local64_read(&event->count) + atomic64_read(&event->child_count); } +static void calc_timer_values(struct perf_event *event, + u64 *now, + u64 *enabled, + u64 *running) +{ + u64 ctx_time; + + *now = perf_clock(); + ctx_time = perf_event_time_now(event, *now); + __perf_update_times(event, ctx_time, enabled, running); +} + /* * NMI-safe method to read a local event, that is an event that * is: @@ -4477,10 +4537,9 @@ int perf_event_read_local(struct perf_event *event, u64 *value, *value = local64_read(&event->count); if (enabled || running) { - u64 now = event->shadow_ctx_time + perf_clock(); - u64 __enabled, __running; + u64 __enabled, __running, __now;; - __perf_update_times(event, now, &__enabled, &__running); + calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) *enabled = __enabled; if (running) @@ -5802,18 +5861,6 @@ static int perf_event_index(struct perf_event *event) return event->pmu->event_idx(event); } -static void calc_timer_values(struct perf_event *event, - u64 *now, - u64 *enabled, - u64 *running) -{ - u64 ctx_time; - - *now = perf_clock(); - ctx_time = event->shadow_ctx_time + *now; - __perf_update_times(event, ctx_time, enabled, running); -} - static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; @@ -6353,7 +6400,6 @@ accounting: ring_buffer_attach(event, rb); perf_event_update_time(event); - perf_set_shadow_time(event, event->ctx); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { -- cgit v1.2.3-58-ga151 From 7fa981cad216e9f64f49e22112f610c0bfed91bc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 11 Jan 2022 10:20:38 -0800 Subject: perf/x86/intel: Add a quirk for the calculation of the number of counters on Alder Lake For some Alder Lake machine with all E-cores disabled in a BIOS, the below warning may be triggered. [ 2.010766] hw perf events fixed 5 > max(4), clipping! Current perf code relies on the CPUID leaf 0xA and leaf 7.EDX[15] to calculate the number of the counters and follow the below assumption. For a hybrid configuration, the leaf 7.EDX[15] (X86_FEATURE_HYBRID_CPU) is set. The leaf 0xA only enumerate the common counters. Linux perf has to manually add the extra GP counters and fixed counters for P-cores. For a non-hybrid configuration, the X86_FEATURE_HYBRID_CPU should not be set. The leaf 0xA enumerates all counters. However, that's not the case when all E-cores are disabled in a BIOS. Although there are only P-cores in the system, the leaf 7.EDX[15] (X86_FEATURE_HYBRID_CPU) is still set. But the leaf 0xA is updated to enumerate all counters of P-cores. The inconsistency triggers the warning. Several software ways were considered to handle the inconsistency. - Drop the leaf 0xA and leaf 7.EDX[15] CPUID enumeration support. Hardcode the number of counters. This solution may be a problem for virtualization. A hypervisor cannot control the number of counters in a Linux guest via changing the guest CPUID enumeration anymore. - Find another CPUID bit that is also updated with E-cores disabled. There may be a problem in the virtualization environment too. Because a hypervisor may disable the feature/CPUID bit. - The P-cores have a maximum of 8 GP counters and 4 fixed counters on ADL. The maximum number can be used to detect the case. This solution is implemented in this patch. Fixes: ee72a94ea4a6 ("perf/x86/intel: Fix fixed counter check warning for some Alder Lake") Reported-by: Damjan Marion (damarion) Reported-by: Chan Edison Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Damjan Marion (damarion) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1641925238-149288-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index fd9f908debe5..d5f940c57934 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6236,6 +6236,19 @@ __init int intel_pmu_init(void) pmu->num_counters = x86_pmu.num_counters; pmu->num_counters_fixed = x86_pmu.num_counters_fixed; } + + /* + * Quirk: For some Alder Lake machine, when all E-cores are disabled in + * a BIOS, the leaf 0xA will enumerate all counters of P-cores. However, + * the X86_FEATURE_HYBRID_CPU is still set. The above codes will + * mistakenly add extra counters for P-cores. Correct the number of + * counters here. + */ + if ((pmu->num_counters > 8) || (pmu->num_counters_fixed > 4)) { + pmu->num_counters = x86_pmu.num_counters; + pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + } + pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); pmu->unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, -- cgit v1.2.3-58-ga151 From 96fd2e89fba1aaada6f4b1e5d25a9d9ecbe1943d Mon Sep 17 00:00:00 2001 From: Zhengjun Xing Date: Thu, 23 Dec 2021 22:48:26 +0800 Subject: perf/x86/intel/uncore: Fix CAS_COUNT_WRITE issue for ICX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The user recently report a perf issue in the ICX platform, when test by perf event “uncore_imc_x/cas_count_write”,the write bandwidth is always very small (only 0.38MB/s), it is caused by the wrong "umask" for the "cas_count_write" event. When double-checking, find "cas_count_read" also is wrong. The public document for ICX uncore: 3rd Gen Intel® Xeon® Processor Scalable Family, Codename Ice Lake,Uncore Performance Monitoring Reference Manual, Revision 1.00, May 2021 On 2.4.7, it defines Unit Masks for CAS_COUNT: RD b00001111 WR b00110000 So corrected both "cas_count_read" and "cas_count_write" for ICX. Old settings: hswep_uncore_imc_events INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03") INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c") New settings: snr_uncore_imc_events INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x0f") INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x30") Fixes: 2b3b76b5ec67 ("perf/x86/intel/uncore: Add Ice Lake server uncore support") Signed-off-by: Zhengjun Xing Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Adrian Hunter Reviewed-by: Kan Liang Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20211223144826.841267-1-zhengjun.xing@linux.intel.com --- arch/x86/events/intel/uncore_snbep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 3660f698fb2a..ed869443efb2 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -5482,7 +5482,7 @@ static struct intel_uncore_type icx_uncore_imc = { .fixed_ctr_bits = 48, .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, - .event_descs = hswep_uncore_imc_events, + .event_descs = snr_uncore_imc_events, .perf_ctr = SNR_IMC_MMIO_PMON_CTR0, .event_ctl = SNR_IMC_MMIO_PMON_CTL0, .event_mask = SNBEP_PMON_RAW_EVENT_MASK, -- cgit v1.2.3-58-ga151 From 0036fb00a756a2f6e360d44e2e3d2200a8afbc9b Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 5 Jan 2022 10:56:59 -0800 Subject: perf/x86/rapl: fix AMD event handling The RAPL events exposed under /sys/devices/power/events should only reflect what the underlying hardware actually support. This is how it works on Intel RAPL and Intel core/uncore PMUs in general. But on AMD, this was not the case. All possible RAPL events were advertised. This is what it showed on an AMD Fam17h: $ ls /sys/devices/power/events/ energy-cores energy-gpu energy-pkg energy-psys energy-ram energy-cores.scale energy-gpu.scale energy-pkg.scale energy-psys.scale energy-ram.scale energy-cores.unit energy-gpu.unit energy-pkg.unit energy-psys.unit energy-ram.unit Yet, on AMD Fam17h, only energy-pkg is supported. This patch fixes the problem. Given the way perf_msr_probe() works, the amd_rapl_msrs[] table has to have all entries filled out and in particular the group field, otherwise perf_msr_probe() defaults to making the event visible. With the patch applied, the kernel now only shows was is actually supported: $ ls /sys/devices/power/events/ energy-pkg energy-pkg.scale energy-pkg.unit The patch also uses the RAPL_MSR_MASK because only the 32-bits LSB of the RAPL counters are relevant when reading power consumption. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220105185659.643355-1-eranian@google.com --- arch/x86/events/rapl.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 85feafacc445..77e3a47af5ad 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -536,11 +536,14 @@ static struct perf_msr intel_rapl_spr_msrs[] = { * - perf_msr_probe(PERF_RAPL_MAX) * - want to use same event codes across both architectures */ -static struct perf_msr amd_rapl_msrs[PERF_RAPL_MAX] = { - [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, +static struct perf_msr amd_rapl_msrs[] = { + [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, 0, false, 0 }, + [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, 0, false, 0 }, + [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, 0, false, 0 }, + [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, 0, false, 0 }, }; - static int rapl_cpu_offline(unsigned int cpu) { struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); -- cgit v1.2.3-58-ga151 From 1ac7fd8159a842b3aa51f0b46a351fa3eeb8fbf3 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 4 Jan 2022 08:51:16 -0800 Subject: perf/x86/intel/lbr: Support LBR format V7 The Goldmont plus and Tremont have LBR format V7. The V7 has LBR_INFO, which is the same as LBR format V5. But V7 doesn't support TSX. Without the patch, the associated misprediction and cycles information in the LBR_INFO may be lost on a Goldmont plus platform. For Tremont, the patch only impacts the non-PEBS events. Because of the adaptive PEBS, the LBR_INFO is always processed for a PEBS event. Currently, two different ways are used to check the LBR capabilities, which make the codes complex and confusing. For the LBR format V4 and earlier, the global static lbr_desc array is used to store the flags for the LBR capabilities in each LBR format. For LBR format V5 and V6, the current code checks the version number for the LBR capabilities. There are common LBR capabilities among LBR format versions. Several flags for the LBR capabilities are introduced into the struct x86_pmu. The flags, which can be shared among LBR formats, are used to check the LBR capabilities. Add intel_pmu_lbr_init() to set the flags accordingly at boot time. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Kan Liang Link: https://lkml.kernel.org/r/1641315077-96661-1-git-send-email-peterz@infradead.org --- arch/x86/events/intel/core.c | 2 + arch/x86/events/intel/lbr.c | 114 ++++++++++++++++++++++++------------------- arch/x86/events/perf_event.h | 10 +++- 3 files changed, 75 insertions(+), 51 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index d5f940c57934..c91434056c29 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6353,6 +6353,8 @@ __init int intel_pmu_init(void) } if (x86_pmu.lbr_nr) { + intel_pmu_lbr_init(); + pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); /* only support branch_stack snapshot for perfmon >= v2 */ diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 8043213b75a5..b7228a170278 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -8,14 +8,6 @@ #include "../perf_event.h" -static const enum { - LBR_EIP_FLAGS = 1, - LBR_TSX = 2, -} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = { - [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS, - [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX, -}; - /* * Intel LBR_SELECT bits * Intel Vol3a, April 2011, Section 16.7 Table 16-10 @@ -243,7 +235,7 @@ void intel_pmu_lbr_reset_64(void) for (i = 0; i < x86_pmu.lbr_nr; i++) { wrmsrl(x86_pmu.lbr_from + i, 0); wrmsrl(x86_pmu.lbr_to + i, 0); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + if (x86_pmu.lbr_has_info) wrmsrl(x86_pmu.lbr_info + i, 0); } } @@ -305,11 +297,10 @@ enum { */ static inline bool lbr_from_signext_quirk_needed(void) { - int lbr_format = x86_pmu.intel_cap.lbr_format; bool tsx_support = boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM); - return !tsx_support && (lbr_desc[lbr_format] & LBR_TSX); + return !tsx_support && x86_pmu.lbr_has_tsx; } static DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key); @@ -427,12 +418,12 @@ rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) void intel_pmu_lbr_restore(void *ctx) { - bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx = ctx; - int i; - unsigned lbr_idx, mask; + bool need_info = x86_pmu.lbr_has_info; u64 tos = task_ctx->tos; + unsigned lbr_idx, mask; + int i; mask = x86_pmu.lbr_nr - 1; for (i = 0; i < task_ctx->valid_lbrs; i++) { @@ -444,7 +435,7 @@ void intel_pmu_lbr_restore(void *ctx) lbr_idx = (tos - i) & mask; wrlbr_from(lbr_idx, 0); wrlbr_to(lbr_idx, 0); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + if (need_info) wrlbr_info(lbr_idx, 0); } @@ -519,9 +510,9 @@ static void __intel_pmu_lbr_restore(void *ctx) void intel_pmu_lbr_save(void *ctx) { - bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx = ctx; + bool need_info = x86_pmu.lbr_has_info; unsigned lbr_idx, mask; u64 tos; int i; @@ -816,7 +807,6 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; - int lbr_format = x86_pmu.intel_cap.lbr_format; u64 tos = intel_pmu_lbr_tos(); int i; int out = 0; @@ -831,9 +821,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) for (i = 0; i < num; i++) { unsigned long lbr_idx = (tos - i) & mask; u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; - int skip = 0; u16 cycles = 0; - int lbr_flags = lbr_desc[lbr_format]; from = rdlbr_from(lbr_idx, NULL); to = rdlbr_to(lbr_idx, NULL); @@ -845,37 +833,39 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (call_stack && !from) break; - if (lbr_format == LBR_FORMAT_INFO && need_info) { - u64 info; - - info = rdlbr_info(lbr_idx, NULL); - mis = !!(info & LBR_INFO_MISPRED); - pred = !mis; - in_tx = !!(info & LBR_INFO_IN_TX); - abort = !!(info & LBR_INFO_ABORT); - cycles = (info & LBR_INFO_CYCLES); - } - - if (lbr_format == LBR_FORMAT_TIME) { - mis = !!(from & LBR_FROM_FLAG_MISPRED); - pred = !mis; - skip = 1; - cycles = ((to >> 48) & LBR_INFO_CYCLES); - - to = (u64)((((s64)to) << 16) >> 16); - } - - if (lbr_flags & LBR_EIP_FLAGS) { - mis = !!(from & LBR_FROM_FLAG_MISPRED); - pred = !mis; - skip = 1; - } - if (lbr_flags & LBR_TSX) { - in_tx = !!(from & LBR_FROM_FLAG_IN_TX); - abort = !!(from & LBR_FROM_FLAG_ABORT); - skip = 3; + if (x86_pmu.lbr_has_info) { + if (need_info) { + u64 info; + + info = rdlbr_info(lbr_idx, NULL); + mis = !!(info & LBR_INFO_MISPRED); + pred = !mis; + cycles = (info & LBR_INFO_CYCLES); + if (x86_pmu.lbr_has_tsx) { + in_tx = !!(info & LBR_INFO_IN_TX); + abort = !!(info & LBR_INFO_ABORT); + } + } + } else { + int skip = 0; + + if (x86_pmu.lbr_from_flags) { + mis = !!(from & LBR_FROM_FLAG_MISPRED); + pred = !mis; + skip = 1; + } + if (x86_pmu.lbr_has_tsx) { + in_tx = !!(from & LBR_FROM_FLAG_IN_TX); + abort = !!(from & LBR_FROM_FLAG_ABORT); + skip = 3; + } + from = (u64)((((s64)from) << skip) >> skip); + + if (x86_pmu.lbr_to_cycles) { + cycles = ((to >> 48) & LBR_INFO_CYCLES); + to = (u64)((((s64)to) << 16) >> 16); + } } - from = (u64)((((s64)from) << skip) >> skip); /* * Some CPUs report duplicated abort records, @@ -1120,7 +1110,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) && (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) && - (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)) + x86_pmu.lbr_has_info) reg->config |= LBR_NO_INFO; return 0; @@ -1706,6 +1696,30 @@ void intel_pmu_lbr_init_knl(void) x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; } +void intel_pmu_lbr_init(void) +{ + switch (x86_pmu.intel_cap.lbr_format) { + case LBR_FORMAT_EIP_FLAGS2: + x86_pmu.lbr_has_tsx = 1; + fallthrough; + case LBR_FORMAT_EIP_FLAGS: + x86_pmu.lbr_from_flags = 1; + break; + + case LBR_FORMAT_INFO: + x86_pmu.lbr_has_tsx = 1; + fallthrough; + case LBR_FORMAT_INFO2: + x86_pmu.lbr_has_info = 1; + break; + + case LBR_FORMAT_TIME: + x86_pmu.lbr_from_flags = 1; + x86_pmu.lbr_to_cycles = 1; + break; + } +} + /* * LBR state size is variable based on the max number of registers. * This calculates the expected state size, which should match diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 9d376e528dfc..150261d929b9 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -215,7 +215,8 @@ enum { LBR_FORMAT_EIP_FLAGS2 = 0x04, LBR_FORMAT_INFO = 0x05, LBR_FORMAT_TIME = 0x06, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, + LBR_FORMAT_INFO2 = 0x07, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO2, }; enum { @@ -840,6 +841,11 @@ struct x86_pmu { bool lbr_double_abort; /* duplicated lbr aborts */ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ + unsigned int lbr_has_info:1; + unsigned int lbr_has_tsx:1; + unsigned int lbr_from_flags:1; + unsigned int lbr_to_cycles:1; + /* * Intel Architectural LBR CPUID Enumeration */ @@ -1392,6 +1398,8 @@ void intel_pmu_lbr_init_skl(void); void intel_pmu_lbr_init_knl(void); +void intel_pmu_lbr_init(void); + void intel_pmu_arch_lbr_init(void); void intel_pmu_pebs_data_source_nhm(void); -- cgit v1.2.3-58-ga151 From 6b19788ddc5937831ffd27525a1b793953fd2d2b Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 4 Jan 2022 08:51:17 -0800 Subject: perf/x86/intel/lbr: Add static_branch for LBR INFO flags Using static_branch to replace the LBR INFO flags to optimize the LBR INFO parsing. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Kan Liang Link: https://lkml.kernel.org/r/1641315077-96661-2-git-send-email-peterz@infradead.org --- arch/x86/events/intel/lbr.c | 51 ++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index b7228a170278..f8fd25528935 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -893,37 +893,40 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_stack.hw_idx = tos; } +static DEFINE_STATIC_KEY_FALSE(x86_lbr_mispred); +static DEFINE_STATIC_KEY_FALSE(x86_lbr_cycles); +static DEFINE_STATIC_KEY_FALSE(x86_lbr_type); + static __always_inline int get_lbr_br_type(u64 info) { - if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type) - return 0; + int type = 0; - return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET; + if (static_branch_likely(&x86_lbr_type)) + type = (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET; + + return type; } static __always_inline bool get_lbr_mispred(u64 info) { - if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) - return 0; + bool mispred = 0; - return !!(info & LBR_INFO_MISPRED); -} - -static __always_inline bool get_lbr_predicted(u64 info) -{ - if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) - return 0; + if (static_branch_likely(&x86_lbr_mispred)) + mispred = !!(info & LBR_INFO_MISPRED); - return !(info & LBR_INFO_MISPRED); + return mispred; } static __always_inline u16 get_lbr_cycles(u64 info) { + u16 cycles = info & LBR_INFO_CYCLES; + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && - !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID)) - return 0; + (!static_branch_likely(&x86_lbr_cycles) || + !(info & LBR_INFO_CYC_CNT_VALID))) + cycles = 0; - return info & LBR_INFO_CYCLES; + return cycles; } static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, @@ -951,7 +954,7 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, e->from = from; e->to = to; e->mispred = get_lbr_mispred(info); - e->predicted = get_lbr_predicted(info); + e->predicted = !e->mispred; e->in_tx = !!(info & LBR_INFO_IN_TX); e->abort = !!(info & LBR_INFO_ABORT); e->cycles = get_lbr_cycles(info); @@ -1718,6 +1721,14 @@ void intel_pmu_lbr_init(void) x86_pmu.lbr_to_cycles = 1; break; } + + if (x86_pmu.lbr_has_info) { + /* + * Only used in combination with baseline pebs. + */ + static_branch_enable(&x86_lbr_mispred); + static_branch_enable(&x86_lbr_cycles); + } } /* @@ -1779,6 +1790,12 @@ void __init intel_pmu_arch_lbr_init(void) x86_pmu.lbr_br_type = ecx.split.lbr_br_type; x86_pmu.lbr_nr = lbr_nr; + if (x86_pmu.lbr_mispred) + static_branch_enable(&x86_lbr_mispred); + if (x86_pmu.lbr_timed_lbr) + static_branch_enable(&x86_lbr_cycles); + if (x86_pmu.lbr_br_type) + static_branch_enable(&x86_lbr_type); arch_lbr_xsave = is_arch_lbr_xsave_available(); if (arch_lbr_xsave) { -- cgit v1.2.3-58-ga151 From 5a4487f9ef5ef2fdb3215cadf0a9c3e5e8678634 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 13 Jan 2022 14:05:54 -0800 Subject: perf/x86/intel/uncore: Add IMC uncore support for ADL Current ADL uncore code only supports the legacy IMC (memory controller) free-running counters. Besides the free-running counters, ADL also supports several general purpose-counters. The general-purpose counters can also be accessed via MMIO but in a different location. Factor out __uncore_imc_init_box() with offset as a parameter. The function can be shared between ADL and TGL. The event format and the layout of the control registers are a little bit different from other uncore counters. The intel_generic_uncore_mmio_enable_event() can be shared with client IMC uncore. Expose the function. Add more PCI IDs for ADL machines. Fixes: 772ed05f3c5c ("perf/x86/intel/uncore: Add Alder Lake support") Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1642111554-118524-1-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 2 +- arch/x86/events/intel/uncore.h | 3 +- arch/x86/events/intel/uncore_discovery.c | 4 +- arch/x86/events/intel/uncore_discovery.h | 2 + arch/x86/events/intel/uncore_snb.c | 214 ++++++++++++++++++++++++++++++- 5 files changed, 220 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index f1ba6ab2e97e..e497da9bf427 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1762,7 +1762,7 @@ static const struct intel_uncore_init_fun rkl_uncore_init __initconst = { static const struct intel_uncore_init_fun adl_uncore_init __initconst = { .cpu_init = adl_uncore_cpu_init, - .mmio_init = tgl_uncore_mmio_init, + .mmio_init = adl_uncore_mmio_init, }; static const struct intel_uncore_init_fun icx_uncore_init __initconst = { diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index b9687980aab6..2adeaf4de4df 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -584,10 +584,11 @@ void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); void icl_uncore_cpu_init(void); -void adl_uncore_cpu_init(void); void tgl_uncore_cpu_init(void); +void adl_uncore_cpu_init(void); void tgl_uncore_mmio_init(void); void tgl_l_uncore_mmio_init(void); +void adl_uncore_mmio_init(void); int snb_pci2phy_map_init(int devid); /* uncore_snbep.c */ diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c index 3049c646fa20..6ddadb482f68 100644 --- a/arch/x86/events/intel/uncore_discovery.c +++ b/arch/x86/events/intel/uncore_discovery.c @@ -494,8 +494,8 @@ void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box) writel(0, box->io_addr); } -static void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box, - struct perf_event *event) +void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box, + struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h index 6d735611c281..cfaf558bdb6b 100644 --- a/arch/x86/events/intel/uncore_discovery.h +++ b/arch/x86/events/intel/uncore_discovery.h @@ -139,6 +139,8 @@ void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box); void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box); void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box, struct perf_event *event); +void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box, + struct perf_event *event); void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box); void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 0f63706cdadf..f698a55bde81 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Nehalem/SandBridge/Haswell/Broadwell/Skylake uncore support */ #include "uncore.h" +#include "uncore_discovery.h" /* Uncore IMC PCI IDs */ #define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100 @@ -64,6 +65,20 @@ #define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53 #define PCI_DEVICE_ID_INTEL_ADL_1_IMC 0x4660 #define PCI_DEVICE_ID_INTEL_ADL_2_IMC 0x4641 +#define PCI_DEVICE_ID_INTEL_ADL_3_IMC 0x4601 +#define PCI_DEVICE_ID_INTEL_ADL_4_IMC 0x4602 +#define PCI_DEVICE_ID_INTEL_ADL_5_IMC 0x4609 +#define PCI_DEVICE_ID_INTEL_ADL_6_IMC 0x460a +#define PCI_DEVICE_ID_INTEL_ADL_7_IMC 0x4621 +#define PCI_DEVICE_ID_INTEL_ADL_8_IMC 0x4623 +#define PCI_DEVICE_ID_INTEL_ADL_9_IMC 0x4629 +#define PCI_DEVICE_ID_INTEL_ADL_10_IMC 0x4637 +#define PCI_DEVICE_ID_INTEL_ADL_11_IMC 0x463b +#define PCI_DEVICE_ID_INTEL_ADL_12_IMC 0x4648 +#define PCI_DEVICE_ID_INTEL_ADL_13_IMC 0x4649 +#define PCI_DEVICE_ID_INTEL_ADL_14_IMC 0x4650 +#define PCI_DEVICE_ID_INTEL_ADL_15_IMC 0x4668 +#define PCI_DEVICE_ID_INTEL_ADL_16_IMC 0x4670 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -155,6 +170,7 @@ DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(chmask, chmask, "config:8-11"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); @@ -1334,6 +1350,62 @@ static const struct pci_device_id tgl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_2_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_4_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_5_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_6_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_7_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_8_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_9_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_10_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_11_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_12_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_13_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_14_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_15_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_16_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ } }; @@ -1390,7 +1462,8 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void) #define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000 #define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000 -static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) +static void __uncore_imc_init_box(struct intel_uncore_box *box, + unsigned int base_offset) { struct pci_dev *pdev = tgl_uncore_get_mc_dev(); struct intel_uncore_pmu *pmu = box->pmu; @@ -1417,11 +1490,17 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) addr |= ((resource_size_t)mch_bar << 32); #endif + addr += base_offset; box->io_addr = ioremap(addr, type->mmio_map_size); if (!box->io_addr) pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); } +static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) +{ + __uncore_imc_init_box(box, 0); +} + static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = { .init_box = tgl_uncore_imc_freerunning_init_box, .exit_box = uncore_mmio_exit_box, @@ -1469,3 +1548,136 @@ void tgl_uncore_mmio_init(void) } /* end of Tiger Lake MMIO uncore support */ + +/* Alder Lake MMIO uncore support */ +#define ADL_UNCORE_IMC_BASE 0xd900 +#define ADL_UNCORE_IMC_MAP_SIZE 0x200 +#define ADL_UNCORE_IMC_CTR 0xe8 +#define ADL_UNCORE_IMC_CTRL 0xd0 +#define ADL_UNCORE_IMC_GLOBAL_CTL 0xc0 +#define ADL_UNCORE_IMC_BOX_CTL 0xc4 +#define ADL_UNCORE_IMC_FREERUNNING_BASE 0xd800 +#define ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE 0x100 + +#define ADL_UNCORE_IMC_CTL_FRZ (1 << 0) +#define ADL_UNCORE_IMC_CTL_RST_CTRL (1 << 1) +#define ADL_UNCORE_IMC_CTL_RST_CTRS (1 << 2) +#define ADL_UNCORE_IMC_CTL_INT (ADL_UNCORE_IMC_CTL_RST_CTRL | \ + ADL_UNCORE_IMC_CTL_RST_CTRS) + +static void adl_uncore_imc_init_box(struct intel_uncore_box *box) +{ + __uncore_imc_init_box(box, ADL_UNCORE_IMC_BASE); + + /* The global control in MC1 can control both MCs. */ + if (box->io_addr && (box->pmu->pmu_idx == 1)) + writel(ADL_UNCORE_IMC_CTL_INT, box->io_addr + ADL_UNCORE_IMC_GLOBAL_CTL); +} + +static void adl_uncore_mmio_disable_box(struct intel_uncore_box *box) +{ + if (!box->io_addr) + return; + + writel(ADL_UNCORE_IMC_CTL_FRZ, box->io_addr + uncore_mmio_box_ctl(box)); +} + +static void adl_uncore_mmio_enable_box(struct intel_uncore_box *box) +{ + if (!box->io_addr) + return; + + writel(0, box->io_addr + uncore_mmio_box_ctl(box)); +} + +static struct intel_uncore_ops adl_uncore_mmio_ops = { + .init_box = adl_uncore_imc_init_box, + .exit_box = uncore_mmio_exit_box, + .disable_box = adl_uncore_mmio_disable_box, + .enable_box = adl_uncore_mmio_enable_box, + .disable_event = intel_generic_uncore_mmio_disable_event, + .enable_event = intel_generic_uncore_mmio_enable_event, + .read_counter = uncore_mmio_read_counter, +}; + +#define ADL_UNC_CTL_CHMASK_MASK 0x00000f00 +#define ADL_UNC_IMC_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + ADL_UNC_CTL_CHMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET) + +static struct attribute *adl_uncore_imc_formats_attr[] = { + &format_attr_event.attr, + &format_attr_chmask.attr, + &format_attr_edge.attr, + NULL, +}; + +static const struct attribute_group adl_uncore_imc_format_group = { + .name = "format", + .attrs = adl_uncore_imc_formats_attr, +}; + +static struct intel_uncore_type adl_uncore_imc = { + .name = "imc", + .num_counters = 5, + .num_boxes = 2, + .perf_ctr_bits = 64, + .perf_ctr = ADL_UNCORE_IMC_CTR, + .event_ctl = ADL_UNCORE_IMC_CTRL, + .event_mask = ADL_UNC_IMC_EVENT_MASK, + .box_ctl = ADL_UNCORE_IMC_BOX_CTL, + .mmio_offset = 0, + .mmio_map_size = ADL_UNCORE_IMC_MAP_SIZE, + .ops = &adl_uncore_mmio_ops, + .format_group = &adl_uncore_imc_format_group, +}; + +enum perf_adl_uncore_imc_freerunning_types { + ADL_MMIO_UNCORE_IMC_DATA_TOTAL, + ADL_MMIO_UNCORE_IMC_DATA_READ, + ADL_MMIO_UNCORE_IMC_DATA_WRITE, + ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX +}; + +static struct freerunning_counters adl_uncore_imc_freerunning[] = { + [ADL_MMIO_UNCORE_IMC_DATA_TOTAL] = { 0x40, 0x0, 0x0, 1, 64 }, + [ADL_MMIO_UNCORE_IMC_DATA_READ] = { 0x58, 0x0, 0x0, 1, 64 }, + [ADL_MMIO_UNCORE_IMC_DATA_WRITE] = { 0xA0, 0x0, 0x0, 1, 64 }, +}; + +static void adl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) +{ + __uncore_imc_init_box(box, ADL_UNCORE_IMC_FREERUNNING_BASE); +} + +static struct intel_uncore_ops adl_uncore_imc_freerunning_ops = { + .init_box = adl_uncore_imc_freerunning_init_box, + .exit_box = uncore_mmio_exit_box, + .read_counter = uncore_mmio_read_counter, + .hw_config = uncore_freerunning_hw_config, +}; + +static struct intel_uncore_type adl_uncore_imc_free_running = { + .name = "imc_free_running", + .num_counters = 3, + .num_boxes = 2, + .num_freerunning_types = ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE, + .freerunning = adl_uncore_imc_freerunning, + .ops = &adl_uncore_imc_freerunning_ops, + .event_descs = tgl_uncore_imc_events, + .format_group = &tgl_uncore_imc_format_group, +}; + +static struct intel_uncore_type *adl_mmio_uncores[] = { + &adl_uncore_imc, + &adl_uncore_imc_free_running, + NULL +}; + +void adl_uncore_mmio_init(void) +{ + uncore_mmio_uncores = adl_mmio_uncores; +} + +/* end of Alder Lake MMIO uncore support */ -- cgit v1.2.3-58-ga151 From 8c16dc047b5dd8f7b3bf4584fa75733ea0dde7dc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 15 Dec 2021 12:40:29 -0800 Subject: x86/perf: Avoid warning for Arch LBR without XSAVE Some hypervisors support Arch LBR, but without the LBR XSAVE support. The current Arch LBR init code prints a warning when the xsave size (0) is unexpected. Avoid printing the warning for the "no LBR XSAVE" case. Signed-off-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211215204029.150686-1-ak@linux.intel.com --- arch/x86/events/intel/lbr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index f8fd25528935..669c2be14784 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1751,6 +1751,9 @@ static bool is_arch_lbr_xsave_available(void) * Check the LBR state with the corresponding software structure. * Disable LBR XSAVES support if the size doesn't match. */ + if (xfeature_size(XFEATURE_LBR) == 0) + return false; + if (WARN_ON(xfeature_size(XFEATURE_LBR) != get_lbr_state_size())) return false; -- cgit v1.2.3-58-ga151 From a06247c6804f1a7c86a2e5398a4c1f1db1471848 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 11 Jan 2022 15:23:09 -0800 Subject: psi: Fix uaf issue when psi trigger is destroyed while being polled With write operation on psi files replacing old trigger with a new one, the lifetime of its waitqueue is totally arbitrary. Overwriting an existing trigger causes its waitqueue to be freed and pending poll() will stumble on trigger->event_wait which was destroyed. Fix this by disallowing to redefine an existing psi trigger. If a write operation is used on a file descriptor with an already existing psi trigger, the operation will fail with EBUSY error. Also bypass a check for psi_disabled in the psi_trigger_destroy as the flag can be flipped after the trigger is created, leading to a memory leak. Fixes: 0e94682b73bf ("psi: introduce psi monitor") Reported-by: syzbot+cdb5dd11c97cc532efad@syzkaller.appspotmail.com Suggested-by: Linus Torvalds Analyzed-by: Eric Biggers Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Eric Biggers Acked-by: Johannes Weiner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220111232309.1786347-1-surenb@google.com --- Documentation/accounting/psi.rst | 3 +- include/linux/psi.h | 2 +- include/linux/psi_types.h | 3 -- kernel/cgroup/cgroup.c | 11 +++++-- kernel/sched/psi.c | 66 ++++++++++++++++++---------------------- 5 files changed, 40 insertions(+), 45 deletions(-) diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst index f2b3439edcc2..860fe651d645 100644 --- a/Documentation/accounting/psi.rst +++ b/Documentation/accounting/psi.rst @@ -92,7 +92,8 @@ Triggers can be set on more than one psi metric and more than one trigger for the same psi metric can be specified. However for each trigger a separate file descriptor is required to be able to poll it separately from others, therefore for each trigger a separate open() syscall should be made even -when opening the same psi interface file. +when opening the same psi interface file. Write operations to a file descriptor +with an already existing psi trigger will fail with EBUSY. Monitors activate only when system enters stall state for the monitored psi metric and deactivates upon exit from the stall state. While system is diff --git a/include/linux/psi.h b/include/linux/psi.h index a70ca833c6d7..f8ce53bfdb2a 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -33,7 +33,7 @@ void cgroup_move_task(struct task_struct *p, struct css_set *to); struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, size_t nbytes, enum psi_res res); -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t); +void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 516c0fe836fd..1a3cef26d129 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -141,9 +141,6 @@ struct psi_trigger { * events to one per window */ u64 last_event_time; - - /* Refcounting to prevent premature destruction */ - struct kref refcount; }; struct psi_group { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b31e1465868a..9d05c3ca2d5e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3643,6 +3643,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn); + /* Allow only one trigger per file descriptor */ + if (ctx->psi.trigger) { + cgroup_put(cgrp); + return -EBUSY; + } + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { @@ -3650,8 +3656,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return PTR_ERR(new); } - psi_trigger_replace(&ctx->psi.trigger, new); - + smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; @@ -3690,7 +3695,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; - psi_trigger_replace(&ctx->psi.trigger, NULL); + psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index a679613a7cb7..c137c4d6983e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1162,7 +1162,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; init_waitqueue_head(&t->event_wait); - kref_init(&t->refcount); mutex_lock(&group->trigger_lock); @@ -1191,15 +1190,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, return t; } -static void psi_trigger_destroy(struct kref *ref) +void psi_trigger_destroy(struct psi_trigger *t) { - struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); - struct psi_group *group = t->group; + struct psi_group *group; struct task_struct *task_to_destroy = NULL; - if (static_branch_likely(&psi_disabled)) + /* + * We do not check psi_disabled since it might have been disabled after + * the trigger got created. + */ + if (!t) return; + group = t->group; /* * Wakeup waiters to stop polling. Can happen if cgroup is deleted * from under a polling process. @@ -1235,9 +1238,9 @@ static void psi_trigger_destroy(struct kref *ref) mutex_unlock(&group->trigger_lock); /* - * Wait for both *trigger_ptr from psi_trigger_replace and - * poll_task RCUs to complete their read-side critical sections - * before destroying the trigger and optionally the poll_task + * Wait for psi_schedule_poll_work RCU to complete its read-side + * critical section before destroying the trigger and optionally the + * poll_task. */ synchronize_rcu(); /* @@ -1254,18 +1257,6 @@ static void psi_trigger_destroy(struct kref *ref) kfree(t); } -void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) -{ - struct psi_trigger *old = *trigger_ptr; - - if (static_branch_likely(&psi_disabled)) - return; - - rcu_assign_pointer(*trigger_ptr, new); - if (old) - kref_put(&old->refcount, psi_trigger_destroy); -} - __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait) { @@ -1275,24 +1266,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (static_branch_likely(&psi_disabled)) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - rcu_read_lock(); - - t = rcu_dereference(*(void __rcu __force **)trigger_ptr); - if (!t) { - rcu_read_unlock(); + t = smp_load_acquire(trigger_ptr); + if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - } - kref_get(&t->refcount); - - rcu_read_unlock(); poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; - kref_put(&t->refcount, psi_trigger_destroy); - return ret; } @@ -1316,14 +1298,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; - new = psi_trigger_create(&psi_system, buf, nbytes, res); - if (IS_ERR(new)) - return PTR_ERR(new); - seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ mutex_lock(&seq->lock); - psi_trigger_replace(&seq->private, new); + + /* Allow only one trigger per file descriptor */ + if (seq->private) { + mutex_unlock(&seq->lock); + return -EBUSY; + } + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) { + mutex_unlock(&seq->lock); + return PTR_ERR(new); + } + + smp_store_release(&seq->private, new); mutex_unlock(&seq->lock); return nbytes; @@ -1358,7 +1350,7 @@ static int psi_fop_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; - psi_trigger_replace(&seq->private, NULL); + psi_trigger_destroy(seq->private); return single_release(inode, file); } -- cgit v1.2.3-58-ga151 From 98b0d890220d45418cfbc5157b3382e6da5a12ab Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Jan 2022 14:46:56 +0100 Subject: sched/pelt: Relax the sync of util_sum with util_avg Rick reported performance regressions in bugzilla because of cpu frequency being lower than before: https://bugzilla.kernel.org/show_bug.cgi?id=215045 He bisected the problem to: commit 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent") This commit forces util_sum to be synced with the new util_avg after removing the contribution of a task and before the next periodic sync. By doing so util_sum is rounded to its lower bound and might lost up to LOAD_AVG_MAX-1 of accumulated contribution which has not yet been reflected in util_avg. Instead of always setting util_sum to the low bound of util_avg, which can significantly lower the utilization of root cfs_rq after propagating the change down into the hierarchy, we revert the change of util_sum and propagate the difference. In addition, we also check that cfs's util_sum always stays above the lower bound for a given util_avg as it has been observed that sched_entity's util_sum is sometimes above cfs one. Fixes: 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent") Reported-by: Rick Yiu Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Sachin Sant Link: https://lkml.kernel.org/r/20220111134659.24961-2-vincent.guittot@linaro.org --- kernel/sched/fair.c | 16 +++++++++++++--- kernel/sched/pelt.h | 4 +++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 095b0aa378df..d8f068d72b9d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3381,7 +3381,6 @@ void set_task_rq_fair(struct sched_entity *se, se->avg.last_update_time = n_last_update_time; } - /* * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to * propagate its contribution. The key to this propagation is the invariant @@ -3449,7 +3448,6 @@ void set_task_rq_fair(struct sched_entity *se, * XXX: only do this for the part of runnable > running ? * */ - static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { @@ -3681,7 +3679,19 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_util; sub_positive(&sa->util_avg, r); - sa->util_sum = sa->util_avg * divider; + sub_positive(&sa->util_sum, r * divider); + /* + * Because of rounding, se->util_sum might ends up being +1 more than + * cfs->util_sum. Although this is not a problem by itself, detaching + * a lot of tasks with the rounding problem between 2 updates of + * util_avg (~1ms) can make cfs->util_sum becoming null whereas + * cfs_util_avg is not. + * Check that util_sum is still above its lower bound for the new + * util_avg. Given that period_contrib might have moved since the last + * sync, we are only sure that util_sum must be above or equal to + * util_avg * minimum possible divider + */ + sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); r = removed_runnable; sub_positive(&sa->runnable_avg, r); diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index e06071bf3472..c336f5f481bc 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -37,9 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running) } #endif +#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024) + static inline u32 get_pelt_divider(struct sched_avg *avg) { - return LOAD_AVG_MAX - 1024 + avg->period_contrib; + return PELT_MIN_DIVIDER + avg->period_contrib; } static inline void cfs_se_util_change(struct sched_avg *avg) -- cgit v1.2.3-58-ga151 From 7ceb77103001544a43e11d7f3a8a69a2c1f422cf Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Jan 2022 14:46:57 +0100 Subject: sched/pelt: Continue to relax the sync of util_sum with util_avg Rick reported performance regressions in bugzilla because of cpu frequency being lower than before: https://bugzilla.kernel.org/show_bug.cgi?id=215045 He bisected the problem to: commit 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent") This commit forces util_sum to be synced with the new util_avg after removing the contribution of a task and before the next periodic sync. By doing so util_sum is rounded to its lower bound and might lost up to LOAD_AVG_MAX-1 of accumulated contribution which has not yet been reflected in util_avg. update_tg_cfs_util() is not the only place where we round util_sum and lost some accumulated contributions that are not already reflected in util_avg. Modify update_tg_cfs_util() and detach_entity_load_avg() to not sync util_sum with the new util_avg. Instead of always setting util_sum to the low bound of util_avg, which can significantly lower the utilization, we propagate the difference. In addition, we also check that cfs's util_sum always stays above the lower bound for a given util_avg as it has been observed that sched_entity's util_sum is sometimes above cfs one. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Sachin Sant Link: https://lkml.kernel.org/r/20220111134659.24961-3-vincent.guittot@linaro.org --- kernel/sched/fair.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d8f068d72b9d..ad2809cc57c7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3451,11 +3451,11 @@ void set_task_rq_fair(struct sched_entity *se, static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3464,13 +3464,20 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq */ divider = get_pelt_divider(&cfs_rq->avg); + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; - se->avg.util_sum = se->avg.util_avg * divider; + new_sum = se->avg.util_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.util_sum; + se->avg.util_sum = new_sum; /* Update parent cfs_rq utilization */ - add_positive(&cfs_rq->avg.util_avg, delta); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + add_positive(&cfs_rq->avg.util_avg, delta_avg); + add_positive(&cfs_rq->avg.util_sum, delta_sum); + + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); } static inline void @@ -3790,7 +3797,11 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum, + cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; -- cgit v1.2.3-58-ga151 From 95246d1ec80b8d19d882cd8eb7ad094e63b41bb8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Jan 2022 14:46:58 +0100 Subject: sched/pelt: Relax the sync of runnable_sum with runnable_avg Similarly to util_avg and util_sum, don't sync runnable_sum with the low bound of runnable_avg but only ensure that runnable_sum stays in the correct range. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Sachin Sant Link: https://lkml.kernel.org/r/20220111134659.24961-4-vincent.guittot@linaro.org --- kernel/sched/fair.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ad2809cc57c7..0e87e1916504 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3483,11 +3483,11 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; - u32 divider; + long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg; + u32 new_sum, divider; /* Nothing to update */ - if (!delta) + if (!delta_avg) return; /* @@ -3498,11 +3498,16 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* Set new sched_entity's runnable */ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg; - se->avg.runnable_sum = se->avg.runnable_avg * divider; + new_sum = se->avg.runnable_avg * divider; + delta_sum = (long)new_sum - (long)se->avg.runnable_sum; + se->avg.runnable_sum = new_sum; /* Update parent cfs_rq runnable */ - add_positive(&cfs_rq->avg.runnable_avg, delta); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + add_positive(&cfs_rq->avg.runnable_avg, delta_avg); + add_positive(&cfs_rq->avg.runnable_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); } static inline void @@ -3702,7 +3707,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_runnable; sub_positive(&sa->runnable_avg, r); - sa->runnable_sum = sa->runnable_avg * divider; + sub_positive(&sa->runnable_sum, r * divider); + /* See sa->util_sum above */ + sa->runnable_sum = max_t(u32, sa->runnable_sum, + sa->runnable_avg * PELT_MIN_DIVIDER); /* * removed_runnable is the unweighted version of removed_load so we @@ -3789,12 +3797,6 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - /* - * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. - * See ___update_load_avg() for details. - */ - u32 divider = get_pelt_divider(&cfs_rq->avg); - dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); @@ -3803,7 +3805,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s cfs_rq->avg.util_avg * PELT_MIN_DIVIDER); sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum, + cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER); add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); -- cgit v1.2.3-58-ga151 From 2d02fa8cc21a93da35cfba462bf8ab87bf2db651 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 11 Jan 2022 14:46:59 +0100 Subject: sched/pelt: Relax the sync of load_sum with load_avg Similarly to util_avg and util_sum, don't sync load_sum with the low bound of load_avg but only ensure that load_sum stays in the correct range. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Sachin Sant Link: https://lkml.kernel.org/r/20220111134659.24961-5-vincent.guittot@linaro.org --- kernel/sched/fair.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0e87e1916504..f4f02c2cff87 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3028,9 +3028,11 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u32 divider = get_pelt_divider(&se->avg); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } #else static inline void @@ -3513,9 +3515,10 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf static inline void update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; + long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; unsigned long load_avg; u64 load_sum = 0; + s64 delta_sum; u32 divider; if (!runnable_sum) @@ -3542,7 +3545,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq * assuming all tasks are equally runnable. */ if (scale_load_down(gcfs_rq->load.weight)) { - load_sum = div_s64(gcfs_rq->avg.load_sum, + load_sum = div_u64(gcfs_rq->avg.load_sum, scale_load_down(gcfs_rq->load.weight)); } @@ -3559,19 +3562,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; runnable_sum = max(runnable_sum, running_sum); - load_sum = (s64)se_weight(se) * runnable_sum; - load_avg = div_s64(load_sum, divider); - - se->avg.load_sum = runnable_sum; + load_sum = se_weight(se) * runnable_sum; + load_avg = div_u64(load_sum, divider); - delta = load_avg - se->avg.load_avg; - if (!delta) + delta_avg = load_avg - se->avg.load_avg; + if (!delta_avg) return; - se->avg.load_avg = load_avg; + delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; - add_positive(&cfs_rq->avg.load_avg, delta); - cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; + se->avg.load_sum = runnable_sum; + se->avg.load_avg = load_avg; + add_positive(&cfs_rq->avg.load_avg, delta_avg); + add_positive(&cfs_rq->avg.load_sum, delta_sum); + /* See update_cfs_rq_load_avg() */ + cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, + cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3687,7 +3693,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) r = removed_load; sub_positive(&sa->load_avg, r); - sa->load_sum = sa->load_avg * divider; + sub_positive(&sa->load_sum, r * divider); + /* See sa->util_sum below */ + sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER); r = removed_util; sub_positive(&sa->util_avg, r); -- cgit v1.2.3-58-ga151 From b171501f258063f5c56dd2c5fdf310802d8d7dc1 Mon Sep 17 00:00:00 2001 From: Cruz Zhao Date: Tue, 11 Jan 2022 17:55:59 +0800 Subject: sched/core: Accounting forceidle time for all tasks except idle task There are two types of forced idle time: forced idle time from cookie'd task and forced idle time form uncookie'd task. The forced idle time from uncookie'd task is actually caused by the cookie'd task in runqueue indirectly, and it's more accurate to measure the capacity loss with the sum of both. Assuming cpu x and cpu y are a pair of SMT siblings, consider the following scenarios: 1.There's a cookie'd task running on cpu x, and there're 4 uncookie'd tasks running on cpu y. For cpu x, there will be 80% forced idle time (from uncookie'd task); for cpu y, there will be 20% forced idle time (from cookie'd task). 2.There's a uncookie'd task running on cpu x, and there're 4 cookie'd tasks running on cpu y. For cpu x, there will be 80% forced idle time (from cookie'd task); for cpu y, there will be 20% forced idle time (from uncookie'd task). The scenario1 can recurrent by stress-ng(scenario2 can recurrent similary): (cookie'd)taskset -c x stress-ng -c 1 -l 100 (uncookie'd)taskset -c y stress-ng -c 4 -l 100 In the above two scenarios, the total capacity loss is 1 cpu, but in scenario1, the cookie'd forced idle time tells us 20% cpu capacity loss, in scenario2, the cookie'd forced idle time tells us 80% cpu capacity loss, which are not accurate. It'll be more accurate to measure with cookie'd forced idle time and uncookie'd forced idle time. Signed-off-by: Cruz Zhao Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josh Don Link: https://lore.kernel.org/r/1641894961-9241-2-git-send-email-CruzZhao@linux.alibaba.com --- kernel/sched/core.c | 3 +-- kernel/sched/core_sched.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83872f95a1ea..0d2ab2a2f9fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5822,8 +5822,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } if (schedstat_enabled() && rq->core->core_forceidle_count) { - if (cookie) - rq->core->core_forceidle_start = rq_clock(rq->core); + rq->core->core_forceidle_start = rq_clock(rq->core); rq->core->core_forceidle_occupation = occ; } diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 1fb45672ec85..c8746a9a7ada 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -277,7 +277,7 @@ void __sched_core_account_forceidle(struct rq *rq) rq_i = cpu_rq(i); p = rq_i->core_pick ?: rq_i->curr; - if (!p->core_cookie) + if (p == rq_i->idle) continue; __schedstat_add(p->stats.core_forceidle_sum, delta); -- cgit v1.2.3-58-ga151 From a315da5e686b02b20c1713dda818e8fb691526bb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Dec 2021 21:59:00 -0800 Subject: sched/fair: Fix all kernel-doc warnings Quieten all kernel-doc warnings in kernel/sched/fair.c: kernel/sched/fair.c:3663: warning: No description found for return value of 'update_cfs_rq_load_avg' kernel/sched/fair.c:8601: warning: No description found for return value of 'asym_smt_can_pull_tasks' kernel/sched/fair.c:8673: warning: Function parameter or member 'sds' not described in 'update_sg_lb_stats' kernel/sched/fair.c:9483: warning: contents before sections Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ricardo Neri Acked-by: Vincent Guittot Link: https://lore.kernel.org/r/20211218055900.2704-1-rdunlap@infradead.org --- kernel/sched/fair.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f4f02c2cff87..5146163bfabb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3668,7 +3668,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * - * Returns true if the load decayed or we removed load. + * Return: true if the load decayed or we removed load. * * Since both these conditions indicate a changed cfs_rq->avg.load we should * call update_tg_load_avg() when this function returns true. @@ -8573,6 +8573,8 @@ group_type group_classify(unsigned int imbalance_pct, * * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings * of @dst_cpu are idle and @sg has lower priority. + * + * Return: true if @dst_cpu can pull tasks, false otherwise. */ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, @@ -8648,6 +8650,7 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. + * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @sg_status: Holds flag indicating the status of the sched_group @@ -9455,12 +9458,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. + * @env: The load balancing environment. * * Also calculates the amount of runnable load which should be moved * to restore balance. * - * @env: The load balancing environment. - * * Return: - The busiest group if imbalance exists. */ static struct sched_group *find_busiest_group(struct lb_env *env) -- cgit v1.2.3-58-ga151 From 7e406d1ff39b8ee574036418a5043c86723170cf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 25 Dec 2021 01:04:57 +0100 Subject: sched: Avoid double preemption in __cond_resched_*lock*() For PREEMPT/DYNAMIC_PREEMPT the *_unlock() will already trigger a preemption, no point in then calling preempt_schedule_common() *again*. Use _cond_resched() instead, since this is a NOP for the preemptible configs while it provide a preemption point for the others. Reported-by: xuhaifeng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YcGnvDEYBwOiV0cR@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d2ab2a2f9fe..56b428c8ea96 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8218,9 +8218,7 @@ int __cond_resched_lock(spinlock_t *lock) if (spin_needbreak(lock) || resched) { spin_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; spin_lock(lock); @@ -8238,9 +8236,7 @@ int __cond_resched_rwlock_read(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { read_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; read_lock(lock); @@ -8258,9 +8254,7 @@ int __cond_resched_rwlock_write(rwlock_t *lock) if (rwlock_needbreak(lock) || resched) { write_unlock(lock); - if (resched) - preempt_schedule_common(); - else + if (!_cond_resched()) cpu_relax(); ret = 1; write_lock(lock); -- cgit v1.2.3-58-ga151 From 0e3872499de1a1230cef5221607d71aa09264bd5 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Fri, 7 Jan 2022 17:52:54 +0800 Subject: kernel/sched: Remove dl_boosted flag comment since commit 2279f540ea7d ("sched/deadline: Fix priority inheritance with multiple scheduling classes"), we should not keep it here. Signed-off-by: Hui Su Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Link: https://lore.kernel.org/r/20220107095254.GA49258@localhost.localdomain --- include/linux/sched.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 7a1f16df66e3..7f8b4495e243 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -614,10 +614,6 @@ struct sched_dl_entity { * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * - * @dl_boosted tells if we are boosted due to DI. If so we are - * outside bandwidth enforcement mechanism (but only until we - * exit the critical section); - * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * -- cgit v1.2.3-58-ga151 From 4624f199327a704dd1069aca1c3cadb8f2a28c6f Mon Sep 17 00:00:00 2001 From: Zechuan Chen Date: Tue, 28 Dec 2021 19:13:38 +0800 Subject: perf probe: Fix ppc64 'perf probe add events failed' case Because of commit bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms lookup across powerpc ABIv1 and ABIv2"), in ppc64 ABIv1, our perf command eliminates the need to use the prefix "." at the symbol name. But when the command "perf probe -a schedule" is executed on ppc64 ABIv1, it obtains two symbol address information through /proc/kallsyms, for example: cat /proc/kallsyms | grep -w schedule c000000000657020 T .schedule c000000000d4fdb8 D schedule The symbol "D schedule" is not a function symbol, and perf will print: "p:probe/schedule _text+13958584"Failed to write event: Invalid argument Therefore, when searching symbols from map and adding probe point for them, a symbol type check is added. If the type of symbol is not a function, skip it. Fixes: bf794bf52a80c627 ("powerpc/kprobes: Fix kallsyms lookup across powerpc ABIv1 and ABIv2") Signed-off-by: Zechuan Chen Acked-by: Masami Hiramatsu Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jianlin Lv Cc: Jin Yao Cc: Jiri Olsa Cc: Mark Rutland Cc: Michael Ellerman Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Yang Jihong Link: https://lore.kernel.org/r/20211228111338.218602-1-chenzechuan1@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index b2a02c9ab8ea..a834918a0a0d 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -3083,6 +3083,9 @@ static int find_probe_trace_events_from_map(struct perf_probe_event *pev, for (j = 0; j < num_matched_functions; j++) { sym = syms[j]; + if (sym->type != STT_FUNC) + continue; + /* There can be duplicated symbols in the map */ for (i = 0; i < j; i++) if (sym->start == syms[i]->start) { -- cgit v1.2.3-58-ga151 From 1855b796f2f672cbb25400be2d3171c26fc869a3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 17 Jan 2022 13:09:28 -0300 Subject: perf affinity: Allow passing a NULL arg to affinity__cleanup() Just like with free(), NULL is checked to avoid having all callers do it. Its convenient for when not using affinity setup/cleanup for dummy CPU maps, i.e. CPU maps for pid targets. Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220117160931.1191712-2-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/affinity.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/affinity.c b/tools/perf/util/affinity.c index f1e30d566db3..4d216c0dc425 100644 --- a/tools/perf/util/affinity.c +++ b/tools/perf/util/affinity.c @@ -62,7 +62,7 @@ void affinity__set(struct affinity *a, int cpu) clear_bit(cpu, a->sched_cpus); } -void affinity__cleanup(struct affinity *a) +static void __affinity__cleanup(struct affinity *a) { int cpu_set_size = get_cpu_set_size(); @@ -71,3 +71,9 @@ void affinity__cleanup(struct affinity *a) zfree(&a->sched_cpus); zfree(&a->orig_cpus); } + +void affinity__cleanup(struct affinity *a) +{ + if (a != NULL) + __affinity__cleanup(a); +} -- cgit v1.2.3-58-ga151 From 49de179577e7b05b57f625bf05cdc60a72de38d0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 17 Jan 2022 13:09:29 -0300 Subject: perf stat: No need to setup affinities when starting a workload I.e. the simple: $ perf stat sleep 1 Uses a dummy CPU map and thus there is no need to setup/cleanup affinities to avoid IPIs, etc. With this we're down to a sched_getaffinity() call, in the libnuma initialization, that probably can be removed in a followup patch. Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220117160931.1191712-3-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 973ade18b72a..934e992c966f 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -788,7 +788,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) const bool forks = (argc > 0); bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false; struct evlist_cpu_iterator evlist_cpu_itr; - struct affinity affinity; + struct affinity saved_affinity, *affinity = NULL; int err; bool second_pass = false; @@ -803,8 +803,11 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) if (group) evlist__set_leader(evsel_list); - if (affinity__setup(&affinity) < 0) - return -1; + if (!cpu_map__is_dummy(evsel_list->core.cpus)) { + if (affinity__setup(&saved_affinity) < 0) + return -1; + affinity = &saved_affinity; + } evlist__for_each_entry(evsel_list, counter) { if (bpf_counter__load(counter, &target)) @@ -813,7 +816,7 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) all_counters_use_bpf = false; } - evlist__for_each_cpu(evlist_cpu_itr, evsel_list, &affinity) { + evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) { counter = evlist_cpu_itr.evsel; /* @@ -869,7 +872,7 @@ try_again: */ /* First close errored or weak retry */ - evlist__for_each_cpu(evlist_cpu_itr, evsel_list, &affinity) { + evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) { counter = evlist_cpu_itr.evsel; if (!counter->reset_group && !counter->errored) @@ -878,7 +881,7 @@ try_again: perf_evsel__close_cpu(&counter->core, evlist_cpu_itr.cpu_map_idx); } /* Now reopen weak */ - evlist__for_each_cpu(evlist_cpu_itr, evsel_list, &affinity) { + evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) { counter = evlist_cpu_itr.evsel; if (!counter->reset_group && !counter->errored) @@ -904,7 +907,7 @@ try_again_reset: counter->supported = true; } } - affinity__cleanup(&affinity); + affinity__cleanup(affinity); evlist__for_each_entry(evsel_list, counter) { if (!counter->supported) { -- cgit v1.2.3-58-ga151 From f350ee95498a3fa65c37ed597d9c051c6b2b6974 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 17 Jan 2022 13:09:30 -0300 Subject: perf evlist: No need to setup affinities when enabling events for pid targets When the target is a pid, not started by 'perf stat' we need to enable the events, and in that case there is no need to setup affinities as we use a dummy CPU map, with just one entry set to -1. So stop doing it to avoid this needless call to sched_getaffinity(): # strace -ke sched_getaffinity perf stat -e cycles -p 241957 sleep 1 sched_getaffinity(0, 512, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]) = 8 > /usr/lib64/libc-2.33.so(sched_getaffinity@@GLIBC_2.3.4+0x1a) [0xe6eea] > /var/home/acme/bin/perf(affinity__setup+0x6a) [0x5329ca] > /var/home/acme/bin/perf(__evlist__enable.constprop.0+0x23) [0x4b9693] > /var/home/acme/bin/perf(enable_counters+0x14d) [0x42de5d] > /var/home/acme/bin/perf(cmd_stat+0x2358) [0x4310c8] > /var/home/acme/bin/perf(run_builtin+0x6a) [0x4a2cfa] > /var/home/acme/bin/perf(main+0x612) [0x40f8c2] > /usr/lib64/libc-2.33.so(__libc_start_main+0xd4) [0x27b74] > /var/home/acme/bin/perf(_start+0x2d) [0x40fadd] Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220117160931.1191712-4-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 6e88d404b5b3..ae6d4363da76 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -487,12 +487,16 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name) { struct evsel *pos; struct evlist_cpu_iterator evlist_cpu_itr; - struct affinity affinity; + struct affinity saved_affinity, *affinity = NULL; - if (affinity__setup(&affinity) < 0) - return; + // See explanation in evlist__close() + if (!cpu_map__is_dummy(evlist->core.cpus)) { + if (affinity__setup(&saved_affinity) < 0) + return; + affinity = &saved_affinity; + } - evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) { + evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) { pos = evlist_cpu_itr.evsel; if (evsel__strcmp(pos, evsel_name)) continue; @@ -500,7 +504,7 @@ static void __evlist__enable(struct evlist *evlist, char *evsel_name) continue; evsel__enable_cpu(pos, evlist_cpu_itr.cpu_map_idx); } - affinity__cleanup(&affinity); + affinity__cleanup(affinity); evlist__for_each_entry(evlist, pos) { if (evsel__strcmp(pos, evsel_name)) continue; -- cgit v1.2.3-58-ga151 From 0d3d237651fd7a01fe5dc501b0d170a43d8156ba Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 17 Jan 2022 13:09:31 -0300 Subject: perf evlist: No need to setup affinities when disabling events for pid targets When the target is a pid, not started by 'perf stat' we need to disable the events, and in that case there is no need to setup affinities as we use a dummy CPU map, with just one entry set to -1. So stop doing it to avoid this needless call to sched_getaffinity(): # strace -ke sched_getaffinity perf stat -e cycles -p 241957 sleep 1 sched_getaffinity(0, 512, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]) = 8 > /usr/lib64/libc-2.33.so(sched_getaffinity@@GLIBC_2.3.4+0x1a) [0xe6eea] > /var/home/acme/bin/perf(affinity__setup+0x6a) [0x532a2a] > /var/home/acme/bin/perf(__evlist__disable.constprop.0+0x27) [0x4b9827] > /var/home/acme/bin/perf(cmd_stat+0x29b5) [0x431725] > /var/home/acme/bin/perf(run_builtin+0x6a) [0x4a2cfa] > /var/home/acme/bin/perf(main+0x612) [0x40f8c2] > /usr/lib64/libc-2.33.so(__libc_start_main+0xd4) [0x27b74] > /var/home/acme/bin/perf(_start+0x2d) [0x40fadd] Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220117160931.1191712-5-acme@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index ae6d4363da76..eaad04e1672a 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -430,15 +430,19 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name) { struct evsel *pos; struct evlist_cpu_iterator evlist_cpu_itr; - struct affinity affinity; + struct affinity saved_affinity, *affinity = NULL; bool has_imm = false; - if (affinity__setup(&affinity) < 0) - return; + // See explanation in evlist__close() + if (!cpu_map__is_dummy(evlist->core.cpus)) { + if (affinity__setup(&saved_affinity) < 0) + return; + affinity = &saved_affinity; + } /* Disable 'immediate' events last */ for (int imm = 0; imm <= 1; imm++) { - evlist__for_each_cpu(evlist_cpu_itr, evlist, &affinity) { + evlist__for_each_cpu(evlist_cpu_itr, evlist, affinity) { pos = evlist_cpu_itr.evsel; if (evsel__strcmp(pos, evsel_name)) continue; @@ -454,7 +458,7 @@ static void __evlist__disable(struct evlist *evlist, char *evsel_name) break; } - affinity__cleanup(&affinity); + affinity__cleanup(affinity); evlist__for_each_entry(evlist, pos) { if (evsel__strcmp(pos, evsel_name)) continue; -- cgit v1.2.3-58-ga151 From 2cb52046d186863e16ac82850c0e225462e493f1 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 17 Jan 2022 16:08:25 +0000 Subject: ALSA: hda: cs35l41: Avoid overwriting register patch regmap_register_patch can't be used to apply the probe sequence as a patch is already registers with the regmap by cs35l41_register_errata_patch and only a single patch can be attached to a single regmap. The driver doesn't currently rely on a cache sync to re-apply this probe sequence so simply switch it to a multi write. Fixes: 7b2f3eb492da ("ALSA: hda: cs35l41: Add support for CS35L41 in HDA systems") Signed-off-by: Charles Keepax Signed-off-by: Lucas Tanure Link: https://lore.kernel.org/r/20220117160830.709403-1-tanureal@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index 30b40d865863..c47c5f0b4e59 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -480,7 +480,7 @@ int cs35l41_hda_probe(struct device *dev, const char *device_name, int id, int i acpi_hw_cfg = NULL; if (cs35l41->reg_seq->probe) { - ret = regmap_register_patch(cs35l41->regmap, cs35l41->reg_seq->probe, + ret = regmap_multi_reg_write(cs35l41->regmap, cs35l41->reg_seq->probe, cs35l41->reg_seq->num_probe); if (ret) { dev_err(cs35l41->dev, "Fail to apply probe reg patch: %d\n", ret); -- cgit v1.2.3-58-ga151 From 6e4320d8ecbc8711209b3075f2d896667006fa37 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 17 Jan 2022 16:08:26 +0000 Subject: ALSA: hda: cs35l41: Add calls to newly added test key function The test key now needs to be manually held when calling cs35l41_register_errata_patch, after patch: Add the missing function calls to this driver. Fixes: f517ba4924ad ("ASoC: cs35l41: Add support for hibernate memory retention mode") Signed-off-by: Charles Keepax Signed-off-by: Lucas Tanure Link: https://lore.kernel.org/r/20220117160830.709403-2-tanureal@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index c47c5f0b4e59..509a380f9be7 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -463,6 +463,10 @@ int cs35l41_hda_probe(struct device *dev, const char *device_name, int id, int i goto err; } + ret = cs35l41_test_key_unlock(cs35l41->dev, cs35l41->regmap); + if (ret) + goto err; + ret = cs35l41_register_errata_patch(cs35l41->dev, cs35l41->regmap, reg_revid); if (ret) goto err; @@ -473,6 +477,10 @@ int cs35l41_hda_probe(struct device *dev, const char *device_name, int id, int i goto err; } + ret = cs35l41_test_key_lock(cs35l41->dev, cs35l41->regmap); + if (ret) + goto err; + ret = cs35l41_hda_apply_properties(cs35l41, acpi_hw_cfg); if (ret) goto err; -- cgit v1.2.3-58-ga151 From 77dc3a6ee2eb5851535fe3a84fc31bf0705e4a2e Mon Sep 17 00:00:00 2001 From: Lucas Tanure Date: Mon, 17 Jan 2022 16:08:27 +0000 Subject: ALSA: hda: cs35l41: Move cs35l41* calls to its own symbol namespace Create own namespace and avoid polluting the global namespace Signed-off-by: Lucas Tanure Link: https://lore.kernel.org/r/20220117160830.709403-3-tanureal@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda.c | 5 ++--- sound/pci/hda/cs35l41_hda_i2c.c | 1 + sound/pci/hda/cs35l41_hda_spi.c | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index 509a380f9be7..c4f25e48dcc0 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -514,7 +514,7 @@ err: return ret; } -EXPORT_SYMBOL_GPL(cs35l41_hda_probe); +EXPORT_SYMBOL_NS_GPL(cs35l41_hda_probe, SND_HDA_SCODEC_CS35L41); int cs35l41_hda_remove(struct device *dev) { @@ -528,8 +528,7 @@ int cs35l41_hda_remove(struct device *dev) return 0; } -EXPORT_SYMBOL_GPL(cs35l41_hda_remove); - +EXPORT_SYMBOL_NS_GPL(cs35l41_hda_remove, SND_HDA_SCODEC_CS35L41); MODULE_DESCRIPTION("CS35L41 HDA Driver"); MODULE_AUTHOR("Lucas Tanure, Cirrus Logic Inc, "); diff --git a/sound/pci/hda/cs35l41_hda_i2c.c b/sound/pci/hda/cs35l41_hda_i2c.c index 4a9462fb5c14..eeb387853ee3 100644 --- a/sound/pci/hda/cs35l41_hda_i2c.c +++ b/sound/pci/hda/cs35l41_hda_i2c.c @@ -62,5 +62,6 @@ static struct i2c_driver cs35l41_i2c_driver = { module_i2c_driver(cs35l41_i2c_driver); MODULE_DESCRIPTION("HDA CS35L41 driver"); +MODULE_IMPORT_NS(SND_HDA_SCODEC_CS35L41); MODULE_AUTHOR("Lucas Tanure "); MODULE_LICENSE("GPL"); diff --git a/sound/pci/hda/cs35l41_hda_spi.c b/sound/pci/hda/cs35l41_hda_spi.c index 77426e96c58f..15345a72b9d1 100644 --- a/sound/pci/hda/cs35l41_hda_spi.c +++ b/sound/pci/hda/cs35l41_hda_spi.c @@ -59,5 +59,6 @@ static struct spi_driver cs35l41_spi_driver = { module_spi_driver(cs35l41_spi_driver); MODULE_DESCRIPTION("HDA CS35L41 driver"); +MODULE_IMPORT_NS(SND_HDA_SCODEC_CS35L41); MODULE_AUTHOR("Lucas Tanure "); MODULE_LICENSE("GPL"); -- cgit v1.2.3-58-ga151 From cd8abf7d04c940c627ceb6f416b2142d3e7b36dd Mon Sep 17 00:00:00 2001 From: Lucas Tanure Date: Mon, 17 Jan 2022 16:08:28 +0000 Subject: ALSA: hda: cs35l41: Add missing default cases Add switch default cases at gpio pins configs Signed-off-by: Lucas Tanure Link: https://lore.kernel.org/r/20220117160830.709403-4-tanureal@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index c4f25e48dcc0..82f982f574a9 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -161,6 +161,9 @@ static void cs35l41_hda_playback_hook(struct device *dev, int action) if (reg_seq->close) ret = regmap_multi_reg_write(reg, reg_seq->close, reg_seq->num_close); break; + default: + ret = -EINVAL; + break; } if (ret) @@ -227,6 +230,8 @@ static int cs35l41_hda_apply_properties(struct cs35l41_hda *cs35l41, internal_boost = true; switch (hw_cfg->gpio1_func) { + case CS35L41_NOT_USED: + break; case CS35l41_VSPK_SWITCH: regmap_update_bits(cs35l41->regmap, CS35L41_GPIO_PAD_CONTROL, CS35L41_GPIO1_CTRL_MASK, 1 << CS35L41_GPIO1_CTRL_SHIFT); @@ -235,13 +240,21 @@ static int cs35l41_hda_apply_properties(struct cs35l41_hda *cs35l41, regmap_update_bits(cs35l41->regmap, CS35L41_GPIO_PAD_CONTROL, CS35L41_GPIO1_CTRL_MASK, 2 << CS35L41_GPIO1_CTRL_SHIFT); break; + default: + dev_err(cs35l41->dev, "Invalid function %d for GPIO1\n", hw_cfg->gpio1_func); + return -EINVAL; } switch (hw_cfg->gpio2_func) { + case CS35L41_NOT_USED: + break; case CS35L41_INTERRUPT: regmap_update_bits(cs35l41->regmap, CS35L41_GPIO_PAD_CONTROL, CS35L41_GPIO2_CTRL_MASK, 2 << CS35L41_GPIO2_CTRL_SHIFT); break; + default: + dev_err(cs35l41->dev, "Invalid function %d for GPIO2\n", hw_cfg->gpio2_func); + return -EINVAL; } if (internal_boost) { -- cgit v1.2.3-58-ga151 From a025df02ce424fa77f6bc6aa195db21677e11274 Mon Sep 17 00:00:00 2001 From: Lucas Tanure Date: Mon, 17 Jan 2022 16:08:29 +0000 Subject: ALSA: hda: cs35l41: Make use of the helper function dev_err_probe() When possible use dev_err_probe help to properly deal with the PROBE_DEFER error, the benefit is that DEFER issue will be logged in the devices_deferred debugfs file. Using dev_err_probe() can reduce code size, and the error value gets printed. Signed-off-by: Lucas Tanure Link: https://lore.kernel.org/r/20220117160830.709403-5-tanureal@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index 82f982f574a9..c317b392c3e3 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -429,8 +429,7 @@ int cs35l41_hda_probe(struct device *dev, const char *device_name, int id, int i if (ret == -EBUSY) { dev_info(cs35l41->dev, "Reset line busy, assuming shared reset\n"); } else { - if (ret != -EPROBE_DEFER) - dev_err(cs35l41->dev, "Failed to get reset GPIO: %d\n", ret); + dev_err_probe(cs35l41->dev, ret, "Failed to get reset GPIO: %d\n", ret); goto err; } } -- cgit v1.2.3-58-ga151 From 8c286a0f973a81201a0cef72a7ca55eda29fc35c Mon Sep 17 00:00:00 2001 From: Lucas Tanure Date: Mon, 17 Jan 2022 16:08:30 +0000 Subject: ALSA: hda: cs35l41: Tidyup code Clean up and simplify cs35l41_hda_bind function Signed-off-by: Lucas Tanure Link: https://lore.kernel.org/r/20220117160830.709403-6-tanureal@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda.c | 99 ++++++++++++++++++++--------------------- sound/pci/hda/cs35l41_hda.h | 2 +- sound/pci/hda/cs35l41_hda_i2c.c | 1 - sound/pci/hda/cs35l41_hda_spi.c | 1 - 4 files changed, 49 insertions(+), 54 deletions(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index c317b392c3e3..3f9ddfb4eaf3 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 // -// cs35l41.c -- CS35l41 ALSA HDA audio driver +// CS35l41 ALSA HDA audio driver // // Copyright 2021 Cirrus Logic, Inc. // @@ -17,19 +17,19 @@ #include "cs35l41_hda.h" static const struct reg_sequence cs35l41_hda_config[] = { - { CS35L41_PLL_CLK_CTRL, 0x00000430 }, //3200000Hz, BCLK Input, PLL_REFCLK_EN = 1 - { CS35L41_GLOBAL_CLK_CTRL, 0x00000003 }, //GLOBAL_FS = 48 kHz - { CS35L41_SP_ENABLES, 0x00010000 }, //ASP_RX1_EN = 1 - { CS35L41_SP_RATE_CTRL, 0x00000021 }, //ASP_BCLK_FREQ = 3.072 MHz - { CS35L41_SP_FORMAT, 0x20200200 }, //24 bits, I2S, BCLK Slave, FSYNC Slave - { CS35L41_DAC_PCM1_SRC, 0x00000008 }, //DACPCM1_SRC = ASPRX1 - { CS35L41_AMP_DIG_VOL_CTRL, 0x00000000 }, //AMP_VOL_PCM 0.0 dB - { CS35L41_AMP_GAIN_CTRL, 0x00000084 }, //AMP_GAIN_PCM 4.5 dB - { CS35L41_PWR_CTRL2, 0x00000001 }, //AMP_EN = 1 + { CS35L41_PLL_CLK_CTRL, 0x00000430 }, // 3200000Hz, BCLK Input, PLL_REFCLK_EN = 1 + { CS35L41_GLOBAL_CLK_CTRL, 0x00000003 }, // GLOBAL_FS = 48 kHz + { CS35L41_SP_ENABLES, 0x00010000 }, // ASP_RX1_EN = 1 + { CS35L41_SP_RATE_CTRL, 0x00000021 }, // ASP_BCLK_FREQ = 3.072 MHz + { CS35L41_SP_FORMAT, 0x20200200 }, // 24 bits, I2S, BCLK Slave, FSYNC Slave + { CS35L41_DAC_PCM1_SRC, 0x00000008 }, // DACPCM1_SRC = ASPRX1 + { CS35L41_AMP_DIG_VOL_CTRL, 0x00000000 }, // AMP_VOL_PCM 0.0 dB + { CS35L41_AMP_GAIN_CTRL, 0x00000084 }, // AMP_GAIN_PCM 4.5 dB + { CS35L41_PWR_CTRL2, 0x00000001 }, // AMP_EN = 1 }; static const struct reg_sequence cs35l41_hda_start_bst[] = { - { CS35L41_PWR_CTRL2, 0x00000021 }, //BST_EN = 10, AMP_EN = 1 + { CS35L41_PWR_CTRL2, 0x00000021 }, // BST_EN = 10, AMP_EN = 1 { CS35L41_PWR_CTRL1, 0x00000001, 3000}, // set GLOBAL_EN = 1 }; @@ -60,7 +60,7 @@ static const struct reg_sequence cs35l41_stop_ext_vspk[] = { { 0x00000040, 0x00000055 }, { 0x00000040, 0x000000AA }, { 0x00007438, 0x00585941 }, - { 0x00002014, 0x00000000, 3000}, //set GLOBAL_EN = 0 + { 0x00002014, 0x00000000, 3000}, // set GLOBAL_EN = 0 { 0x0000742C, 0x00000009 }, { 0x00007438, 0x00580941 }, { 0x00011008, 0x00000001 }, @@ -78,7 +78,7 @@ static const struct reg_sequence cs35l41_safe_to_active[] = { { 0x0000742C, 0x0000000F }, { 0x0000742C, 0x00000079 }, { 0x00007438, 0x00585941 }, - { CS35L41_PWR_CTRL1, 0x00000001, 2000 }, //GLOBAL_EN = 1 + { CS35L41_PWR_CTRL1, 0x00000001, 2000 }, // GLOBAL_EN = 1 { 0x0000742C, 0x000000F9 }, { 0x00007438, 0x00580941 }, { 0x00000040, 0x000000CC }, @@ -89,8 +89,8 @@ static const struct reg_sequence cs35l41_active_to_safe[] = { { 0x00000040, 0x00000055 }, { 0x00000040, 0x000000AA }, { 0x00007438, 0x00585941 }, - { CS35L41_AMP_DIG_VOL_CTRL, 0x0000A678 }, //AMP_VOL_PCM Mute - { CS35L41_PWR_CTRL2, 0x00000000 }, //AMP_EN = 0 + { CS35L41_AMP_DIG_VOL_CTRL, 0x0000A678 }, // AMP_VOL_PCM Mute + { CS35L41_PWR_CTRL2, 0x00000000 }, // AMP_EN = 0 { CS35L41_PWR_CTRL1, 0x00000000 }, { 0x0000742C, 0x00000009, 2000 }, { 0x00007438, 0x00580941 }, @@ -168,7 +168,6 @@ static void cs35l41_hda_playback_hook(struct device *dev, int action) if (ret) dev_warn(cs35l41->dev, "Failed to apply multi reg write: %d\n", ret); - } static int cs35l41_hda_channel_map(struct device *dev, unsigned int tx_num, unsigned int *tx_slot, @@ -185,20 +184,19 @@ static int cs35l41_hda_bind(struct device *dev, struct device *master, void *mas struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev); struct hda_component *comps = master_data; - if (comps && cs35l41->index >= 0 && cs35l41->index < HDA_MAX_COMPONENTS) - comps = &comps[cs35l41->index]; - else + if (!comps || cs35l41->index < 0 || cs35l41->index >= HDA_MAX_COMPONENTS) return -EINVAL; - if (!comps->dev) { - comps->dev = dev; - strscpy(comps->name, dev_name(dev), sizeof(comps->name)); - comps->playback_hook = cs35l41_hda_playback_hook; - comps->set_channel_map = cs35l41_hda_channel_map; - return 0; - } + comps = &comps[cs35l41->index]; + if (comps->dev) + return -EBUSY; - return -EBUSY; + comps->dev = dev; + strscpy(comps->name, dev_name(dev), sizeof(comps->name)); + comps->playback_hook = cs35l41_hda_playback_hook; + comps->set_channel_map = cs35l41_hda_channel_map; + + return 0; } static void cs35l41_hda_unbind(struct device *dev, struct device *master, void *master_data) @@ -269,11 +267,7 @@ static int cs35l41_hda_apply_properties(struct cs35l41_hda *cs35l41, cs35l41->reg_seq = &cs35l41_hda_reg_seq_ext_bst; } - ret = cs35l41_hda_channel_map(cs35l41->dev, 0, NULL, 1, (unsigned int *)&hw_cfg->spk_pos); - if (ret) - return ret; - - return 0; + return cs35l41_hda_channel_map(cs35l41->dev, 0, NULL, 1, (unsigned int *)&hw_cfg->spk_pos); } static struct cs35l41_hda_hw_config *cs35l41_hda_read_acpi(struct cs35l41_hda *cs35l41, @@ -282,7 +276,7 @@ static struct cs35l41_hda_hw_config *cs35l41_hda_read_acpi(struct cs35l41_hda *c struct cs35l41_hda_hw_config *hw_cfg; u32 values[HDA_MAX_COMPONENTS]; struct acpi_device *adev; - struct device *acpi_dev; + struct device *physdev; char *property; size_t nval; int i, ret; @@ -293,11 +287,11 @@ static struct cs35l41_hda_hw_config *cs35l41_hda_read_acpi(struct cs35l41_hda *c return ERR_PTR(-ENODEV); } - acpi_dev = get_device(acpi_get_first_physical_node(adev)); + physdev = get_device(acpi_get_first_physical_node(adev)); acpi_dev_put(adev); property = "cirrus,dev-index"; - ret = device_property_count_u32(acpi_dev, property); + ret = device_property_count_u32(physdev, property); if (ret <= 0) goto no_acpi_dsd; @@ -307,7 +301,7 @@ static struct cs35l41_hda_hw_config *cs35l41_hda_read_acpi(struct cs35l41_hda *c } nval = ret; - ret = device_property_read_u32_array(acpi_dev, property, values, nval); + ret = device_property_read_u32_array(physdev, property, values, nval); if (ret) goto err; @@ -324,7 +318,9 @@ static struct cs35l41_hda_hw_config *cs35l41_hda_read_acpi(struct cs35l41_hda *c goto err; } - /* No devm_ version as CLSA0100, in no_acpi_dsd case, can't use devm version */ + /* To use the same release code for all laptop variants we can't use devm_ version of + * gpiod_get here, as CLSA010* don't have a fully functional bios with an _DSD node + */ cs35l41->reset_gpio = fwnode_gpiod_get_index(&adev->fwnode, "reset", cs35l41->index, GPIOD_OUT_LOW, "cs35l41-reset"); @@ -335,46 +331,46 @@ static struct cs35l41_hda_hw_config *cs35l41_hda_read_acpi(struct cs35l41_hda *c } property = "cirrus,speaker-position"; - ret = device_property_read_u32_array(acpi_dev, property, values, nval); + ret = device_property_read_u32_array(physdev, property, values, nval); if (ret) goto err_free; hw_cfg->spk_pos = values[cs35l41->index]; property = "cirrus,gpio1-func"; - ret = device_property_read_u32_array(acpi_dev, property, values, nval); + ret = device_property_read_u32_array(physdev, property, values, nval); if (ret) goto err_free; hw_cfg->gpio1_func = values[cs35l41->index]; property = "cirrus,gpio2-func"; - ret = device_property_read_u32_array(acpi_dev, property, values, nval); + ret = device_property_read_u32_array(physdev, property, values, nval); if (ret) goto err_free; hw_cfg->gpio2_func = values[cs35l41->index]; property = "cirrus,boost-peak-milliamp"; - ret = device_property_read_u32_array(acpi_dev, property, values, nval); + ret = device_property_read_u32_array(physdev, property, values, nval); if (ret == 0) hw_cfg->bst_ipk = values[cs35l41->index]; property = "cirrus,boost-ind-nanohenry"; - ret = device_property_read_u32_array(acpi_dev, property, values, nval); + ret = device_property_read_u32_array(physdev, property, values, nval); if (ret == 0) hw_cfg->bst_ind = values[cs35l41->index]; property = "cirrus,boost-cap-microfarad"; - ret = device_property_read_u32_array(acpi_dev, property, values, nval); + ret = device_property_read_u32_array(physdev, property, values, nval); if (ret == 0) hw_cfg->bst_cap = values[cs35l41->index]; - put_device(acpi_dev); + put_device(physdev); return hw_cfg; err_free: kfree(hw_cfg); err: - put_device(acpi_dev); + put_device(physdev); dev_err(cs35l41->dev, "Failed property %s: %d\n", property, ret); return ERR_PTR(ret); @@ -383,18 +379,18 @@ no_acpi_dsd: /* * Device CLSA0100 doesn't have _DSD so a gpiod_get by the label reset won't work. * And devices created by i2c-multi-instantiate don't have their device struct pointing to - * the correct fwnode, so acpi_dev must be used here + * the correct fwnode, so acpi_dev must be used here. * And devm functions expect that the device requesting the resource has the correct - * fwnode + * fwnode. */ if (strncmp(hid, "CLSA0100", 8) != 0) return ERR_PTR(-EINVAL); /* check I2C address to assign the index */ cs35l41->index = id == 0x40 ? 0 : 1; - cs35l41->reset_gpio = gpiod_get_index(acpi_dev, NULL, 0, GPIOD_OUT_HIGH); + cs35l41->reset_gpio = gpiod_get_index(physdev, NULL, 0, GPIOD_OUT_HIGH); cs35l41->vspk_always_on = true; - put_device(acpi_dev); + put_device(physdev); return NULL; } @@ -449,7 +445,8 @@ int cs35l41_hda_probe(struct device *dev, const char *device_name, int id, int i ret = regmap_read(cs35l41->regmap, CS35L41_IRQ1_STATUS3, &int_sts); if (ret || (int_sts & CS35L41_OTP_BOOT_ERR)) { - dev_err(cs35l41->dev, "OTP Boot error\n"); + dev_err(cs35l41->dev, "OTP Boot status %x error: %d\n", + int_sts & CS35L41_OTP_BOOT_ERR, ret); ret = -EIO; goto err; } @@ -501,7 +498,7 @@ int cs35l41_hda_probe(struct device *dev, const char *device_name, int id, int i if (cs35l41->reg_seq->probe) { ret = regmap_multi_reg_write(cs35l41->regmap, cs35l41->reg_seq->probe, - cs35l41->reg_seq->num_probe); + cs35l41->reg_seq->num_probe); if (ret) { dev_err(cs35l41->dev, "Fail to apply probe reg patch: %d\n", ret); goto err; diff --git a/sound/pci/hda/cs35l41_hda.h b/sound/pci/hda/cs35l41_hda.h index 76c69a8a22f6..640afc98b686 100644 --- a/sound/pci/hda/cs35l41_hda.h +++ b/sound/pci/hda/cs35l41_hda.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * cs35l41_hda.h -- CS35L41 ALSA HDA audio driver + * CS35L41 ALSA HDA audio driver * * Copyright 2021 Cirrus Logic, Inc. * diff --git a/sound/pci/hda/cs35l41_hda_i2c.c b/sound/pci/hda/cs35l41_hda_i2c.c index eeb387853ee3..c2397dc53e78 100644 --- a/sound/pci/hda/cs35l41_hda_i2c.c +++ b/sound/pci/hda/cs35l41_hda_i2c.c @@ -58,7 +58,6 @@ static struct i2c_driver cs35l41_i2c_driver = { .probe = cs35l41_hda_i2c_probe, .remove = cs35l41_hda_i2c_remove, }; - module_i2c_driver(cs35l41_i2c_driver); MODULE_DESCRIPTION("HDA CS35L41 driver"); diff --git a/sound/pci/hda/cs35l41_hda_spi.c b/sound/pci/hda/cs35l41_hda_spi.c index 15345a72b9d1..36815ab4e461 100644 --- a/sound/pci/hda/cs35l41_hda_spi.c +++ b/sound/pci/hda/cs35l41_hda_spi.c @@ -55,7 +55,6 @@ static struct spi_driver cs35l41_spi_driver = { .probe = cs35l41_hda_spi_probe, .remove = cs35l41_hda_spi_remove, }; - module_spi_driver(cs35l41_spi_driver); MODULE_DESCRIPTION("HDA CS35L41 driver"); -- cgit v1.2.3-58-ga151 From 85c25662d18903874fad585d17fc398a7ba37ab0 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Mon, 17 Jan 2022 23:00:55 +0100 Subject: ALSA: hda: cs35l41: Make cs35l41_hda_remove() return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Up to now cs35l41_hda_remove() returns zero unconditionally. Make it return void instead which makes it easier to see in the callers that there is no error to handle. Also the return value of i2c and spi remove callbacks is ignored anyway. Signed-off-by: Uwe Kleine-König Reviewed-by: Lucas Tanure Link: https://lore.kernel.org/r/20220117220055.120955-1-u.kleine-koenig@pengutronix.de Signed-off-by: Takashi Iwai --- sound/pci/hda/cs35l41_hda.c | 4 +--- sound/pci/hda/cs35l41_hda.h | 2 +- sound/pci/hda/cs35l41_hda_i2c.c | 4 +++- sound/pci/hda/cs35l41_hda_spi.c | 4 +++- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c index 3f9ddfb4eaf3..718595380868 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -525,7 +525,7 @@ err: } EXPORT_SYMBOL_NS_GPL(cs35l41_hda_probe, SND_HDA_SCODEC_CS35L41); -int cs35l41_hda_remove(struct device *dev) +void cs35l41_hda_remove(struct device *dev) { struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev); @@ -534,8 +534,6 @@ int cs35l41_hda_remove(struct device *dev) if (!cs35l41->vspk_always_on) gpiod_set_value_cansleep(cs35l41->reset_gpio, 0); gpiod_put(cs35l41->reset_gpio); - - return 0; } EXPORT_SYMBOL_NS_GPL(cs35l41_hda_remove, SND_HDA_SCODEC_CS35L41); diff --git a/sound/pci/hda/cs35l41_hda.h b/sound/pci/hda/cs35l41_hda.h index 640afc98b686..74951001501c 100644 --- a/sound/pci/hda/cs35l41_hda.h +++ b/sound/pci/hda/cs35l41_hda.h @@ -64,6 +64,6 @@ struct cs35l41_hda { int cs35l41_hda_probe(struct device *dev, const char *device_name, int id, int irq, struct regmap *regmap); -int cs35l41_hda_remove(struct device *dev); +void cs35l41_hda_remove(struct device *dev); #endif /*__CS35L41_HDA_H__*/ diff --git a/sound/pci/hda/cs35l41_hda_i2c.c b/sound/pci/hda/cs35l41_hda_i2c.c index c2397dc53e78..e810b278fb91 100644 --- a/sound/pci/hda/cs35l41_hda_i2c.c +++ b/sound/pci/hda/cs35l41_hda_i2c.c @@ -32,7 +32,9 @@ static int cs35l41_hda_i2c_probe(struct i2c_client *clt, const struct i2c_device static int cs35l41_hda_i2c_remove(struct i2c_client *clt) { - return cs35l41_hda_remove(&clt->dev); + cs35l41_hda_remove(&clt->dev); + + return 0; } static const struct i2c_device_id cs35l41_hda_i2c_id[] = { diff --git a/sound/pci/hda/cs35l41_hda_spi.c b/sound/pci/hda/cs35l41_hda_spi.c index 36815ab4e461..9f8123893cc8 100644 --- a/sound/pci/hda/cs35l41_hda_spi.c +++ b/sound/pci/hda/cs35l41_hda_spi.c @@ -30,7 +30,9 @@ static int cs35l41_hda_spi_probe(struct spi_device *spi) static int cs35l41_hda_spi_remove(struct spi_device *spi) { - return cs35l41_hda_remove(&spi->dev); + cs35l41_hda_remove(&spi->dev); + + return 0; } static const struct spi_device_id cs35l41_hda_spi_id[] = { -- cgit v1.2.3-58-ga151 From fd9f4e62a39f09a7c014d7415c2b9d1390aa0504 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jan 2022 08:04:44 +0100 Subject: block: assign bi_bdev for cloned bios in blk_rq_prep_clone bio_clone_fast() sets the cloned bio to have the same ->bi_bdev as the source bio. This means that when request-based dm called setup_clone(), the cloned bio had its ->bi_bdev pointing to the dm device. After Commit 0b6e522cdc4a ("blk-mq: use ->bi_bdev for I/O accounting") __blk_account_io_start() started using the request's ->bio->bi_bdev for I/O accounting, if it was set. This caused IO going to the underlying devices to use the dm device for their I/O accounting. Set up the proper ->bi_bdev in blk_rq_prep_clone based on the whole device bdev for the queue the request is cloned onto. Fixes: 0b6e522cdc4a ("blk-mq: use ->bi_bdev for I/O accounting") Reported-by: Benjamin Marzinski Signed-off-by: Christoph Hellwig [hch: the commit message is mostly from a different patch from Benjamin] Reviewed-by: Ming Lei Reviewed-by: Benjamin Marzinski Link: https://lore.kernel.org/r/20220118070444.1241739-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index a6d4780580fc..b5e35e63adad 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2976,6 +2976,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, bio = bio_clone_fast(bio_src, gfp_mask, bs); if (!bio) goto free_and_out; + bio->bi_bdev = rq->q->disk->part0; if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; -- cgit v1.2.3-58-ga151 From a8e422af696133003903e440b87f10a8248051b8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 18 Jan 2022 10:18:36 -0800 Subject: xfs: remove unused xfs_ioctl32.h declarations Remove these unused ia32 compat declarations; all the bits involved have either been withdrawn or hoisted to the VFS. Signed-off-by: Darrick J. Wong Reviewed-by: Eric Sandeen --- fs/xfs/xfs_ioctl32.h | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index fc5a91f3a5e0..c14852362fce 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -142,24 +142,6 @@ typedef struct compat_xfs_fsop_attrmulti_handlereq { _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq) #ifdef BROKEN_X86_ALIGNMENT -/* on ia32 l_start is on a 32-bit boundary */ -typedef struct compat_xfs_flock64 { - __s16 l_type; - __s16 l_whence; - __s64 l_start __attribute__((packed)); - /* len == 0 means until end of file */ - __s64 l_len __attribute__((packed)); - __s32 l_sysid; - __u32 l_pid; - __s32 l_pad[4]; /* reserve area */ -} compat_xfs_flock64_t; - -#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64) -#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) -#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) -#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) -#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64) - typedef struct compat_xfs_fsop_geom_v1 { __u32 blocksize; /* filesystem (data) block size */ __u32 rtextsize; /* realtime extent size */ -- cgit v1.2.3-58-ga151 From 9c494ca4d3a535f9ca11ad6af1813983c1c6cbdd Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Thu, 13 Jan 2022 16:28:39 -0800 Subject: x86/gpu: Reserve stolen memory for first integrated Intel GPU "Stolen memory" is memory set aside for use by an Intel integrated GPU. The intel_graphics_quirks() early quirk reserves this memory when it is called for a GPU that appears in the intel_early_ids[] table of integrated GPUs. Previously intel_graphics_quirks() was marked as QFLAG_APPLY_ONCE, so it was called only for the first Intel GPU found. If a discrete GPU happened to be enumerated first, intel_graphics_quirks() was called for it but not for any integrated GPU found later. Therefore, stolen memory for such an integrated GPU was never reserved. For example, this problem occurs in this Alderlake-P (integrated) + DG2 (discrete) topology where the DG2 is found first, but stolen memory is associated with the integrated GPU: - 00:01.0 Bridge `- 03:00.0 DG2 discrete GPU - 00:02.0 Integrated GPU (with stolen memory) Remove the QFLAG_APPLY_ONCE flag and call intel_graphics_quirks() for every Intel GPU. Reserve stolen memory for the first GPU that appears in intel_early_ids[]. [bhelgaas: commit log, add code comment, squash in https://lore.kernel.org/r/20220118190558.2ququ4vdfjuahicm@ldmartin-desk2] Link: https://lore.kernel.org/r/20220114002843.2083382-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org --- arch/x86/kernel/early-quirks.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 391a4e2b8604..8690fab95ae4 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -515,6 +515,7 @@ static const struct intel_early_ops gen11_early_ops __initconst = { .stolen_size = gen9_stolen_size, }; +/* Intel integrated GPUs for which we need to reserve "stolen memory" */ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_I830_IDS(&i830_early_ops), INTEL_I845G_IDS(&i845_early_ops), @@ -591,6 +592,13 @@ static void __init intel_graphics_quirks(int num, int slot, int func) u16 device; int i; + /* + * Reserve "stolen memory" for an integrated GPU. If we've already + * found one, there's nothing to do for other (discrete) GPUs. + */ + if (resource_size(&intel_graphics_stolen_res)) + return; + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) { @@ -703,7 +711,7 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, - QFLAG_APPLY_ONCE, intel_graphics_quirks }, + 0, intel_graphics_quirks }, /* * HPET on the current version of the Baytrail platform has accuracy * problems: it will halt in deep idle state - so we disable it. -- cgit v1.2.3-58-ga151 From e4e2787bef7e643511cf0f352deb6bd16e7fa9b4 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 18 Jan 2022 16:36:27 -0600 Subject: smb3: add new defines from protocol specification In the October updates to MS-SMB2 two additional FSCTLs were described. Add the missing defines for these, as well as fix a typo in an earlier define. Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/smbfs_common/smb2pdu.h | 2 +- fs/smbfs_common/smbfsctl.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h index 7ccadcbe684b..38b8fc514860 100644 --- a/fs/smbfs_common/smb2pdu.h +++ b/fs/smbfs_common/smb2pdu.h @@ -449,7 +449,7 @@ struct smb2_netname_neg_context { */ /* Flags */ -#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001 +#define SMB2_ACCEPT_TRANSPORT_LEVEL_SECURITY 0x00000001 struct smb2_transport_capabilities_context { __le16 ContextType; /* 6 */ diff --git a/fs/smbfs_common/smbfsctl.h b/fs/smbfs_common/smbfsctl.h index 926f87cd6af0..d51939c43ad7 100644 --- a/fs/smbfs_common/smbfsctl.h +++ b/fs/smbfs_common/smbfsctl.h @@ -95,8 +95,10 @@ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ #define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C #define FSCTL_GET_REFS_VOLUME_DATA 0x000902D8 /* See MS-FSCC 2.3.24 */ +#define FSCTL_SET_INTEGRITY_INFORMATION_EXT 0x00090380 #define FSCTL_GET_RETRIEVAL_POINTERS_AND_REFCOUNT 0x000903d3 #define FSCTL_GET_RETRIEVAL_POINTER_COUNT 0x0009042b +#define FSCTL_REFS_STREAM_SNAPSHOT_MANAGEMENT 0x00090440 #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF #define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ #define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ -- cgit v1.2.3-58-ga151 From 2fd5dcb1c8ef96c9f0fa8bda53ca480524b80ae7 Mon Sep 17 00:00:00 2001 From: Hyunchul Lee Date: Thu, 13 Jan 2022 09:51:39 +0900 Subject: ksmbd: smbd: fix missing client's memory region invalidation if the Channel of a SMB2 WRITE request is SMB2_CHANNEL_RDMA_V1_INVALIDTE, a client does not invalidate its memory regions but ksmbd must do it by sending a SMB2 WRITE response with IB_WR_SEND_WITH_INV. But if errors occur while processing a SMB2 READ/WRITE request, ksmbd sends a response with IB_WR_SEND. So a client could use memory regions already in use. Acked-by: Namjae Jeon Signed-off-by: Hyunchul Lee Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 73 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 867ed982f729..15f331dbe17a 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -6124,25 +6124,33 @@ out: return err; } -static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, - struct smb2_read_req *req, void *data_buf, - size_t length) +static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, + struct smb2_buffer_desc_v1 *desc, + __le32 Channel, + __le16 ChannelInfoOffset, + __le16 ChannelInfoLength) { - struct smb2_buffer_desc_v1 *desc = - (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; - int err; - if (work->conn->dialect == SMB30_PROT_ID && - req->Channel != SMB2_CHANNEL_RDMA_V1) + Channel != SMB2_CHANNEL_RDMA_V1) return -EINVAL; - if (req->ReadChannelInfoOffset == 0 || - le16_to_cpu(req->ReadChannelInfoLength) < sizeof(*desc)) + if (ChannelInfoOffset == 0 || + le16_to_cpu(ChannelInfoLength) < sizeof(*desc)) return -EINVAL; work->need_invalidate_rkey = - (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); + (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); work->remote_key = le32_to_cpu(desc->token); + return 0; +} + +static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, + struct smb2_read_req *req, void *data_buf, + size_t length) +{ + struct smb2_buffer_desc_v1 *desc = + (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; + int err; err = ksmbd_conn_rdma_write(work->conn, data_buf, length, le32_to_cpu(desc->token), @@ -6165,7 +6173,7 @@ int smb2_read(struct ksmbd_work *work) struct ksmbd_conn *conn = work->conn; struct smb2_read_req *req; struct smb2_read_rsp *rsp; - struct ksmbd_file *fp; + struct ksmbd_file *fp = NULL; loff_t offset; size_t length, mincount; ssize_t nbytes = 0, remain_bytes = 0; @@ -6179,6 +6187,18 @@ int smb2_read(struct ksmbd_work *work) return smb2_read_pipe(work); } + if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || + req->Channel == SMB2_CHANNEL_RDMA_V1) { + err = smb2_set_remote_key_for_rdma(work, + (struct smb2_buffer_desc_v1 *) + &req->Buffer[0], + req->Channel, + req->ReadChannelInfoOffset, + req->ReadChannelInfoLength); + if (err) + goto out; + } + fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId), le64_to_cpu(req->PersistentFileId)); if (!fp) { @@ -6364,21 +6384,6 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; - if (work->conn->dialect == SMB30_PROT_ID && - req->Channel != SMB2_CHANNEL_RDMA_V1) - return -EINVAL; - - if (req->Length != 0 || req->DataOffset != 0) - return -EINVAL; - - if (req->WriteChannelInfoOffset == 0 || - le16_to_cpu(req->WriteChannelInfoLength) < sizeof(*desc)) - return -EINVAL; - - work->need_invalidate_rkey = - (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); - work->remote_key = le32_to_cpu(desc->token); - data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); if (!data_buf) return -ENOMEM; @@ -6425,6 +6430,20 @@ int smb2_write(struct ksmbd_work *work) return smb2_write_pipe(work); } + if (req->Channel == SMB2_CHANNEL_RDMA_V1 || + req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { + if (req->Length != 0 || req->DataOffset != 0) + return -EINVAL; + err = smb2_set_remote_key_for_rdma(work, + (struct smb2_buffer_desc_v1 *) + &req->Buffer[0], + req->Channel, + req->WriteChannelInfoOffset, + req->WriteChannelInfoLength); + if (err) + goto out; + } + if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { ksmbd_debug(SMB, "User does not have write permission\n"); err = -EACCES; -- cgit v1.2.3-58-ga151 From b207602fb04537cb21ac38fabd7577eca2fa05ae Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 15 Jan 2022 14:49:00 +0300 Subject: ksmbd: uninitialized variable in create_socket() The "ksmbd_socket" variable is not initialized on this error path. Cc: stable@vger.kernel.org Fixes: 0626e6641f6b ("cifsd: add server handler for central processing and tranport layers") Signed-off-by: Dan Carpenter Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/transport_tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index c14320e03b69..82a1429bbe12 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -404,7 +404,7 @@ static int create_socket(struct interface *iface) &ksmbd_socket); if (ret) { pr_err("Can't create socket for ipv4: %d\n", ret); - goto out_error; + goto out_clear; } sin.sin_family = PF_INET; @@ -462,6 +462,7 @@ static int create_socket(struct interface *iface) out_error: tcp_destroy_socket(ksmbd_socket); +out_clear: iface->ksmbd_socket = NULL; return ret; } -- cgit v1.2.3-58-ga151 From ac090d9c90b087d6fb714e54b2a6dd1e6c373ed6 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 17 Jan 2022 22:16:01 +0900 Subject: ksmbd: fix guest connection failure with nautilus MS-SMB2 describe session sign like the following. Session.SigningRequired MUST be set to TRUE under the following conditions: - If the SMB2_NEGOTIATE_SIGNING_REQUIRED bit is set in the SecurityMode field of the client request. - If the SMB2_SESSION_FLAG_IS_GUEST bit is not set in the SessionFlags field and Session.IsAnonymous is FALSE and either Connection.ShouldSign or global RequireMessageSigning is TRUE. When trying guest account connection using nautilus, The login failure happened on session setup. ksmbd does not allow this connection when the user is a guest and the connection sign is set. Just do not set session sign instead of error response as described in the specification. And this change improves the guest connection in Nautilus. Fixes: e2f34481b24d ("cifsd: add server-side procedures for SMB3") Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 62 +++++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 15f331dbe17a..1866c81c5c99 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -1464,11 +1464,6 @@ static int ntlm_authenticate(struct ksmbd_work *work) } if (user_guest(sess->user)) { - if (conn->sign) { - ksmbd_debug(SMB, "Guest login not allowed when signing enabled\n"); - return -EPERM; - } - rsp->SessionFlags = SMB2_SESSION_FLAG_IS_GUEST_LE; } else { struct authenticate_message *authblob; @@ -1481,38 +1476,39 @@ static int ntlm_authenticate(struct ksmbd_work *work) ksmbd_debug(SMB, "authentication failed\n"); return -EPERM; } + } - /* - * If session state is SMB2_SESSION_VALID, We can assume - * that it is reauthentication. And the user/password - * has been verified, so return it here. - */ - if (sess->state == SMB2_SESSION_VALID) { - if (conn->binding) - goto binding_session; - return 0; - } + /* + * If session state is SMB2_SESSION_VALID, We can assume + * that it is reauthentication. And the user/password + * has been verified, so return it here. + */ + if (sess->state == SMB2_SESSION_VALID) { + if (conn->binding) + goto binding_session; + return 0; + } - if ((conn->sign || server_conf.enforced_signing) || - (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) - sess->sign = true; + if ((rsp->SessionFlags != SMB2_SESSION_FLAG_IS_GUEST_LE && + (conn->sign || server_conf.enforced_signing)) || + (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) + sess->sign = true; - if (smb3_encryption_negotiated(conn) && - !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { - rc = conn->ops->generate_encryptionkey(sess); - if (rc) { - ksmbd_debug(SMB, - "SMB3 encryption key generation failed\n"); - return -EINVAL; - } - sess->enc = true; - rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; - /* - * signing is disable if encryption is enable - * on this session - */ - sess->sign = false; + if (smb3_encryption_negotiated(conn) && + !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { + rc = conn->ops->generate_encryptionkey(sess); + if (rc) { + ksmbd_debug(SMB, + "SMB3 encryption key generation failed\n"); + return -EINVAL; } + sess->enc = true; + rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; + /* + * signing is disable if encryption is enable + * on this session + */ + sess->sign = false; } binding_session: -- cgit v1.2.3-58-ga151 From 520d9cd267618181901272a79db6154c0b83309c Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Fri, 14 Jan 2022 13:49:13 +0800 Subject: drm/amdgpu: apply vcn harvest quirk This is a following patch to apply the workaround only on those boards with a bad harvest table in ip discovery. Signed-off-by: Guchun Chen Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 32 ++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index be45650250fa..81bfee978b74 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -243,6 +243,30 @@ static inline bool amdgpu_discovery_verify_binary_signature(uint8_t *binary) return (le32_to_cpu(bhdr->binary_signature) == BINARY_SIGNATURE); } +static void amdgpu_discovery_harvest_config_quirk(struct amdgpu_device *adev) +{ + /* + * So far, apply this quirk only on those Navy Flounder boards which + * have a bad harvest table of VCN config. + */ + if ((adev->ip_versions[UVD_HWIP][1] == IP_VERSION(3, 0, 1)) && + (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 3, 2))) { + switch (adev->pdev->revision) { + case 0xC1: + case 0xC2: + case 0xC3: + case 0xC5: + case 0xC7: + case 0xCF: + case 0xDF: + adev->vcn.harvest_config |= AMDGPU_VCN_HARVEST_VCN1; + break; + default: + break; + } + } +} + static int amdgpu_discovery_init(struct amdgpu_device *adev) { struct table_info *info; @@ -548,11 +572,9 @@ void amdgpu_discovery_harvest_ip(struct amdgpu_device *adev) break; } } - /* some IP discovery tables on Navy Flounder don't have this set correctly */ - if ((adev->ip_versions[UVD_HWIP][1] == IP_VERSION(3, 0, 1)) && - (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 3, 2)) && - (adev->pdev->revision != 0xFF)) - adev->vcn.harvest_config |= AMDGPU_VCN_HARVEST_VCN1; + + amdgpu_discovery_harvest_config_quirk(adev); + if (vcn_harvest_count == adev->vcn.num_vcn_inst) { adev->harvest_ip_mask |= AMD_HARVEST_IP_VCN_MASK; adev->harvest_ip_mask |= AMD_HARVEST_IP_JPEG_MASK; -- cgit v1.2.3-58-ga151 From 9a458402fb69bda886aa6cbe067311b6e3d9c52a Mon Sep 17 00:00:00 2001 From: Jingwen Chen Date: Thu, 13 Jan 2022 19:06:59 +0800 Subject: drm/amd/amdgpu: fixing read wrong pf2vf data in SRIOV [Why] This fixes 892deb48269c ("drm/amdgpu: Separate vf2pf work item init from virt data exchange"). we should read pf2vf data based at mman.fw_vram_usage_va after gmc sw_init. commit 892deb48269c breaks this logic. [How] calling amdgpu_virt_exchange_data in amdgpu_virt_init_data_exchange to set the right base in the right sequence. v2: call amdgpu_virt_init_data_exchange after gmc sw_init to make data exchange workqueue run v3: clean up the code logic v4: add some comment and make the code more readable Fixes: 892deb48269c ("drm/amdgpu: Separate vf2pf work item init from virt data exchange") Signed-off-by: Jingwen Chen Reviewed-by: Horace Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 20 +++++++------------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c4f3c886be55..ed077de426d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2354,7 +2354,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) } if (amdgpu_sriov_vf(adev)) - amdgpu_virt_exchange_data(adev); + amdgpu_virt_init_data_exchange(adev); r = amdgpu_ib_pool_init(adev); if (r) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 894444ab0032..07bc0f504713 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -625,20 +625,20 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev) adev->virt.fw_reserve.p_vf2pf = NULL; adev->virt.vf2pf_update_interval_ms = 0; - if (adev->bios != NULL) { - adev->virt.vf2pf_update_interval_ms = 2000; + if (adev->mman.fw_vram_usage_va != NULL) { + /* go through this logic in ip_init and reset to init workqueue*/ + amdgpu_virt_exchange_data(adev); + INIT_DELAYED_WORK(&adev->virt.vf2pf_work, amdgpu_virt_update_vf2pf_work_item); + schedule_delayed_work(&(adev->virt.vf2pf_work), msecs_to_jiffies(adev->virt.vf2pf_update_interval_ms)); + } else if (adev->bios != NULL) { + /* got through this logic in early init stage to get necessary flags, e.g. rlcg_acc related*/ adev->virt.fw_reserve.p_pf2vf = (struct amd_sriov_msg_pf2vf_info_header *) (adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB << 10)); amdgpu_virt_read_pf2vf_data(adev); } - - if (adev->virt.vf2pf_update_interval_ms != 0) { - INIT_DELAYED_WORK(&adev->virt.vf2pf_work, amdgpu_virt_update_vf2pf_work_item); - schedule_delayed_work(&(adev->virt.vf2pf_work), msecs_to_jiffies(adev->virt.vf2pf_update_interval_ms)); - } } @@ -674,12 +674,6 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev) if (adev->virt.ras_init_done) amdgpu_virt_add_bad_page(adev, bp_block_offset, bp_block_size); } - } else if (adev->bios != NULL) { - adev->virt.fw_reserve.p_pf2vf = - (struct amd_sriov_msg_pf2vf_info_header *) - (adev->bios + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB << 10)); - - amdgpu_virt_read_pf2vf_data(adev); } } -- cgit v1.2.3-58-ga151 From 4722f463896cc0ef1a6f1c3cb2e171e949831249 Mon Sep 17 00:00:00 2001 From: Christian König Date: Mon, 17 Jan 2022 10:31:26 +0100 Subject: drm/radeon: fix error handling in radeon_driver_open_kms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The return value was never initialized so the cleanup code executed when it isn't even necessary. Just add proper error handling. Fixes: ab50cb9df889 ("drm/radeon/radeon_kms: Fix a NULL pointer dereference in radeon_driver_open_kms()") Signed-off-by: Christian König Tested-by: Jan Stancek Tested-by: Borislav Petkov Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/radeon_kms.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index e2488559cc9f..11ad210919c8 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -666,18 +666,18 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL); if (unlikely(!fpriv)) { r = -ENOMEM; - goto out_suspend; + goto err_suspend; } if (rdev->accel_working) { vm = &fpriv->vm; r = radeon_vm_init(rdev, vm); if (r) - goto out_fpriv; + goto err_fpriv; r = radeon_bo_reserve(rdev->ring_tmp_bo.bo, false); if (r) - goto out_vm_fini; + goto err_vm_fini; /* map the ib pool buffer read only into * virtual address space */ @@ -685,7 +685,7 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) rdev->ring_tmp_bo.bo); if (!vm->ib_bo_va) { r = -ENOMEM; - goto out_vm_fini; + goto err_vm_fini; } r = radeon_vm_bo_set_addr(rdev, vm->ib_bo_va, @@ -693,19 +693,21 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) RADEON_VM_PAGE_READABLE | RADEON_VM_PAGE_SNOOPED); if (r) - goto out_vm_fini; + goto err_vm_fini; } file_priv->driver_priv = fpriv; } - if (!r) - goto out_suspend; + pm_runtime_mark_last_busy(dev->dev); + pm_runtime_put_autosuspend(dev->dev); + return 0; -out_vm_fini: +err_vm_fini: radeon_vm_fini(rdev, vm); -out_fpriv: +err_fpriv: kfree(fpriv); -out_suspend: + +err_suspend: pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); return r; -- cgit v1.2.3-58-ga151 From ea6e7ceedaf11e1bad3ff21e8624694d696d276b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 Jan 2022 19:10:11 -0700 Subject: io-wq: remove useless 'work' argument to __io_worker_busy() We don't use 'work' anymore in the busy logic, remove the dead argument. Signed-off-by: Jens Axboe --- fs/io-wq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 5c4f582d6549..f8a5f172b9eb 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -405,8 +405,7 @@ static void io_wqe_dec_running(struct io_worker *worker) * Worker will start processing some work. Move it to the busy list, if * it's currently on the freelist */ -static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, - struct io_wq_work *work) +static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker) __must_hold(wqe->lock) { if (worker->flags & IO_WORKER_F_FREE) { @@ -556,7 +555,7 @@ get_next: */ work = io_get_next_work(acct, worker); if (work) - __io_worker_busy(wqe, worker, work); + __io_worker_busy(wqe, worker); raw_spin_unlock(&wqe->lock); if (!work) -- cgit v1.2.3-58-ga151 From 081b58204629eff9dd93e7f68ed15c8aa6452a4b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 Jan 2022 19:13:43 -0700 Subject: io-wq: make io_worker lock a raw spinlock In preparation to nesting it under the wqe lock (which is raw due to being acquired from the scheduler side), change the io_worker lock from a normal spinlock to a raw spinlock. Signed-off-by: Jens Axboe --- fs/io-wq.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index f8a5f172b9eb..c369910de793 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -48,7 +48,7 @@ struct io_worker { struct io_wqe *wqe; struct io_wq_work *cur_work; - spinlock_t lock; + raw_spinlock_t lock; struct completion ref_done; @@ -528,9 +528,9 @@ static void io_assign_current_work(struct io_worker *worker, cond_resched(); } - spin_lock(&worker->lock); + raw_spin_lock(&worker->lock); worker->cur_work = work; - spin_unlock(&worker->lock); + raw_spin_unlock(&worker->lock); } static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); @@ -814,7 +814,7 @@ fail: refcount_set(&worker->ref, 1); worker->wqe = wqe; - spin_lock_init(&worker->lock); + raw_spin_lock_init(&worker->lock); init_completion(&worker->ref_done); if (index == IO_WQ_ACCT_BOUND) @@ -980,13 +980,13 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) * Hold the lock to avoid ->cur_work going out of scope, caller * may dereference the passed in work. */ - spin_lock(&worker->lock); + raw_spin_lock(&worker->lock); if (worker->cur_work && match->fn(worker->cur_work, match->data)) { set_notify_signal(worker->task); match->nr_running++; } - spin_unlock(&worker->lock); + raw_spin_unlock(&worker->lock); return match->nr_running && !match->cancel_all; } -- cgit v1.2.3-58-ga151 From 36e4c58bf044b07204c8c7e6dd7c2384e439921a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 Jan 2022 19:18:20 -0700 Subject: io-wq: invoke work cancelation with wqe->lock held io_wqe_cancel_pending_work() grabs it internally, grab it upfront instead. For the running work cancelation, grab the lock around it as well. Signed-off-by: Jens Axboe --- fs/io-wq.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index c369910de793..a92fbdc8bea3 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1038,17 +1038,16 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, { int i; retry: - raw_spin_lock(&wqe->lock); for (i = 0; i < IO_WQ_ACCT_NR; i++) { struct io_wqe_acct *acct = io_get_acct(wqe, i == 0); if (io_acct_cancel_pending_work(wqe, acct, match)) { + raw_spin_lock(&wqe->lock); if (match->cancel_all) goto retry; - return; + break; } } - raw_spin_unlock(&wqe->lock); } static void io_wqe_cancel_running_work(struct io_wqe *wqe, @@ -1077,7 +1076,9 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; + raw_spin_lock(&wqe->lock); io_wqe_cancel_pending_work(wqe, &match); + raw_spin_unlock(&wqe->lock); if (match.nr_pending && !match.cancel_all) return IO_WQ_CANCEL_OK; } @@ -1091,7 +1092,9 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; + raw_spin_lock(&wqe->lock); io_wqe_cancel_running_work(wqe, &match); + raw_spin_unlock(&wqe->lock); if (match.nr_running && !match.cancel_all) return IO_WQ_CANCEL_RUNNING; } @@ -1262,7 +1265,9 @@ static void io_wq_destroy(struct io_wq *wq) .fn = io_wq_work_match_all, .cancel_all = true, }; + raw_spin_lock(&wqe->lock); io_wqe_cancel_pending_work(wqe, &match); + raw_spin_unlock(&wqe->lock); free_cpumask_var(wqe->cpu_mask); kfree(wqe); } -- cgit v1.2.3-58-ga151 From efdf518459b17e18a93c7c9cb622fd3051dabd0c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 Jan 2022 19:22:32 -0700 Subject: io-wq: perform both unstarted and started work cancelations in one go Rather than split these into two separate lookups and matches, combine them into one loop. This will become important when we can guarantee that we don't have a window where a pending work item isn't discoverable in either state. Signed-off-by: Jens Axboe --- fs/io-wq.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index a92fbdc8bea3..db150186ce94 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1072,27 +1072,25 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, * First check pending list, if we're lucky we can just remove it * from there. CANCEL_OK means that the work is returned as-new, * no completion will be posted for it. - */ - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - raw_spin_lock(&wqe->lock); - io_wqe_cancel_pending_work(wqe, &match); - raw_spin_unlock(&wqe->lock); - if (match.nr_pending && !match.cancel_all) - return IO_WQ_CANCEL_OK; - } - - /* - * Now check if a free (going busy) or busy worker has the work + * + * Then check if a free (going busy) or busy worker has the work * currently running. If we find it there, we'll return CANCEL_RUNNING * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. + * + * Do both of these while holding the wqe->lock, to ensure that + * we'll find a work item regardless of state. */ for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; raw_spin_lock(&wqe->lock); + io_wqe_cancel_pending_work(wqe, &match); + if (match.nr_pending && !match.cancel_all) { + raw_spin_unlock(&wqe->lock); + return IO_WQ_CANCEL_OK; + } + io_wqe_cancel_running_work(wqe, &match); raw_spin_unlock(&wqe->lock); if (match.nr_running && !match.cancel_all) -- cgit v1.2.3-58-ga151 From 361aee450c6e36c8dbab712c94a8a7835bd92e25 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 Jan 2022 19:23:51 -0700 Subject: io-wq: add intermediate work step between pending list and active work We have a gap where a worker removes an item from the work list and to when it gets added as the workers active work. In this state, the work item cannot be found by cancelations. This is a small window, but it does exist. Add a temporary pointer to a work item that isn't on the pending work list anymore, but also not the active work. This is needed as we need to drop the wqe lock in between grabbing the work item and marking it as active, to ensure that signal based cancelations are properly ordered. Reported-by: Florian Fischer Link: https://lore.kernel.org/io-uring/20220118151337.fac6cthvbnu7icoc@pasture/ Signed-off-by: Jens Axboe --- fs/io-wq.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index db150186ce94..1efb134c98b7 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -48,6 +48,7 @@ struct io_worker { struct io_wqe *wqe; struct io_wq_work *cur_work; + struct io_wq_work *next_work; raw_spinlock_t lock; struct completion ref_done; @@ -530,6 +531,7 @@ static void io_assign_current_work(struct io_worker *worker, raw_spin_lock(&worker->lock); worker->cur_work = work; + worker->next_work = NULL; raw_spin_unlock(&worker->lock); } @@ -554,9 +556,20 @@ get_next: * clear the stalled flag. */ work = io_get_next_work(acct, worker); - if (work) + if (work) { __io_worker_busy(wqe, worker); + /* + * Make sure cancelation can find this, even before + * it becomes the active work. That avoids a window + * where the work has been removed from our general + * work list, but isn't yet discoverable as the + * current work item for this worker. + */ + raw_spin_lock(&worker->lock); + worker->next_work = work; + raw_spin_unlock(&worker->lock); + } raw_spin_unlock(&wqe->lock); if (!work) break; @@ -972,6 +985,19 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); } +static bool __io_wq_worker_cancel(struct io_worker *worker, + struct io_cb_cancel_data *match, + struct io_wq_work *work) +{ + if (work && match->fn(work, match->data)) { + work->flags |= IO_WQ_WORK_CANCEL; + set_notify_signal(worker->task); + return true; + } + + return false; +} + static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { struct io_cb_cancel_data *match = data; @@ -981,11 +1007,9 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) * may dereference the passed in work. */ raw_spin_lock(&worker->lock); - if (worker->cur_work && - match->fn(worker->cur_work, match->data)) { - set_notify_signal(worker->task); + if (__io_wq_worker_cancel(worker, match, worker->cur_work) || + __io_wq_worker_cancel(worker, match, worker->next_work)) match->nr_running++; - } raw_spin_unlock(&worker->lock); return match->nr_running && !match->cancel_all; -- cgit v1.2.3-58-ga151 From ccbf726171b7328f800bc98005132fd77eb1a175 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 18 Jan 2022 19:11:11 -0700 Subject: io_uring: perform poll removal even if async work removal is successful An active work can have poll armed, hence it's not enough to just do the async work removal and return the value if it's different from "not found". Rather than make poll removal special, just fall through to do the remaining type lookups and removals. Reported-by: Florian Fischer Link: https://lore.kernel.org/io-uring/20220118151337.fac6cthvbnu7icoc@pasture/ Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 422d6de48688..e54c4127422e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6386,16 +6386,21 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - if (ret != -ENOENT) - return ret; + /* + * Fall-through even for -EALREADY, as we may have poll armed + * that need unarming. + */ + if (!ret) + return 0; spin_lock(&ctx->completion_lock); + ret = io_poll_cancel(ctx, sqe_addr, false); + if (ret != -ENOENT) + goto out; + spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, sqe_addr); spin_unlock_irq(&ctx->timeout_lock); - if (ret != -ENOENT) - goto out; - ret = io_poll_cancel(ctx, sqe_addr, false); out: spin_unlock(&ctx->completion_lock); return ret; -- cgit v1.2.3-58-ga151 From 8b59b0a53c840921b625378f137e88adfa87647e Mon Sep 17 00:00:00 2001 From: sparkhuang Date: Wed, 15 Dec 2021 10:08:23 +0100 Subject: ARM: 9170/1: fix panic when kasan and kprobe are enabled arm32 uses software to simulate the instruction replaced by kprobe. some instructions may be simulated by constructing assembly functions. therefore, before executing instruction simulation, it is necessary to construct assembly function execution environment in C language through binding registers. after kasan is enabled, the register binding relationship will be destroyed, resulting in instruction simulation errors and causing kernel panic. the kprobe emulate instruction function is distributed in three files: actions-common.c actions-arm.c actions-thumb.c, so disable KASAN when compiling these files. for example, use kprobe insert on cap_capable+20 after kasan enabled, the cap_capable assembly code is as follows: : e92d47f0 push {r4, r5, r6, r7, r8, r9, sl, lr} e1a05000 mov r5, r0 e280006c add r0, r0, #108 ; 0x6c e1a04001 mov r4, r1 e1a06002 mov r6, r2 e59fa090 ldr sl, [pc, #144] ; ebfc7bf8 bl c03aa4b4 <__asan_load4> e595706c ldr r7, [r5, #108] ; 0x6c e2859014 add r9, r5, #20 ...... The emulate_ldr assembly code after enabling kasan is as follows: c06f1384 : e92d47f0 push {r4, r5, r6, r7, r8, r9, sl, lr} e282803c add r8, r2, #60 ; 0x3c e1a05000 mov r5, r0 e7e37855 ubfx r7, r5, #16, #4 e1a00008 mov r0, r8 e1a09001 mov r9, r1 e1a04002 mov r4, r2 ebf35462 bl c03c6530 <__asan_load4> e357000f cmp r7, #15 e7e36655 ubfx r6, r5, #12, #4 e205a00f and sl, r5, #15 0a000001 beq c06f13bc e0840107 add r0, r4, r7, lsl #2 ebf3545c bl c03c6530 <__asan_load4> e084010a add r0, r4, sl, lsl #2 ebf3545a bl c03c6530 <__asan_load4> e2890010 add r0, r9, #16 ebf35458 bl c03c6530 <__asan_load4> e5990010 ldr r0, [r9, #16] e12fff30 blx r0 e356000f cm r6, #15 1a000014 bne c06f1430 e1a06000 mov r6, r0 e2840040 add r0, r4, #64 ; 0x40 ...... when running in emulate_ldr to simulate the ldr instruction, panic occurred, and the log is as follows: Unable to handle kernel NULL pointer dereference at virtual address 00000090 pgd = ecb46400 [00000090] *pgd=2e0fa003, *pmd=00000000 Internal error: Oops: 206 [#1] SMP ARM PC is at cap_capable+0x14/0xb0 LR is at emulate_ldr+0x50/0xc0 psr: 600d0293 sp : ecd63af8 ip : 00000004 fp : c0a7c30c r10: 00000000 r9 : c30897f4 r8 : ecd63cd4 r7 : 0000000f r6 : 0000000a r5 : e59fa090 r4 : ecd63c98 r3 : c06ae294 r2 : 00000000 r1 : b7611300 r0 : bf4ec008 Flags: nZCv IRQs off FIQs on Mode SVC_32 ISA ARM Segment user Control: 32c5387d Table: 2d546400 DAC: 55555555 Process bash (pid: 1643, stack limit = 0xecd60190) (cap_capable) from (kprobe_handler+0x218/0x340) (kprobe_handler) from (kprobe_trap_handler+0x24/0x48) (kprobe_trap_handler) from (do_undefinstr+0x13c/0x364) (do_undefinstr) from (__und_svc_finish+0x0/0x30) (__und_svc_finish) from (cap_capable+0x18/0xb0) (cap_capable) from (cap_vm_enough_memory+0x38/0x48) (cap_vm_enough_memory) from (security_vm_enough_memory_mm+0x48/0x6c) (security_vm_enough_memory_mm) from (copy_process.constprop.5+0x16b4/0x25c8) (copy_process.constprop.5) from (_do_fork+0xe8/0x55c) (_do_fork) from (SyS_clone+0x1c/0x24) (SyS_clone) from (__sys_trace_return+0x0/0x10) Code: 0050a0e1 6c0080e2 0140a0e1 0260a0e1 (f801f0e7) Fixes: 35aa1df43283 ("ARM kprobes: instruction single-stepping support") Fixes: 421015713b30 ("ARM: 9017/2: Enable KASan for ARM") Signed-off-by: huangshaobo Acked-by: Ard Biesheuvel Signed-off-by: Russell King (Oracle) --- arch/arm/probes/kprobes/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/probes/kprobes/Makefile b/arch/arm/probes/kprobes/Makefile index 14db56f49f0a..6159010dac4a 100644 --- a/arch/arm/probes/kprobes/Makefile +++ b/arch/arm/probes/kprobes/Makefile @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 +KASAN_SANITIZE_actions-common.o := n +KASAN_SANITIZE_actions-arm.o := n +KASAN_SANITIZE_actions-thumb.o := n obj-$(CONFIG_KPROBES) += core.o actions-common.o checkers-common.o obj-$(CONFIG_ARM_KPROBES_TEST) += test-kprobes.o test-kprobes-objs := test-core.o -- cgit v1.2.3-58-ga151 From 15420269b02a63ed8c1841905d8b8b2403246004 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 18 Jan 2022 13:45:09 +0100 Subject: ARM: 9179/1: uaccess: avoid alignment faults in copy_[from|to]_kernel_nofault The helpers that are used to implement copy_from_kernel_nofault() and copy_to_kernel_nofault() cast a void* to a pointer to a wider type, which may result in alignment faults on ARM if the compiler decides to use double-word or multiple-word load/store instructions. Only configurations that define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y are affected, given that commit 2423de2e6f4d ("ARM: 9115/1: mm/maccess: fix unaligned copy_{from,to}_kernel_nofault") ensures that dst and src are sufficiently aligned otherwise. So use the unaligned accessors for accessing dst and src in cases where they may be misaligned. Cc: # depends on 2423de2e6f4d Fixes: 2df4c9a741a0 ("ARM: 9112/1: uaccess: add __{get,put}_kernel_nofault") Reviewed-by: Arnd Bergmann Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King (Oracle) --- arch/arm/include/asm/uaccess.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 36fbc3329252..32dbfd81f42a 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -497,7 +498,10 @@ do { \ } \ default: __err = __get_user_bad(); break; \ } \ - *(type *)(dst) = __val; \ + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) \ + put_unaligned(__val, (type *)(dst)); \ + else \ + *(type *)(dst) = __val; /* aligned by caller */ \ if (__err) \ goto err_label; \ } while (0) @@ -507,7 +511,9 @@ do { \ const type *__pk_ptr = (dst); \ unsigned long __dst = (unsigned long)__pk_ptr; \ int __err = 0; \ - type __val = *(type *)src; \ + type __val = IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \ + ? get_unaligned((type *)(src)) \ + : *(type *)(src); /* aligned by caller */ \ switch (sizeof(type)) { \ case 1: __put_user_asm_byte(__val, __dst, __err, ""); break; \ case 2: __put_user_asm_half(__val, __dst, __err, ""); break; \ -- cgit v1.2.3-58-ga151 From 9f80ccda53b9417236945bc7ece4b519037df74d Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 18 Jan 2022 19:32:17 +0100 Subject: ARM: 9180/1: Thumb2: align ALT_UP() sections in modules sufficiently When building for Thumb2, the .alt.smp.init sections that are emitted by the ALT_UP() patching code may not be 32-bit aligned, even though the fixup_smp_on_up() routine expects that. This results in alignment faults at module load time, which need to be fixed up by the fault handler. So let's align those sections explicitly, and prevent this from occurring. Cc: Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King (Oracle) --- arch/arm/include/asm/assembler.h | 2 ++ arch/arm/include/asm/processor.h | 1 + 2 files changed, 3 insertions(+) diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 7d23d4bb2168..6fe67963ba5a 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -288,6 +288,7 @@ */ #define ALT_UP(instr...) \ .pushsection ".alt.smp.init", "a" ;\ + .align 2 ;\ .long 9998b - . ;\ 9997: instr ;\ .if . - 9997b == 2 ;\ @@ -299,6 +300,7 @@ .popsection #define ALT_UP_B(label) \ .pushsection ".alt.smp.init", "a" ;\ + .align 2 ;\ .long 9998b - . ;\ W(b) . + (label - 9998b) ;\ .popsection diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 6af68edfa53a..bdc35c0e8dfb 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -96,6 +96,7 @@ unsigned long __get_wchan(struct task_struct *p); #define __ALT_SMP_ASM(smp, up) \ "9998: " smp "\n" \ " .pushsection \".alt.smp.init\", \"a\"\n" \ + " .align 2\n" \ " .long 9998b - .\n" \ " " up "\n" \ " .popsection\n" -- cgit v1.2.3-58-ga151 From 4ea5763fb79ed89b3bdad455ebf3f33416a81624 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 14 Jan 2022 14:33:30 +0100 Subject: HID: uhid: Fix worker destroying device without any protection uhid has to run hid_add_device() from workqueue context while allowing parallel use of the userspace API (which is protected with ->devlock). But hid_add_device() can fail. Currently, that is handled by immediately destroying the associated HID device, without using ->devlock - but if there are concurrent requests from userspace, that's wrong and leads to NULL dereferences and/or memory corruption (via use-after-free). Fix it by leaving the HID device as-is in the worker. We can clean it up later, either in the UHID_DESTROY command handler or in the ->release() handler. Cc: stable@vger.kernel.org Fixes: 67f8ecc550b5 ("HID: uhid: fix timeout when probe races with IO") Signed-off-by: Jann Horn Signed-off-by: Jiri Kosina --- drivers/hid/uhid.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index 8fe3efcb8327..fc06d8bb42e0 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -28,11 +28,22 @@ struct uhid_device { struct mutex devlock; + + /* This flag tracks whether the HID device is usable for commands from + * userspace. The flag is already set before hid_add_device(), which + * runs in workqueue context, to allow hid_add_device() to communicate + * with userspace. + * However, if hid_add_device() fails, the flag is cleared without + * holding devlock. + * We guarantee that if @running changes from true to false while you're + * holding @devlock, it's still fine to access @hid. + */ bool running; __u8 *rd_data; uint rd_size; + /* When this is NULL, userspace may use UHID_CREATE/UHID_CREATE2. */ struct hid_device *hid; struct uhid_event input_buf; @@ -63,9 +74,18 @@ static void uhid_device_add_worker(struct work_struct *work) if (ret) { hid_err(uhid->hid, "Cannot register HID device: error %d\n", ret); - hid_destroy_device(uhid->hid); - uhid->hid = NULL; + /* We used to call hid_destroy_device() here, but that's really + * messy to get right because we have to coordinate with + * concurrent writes from userspace that might be in the middle + * of using uhid->hid. + * Just leave uhid->hid as-is for now, and clean it up when + * userspace tries to close or reinitialize the uhid instance. + * + * However, we do have to clear the ->running flag and do a + * wakeup to make sure userspace knows that the device is gone. + */ uhid->running = false; + wake_up_interruptible(&uhid->report_wait); } } @@ -474,7 +494,7 @@ static int uhid_dev_create2(struct uhid_device *uhid, void *rd_data; int ret; - if (uhid->running) + if (uhid->hid) return -EALREADY; rd_size = ev->u.create2.rd_size; @@ -556,7 +576,7 @@ static int uhid_dev_create(struct uhid_device *uhid, static int uhid_dev_destroy(struct uhid_device *uhid) { - if (!uhid->running) + if (!uhid->hid) return -EINVAL; uhid->running = false; @@ -565,6 +585,7 @@ static int uhid_dev_destroy(struct uhid_device *uhid) cancel_work_sync(&uhid->worker); hid_destroy_device(uhid->hid); + uhid->hid = NULL; kfree(uhid->rd_data); return 0; -- cgit v1.2.3-58-ga151 From c8e7ff41f819b0c31c66c5196933c26c18f7681f Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 14 Jan 2022 14:33:31 +0100 Subject: HID: uhid: Use READ_ONCE()/WRITE_ONCE() for ->running The flag uhid->running can be set to false by uhid_device_add_worker() without holding the uhid->devlock. Mark all reads/writes of the flag that might race with READ_ONCE()/WRITE_ONCE() for clarity and correctness. Signed-off-by: Jann Horn Signed-off-by: Jiri Kosina --- drivers/hid/uhid.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c index fc06d8bb42e0..614adb510dbd 100644 --- a/drivers/hid/uhid.c +++ b/drivers/hid/uhid.c @@ -84,7 +84,7 @@ static void uhid_device_add_worker(struct work_struct *work) * However, we do have to clear the ->running flag and do a * wakeup to make sure userspace knows that the device is gone. */ - uhid->running = false; + WRITE_ONCE(uhid->running, false); wake_up_interruptible(&uhid->report_wait); } } @@ -194,9 +194,9 @@ static int __uhid_report_queue_and_wait(struct uhid_device *uhid, spin_unlock_irqrestore(&uhid->qlock, flags); ret = wait_event_interruptible_timeout(uhid->report_wait, - !uhid->report_running || !uhid->running, + !uhid->report_running || !READ_ONCE(uhid->running), 5 * HZ); - if (!ret || !uhid->running || uhid->report_running) + if (!ret || !READ_ONCE(uhid->running) || uhid->report_running) ret = -EIO; else if (ret < 0) ret = -ERESTARTSYS; @@ -237,7 +237,7 @@ static int uhid_hid_get_report(struct hid_device *hid, unsigned char rnum, struct uhid_event *ev; int ret; - if (!uhid->running) + if (!READ_ONCE(uhid->running)) return -EIO; ev = kzalloc(sizeof(*ev), GFP_KERNEL); @@ -279,7 +279,7 @@ static int uhid_hid_set_report(struct hid_device *hid, unsigned char rnum, struct uhid_event *ev; int ret; - if (!uhid->running || count > UHID_DATA_MAX) + if (!READ_ONCE(uhid->running) || count > UHID_DATA_MAX) return -EIO; ev = kzalloc(sizeof(*ev), GFP_KERNEL); @@ -579,7 +579,7 @@ static int uhid_dev_destroy(struct uhid_device *uhid) if (!uhid->hid) return -EINVAL; - uhid->running = false; + WRITE_ONCE(uhid->running, false); wake_up_interruptible(&uhid->report_wait); cancel_work_sync(&uhid->worker); @@ -593,7 +593,7 @@ static int uhid_dev_destroy(struct uhid_device *uhid) static int uhid_dev_input(struct uhid_device *uhid, struct uhid_event *ev) { - if (!uhid->running) + if (!READ_ONCE(uhid->running)) return -EINVAL; hid_input_report(uhid->hid, HID_INPUT_REPORT, ev->u.input.data, @@ -604,7 +604,7 @@ static int uhid_dev_input(struct uhid_device *uhid, struct uhid_event *ev) static int uhid_dev_input2(struct uhid_device *uhid, struct uhid_event *ev) { - if (!uhid->running) + if (!READ_ONCE(uhid->running)) return -EINVAL; hid_input_report(uhid->hid, HID_INPUT_REPORT, ev->u.input2.data, @@ -616,7 +616,7 @@ static int uhid_dev_input2(struct uhid_device *uhid, struct uhid_event *ev) static int uhid_dev_get_report_reply(struct uhid_device *uhid, struct uhid_event *ev) { - if (!uhid->running) + if (!READ_ONCE(uhid->running)) return -EINVAL; uhid_report_wake_up(uhid, ev->u.get_report_reply.id, ev); @@ -626,7 +626,7 @@ static int uhid_dev_get_report_reply(struct uhid_device *uhid, static int uhid_dev_set_report_reply(struct uhid_device *uhid, struct uhid_event *ev) { - if (!uhid->running) + if (!READ_ONCE(uhid->running)) return -EINVAL; uhid_report_wake_up(uhid, ev->u.set_report_reply.id, ev); -- cgit v1.2.3-58-ga151 From c476d430bfc02115e67a32a268ebd31b8d683698 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 21 Dec 2021 08:52:09 -0400 Subject: dt-bindings: display: Add SPI peripheral schema to SPI based displays With 'unevaluatedProperties' support enabled, several SPI based display binding examples have warnings: Documentation/devicetree/bindings/display/panel/samsung,ld9040.example.dt.yaml: lcd@0: Unevaluated properties are not allowed ('#address-cells', '#size-cells', 'spi-max-frequency', 'spi-cpol', 'spi-cpha' were unexpected) Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.example.dt.yaml: panel@0: Unevaluated properties are not allowed ('spi-max-frequency', 'spi-3wire' were unexpected) Documentation/devicetree/bindings/display/panel/ilitek,ili9322.example.dt.yaml: display@0: Unevaluated properties are not allowed ('reg' was unexpected) Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.example.dt.yaml: display@0: Unevaluated properties are not allowed ('spi-max-frequency' was unexpected) Documentation/devicetree/bindings/display/panel/abt,y030xx067a.example.dt.yaml: panel@0: Unevaluated properties are not allowed ('spi-max-frequency' was unexpected) Documentation/devicetree/bindings/display/panel/sony,acx565akm.example.dt.yaml: panel@2: Unevaluated properties are not allowed ('spi-max-frequency', 'reg' were unexpected) Documentation/devicetree/bindings/display/panel/tpo,td.example.dt.yaml: panel@0: Unevaluated properties are not allowed ('spi-max-frequency', 'spi-cpol', 'spi-cpha' were unexpected) Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.example.dt.yaml: panel@0: Unevaluated properties are not allowed ('reg', 'spi-max-frequency', 'spi-cpol', 'spi-cpha' were unexpected) Documentation/devicetree/bindings/display/panel/innolux,ej030na.example.dt.yaml: panel@0: Unevaluated properties are not allowed ('spi-max-frequency' was unexpected) Documentation/devicetree/bindings/display/panel/sitronix,st7789v.example.dt.yaml: panel@0: Unevaluated properties are not allowed ('spi-max-frequency', 'spi-cpol', 'spi-cpha' were unexpected) Fix all of these by adding a reference to spi-peripheral-props.yaml. With this, the description that the binding must follow spi-controller.yaml is both a bit out of date and redundant, so remove it. Signed-off-by: Rob Herring Reviewed-by: Linus Walleij Acked-by: Paul Cercueil Acked-by: Sam Ravnborg Link: https://lore.kernel.org/r/20211221125209.1195932-1-robh@kernel.org --- .../devicetree/bindings/display/panel/abt,y030xx067a.yaml | 5 +---- .../devicetree/bindings/display/panel/ilitek,ili9322.yaml | 4 +--- .../devicetree/bindings/display/panel/innolux,ej030na.yaml | 5 +---- .../bindings/display/panel/kingdisplay,kd035g6-54nt.yaml | 5 +---- .../devicetree/bindings/display/panel/lgphilips,lb035q02.yaml | 5 +---- .../devicetree/bindings/display/panel/samsung,ld9040.yaml | 7 +------ .../devicetree/bindings/display/panel/samsung,s6e63m0.yaml | 1 + .../devicetree/bindings/display/panel/sitronix,st7789v.yaml | 5 +---- .../devicetree/bindings/display/panel/sony,acx565akm.yaml | 5 +---- Documentation/devicetree/bindings/display/panel/tpo,td.yaml | 5 +---- 10 files changed, 10 insertions(+), 37 deletions(-) diff --git a/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml b/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml index a108029ecfab..acd2f3faa6b9 100644 --- a/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml +++ b/Documentation/devicetree/bindings/display/panel/abt,y030xx067a.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Asia Better Technology 3.0" (320x480 pixels) 24-bit IPS LCD panel -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Paul Cercueil allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/ilitek,ili9322.yaml b/Documentation/devicetree/bindings/display/panel/ilitek,ili9322.yaml index e89c1ea62ffa..7d221ef35443 100644 --- a/Documentation/devicetree/bindings/display/panel/ilitek,ili9322.yaml +++ b/Documentation/devicetree/bindings/display/panel/ilitek,ili9322.yaml @@ -15,11 +15,9 @@ description: | 960 TFT source driver pins and 240 TFT gate driver pins, VCOM, VCOML and VCOMH outputs. - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/innolux,ej030na.yaml b/Documentation/devicetree/bindings/display/panel/innolux,ej030na.yaml index cda36c04e85c..72788e3e6c59 100644 --- a/Documentation/devicetree/bindings/display/panel/innolux,ej030na.yaml +++ b/Documentation/devicetree/bindings/display/panel/innolux,ej030na.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Innolux EJ030NA 3.0" (320x480 pixels) 24-bit TFT LCD panel -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Paul Cercueil allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.yaml b/Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.yaml index c45c92a3d41f..2a2756d19681 100644 --- a/Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.yaml +++ b/Documentation/devicetree/bindings/display/panel/kingdisplay,kd035g6-54nt.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: King Display KD035G6-54NT 3.5" (320x240 pixels) 24-bit TFT LCD panel -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Paul Cercueil allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.yaml b/Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.yaml index 830e335ddb53..5e4e0e552c2f 100644 --- a/Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.yaml +++ b/Documentation/devicetree/bindings/display/panel/lgphilips,lb035q02.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: LG.Philips LB035Q02 Panel -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Tomi Valkeinen allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/samsung,ld9040.yaml b/Documentation/devicetree/bindings/display/panel/samsung,ld9040.yaml index 060ee27a4749..d525165d6d63 100644 --- a/Documentation/devicetree/bindings/display/panel/samsung,ld9040.yaml +++ b/Documentation/devicetree/bindings/display/panel/samsung,ld9040.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Samsung LD9040 AMOLED LCD parallel RGB panel with SPI control bus -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Andrzej Hajda allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: @@ -63,8 +60,6 @@ examples: lcd@0 { compatible = "samsung,ld9040"; - #address-cells = <1>; - #size-cells = <0>; reg = <0>; vdd3-supply = <&ldo7_reg>; diff --git a/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.yaml b/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.yaml index ea58df49263a..940f7f88526f 100644 --- a/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.yaml +++ b/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.yaml @@ -12,6 +12,7 @@ maintainers: allOf: - $ref: panel-common.yaml# - $ref: /schemas/leds/backlight/common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/sitronix,st7789v.yaml b/Documentation/devicetree/bindings/display/panel/sitronix,st7789v.yaml index fa46d151e7b3..9e1d707c2ace 100644 --- a/Documentation/devicetree/bindings/display/panel/sitronix,st7789v.yaml +++ b/Documentation/devicetree/bindings/display/panel/sitronix,st7789v.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Sitronix ST7789V RGB panel with SPI control bus -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Maxime Ripard allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/sony,acx565akm.yaml b/Documentation/devicetree/bindings/display/panel/sony,acx565akm.yaml index 95d053c548ab..98abdf4ddeac 100644 --- a/Documentation/devicetree/bindings/display/panel/sony,acx565akm.yaml +++ b/Documentation/devicetree/bindings/display/panel/sony,acx565akm.yaml @@ -6,15 +6,12 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Sony ACX565AKM SDI Panel -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Tomi Valkeinen allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: diff --git a/Documentation/devicetree/bindings/display/panel/tpo,td.yaml b/Documentation/devicetree/bindings/display/panel/tpo,td.yaml index 4aa605613445..f902a9d74141 100644 --- a/Documentation/devicetree/bindings/display/panel/tpo,td.yaml +++ b/Documentation/devicetree/bindings/display/panel/tpo,td.yaml @@ -6,16 +6,13 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Toppoly TD Panels -description: | - The panel must obey the rules for a SPI slave device as specified in - spi/spi-controller.yaml - maintainers: - Marek Belisko - H. Nikolaus Schaller allOf: - $ref: panel-common.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# properties: compatible: -- cgit v1.2.3-58-ga151 From 59449e5dc87e72e9b4a16df115625a93b0112203 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 6 Jan 2022 12:25:14 -0600 Subject: dt-bindings: mmc: arm,pl18x: Make each example a separate entry Each independent example should be a separate entry. This and dropping 'interrupt-parent' allows for 'interrupts' to have different cell sizes. Signed-off-by: Rob Herring Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20220106182518.1435497-6-robh@kernel.org --- Documentation/devicetree/bindings/mmc/arm,pl18x.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mmc/arm,pl18x.yaml b/Documentation/devicetree/bindings/mmc/arm,pl18x.yaml index 47595cb483be..2a64cffbe6ad 100644 --- a/Documentation/devicetree/bindings/mmc/arm,pl18x.yaml +++ b/Documentation/devicetree/bindings/mmc/arm,pl18x.yaml @@ -167,6 +167,9 @@ examples: clock-names = "mclk", "apb_pclk"; }; + - | + #include + mmc@80126000 { compatible = "arm,pl18x", "arm,primecell"; reg = <0x80126000 0x1000>; @@ -188,12 +191,12 @@ examples: vqmmc-supply = <&vmmci>; }; + - | mmc@101f6000 { compatible = "arm,pl18x", "arm,primecell"; reg = <0x101f6000 0x1000>; clocks = <&sdiclk>, <&pclksdi>; clock-names = "mclk", "apb_pclk"; - interrupt-parent = <&vica>; interrupts = <22>; max-frequency = <400000>; bus-width = <4>; @@ -208,6 +211,7 @@ examples: vmmc-supply = <&vmmc_regulator>; }; + - | mmc@52007000 { compatible = "arm,pl18x", "arm,primecell"; arm,primecell-periphid = <0x10153180>; -- cgit v1.2.3-58-ga151 From d9dfab097d90f74dd8d7198aa6e8b87bc15f2122 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Thu, 6 Jan 2022 12:25:15 -0600 Subject: dt-bindings: rtc: st,stm32-rtc: Make each example a separate entry Each independent example should be a separate entry. This allows for 'interrupts' to have different cell sizes. Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220106182518.1435497-7-robh@kernel.org --- Documentation/devicetree/bindings/rtc/st,stm32-rtc.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/rtc/st,stm32-rtc.yaml b/Documentation/devicetree/bindings/rtc/st,stm32-rtc.yaml index 2359f541b770..764717ce1873 100644 --- a/Documentation/devicetree/bindings/rtc/st,stm32-rtc.yaml +++ b/Documentation/devicetree/bindings/rtc/st,stm32-rtc.yaml @@ -127,6 +127,7 @@ examples: st,syscfg = <&pwrcfg 0x00 0x100>; }; + - | #include #include rtc@5c004000 { -- cgit v1.2.3-58-ga151 From 8c0ae778e2874f3742bd619000b791d178c187e2 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 19 Jan 2022 10:10:50 +0100 Subject: ALSA: core: Simplify snd_power_ref_and_wait() with the standard macro Use wait_event_cmd() macro and simplify snd_power_ref_wait() implementation. This may also cover possible races in the current open code, too. Reviewed-by: Jaroslav Kysela Link: https://lore.kernel.org/r/20220119091050.30125-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/core/init.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/sound/core/init.c b/sound/core/init.c index ac335f5906c6..31ba7024e3ad 100644 --- a/sound/core/init.c +++ b/sound/core/init.c @@ -1111,29 +1111,14 @@ EXPORT_SYMBOL(snd_card_file_remove); */ int snd_power_ref_and_wait(struct snd_card *card) { - wait_queue_entry_t wait; - int result = 0; - snd_power_ref(card); - /* fastpath */ if (snd_power_get_state(card) == SNDRV_CTL_POWER_D0) return 0; - init_waitqueue_entry(&wait, current); - add_wait_queue(&card->power_sleep, &wait); - while (1) { - if (card->shutdown) { - result = -ENODEV; - break; - } - if (snd_power_get_state(card) == SNDRV_CTL_POWER_D0) - break; - snd_power_unref(card); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(30 * HZ); - snd_power_ref(card); - } - remove_wait_queue(&card->power_sleep, &wait); - return result; + wait_event_cmd(card->power_sleep, + card->shutdown || + snd_power_get_state(card) == SNDRV_CTL_POWER_D0, + snd_power_unref(card), snd_power_ref(card)); + return card->shutdown ? -ENODEV : 0; } EXPORT_SYMBOL_GPL(snd_power_ref_and_wait); -- cgit v1.2.3-58-ga151 From 9d5f0c36438eeae7566ca383b2b673179e3cc613 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 18 Jan 2022 09:02:43 -0300 Subject: perf machine: Use path__join() to compose a path instead of snprintf(dir, '/', filename) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its more intention revealing, and if we're interested in the odd cases where this may end up truncating we can do debug checks at one centralized place. Motivation, of all the container builds, fedora rawhide started complaining of: util/machine.c: In function ‘machine__create_modules’: util/machine.c:1419:50: error: ‘%s’ directive output may be truncated writing up to 255 bytes into a region of size between 0 and 4095 [-Werror=format-truncation=] 1419 | snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name); | ^~ In file included from /usr/include/stdio.h:894, from util/branch.h:9, from util/callchain.h:8, from util/machine.c:7: In function ‘snprintf’, inlined from ‘maps__set_modules_path_dir’ at util/machine.c:1419:3, inlined from ‘machine__set_modules_path’ at util/machine.c:1473:9, inlined from ‘machine__create_modules’ at util/machine.c:1519:7: /usr/include/bits/stdio2.h:71:10: note: ‘__builtin___snprintf_chk’ output between 2 and 4352 bytes into a destination of size 4096 There are other places where we should use path__join(), but lets get rid of this one first. Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Acked-by: Ian Rogers Link: Link: https://lore.kernel.org/r/YebZKjwgfdOz0lAs@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 3901440aeff9..f70ba56912d4 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -16,6 +16,7 @@ #include "map_symbol.h" #include "branch.h" #include "mem-events.h" +#include "path.h" #include "srcline.h" #include "symbol.h" #include "sort.h" @@ -1416,7 +1417,7 @@ static int maps__set_modules_path_dir(struct maps *maps, const char *dir_name, i struct stat st; /*sshfs might return bad dent->d_type, so we have to stat*/ - snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name); + path__join(path, sizeof(path), dir_name, dent->d_name); if (stat(path, &st)) continue; -- cgit v1.2.3-58-ga151 From 7c8a4742c4abe205ec9daf416c9d42fd6b406e8e Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 13 Jan 2022 23:30:17 +0000 Subject: KVM: x86/mmu: Fix write-protection of PTs mapped by the TDP MMU When the TDP MMU is write-protection GFNs for page table protection (as opposed to for dirty logging, or due to the HVA not being writable), it checks if the SPTE is already write-protected and if so skips modifying the SPTE and the TLB flush. This behavior is incorrect because it fails to check if the SPTE is write-protected for page table protection, i.e. fails to check that MMU-writable is '0'. If the SPTE was write-protected for dirty logging but not page table protection, the SPTE could locklessly be made writable, and vCPUs could still be running with writable mappings cached in their TLB. Fix this by only skipping setting the SPTE if the SPTE is already write-protected *and* MMU-writable is already clear. Technically, checking only MMU-writable would suffice; a SPTE cannot be writable without MMU-writable being set. But check both to be paranoid and because it arguably yields more readable code. Fixes: 46044f72c382 ("kvm: x86/mmu: Support write protection for nesting in tdp MMU") Cc: stable@vger.kernel.org Signed-off-by: David Matlack Message-Id: <20220113233020.3986005-2-dmatlack@google.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7b1bc816b7c3..bc9e3553fba2 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1442,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, !is_last_spte(iter.old_spte, iter.level)) continue; - if (!is_writable_pte(iter.old_spte)) - break; - new_spte = iter.old_spte & ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); + if (new_spte == iter.old_spte) + break; + tdp_mmu_set_spte(kvm, &iter, new_spte); spte_set = true; } -- cgit v1.2.3-58-ga151 From f082d86ea68559e4bd1ecaffa04981b72281e28f Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 13 Jan 2022 23:30:18 +0000 Subject: KVM: x86/mmu: Clear MMU-writable during changed_pte notifier When handling the changed_pte notifier and the new PTE is read-only, clear both the Host-writable and MMU-writable bits in the SPTE. This preserves the invariant that MMU-writable is set if-and-only-if Host-writable is set. No functional change intended. Nothing currently relies on the aforementioned invariant and technically the changed_pte notifier is dead code. Signed-off-by: David Matlack Message-Id: <20220113233020.3986005-3-dmatlack@google.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/spte.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index 8a7b03207762..f8677404c93c 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -215,6 +215,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn) new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~shadow_host_writable_mask; + new_spte &= ~shadow_mmu_writable_mask; new_spte = mark_spte_for_access_track(new_spte); -- cgit v1.2.3-58-ga151 From 5f16bcac6e280a7dade580d9627f5cf93ef6aa56 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 13 Jan 2022 23:30:19 +0000 Subject: KVM: x86/mmu: Document and enforce MMU-writable and Host-writable invariants SPTEs are tagged with software-only bits to indicate if it is "MMU-writable" and "Host-writable". These bits are used to determine why KVM has marked an SPTE as read-only. Document these bits and their invariants, and enforce the invariants with new WARNs in spte_can_locklessly_be_made_writable() to ensure they are not accidentally violated in the future. Opportunistically move DEFAULT_SPTE_{MMU,HOST}_WRITABLE next to EPT_SPTE_{MMU,HOST}_WRITABLE since the new documentation applies to both. No functional change intended. Signed-off-by: David Matlack Message-Id: <20220113233020.3986005-4-dmatlack@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/spte.h | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index a4af2a42695c..be6a007a4af3 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -60,10 +60,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) -/* Bits 9 and 10 are ignored by all non-EPT PTEs. */ -#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) -#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) - /* * The mask/shift to use for saving the original R/X bits when marking the PTE * as not-present for access tracking purposes. We do not save the W bit as the @@ -78,6 +74,35 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0); SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK)); +/* + * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits + * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs + * that map guest pages in read-only memslots and read-only VMAs. + * + * Invariants: + * - If Host-writable is clear, PT_WRITABLE_MASK must be clear. + * + * + * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU + * allows writes to the guest page mapped by the SPTE. This bit is cleared when + * the guest page mapped by the SPTE contains a page table that is being + * monitored for shadow paging. In this case the SPTE can only be made writable + * by unsyncing the shadow page under the mmu_lock. + * + * Invariants: + * - If MMU-writable is clear, PT_WRITABLE_MASK must be clear. + * - If MMU-writable is set, Host-writable must be set. + * + * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared + * to track writes for dirty logging. For such SPTEs, KVM will locklessly set + * PT_WRITABLE_MASK upon the next write from the guest and record the write in + * the dirty log (see fast_page_fault()). + */ + +/* Bits 9 and 10 are ignored by all non-EPT PTEs. */ +#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9) +#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10) + /* * Low ignored bits are at a premium for EPT, use high ignored bits, taking care * to not overlap the A/D type mask or the saved access bits of access-tracked @@ -316,8 +341,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, static inline bool spte_can_locklessly_be_made_writable(u64 spte) { - return (spte & shadow_host_writable_mask) && - (spte & shadow_mmu_writable_mask); + if (spte & shadow_mmu_writable_mask) { + WARN_ON_ONCE(!(spte & shadow_host_writable_mask)); + return true; + } + + WARN_ON_ONCE(spte & PT_WRITABLE_MASK); + return false; } static inline u64 get_mmio_spte_generation(u64 spte) -- cgit v1.2.3-58-ga151 From 6ff94f27fd47847d6ecb9302f9d3bd1ca991a17f Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 13 Jan 2022 23:30:20 +0000 Subject: KVM: x86/mmu: Improve TLB flush comment in kvm_mmu_slot_remove_write_access() Rewrite the comment in kvm_mmu_slot_remove_write_access() that explains why it is safe to flush TLBs outside of the MMU lock after write-protecting SPTEs for dirty logging. The current comment is a long run-on sentence that was difficult to understand. In addition it was specific to the shadow MMU (mentioning mmu_spte_update()) when the TDP MMU has to handle this as well. The new comment explains: - Why the TLB flush is necessary at all. - Why it is desirable to do the TLB flush outside of the MMU lock. - Why it is safe to do the TLB flush outside of the MMU lock. No functional change intended. Signed-off-by: David Matlack Message-Id: <20220113233020.3986005-5-dmatlack@google.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1d275e9d76b5..593093b52395 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5756,6 +5756,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) continue; flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } @@ -5825,15 +5826,27 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, } /* - * We can flush all the TLBs out of the mmu lock without TLB - * corruption since we just change the spte from writable to - * readonly so that we only need to care the case of changing - * spte from present to present (changing the spte from present - * to nonpresent will flush all the TLBs immediately), in other - * words, the only case we care is mmu_spte_update() where we - * have checked Host-writable | MMU-writable instead of - * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK - * anymore. + * Flush TLBs if any SPTEs had to be write-protected to ensure that + * guest writes are reflected in the dirty bitmap before the memslot + * update completes, i.e. before enabling dirty logging is visible to + * userspace. + * + * Perform the TLB flush outside the mmu_lock to reduce the amount of + * time the lock is held. However, this does mean that another CPU can + * now grab mmu_lock and encounter a write-protected SPTE while CPUs + * still have a writable mapping for the associated GFN in their TLB. + * + * This is safe but requires KVM to be careful when making decisions + * based on the write-protection status of an SPTE. Specifically, KVM + * also write-protects SPTEs to monitor changes to guest page tables + * during shadow paging, and must guarantee no CPUs can write to those + * page before the lock is dropped. As mentioned in the previous + * paragraph, a write-protected SPTE is no guarantee that CPU cannot + * perform writes. So to determine if a TLB flush is truly required, KVM + * will clear a separate software-only bit (MMU-writable) and skip the + * flush if-and-only-if this bit was already clear. + * + * See DEFAULT_SPTE_MMU_WRITEABLE for more details. */ if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); -- cgit v1.2.3-58-ga151 From e9737468829c2f6abc0c67e5372f8878dff11653 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 17 Jan 2022 15:45:31 +0800 Subject: KVM: x86/cpuid: Clear XFD for component i if the base feature is missing According to Intel extended feature disable (XFD) spec, the sub-function i (i > 1) of CPUID function 0DH enumerates "details for state component i. ECX[2] enumerates support for XFD support for this state component." If KVM does not report F(XFD) feature (e.g. due to CONFIG_X86_64), then the corresponding XFD support for any state component i should also be removed. Translate this dependency into KVM terms. Fixes: 690a757d610e ("kvm: x86: Add CPUID support for Intel AMX") Signed-off-by: Like Xu Message-Id: <20220117074531.76925-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a7c280d2113b..3902c28fb6cb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -936,6 +936,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) --array->nent; continue; } + + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + entry->ecx &= ~BIT_ULL(2); entry->edx = 0; } break; -- cgit v1.2.3-58-ga151 From 1a1d1dbce6d5477e2bb08ce1ef0d77caa838cc8e Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 17 Jan 2022 20:48:17 -0500 Subject: kvm: selftests: conditionally build vm_xsave_req_perm() vm_xsave_req_perm() is currently defined and used by x86_64 only. Make it compiled into vm_create_with_vcpus() only when on x86_64 machines. Otherwise, it would cause linkage errors, e.g. on s390x. Fixes: 415a3c33e8 ("kvm: selftests: Add support for KVM_CAP_XSAVE2") Reported-by: Janis Schoetterl-Glausch Signed-off-by: Wei Wang Tested-by: Janis Schoetterl-Glausch Message-Id: <20220118014817.30910-1-wei.w.wang@intel.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/kvm_util.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 4a645dc77f34..c22a17aac6b0 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -393,10 +393,12 @@ struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, struct kvm_vm *vm; int i; +#ifdef __x86_64__ /* * Permission needs to be requested before KVM_SET_CPUID2. */ vm_xsave_req_perm(); +#endif /* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */ if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES) -- cgit v1.2.3-58-ga151 From 3663c9045f51a7ad635a0785adef07c21b79b560 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Sat, 30 Oct 2021 04:51:35 +0000 Subject: cifs: check reconnects for channels of active tcons too With the new multichannel logic, when a channel needs reconnection, the tree connect and other channels can still be active. This fix will handle cases of checking for channel reconnect, when the tcon does not need reconnect. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 2 ++ fs/cifs/connect.c | 34 ++++++++++++++++++++-- fs/cifs/smb2pdu.c | 82 +++++++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 99 insertions(+), 19 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f84978b76bb6..cace0818d510 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -117,6 +117,7 @@ enum statusEnum { CifsInSessSetup, CifsNeedTcon, CifsInTcon, + CifsNeedFilesInvalidate, CifsInFilesInvalidate }; @@ -923,6 +924,7 @@ struct cifs_chan { */ struct cifs_ses { struct list_head smb_ses_list; + struct list_head rlist; /* reconnect list */ struct list_head tcon_list; struct cifs_tcon *tcon_ipc; struct mutex session_mutex; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index accce1b351c6..804ffcde0391 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -335,6 +335,7 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, spin_unlock(&cifs_tcp_ses_lock); cifs_swn_reset_server_dstaddr(server); mutex_unlock(&server->srv_mutex); + mod_delayed_work(cifsiod_wq, &server->reconnect, 0); } } while (server->tcpStatus == CifsNeedReconnect); @@ -4404,9 +4405,22 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru char *tree; struct dfs_info3_param ref = {0}; + /* only send once per connect */ + spin_lock(&cifs_tcp_ses_lock); + if (tcon->ses->status != CifsGood || + (tcon->tidStatus != CifsNew && + tcon->tidStatus != CifsNeedTcon)) { + spin_unlock(&cifs_tcp_ses_lock); + return 0; + } + tcon->tidStatus = CifsInTcon; + spin_unlock(&cifs_tcp_ses_lock); + tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL); - if (!tree) - return -ENOMEM; + if (!tree) { + rc = -ENOMEM; + goto out; + } if (tcon->ipc) { scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname); @@ -4438,11 +4452,18 @@ out: kfree(tree); cifs_put_tcp_super(sb); + if (rc) { + spin_lock(&cifs_tcp_ses_lock); + tcon->tidStatus = CifsNeedTcon; + spin_unlock(&cifs_tcp_ses_lock); + } + return rc; } #else int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc) { + int rc; const struct smb_version_operations *ops = tcon->ses->server->ops; /* only send once per connect */ @@ -4456,6 +4477,13 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru tcon->tidStatus = CifsInTcon; spin_unlock(&cifs_tcp_ses_lock); - return ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); + rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); + if (rc) { + spin_lock(&cifs_tcp_ses_lock); + tcon->tidStatus = CifsNeedTcon; + spin_unlock(&cifs_tcp_ses_lock); + } + + return rc; } #endif diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 625e3e9cb614..41b6dffc84c7 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -289,14 +289,18 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, rc = -EHOSTDOWN; goto failed; } - } - - if (rc || !tcon->need_reconnect) { + } else { mutex_unlock(&ses->session_mutex); goto out; } + mutex_unlock(&ses->session_mutex); skip_sess_setup: + mutex_lock(&ses->session_mutex); + if (!tcon->need_reconnect) { + mutex_unlock(&ses->session_mutex); + goto out; + } cifs_mark_open_files_invalid(tcon); if (tcon->use_persistent) tcon->need_reopen_files = true; @@ -3787,27 +3791,35 @@ void smb2_reconnect_server(struct work_struct *work) { struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, reconnect.work); - struct cifs_ses *ses; + struct TCP_Server_Info *pserver; + struct cifs_ses *ses, *ses2; struct cifs_tcon *tcon, *tcon2; - struct list_head tmp_list; - int tcon_exist = false; + struct list_head tmp_list, tmp_ses_list; + bool tcon_exist = false, ses_exist = false; + bool tcon_selected = false, ses_selected = false; int rc; - int resched = false; + bool resched = false; + /* If server is a channel, select the primary channel */ + pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; /* Prevent simultaneous reconnects that can corrupt tcon->rlist list */ - mutex_lock(&server->reconnect_mutex); + mutex_lock(&pserver->reconnect_mutex); INIT_LIST_HEAD(&tmp_list); - cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n"); + INIT_LIST_HEAD(&tmp_ses_list); + cifs_dbg(FYI, "Reconnecting tcons and channels\n"); spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + + tcon_selected = ses_selected = false; + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->need_reconnect || tcon->need_reopen_files) { tcon->tc_count++; list_add_tail(&tcon->rlist, &tmp_list); - tcon_exist = true; + tcon_selected = tcon_exist = true; } } /* @@ -3816,7 +3828,17 @@ void smb2_reconnect_server(struct work_struct *work) */ if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) { list_add_tail(&ses->tcon_ipc->rlist, &tmp_list); - tcon_exist = true; + tcon_selected = tcon_exist = true; + ses->ses_count++; + } + /* + * handle the case where channel needs to reconnect + * binding session, but tcon is healthy (some other channel + * is active) + */ + if (!tcon_selected && cifs_chan_needs_reconnect(ses, server)) { + list_add_tail(&ses->rlist, &tmp_ses_list); + ses_selected = ses_exist = true; ses->ses_count++; } } @@ -3824,7 +3846,7 @@ void smb2_reconnect_server(struct work_struct *work) * Get the reference to server struct to be sure that the last call of * cifs_put_tcon() in the loop below won't release the server pointer. */ - if (tcon_exist) + if (tcon_exist || ses_exist) server->srv_count++; spin_unlock(&cifs_tcp_ses_lock); @@ -3842,13 +3864,41 @@ void smb2_reconnect_server(struct work_struct *work) cifs_put_tcon(tcon); } - cifs_dbg(FYI, "Reconnecting tcons finished\n"); + if (!ses_exist) + goto done; + + /* allocate a dummy tcon struct used for reconnect */ + tcon = kzalloc(sizeof(struct cifs_tcon), GFP_KERNEL); + if (!tcon) { + resched = true; + list_del_init(&ses->rlist); + cifs_put_smb_ses(ses); + goto done; + } + + tcon->tidStatus = CifsGood; + tcon->retry = false; + tcon->need_reconnect = false; + + /* now reconnect sessions for necessary channels */ + list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { + tcon->ses = ses; + rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server); + if (rc) + resched = true; + list_del_init(&ses->rlist); + cifs_put_smb_ses(ses); + } + kfree(tcon); + +done: + cifs_dbg(FYI, "Reconnecting tcons and channels finished\n"); if (resched) queue_delayed_work(cifsiod_wq, &server->reconnect, 2 * HZ); - mutex_unlock(&server->reconnect_mutex); + mutex_unlock(&pserver->reconnect_mutex); /* now we can safely release srv struct */ - if (tcon_exist) + if (tcon_exist || ses_exist) cifs_put_tcp_session(server, 1); } -- cgit v1.2.3-58-ga151 From a05885ce13bd5ec9602551e32dfb1a4f26bfa542 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 17 Nov 2021 15:57:22 +0000 Subject: cifs: fix the connection state transitions with multichannel Recent changes to multichannel required some adjustments in the way connection states transitioned during/after reconnect. Also some minor fixes: 1. A pending switch of GlobalMid_Lock to cifs_tcp_ses_lock 2. Relocations of the code that logs reconnect 3. Changed some code in allocate_mid to suit the new scheme Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 40 ++++++++++++++++++++++++++-------------- fs/cifs/sess.c | 1 - fs/cifs/smb2pdu.c | 1 - fs/cifs/transport.c | 14 ++------------ 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 804ffcde0391..9fba2ec56328 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -181,17 +181,16 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, server->maxBuf = 0; server->max_read = 0; - cifs_dbg(FYI, "Mark tcp session as need reconnect\n"); - trace_smb3_reconnect(server->CurrentMid, server->conn_id, server->hostname); /* * before reconnecting the tcp session, mark the smb session (uid) and the tid bad so they * are not used until reconnected. */ - cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect\n", __func__); + cifs_dbg(FYI, "%s: marking necessary sessions and tcons for reconnect\n", __func__); /* If server is a channel, select the primary channel */ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; + spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { spin_lock(&ses->chan_lock); @@ -218,13 +217,8 @@ next_session: } spin_unlock(&cifs_tcp_ses_lock); - /* - * before reconnecting the tcp session, mark the smb session (uid) - * and the tid bad so they are not used until reconnected - */ - cifs_dbg(FYI, "%s: marking sessions and tcons for reconnect and tearing down socket\n", - __func__); /* do not want to be sending data on a socket we are freeing */ + cifs_dbg(FYI, "%s: tearing down socket\n", __func__); mutex_lock(&server->srv_mutex); if (server->ssocket) { cifs_dbg(FYI, "State: 0x%x Flags: 0x%lx\n", server->ssocket->state, @@ -280,7 +274,12 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num wake_up(&server->response_q); return false; } + + cifs_dbg(FYI, "Mark tcp session as need reconnect\n"); + trace_smb3_reconnect(server->CurrentMid, server->conn_id, + server->hostname); server->tcpStatus = CifsNeedReconnect; + spin_unlock(&cifs_tcp_ses_lock); return true; } @@ -634,7 +633,6 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg) if (server->tcpStatus == CifsNeedReconnect) { spin_unlock(&cifs_tcp_ses_lock); - cifs_reconnect(server, false); return -ECONNABORTED; } spin_unlock(&cifs_tcp_ses_lock); @@ -3885,6 +3883,11 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, else rc = -EHOSTDOWN; spin_unlock(&cifs_tcp_ses_lock); + } else { + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsInNegotiate) + server->tcpStatus = CifsNeedNegotiate; + spin_unlock(&cifs_tcp_ses_lock); } return rc; @@ -3931,8 +3934,18 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (server->ops->sess_setup) rc = server->ops->sess_setup(xid, ses, server, nls_info); - if (rc) + if (rc) { cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc); + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsInSessSetup) + server->tcpStatus = CifsNeedSessSetup; + spin_unlock(&cifs_tcp_ses_lock); + } else { + spin_lock(&cifs_tcp_ses_lock); + if (server->tcpStatus == CifsInSessSetup) + server->tcpStatus = CifsGood; + spin_unlock(&cifs_tcp_ses_lock); + } return rc; } @@ -4279,9 +4292,8 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t /* only send once per connect */ spin_lock(&cifs_tcp_ses_lock); - if (tcon->ses->status != CifsGood || - (tcon->tidStatus != CifsNew && - tcon->tidStatus != CifsNeedTcon)) { + if (tcon->tidStatus != CifsNew && + tcon->tidStatus != CifsNeedTcon) { spin_unlock(&cifs_tcp_ses_lock); return 0; } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 50ca479d7039..e7fddd4a5990 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1054,7 +1054,6 @@ sess_establish_session(struct sess_data *sess_data) /* Even if one channel is active, session is in good state */ spin_lock(&cifs_tcp_ses_lock); - server->tcpStatus = CifsGood; ses->status = CifsGood; spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 41b6dffc84c7..6d8cfe15fbaf 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1393,7 +1393,6 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) /* Even if one channel is active, session is in good state */ spin_lock(&cifs_tcp_ses_lock); - server->tcpStatus = CifsGood; ses->status = CifsGood; spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 93f0e8c1ea23..548cb376942e 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -431,7 +431,8 @@ unmask: * socket so the server throws away the partial SMB */ spin_lock(&cifs_tcp_ses_lock); - server->tcpStatus = CifsNeedReconnect; + if (server->tcpStatus != CifsExiting) + server->tcpStatus = CifsNeedReconnect; spin_unlock(&cifs_tcp_ses_lock); trace_smb3_partial_send_reconnect(server->CurrentMid, server->conn_id, server->hostname); @@ -729,17 +730,6 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, struct mid_q_entry **ppmidQ) { spin_lock(&cifs_tcp_ses_lock); - if (ses->server->tcpStatus == CifsExiting) { - spin_unlock(&cifs_tcp_ses_lock); - return -ENOENT; - } - - if (ses->server->tcpStatus == CifsNeedReconnect) { - spin_unlock(&cifs_tcp_ses_lock); - cifs_dbg(FYI, "tcp session dead - return to caller to retry\n"); - return -EAGAIN; - } - if (ses->status == CifsNew) { if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) && (in_buf->Command != SMB_COM_NEGOTIATE)) { -- cgit v1.2.3-58-ga151 From 88b024f556fcd5bf1288c6333016f576cfa5f539 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Fri, 19 Nov 2021 14:16:57 +0000 Subject: cifs: protect all accesses to chan_* with chan_lock A spin lock called chan_lock was introduced recently. But not all accesses were protected. Doing that with this change. To make sure that a channel is not freed when in use, we need to introduce a ref count. But today, we don't ever free channels. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 4 +++- fs/cifs/sess.c | 10 +++++++--- fs/cifs/smb2pdu.c | 4 +++- fs/cifs/smb2transport.c | 6 ++++++ fs/cifs/transport.c | 3 +++ 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9fba2ec56328..d643f9e16f4e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1831,7 +1831,6 @@ void cifs_put_smb_ses(struct cifs_ses *ses) spin_lock(&ses->chan_lock); chan_count = ses->chan_count; - spin_unlock(&ses->chan_lock); /* close any extra channels */ if (chan_count > 1) { @@ -1848,6 +1847,7 @@ void cifs_put_smb_ses(struct cifs_ses *ses) ses->chans[i].server = NULL; } } + spin_unlock(&ses->chan_lock); sesInfoFree(ses); cifs_put_tcp_session(server, 0); @@ -2123,8 +2123,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) mutex_unlock(&ses->session_mutex); /* each channel uses a different signing key */ + spin_lock(&ses->chan_lock); memcpy(ses->chans[0].signkey, ses->smb3signingkey, sizeof(ses->smb3signingkey)); + spin_unlock(&ses->chan_lock); if (rc) goto get_ses_fail; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index e7fddd4a5990..f7de57f6f047 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -65,6 +65,8 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface) return false; } +/* channel helper functions. assumed that chan_lock is held by caller. */ + unsigned int cifs_ses_get_chan_index(struct cifs_ses *ses, struct TCP_Server_Info *server) @@ -134,10 +136,10 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) left = ses->chan_max - ses->chan_count; if (left <= 0) { + spin_unlock(&ses->chan_lock); cifs_dbg(FYI, "ses already at max_channels (%zu), nothing to open\n", ses->chan_max); - spin_unlock(&ses->chan_lock); return 0; } @@ -369,12 +371,14 @@ void cifs_ses_mark_for_reconnect(struct cifs_ses *ses) { int i; + spin_lock(&cifs_tcp_ses_lock); + spin_lock(&ses->chan_lock); for (i = 0; i < ses->chan_count; i++) { - spin_lock(&cifs_tcp_ses_lock); if (ses->chans[i].server->tcpStatus != CifsExiting) ses->chans[i].server->tcpStatus = CifsNeedReconnect; - spin_unlock(&cifs_tcp_ses_lock); } + spin_unlock(&ses->chan_lock); + spin_unlock(&cifs_tcp_ses_lock); } static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 6d8cfe15fbaf..ca8a16896f17 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -244,10 +244,10 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, spin_unlock(&ses->chan_lock); return 0; } + spin_unlock(&ses->chan_lock); cifs_dbg(FYI, "sess reconnect mask: 0x%lx, tcon reconnect: %d", tcon->ses->chans_need_reconnect, tcon->need_reconnect); - spin_unlock(&ses->chan_lock); nls_codepage = load_nls_default(); @@ -3835,11 +3835,13 @@ void smb2_reconnect_server(struct work_struct *work) * binding session, but tcon is healthy (some other channel * is active) */ + spin_lock(&ses->chan_lock); if (!tcon_selected && cifs_chan_needs_reconnect(ses, server)) { list_add_tail(&ses->rlist, &tmp_ses_list); ses_selected = ses_exist = true; ses->ses_count++; } + spin_unlock(&ses->chan_lock); } /* * Get the reference to server struct to be sure that the last call of diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index b70a49b4edc0..2af79093b78b 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -100,6 +100,7 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) goto out; found: + spin_lock(&ses->chan_lock); if (cifs_chan_needs_reconnect(ses, server) && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { /* @@ -108,6 +109,7 @@ found: * session key */ memcpy(key, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); + spin_unlock(&ses->chan_lock); goto out; } @@ -119,9 +121,11 @@ found: chan = ses->chans + i; if (chan->server == server) { memcpy(key, chan->signkey, SMB3_SIGN_KEY_SIZE); + spin_unlock(&ses->chan_lock); goto out; } } + spin_unlock(&ses->chan_lock); cifs_dbg(VFS, "%s: Could not find channel signing key for session 0x%llx\n", @@ -430,8 +434,10 @@ generate_smb3signingkey(struct cifs_ses *ses, return rc; /* safe to access primary channel, since it will never go away */ + spin_lock(&ses->chan_lock); memcpy(ses->chans[0].signkey, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); + spin_unlock(&ses->chan_lock); rc = generate_key(ses, ptriplet->encryption.label, ptriplet->encryption.context, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 548cb376942e..8540f7c13eae 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -1049,7 +1049,10 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses) /* round robin */ index = (uint)atomic_inc_return(&ses->chan_seq); + + spin_lock(&ses->chan_lock); index %= ses->chan_count; + spin_unlock(&ses->chan_lock); return ses->chans[index].server; } -- cgit v1.2.3-58-ga151 From 8a409cda978e212661b8c032e1b08b3b0b0f9d36 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Wed, 5 Jan 2022 02:24:37 +0500 Subject: cifs: remove unused variable ses_selected ses_selected is being declared and set at several places. It is not being used. Remove it. Signed-off-by: Muhammad Usama Anjum Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index ca8a16896f17..202ddbafe939 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3795,7 +3795,7 @@ void smb2_reconnect_server(struct work_struct *work) struct cifs_tcon *tcon, *tcon2; struct list_head tmp_list, tmp_ses_list; bool tcon_exist = false, ses_exist = false; - bool tcon_selected = false, ses_selected = false; + bool tcon_selected = false; int rc; bool resched = false; @@ -3812,7 +3812,7 @@ void smb2_reconnect_server(struct work_struct *work) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { - tcon_selected = ses_selected = false; + tcon_selected = false; list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { if (tcon->need_reconnect || tcon->need_reopen_files) { @@ -3838,7 +3838,7 @@ void smb2_reconnect_server(struct work_struct *work) spin_lock(&ses->chan_lock); if (!tcon_selected && cifs_chan_needs_reconnect(ses, server)) { list_add_tail(&ses->rlist, &tmp_ses_list); - ses_selected = ses_exist = true; + ses_exist = true; ses->ses_count++; } spin_unlock(&ses->chan_lock); -- cgit v1.2.3-58-ga151 From e154cb7b0ab961f9d785ed34c2d7128413e7083d Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Sun, 16 Jan 2022 11:19:36 +0000 Subject: cifs: fix the cifs_reconnect path for DFS Recently, the cifs_reconnect code was refactored into two branches for regular vs dfs codepath. Some of my recent changes were missing in the dfs path, namely the code to enable periodic DNS query, and a missing lock. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d643f9e16f4e..4a31a9b3f34f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -338,8 +338,10 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, } } while (server->tcpStatus == CifsNeedReconnect); + spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsNeedNegotiate) mod_delayed_work(cifsiod_wq, &server->echo, 0); + spin_unlock(&cifs_tcp_ses_lock); wake_up(&server->response_q); return rc; @@ -454,6 +456,7 @@ reconnect_dfs_server(struct TCP_Server_Info *server, spin_unlock(&cifs_tcp_ses_lock); cifs_swn_reset_server_dstaddr(server); mutex_unlock(&server->srv_mutex); + mod_delayed_work(cifsiod_wq, &server->reconnect, 0); } while (server->tcpStatus == CifsNeedReconnect); if (target_hint) -- cgit v1.2.3-58-ga151 From ece0767641740c7eea7aee5a332728e115b00eab Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Sun, 16 Jan 2022 13:28:34 +0000 Subject: cifs: remove repeated state change in dfs tree connect cifs_tree_connect checks and sets the tidStatus for the tcon. cifs_tree_connect also calls a dfs specific tree connect function, which also does similar checks. This should not happen. Removing it with this change. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4a31a9b3f34f..6740a7c39df3 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4295,16 +4295,6 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t struct dfs_cache_tgt_iterator *tit; bool target_match; - /* only send once per connect */ - spin_lock(&cifs_tcp_ses_lock); - if (tcon->tidStatus != CifsNew && - tcon->tidStatus != CifsNeedTcon) { - spin_unlock(&cifs_tcp_ses_lock); - return 0; - } - tcon->tidStatus = CifsInTcon; - spin_unlock(&cifs_tcp_ses_lock); - extract_unc_hostname(server->hostname, &tcp_host, &tcp_host_len); tit = dfs_cache_get_tgt_iterator(tl); -- cgit v1.2.3-58-ga151 From c1604da708d345a1ca1cf6a5537d503b14aa4787 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Sun, 16 Jan 2022 13:38:14 +0000 Subject: cifs: make status checks in version independent callers The status of tcp session, smb session and tcon have the same flow, irrespective of the SMB version used. Hence these status checks and updates should happen in the version independent callers of these commands. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 29 +++++++++++++++++++++++------ fs/cifs/sess.c | 9 --------- fs/cifs/smb2pdu.c | 14 -------------- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 6740a7c39df3..0a35503b7b46 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3770,10 +3770,6 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, if (rc == 0) { bool is_unicode; - spin_lock(&cifs_tcp_ses_lock); - tcon->tidStatus = CifsGood; - spin_unlock(&cifs_tcp_ses_lock); - tcon->need_reconnect = false; tcon->tid = smb_buffer_response->Tid; bcc_ptr = pByteArea(smb_buffer_response); bytes_left = get_bcc(smb_buffer_response); @@ -3949,7 +3945,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, spin_lock(&cifs_tcp_ses_lock); if (server->tcpStatus == CifsInSessSetup) server->tcpStatus = CifsGood; + /* Even if one channel is active, session is in good state */ + if (ses->status == CifsInSessSetup) + ses->status = CifsGood; spin_unlock(&cifs_tcp_ses_lock); + + spin_lock(&ses->chan_lock); + cifs_chan_clear_need_reconnect(ses, server); + spin_unlock(&ses->chan_lock); } return rc; @@ -4461,8 +4464,15 @@ out: if (rc) { spin_lock(&cifs_tcp_ses_lock); - tcon->tidStatus = CifsNeedTcon; + if (tcon->tidStatus == CifsInTcon) + tcon->tidStatus = CifsNeedTcon; + spin_unlock(&cifs_tcp_ses_lock); + } else { + spin_lock(&cifs_tcp_ses_lock); + if (tcon->tidStatus == CifsInTcon) + tcon->tidStatus = CifsGood; spin_unlock(&cifs_tcp_ses_lock); + tcon->need_reconnect = false; } return rc; @@ -4487,8 +4497,15 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru rc = ops->tree_connect(xid, tcon->ses, tcon->treeName, tcon, nlsc); if (rc) { spin_lock(&cifs_tcp_ses_lock); - tcon->tidStatus = CifsNeedTcon; + if (tcon->tidStatus == CifsInTcon) + tcon->tidStatus = CifsNeedTcon; spin_unlock(&cifs_tcp_ses_lock); + } else { + spin_lock(&cifs_tcp_ses_lock); + if (tcon->tidStatus == CifsInTcon) + tcon->tidStatus = CifsGood; + spin_unlock(&cifs_tcp_ses_lock); + tcon->need_reconnect = false; } return rc; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index f7de57f6f047..97fba1f28e4b 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1052,15 +1052,6 @@ sess_establish_session(struct sess_data *sess_data) mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "CIFS session established successfully\n"); - spin_lock(&ses->chan_lock); - cifs_chan_clear_need_reconnect(ses, server); - spin_unlock(&ses->chan_lock); - - /* Even if one channel is active, session is in good state */ - spin_lock(&cifs_tcp_ses_lock); - ses->status = CifsGood; - spin_unlock(&cifs_tcp_ses_lock); - return 0; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 202ddbafe939..1e670e56b07a 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1386,16 +1386,6 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "SMB2/3 session established successfully\n"); - - spin_lock(&ses->chan_lock); - cifs_chan_clear_need_reconnect(ses, server); - spin_unlock(&ses->chan_lock); - - /* Even if one channel is active, session is in good state */ - spin_lock(&cifs_tcp_ses_lock); - ses->status = CifsGood; - spin_unlock(&cifs_tcp_ses_lock); - return rc; } @@ -1923,10 +1913,6 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->share_flags = le32_to_cpu(rsp->ShareFlags); tcon->capabilities = rsp->Capabilities; /* we keep caps little endian */ tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); - spin_lock(&cifs_tcp_ses_lock); - tcon->tidStatus = CifsGood; - spin_unlock(&cifs_tcp_ses_lock); - tcon->need_reconnect = false; tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId); strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); -- cgit v1.2.3-58-ga151 From 47de760655f329ce4b3d3e6276557220956d8c38 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Tue, 18 Jan 2022 09:24:08 +0000 Subject: cifs: update tcpStatus during negotiate and sess setup Till the end of SMB session setup, update tcpStatus and avoid updating session status field. There was a typo in cifs_setup_session, which caused ses->status to be updated instead. This was causing issues during reconnect. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/connect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0a35503b7b46..bcba3324cb4b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3908,7 +3908,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, spin_unlock(&cifs_tcp_ses_lock); return 0; } - ses->status = CifsInSessSetup; + server->tcpStatus = CifsInSessSetup; spin_unlock(&cifs_tcp_ses_lock); spin_lock(&ses->chan_lock); @@ -3946,8 +3946,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (server->tcpStatus == CifsInSessSetup) server->tcpStatus = CifsGood; /* Even if one channel is active, session is in good state */ - if (ses->status == CifsInSessSetup) - ses->status = CifsGood; + ses->status = CifsGood; spin_unlock(&cifs_tcp_ses_lock); spin_lock(&ses->chan_lock); -- cgit v1.2.3-58-ga151 From ba978e83255a759a4a07257a46ca6396a8b81787 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Mon, 17 Jan 2022 07:15:02 +0000 Subject: cifs: cifs_ses_mark_for_reconnect should also update reconnect bits Recent restructuring of cifs_reconnect introduced a helper func named cifs_ses_mark_for_reconnect, which updates the state of tcp session for all the channels of a session for reconnect. However, this does not update the session state and chans_need_reconnect bitmask. This change fixes that. Also, cifs_mark_tcp_sess_for_reconnect should mark set the bitmask for all channels when the whole session is marked for reconnect. Fixed that here too. Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifs_swn.c | 9 +++------ fs/cifs/cifsproto.h | 3 +++ fs/cifs/connect.c | 9 ++++++--- fs/cifs/dfs_cache.c | 2 +- fs/cifs/netmisc.c | 5 +---- fs/cifs/sess.c | 15 --------------- 6 files changed, 14 insertions(+), 29 deletions(-) diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index 8f386dd9939e..463ebe34892b 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -396,11 +396,11 @@ static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const ch switch (state) { case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name); - cifs_ses_mark_for_reconnect(swnreg->tcon->ses); + cifs_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_AVAILABLE: cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name); - cifs_ses_mark_for_reconnect(swnreg->tcon->ses); + cifs_reconnect(swnreg->tcon->ses->server, true); break; case CIFS_SWN_RESOURCE_STATE_UNKNOWN: cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name); @@ -498,10 +498,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a goto unlock; } - spin_lock(&cifs_tcp_ses_lock); - if (tcon->ses->server->tcpStatus != CifsExiting) - tcon->ses->server->tcpStatus = CifsNeedReconnect; - spin_unlock(&cifs_tcp_ses_lock); + cifs_reconnect(tcon->ses->server, false); unlock: mutex_unlock(&tcon->ses->server->srv_mutex); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f2029bc46215..d3701295402d 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -131,6 +131,9 @@ extern int SendReceiveBlockingLock(const unsigned int xid, struct smb_hdr *in_buf , struct smb_hdr *out_buf, int *bytes_returned); +void +cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, + bool mark_smb_session); extern int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session); extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index bcba3324cb4b..453aba3c2ad6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -168,7 +168,7 @@ static void cifs_resolve_server(struct work_struct *work) * @server needs to be previously set to CifsNeedReconnect. * */ -static void +void cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) { @@ -197,7 +197,10 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) goto next_session; - cifs_chan_set_need_reconnect(ses, server); + if (mark_smb_session) + CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses); + else + cifs_chan_set_need_reconnect(ses, server); /* If all channels need reconnect, then tcon needs reconnect */ if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) @@ -4396,7 +4399,7 @@ static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tco */ if (rc && server->current_fullpath != server->origin_fullpath) { server->current_fullpath = server->origin_fullpath; - cifs_ses_mark_for_reconnect(tcon->ses); + cifs_reconnect(tcon->ses->server, true); } dfs_cache_free_tgts(tl); diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index e9b0fa2a9614..dd9643751671 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1355,7 +1355,7 @@ static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cach } cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__); - cifs_ses_mark_for_reconnect(tcon->ses); + cifs_reconnect(tcon->ses->server, true); } /* Refresh dfs referral of tcon and mark it for reconnect if needed */ diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 43b16b6d108c..ebe236b9d9f5 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -896,10 +896,7 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr) if (class == ERRSRV && code == ERRbaduid) { cifs_dbg(FYI, "Server returned 0x%x, reconnecting session...\n", code); - spin_lock(&cifs_tcp_ses_lock); - if (mid->server->tcpStatus != CifsExiting) - mid->server->tcpStatus = CifsNeedReconnect; - spin_unlock(&cifs_tcp_ses_lock); + cifs_reconnect(mid->server, false); } } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 97fba1f28e4b..15373a377a36 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -366,21 +366,6 @@ out: return rc; } -/* Mark all session channels for reconnect */ -void cifs_ses_mark_for_reconnect(struct cifs_ses *ses) -{ - int i; - - spin_lock(&cifs_tcp_ses_lock); - spin_lock(&ses->chan_lock); - for (i = 0; i < ses->chan_count; i++) { - if (ses->chans[i].server->tcpStatus != CifsExiting) - ses->chans[i].server->tcpStatus = CifsNeedReconnect; - } - spin_unlock(&ses->chan_lock); - spin_unlock(&cifs_tcp_ses_lock); -} - static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, struct TCP_Server_Info *server, SESSION_SETUP_ANDX *pSMB) -- cgit v1.2.3-58-ga151 From 7ff775aca48adc854436b92c060e5eebfffb6a4a Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Jan 2022 21:24:26 -0800 Subject: KVM: x86/pmu: Use binary search to check filtered events The PMU event filter may contain up to 300 events. Replace the linear search in reprogram_gp_counter() with a binary search. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Message-Id: <20220115052431.447232-2-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index e632693a2266..2c98f3ee8df4 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include "x86.h" #include "cpuid.h" @@ -172,12 +174,16 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return true; } +static int cmp_u64(const void *a, const void *b) +{ + return *(__u64 *)a - *(__u64 *)b; +} + void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { unsigned config, type = PERF_TYPE_RAW; struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; - int i; bool allow_event = true; if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) @@ -192,16 +198,13 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); if (filter) { - for (i = 0; i < filter->nevents; i++) - if (filter->events[i] == - (eventsel & AMD64_RAW_EVENT_MASK_NB)) - break; - if (filter->action == KVM_PMU_EVENT_ALLOW && - i == filter->nevents) - allow_event = false; - if (filter->action == KVM_PMU_EVENT_DENY && - i < filter->nevents) - allow_event = false; + __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB; + + if (bsearch(&key, filter->events, filter->nevents, + sizeof(__u64), cmp_u64)) + allow_event = filter->action == KVM_PMU_EVENT_ALLOW; + else + allow_event = filter->action == KVM_PMU_EVENT_DENY; } if (!allow_event) return; @@ -576,6 +579,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) /* Ensure nevents can't be changed between the user copies. */ *filter = tmp; + /* + * Sort the in-kernel list so that we can search it with bsearch. + */ + sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL); + mutex_lock(&kvm->lock); filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, mutex_is_locked(&kvm->lock)); -- cgit v1.2.3-58-ga151 From b33b9c407861985713ca18cc9ea05b7540210ad4 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Jan 2022 21:24:27 -0800 Subject: selftests: kvm/x86: Parameterize the CPUID vendor string check Refactor is_intel_cpu() to make it easier to reuse the bulk of the code for other vendors in the future. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Message-Id: <20220115052431.447232-3-jmattson@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index d61e2326dc85..308a1fe687f1 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1253,10 +1253,10 @@ void kvm_x86_state_cleanup(struct kvm_x86_state *state) free(state); } -bool is_intel_cpu(void) +static bool cpu_vendor_string_is(const char *vendor) { + const uint32_t *chunk = (const uint32_t *)vendor; int eax, ebx, ecx, edx; - const uint32_t *chunk; const int leaf = 0; __asm__ __volatile__( @@ -1265,10 +1265,14 @@ bool is_intel_cpu(void) "=c"(ecx), "=d"(edx) : /* input */ "0"(leaf), "2"(0)); - chunk = (const uint32_t *)("GenuineIntel"); return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]); } +bool is_intel_cpu(void) +{ + return cpu_vendor_string_is("GenuineIntel"); +} + uint32_t kvm_get_cpuid_max_basic(void) { return kvm_get_supported_cpuid_entry(0)->eax; -- cgit v1.2.3-58-ga151 From 21066101f42cfd86fdd835b70ce0e36c335f5f4d Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Jan 2022 21:24:28 -0800 Subject: selftests: kvm/x86: Introduce is_amd_cpu() Replace the one ad hoc "AuthenticAMD" CPUID vendor string comparison with a new function, is_amd_cpu(). Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Message-Id: <20220115052431.447232-4-jmattson@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/x86_64/processor.h | 1 + tools/testing/selftests/kvm/lib/x86_64/processor.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index bb013d101c14..e82c44aa029a 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -364,6 +364,7 @@ static inline unsigned long get_xmm(int n) } bool is_intel_cpu(void); +bool is_amd_cpu(void); struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 308a1fe687f1..6bf01ded3776 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1273,6 +1273,14 @@ bool is_intel_cpu(void) return cpu_vendor_string_is("GenuineIntel"); } +/* + * Exclude early K5 samples with a vendor string of "AMDisbetter!" + */ +bool is_amd_cpu(void) +{ + return cpu_vendor_string_is("AuthenticAMD"); +} + uint32_t kvm_get_cpuid_max_basic(void) { return kvm_get_supported_cpuid_entry(0)->eax; @@ -1508,10 +1516,6 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui return cpuid; } -#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 -#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 -#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65 - static inline unsigned x86_family(unsigned int eax) { unsigned int x86; @@ -1533,11 +1537,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1; /* Avoid reserved HyperTransport region on AMD processors. */ - eax = ecx = 0; - cpuid(&eax, &ebx, &ecx, &edx); - if (ebx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx || - ecx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx || - edx != X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) + if (!is_amd_cpu()) return max_gfn; /* On parts with <40 physical address bits, the area is fully hidden */ -- cgit v1.2.3-58-ga151 From 398f9240f90f4168f5882180723f743f7b682049 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Jan 2022 21:24:29 -0800 Subject: selftests: kvm/x86: Export x86_family() for use outside of processor.c Move this static inline function to processor.h, so that it can be used in individual tests, as needed. Opportunistically replace the bare 'unsigned' with 'unsigned int.' Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Message-Id: <20220115052431.447232-5-jmattson@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/x86_64/processor.h | 12 ++++++++++++ tools/testing/selftests/kvm/lib/x86_64/processor.c | 12 ------------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index e82c44aa029a..4eb73959ce05 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -366,6 +366,18 @@ static inline unsigned long get_xmm(int n) bool is_intel_cpu(void); bool is_amd_cpu(void); +static inline unsigned int x86_family(unsigned int eax) +{ + unsigned int x86; + + x86 = (eax >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (eax >> 20) & 0xff; + + return x86; +} + struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 6bf01ded3776..59dcfe1967cc 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1516,18 +1516,6 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui return cpuid; } -static inline unsigned x86_family(unsigned int eax) -{ - unsigned int x86; - - x86 = (eax >> 8) & 0xf; - - if (x86 == 0xf) - x86 += (eax >> 20) & 0xff; - - return x86; -} - unsigned long vm_compute_max_gfn(struct kvm_vm *vm) { const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */ -- cgit v1.2.3-58-ga151 From 2ba9047424fc7243c63ac57f5fdfa754aa895e3c Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Jan 2022 21:24:30 -0800 Subject: selftests: kvm/x86: Introduce x86_model() Extract the x86 model number from CPUID.01H:EAX. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Message-Id: <20220115052431.447232-6-jmattson@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/include/x86_64/processor.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 4eb73959ce05..122447827954 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -378,6 +378,11 @@ static inline unsigned int x86_family(unsigned int eax) return x86; } +static inline unsigned int x86_model(unsigned int eax) +{ + return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f); +} + struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid); void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state); -- cgit v1.2.3-58-ga151 From bef9a701f3ebfb60da259b04778d24128505a96c Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 14 Jan 2022 21:24:31 -0800 Subject: selftests: kvm/x86: Add test for KVM_SET_PMU_EVENT_FILTER Verify that the PMU event filter works as expected. Note that the virtual PMU doesn't work as expected on AMD Zen CPUs (an intercepted rdmsr is counted as a retired branch instruction), but the PMU event filter does work. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Message-Id: <20220115052431.447232-7-jmattson@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/x86_64/pmu_event_filter_test.c | 438 +++++++++++++++++++++ 3 files changed, 440 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 20b4c921c9a2..9fe19f412f44 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -22,6 +22,7 @@ /x86_64/mmio_warning_test /x86_64/mmu_role_test /x86_64/platform_info_test +/x86_64/pmu_event_filter_test /x86_64/set_boot_cpu_id /x86_64/set_sregs_test /x86_64/sev_migrate_tests diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index ec78a8692899..7fbc80e3ecdf 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -56,6 +56,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test TEST_GEN_PROGS_x86_64 += x86_64/mmu_role_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test +TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test TEST_GEN_PROGS_x86_64 += x86_64/smm_test diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c new file mode 100644 index 000000000000..aa104946e6e0 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for x86 KVM_SET_PMU_EVENT_FILTER. + * + * Copyright (C) 2022, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Verifies the expected behavior of allow lists and deny lists for + * virtual PMU events. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +/* + * In lieu of copying perf_event.h into tools... + */ +#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17) +#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) + +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_counters:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +union cpuid10_ebx { + struct { + unsigned int no_unhalted_core_cycles:1; + unsigned int no_instructions_retired:1; + unsigned int no_unhalted_reference_cycles:1; + unsigned int no_llc_reference:1; + unsigned int no_llc_misses:1; + unsigned int no_branch_instruction_retired:1; + unsigned int no_branch_misses_retired:1; + } split; + unsigned int full; +}; + +/* End of stuff taken from perf_event.h. */ + +/* Oddly, this isn't in perf_event.h. */ +#define ARCH_PERFMON_BRANCHES_RETIRED 5 + +#define VCPU_ID 0 +#define NUM_BRANCHES 42 + +/* + * This is how the event selector and unit mask are stored in an AMD + * core performance event-select register. Intel's format is similar, + * but the event selector is only 8 bits. + */ +#define EVENT(select, umask) ((select & 0xf00UL) << 24 | (select & 0xff) | \ + (umask & 0xff) << 8) + +/* + * "Branch instructions retired", from the Intel SDM, volume 3, + * "Pre-defined Architectural Performance Events." + */ + +#define INTEL_BR_RETIRED EVENT(0xc4, 0) + +/* + * "Retired branch instructions", from Processor Programming Reference + * (PPR) for AMD Family 17h Model 01h, Revision B1 Processors, + * Preliminary Processor Programming Reference (PPR) for AMD Family + * 17h Model 31h, Revision B0 Processors, and Preliminary Processor + * Programming Reference (PPR) for AMD Family 19h Model 01h, Revision + * B1 Processors Volume 1 of 2. + */ + +#define AMD_ZEN_BR_RETIRED EVENT(0xc2, 0) + +/* + * This event list comprises Intel's eight architectural events plus + * AMD's "retired branch instructions" for Zen[123] (and possibly + * other AMD CPUs). + */ +static const uint64_t event_list[] = { + EVENT(0x3c, 0), + EVENT(0xc0, 0), + EVENT(0x3c, 1), + EVENT(0x2e, 0x4f), + EVENT(0x2e, 0x41), + EVENT(0xc4, 0), + EVENT(0xc5, 0), + EVENT(0xa4, 1), + AMD_ZEN_BR_RETIRED, +}; + +/* + * If we encounter a #GP during the guest PMU sanity check, then the guest + * PMU is not functional. Inform the hypervisor via GUEST_SYNC(0). + */ +static void guest_gp_handler(struct ex_regs *regs) +{ + GUEST_SYNC(0); +} + +/* + * Check that we can write a new value to the given MSR and read it back. + * The caller should provide a non-empty set of bits that are safe to flip. + * + * Return on success. GUEST_SYNC(0) on error. + */ +static void check_msr(uint32_t msr, uint64_t bits_to_flip) +{ + uint64_t v = rdmsr(msr) ^ bits_to_flip; + + wrmsr(msr, v); + if (rdmsr(msr) != v) + GUEST_SYNC(0); + + v ^= bits_to_flip; + wrmsr(msr, v); + if (rdmsr(msr) != v) + GUEST_SYNC(0); +} + +static void intel_guest_code(void) +{ + check_msr(MSR_CORE_PERF_GLOBAL_CTRL, 1); + check_msr(MSR_P6_EVNTSEL0, 0xffff); + check_msr(MSR_IA32_PMC0, 0xffff); + GUEST_SYNC(1); + + for (;;) { + uint64_t br0, br1; + + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); + wrmsr(MSR_P6_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | INTEL_BR_RETIRED); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 1); + br0 = rdmsr(MSR_IA32_PMC0); + __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); + br1 = rdmsr(MSR_IA32_PMC0); + GUEST_SYNC(br1 - br0); + } +} + +/* + * To avoid needing a check for CPUID.80000001:ECX.PerfCtrExtCore[bit 23], + * this code uses the always-available, legacy K7 PMU MSRs, which alias to + * the first four of the six extended core PMU MSRs. + */ +static void amd_guest_code(void) +{ + check_msr(MSR_K7_EVNTSEL0, 0xffff); + check_msr(MSR_K7_PERFCTR0, 0xffff); + GUEST_SYNC(1); + + for (;;) { + uint64_t br0, br1; + + wrmsr(MSR_K7_EVNTSEL0, 0); + wrmsr(MSR_K7_EVNTSEL0, ARCH_PERFMON_EVENTSEL_ENABLE | + ARCH_PERFMON_EVENTSEL_OS | AMD_ZEN_BR_RETIRED); + br0 = rdmsr(MSR_K7_PERFCTR0); + __asm__ __volatile__("loop ." : "+c"((int){NUM_BRANCHES})); + br1 = rdmsr(MSR_K7_PERFCTR0); + GUEST_SYNC(br1 - br0); + } +} + +/* + * Run the VM to the next GUEST_SYNC(value), and return the value passed + * to the sync. Any other exit from the guest is fatal. + */ +static uint64_t run_vm_to_sync(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct ucall uc; + + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Exit_reason other than KVM_EXIT_IO: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + get_ucall(vm, VCPU_ID, &uc); + TEST_ASSERT(uc.cmd == UCALL_SYNC, + "Received ucall other than UCALL_SYNC: %lu", uc.cmd); + return uc.args[1]; +} + +/* + * In a nested environment or if the vPMU is disabled, the guest PMU + * might not work as architected (accessing the PMU MSRs may raise + * #GP, or writes could simply be discarded). In those situations, + * there is no point in running these tests. The guest code will perform + * a sanity check and then GUEST_SYNC(success). In the case of failure, + * the behavior of the guest on resumption is undefined. + */ +static bool sanity_check_pmu(struct kvm_vm *vm) +{ + bool success; + + vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); + success = run_vm_to_sync(vm); + vm_install_exception_handler(vm, GP_VECTOR, NULL); + + return success; +} + +static struct kvm_pmu_event_filter *make_pmu_event_filter(uint32_t nevents) +{ + struct kvm_pmu_event_filter *f; + int size = sizeof(*f) + nevents * sizeof(f->events[0]); + + f = malloc(size); + TEST_ASSERT(f, "Out of memory"); + memset(f, 0, size); + f->nevents = nevents; + return f; +} + +static struct kvm_pmu_event_filter *event_filter(uint32_t action) +{ + struct kvm_pmu_event_filter *f; + int i; + + f = make_pmu_event_filter(ARRAY_SIZE(event_list)); + f->action = action; + for (i = 0; i < ARRAY_SIZE(event_list); i++) + f->events[i] = event_list[i]; + + return f; +} + +/* + * Remove the first occurrence of 'event' (if any) from the filter's + * event list. + */ +static struct kvm_pmu_event_filter *remove_event(struct kvm_pmu_event_filter *f, + uint64_t event) +{ + bool found = false; + int i; + + for (i = 0; i < f->nevents; i++) { + if (found) + f->events[i - 1] = f->events[i]; + else + found = f->events[i] == event; + } + if (found) + f->nevents--; + return f; +} + +static void test_without_filter(struct kvm_vm *vm) +{ + uint64_t count = run_vm_to_sync(vm); + + if (count != NUM_BRANCHES) + pr_info("%s: Branch instructions retired = %lu (expected %u)\n", + __func__, count, NUM_BRANCHES); + TEST_ASSERT(count, "Allowed PMU event is not counting"); +} + +static uint64_t test_with_filter(struct kvm_vm *vm, + struct kvm_pmu_event_filter *f) +{ + vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, (void *)f); + return run_vm_to_sync(vm); +} + +static void test_member_deny_list(struct kvm_vm *vm) +{ + struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY); + uint64_t count = test_with_filter(vm, f); + + free(f); + if (count) + pr_info("%s: Branch instructions retired = %lu (expected 0)\n", + __func__, count); + TEST_ASSERT(!count, "Disallowed PMU Event is counting"); +} + +static void test_member_allow_list(struct kvm_vm *vm) +{ + struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW); + uint64_t count = test_with_filter(vm, f); + + free(f); + if (count != NUM_BRANCHES) + pr_info("%s: Branch instructions retired = %lu (expected %u)\n", + __func__, count, NUM_BRANCHES); + TEST_ASSERT(count, "Allowed PMU event is not counting"); +} + +static void test_not_member_deny_list(struct kvm_vm *vm) +{ + struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_DENY); + uint64_t count; + + remove_event(f, INTEL_BR_RETIRED); + remove_event(f, AMD_ZEN_BR_RETIRED); + count = test_with_filter(vm, f); + free(f); + if (count != NUM_BRANCHES) + pr_info("%s: Branch instructions retired = %lu (expected %u)\n", + __func__, count, NUM_BRANCHES); + TEST_ASSERT(count, "Allowed PMU event is not counting"); +} + +static void test_not_member_allow_list(struct kvm_vm *vm) +{ + struct kvm_pmu_event_filter *f = event_filter(KVM_PMU_EVENT_ALLOW); + uint64_t count; + + remove_event(f, INTEL_BR_RETIRED); + remove_event(f, AMD_ZEN_BR_RETIRED); + count = test_with_filter(vm, f); + free(f); + if (count) + pr_info("%s: Branch instructions retired = %lu (expected 0)\n", + __func__, count); + TEST_ASSERT(!count, "Disallowed PMU Event is counting"); +} + +/* + * Check for a non-zero PMU version, at least one general-purpose + * counter per logical processor, an EBX bit vector of length greater + * than 5, and EBX[5] clear. + */ +static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry) +{ + union cpuid10_eax eax = { .full = entry->eax }; + union cpuid10_ebx ebx = { .full = entry->ebx }; + + return eax.split.version_id && eax.split.num_counters > 0 && + eax.split.mask_length > ARCH_PERFMON_BRANCHES_RETIRED && + !ebx.split.no_branch_instruction_retired; +} + +/* + * Note that CPUID leaf 0xa is Intel-specific. This leaf should be + * clear on AMD hardware. + */ +static bool use_intel_pmu(void) +{ + struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid2 *cpuid; + + cpuid = kvm_get_supported_cpuid(); + entry = kvm_get_supported_cpuid_index(0xa, 0); + return is_intel_cpu() && entry && check_intel_pmu_leaf(entry); +} + +static bool is_zen1(uint32_t eax) +{ + return x86_family(eax) == 0x17 && x86_model(eax) <= 0x0f; +} + +static bool is_zen2(uint32_t eax) +{ + return x86_family(eax) == 0x17 && + x86_model(eax) >= 0x30 && x86_model(eax) <= 0x3f; +} + +static bool is_zen3(uint32_t eax) +{ + return x86_family(eax) == 0x19 && x86_model(eax) <= 0x0f; +} + +/* + * Determining AMD support for a PMU event requires consulting the AMD + * PPR for the CPU or reference material derived therefrom. The AMD + * test code herein has been verified to work on Zen1, Zen2, and Zen3. + * + * Feel free to add more AMD CPUs that are documented to support event + * select 0xc2 umask 0 as "retired branch instructions." + */ +static bool use_amd_pmu(void) +{ + struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid2 *cpuid; + + cpuid = kvm_get_supported_cpuid(); + entry = kvm_get_supported_cpuid_index(1, 0); + return is_amd_cpu() && entry && + (is_zen1(entry->eax) || + is_zen2(entry->eax) || + is_zen3(entry->eax)); +} + +int main(int argc, char *argv[]) +{ + void (*guest_code)(void) = NULL; + struct kvm_vm *vm; + int r; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + r = kvm_check_cap(KVM_CAP_PMU_EVENT_FILTER); + if (!r) { + print_skip("KVM_CAP_PMU_EVENT_FILTER not supported"); + exit(KSFT_SKIP); + } + + if (use_intel_pmu()) + guest_code = intel_guest_code; + else if (use_amd_pmu()) + guest_code = amd_guest_code; + + if (!guest_code) { + print_skip("Don't know how to test this guest PMU"); + exit(KSFT_SKIP); + } + + vm = vm_create_default(VCPU_ID, 0, guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + + if (!sanity_check_pmu(vm)) { + print_skip("Guest PMU is not functional"); + exit(KSFT_SKIP); + } + + test_without_filter(vm); + test_member_deny_list(vm); + test_member_allow_list(vm); + test_not_member_deny_list(vm); + test_not_member_allow_list(vm); + + kvm_vm_free(vm); + + return 0; +} -- cgit v1.2.3-58-ga151 From fc4fad79fc3d8841562e2a85808079da5b4835f6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 28 Dec 2021 23:24:36 +0000 Subject: KVM: VMX: Reject KVM_RUN if emulation is required with pending exception Reject KVM_RUN if emulation is required (because VMX is running without unrestricted guest) and an exception is pending, as KVM doesn't support emulating exceptions except when emulating real mode via vm86. The vCPU is hosed either way, but letting KVM_RUN proceed triggers a WARN due to the impossible condition. Alternatively, the WARN could be removed, but then userspace and/or KVM bugs would result in the vCPU silently running in a bad state, which isn't very friendly to users. Originally, the bug was hit by syzkaller with a nested guest as that doesn't require kvm_intel.unrestricted_guest=0. That particular flavor is likely fixed by commit cd0e615c49e5 ("KVM: nVMX: Synthesize TRIPLE_FAULT for L2 if emulation is required"), but it's trivial to trigger the WARN with a non-nested guest, and userspace can likely force bad state via ioctls() for a nested guest as well. Checking for the impossible condition needs to be deferred until KVM_RUN because KVM can't force specific ordering between ioctls. E.g. clearing exception.pending in KVM_SET_SREGS doesn't prevent userspace from setting it in KVM_SET_VCPU_EVENTS, and disallowing KVM_SET_VCPU_EVENTS with emulation_required would prevent userspace from queuing an exception and then stuffing sregs. Note, if KVM were to try and detect/prevent the condition prior to KVM_RUN, handle_invalid_guest_state() and/or handle_emulation_failure() would need to be modified to clear the pending exception prior to exiting to userspace. ------------[ cut here ]------------ WARNING: CPU: 6 PID: 137812 at arch/x86/kvm/vmx/vmx.c:1623 vmx_queue_exception+0x14f/0x160 [kvm_intel] CPU: 6 PID: 137812 Comm: vmx_invalid_nes Not tainted 5.15.2-7cc36c3e14ae-pop #279 Hardware name: ASUS Q87M-E/Q87M-E, BIOS 1102 03/03/2014 RIP: 0010:vmx_queue_exception+0x14f/0x160 [kvm_intel] Code: <0f> 0b e9 fd fe ff ff 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 RSP: 0018:ffffa45c83577d38 EFLAGS: 00010202 RAX: 0000000000000003 RBX: 0000000080000006 RCX: 0000000000000006 RDX: 0000000000000000 RSI: 0000000000010002 RDI: ffff9916af734000 RBP: ffff9916af734000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000006 R13: 0000000000000000 R14: ffff9916af734038 R15: 0000000000000000 FS: 00007f1e1a47c740(0000) GS:ffff99188fb80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1e1a6a8008 CR3: 000000026f83b005 CR4: 00000000001726e0 Call Trace: kvm_arch_vcpu_ioctl_run+0x13a2/0x1f20 [kvm] kvm_vcpu_ioctl+0x279/0x690 [kvm] __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae Reported-by: syzbot+82112403ace4cbd780d8@syzkaller.appspotmail.com Signed-off-by: Sean Christopherson Message-Id: <20211228232437.1875318-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/svm.c | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 22 ++++++++++++++++++++-- arch/x86/kvm/x86.c | 12 +++++++++--- 5 files changed, 37 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index f658bb4dbb74..65c3f1857d6e 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -55,6 +55,7 @@ KVM_X86_OP_NULL(tlb_remote_flush) KVM_X86_OP_NULL(tlb_remote_flush_with_range) KVM_X86_OP(tlb_flush_gva) KVM_X86_OP(tlb_flush_guest) +KVM_X86_OP(vcpu_pre_run) KVM_X86_OP(run) KVM_X86_OP_NULL(handle_exit) KVM_X86_OP_NULL(skip_emulated_instruction) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 89d1fdb39c46..6a8fa26ef98c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1380,6 +1380,7 @@ struct kvm_x86_ops { */ void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); + int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d8cac84fb2dc..bf0c3a67d836 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3829,6 +3829,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) svm_complete_interrupts(vcpu); } +static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + return 1; +} + static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && @@ -4658,6 +4663,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .tlb_flush_gva = svm_flush_tlb_gva, .tlb_flush_guest = svm_flush_tlb, + .vcpu_pre_run = svm_vcpu_pre_run, .run = svm_vcpu_run, .handle_exit = handle_exit, .skip_emulated_instruction = skip_emulated_instruction, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 15e30602782b..41aaa37d9eb8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5426,6 +5426,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) return 1; } +static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + return vmx->emulation_required && !vmx->rmode.vm86_active && + vcpu->arch.exception.pending; +} + static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5445,8 +5453,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (!kvm_emulate_instruction(vcpu, 0)) return 0; - if (vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending) { + if (vmx_emulation_required_with_pending_exception(vcpu)) { kvm_prepare_emulation_failure_exit(vcpu); return 0; } @@ -5468,6 +5475,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) return 1; } +static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) +{ + if (vmx_emulation_required_with_pending_exception(vcpu)) { + kvm_prepare_emulation_failure_exit(vcpu); + return 0; + } + + return 1; +} + static void grow_ple_window(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7708,6 +7725,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .tlb_flush_gva = vmx_flush_tlb_gva, .tlb_flush_guest = vmx_flush_tlb_guest, + .vcpu_pre_run = vmx_vcpu_pre_run, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, .skip_emulated_instruction = vmx_skip_emulated_instruction, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 52df8e6eaa57..49ff85e966d7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10393,10 +10393,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) } else WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); - if (kvm_run->immediate_exit) + if (kvm_run->immediate_exit) { r = -EINTR; - else - r = vcpu_run(vcpu); + goto out; + } + + r = static_call(kvm_x86_vcpu_pre_run)(vcpu); + if (r <= 0) + goto out; + + r = vcpu_run(vcpu); out: kvm_put_guest_fpu(vcpu); -- cgit v1.2.3-58-ga151 From e337f7e063641ca4d040c8210d4bd790b81effb0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 28 Dec 2021 23:24:37 +0000 Subject: KVM: selftests: Add a test to force emulation with a pending exception Add a VMX specific test to verify that KVM doesn't explode if userspace attempts KVM_RUN when emulation is required with a pending exception. KVM VMX's emulation support for !unrestricted_guest punts exceptions to userspace instead of attempting to synthesize the exception with all the correct state (and stack switching, etc...). Punting is acceptable as there's never been a request to support injecting exceptions when emulating due to invalid state, but KVM has historically assumed that userspace will do the right thing and either clear the exception or kill the guest. Deliberately do the opposite and attempt to re-enter the guest with a pending exception and emulation required to verify KVM continues to punt the combination to userspace, e.g. doesn't explode, WARN, etc... Signed-off-by: Sean Christopherson Message-Id: <20211228232437.1875318-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../vmx_exception_with_invalid_guest_state.c | 139 +++++++++++++++++++++ 3 files changed, 141 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 9fe19f412f44..c2165b137a50 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -37,6 +37,7 @@ /x86_64/vmx_apic_access_test /x86_64/vmx_close_while_nested_test /x86_64/vmx_dirty_log_test +/x86_64/vmx_exception_with_invalid_guest_state /x86_64/vmx_invalid_nested_guest_state /x86_64/vmx_preemption_timer_test /x86_64/vmx_set_nested_state_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 7fbc80e3ecdf..81ebf99d6ff0 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -70,6 +70,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test diff --git a/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c new file mode 100644 index 000000000000..27a850f3d7ce --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/vmx_exception_with_invalid_guest_state.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#include +#include +#include +#include + +#include "kselftest.h" + +#define VCPU_ID 0 + +static struct kvm_vm *vm; + +static void guest_ud_handler(struct ex_regs *regs) +{ + /* Loop on the ud2 until guest state is made invalid. */ +} + +static void guest_code(void) +{ + asm volatile("ud2"); +} + +static void __run_vcpu_with_invalid_state(void) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + + vcpu_run(vm, VCPU_ID); + + TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR, + "Expected KVM_EXIT_INTERNAL_ERROR, got %d (%s)\n", + run->exit_reason, exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->emulation_failure.suberror == KVM_INTERNAL_ERROR_EMULATION, + "Expected emulation failure, got %d\n", + run->emulation_failure.suberror); +} + +static void run_vcpu_with_invalid_state(void) +{ + /* + * Always run twice to verify KVM handles the case where _KVM_ queues + * an exception with invalid state and then exits to userspace, i.e. + * that KVM doesn't explode if userspace ignores the initial error. + */ + __run_vcpu_with_invalid_state(); + __run_vcpu_with_invalid_state(); +} + +static void set_timer(void) +{ + struct itimerval timer; + + timer.it_value.tv_sec = 0; + timer.it_value.tv_usec = 200; + timer.it_interval = timer.it_value; + ASSERT_EQ(setitimer(ITIMER_REAL, &timer, NULL), 0); +} + +static void set_or_clear_invalid_guest_state(bool set) +{ + static struct kvm_sregs sregs; + + if (!sregs.cr0) + vcpu_sregs_get(vm, VCPU_ID, &sregs); + sregs.tr.unusable = !!set; + vcpu_sregs_set(vm, VCPU_ID, &sregs); +} + +static void set_invalid_guest_state(void) +{ + set_or_clear_invalid_guest_state(true); +} + +static void clear_invalid_guest_state(void) +{ + set_or_clear_invalid_guest_state(false); +} + +static void sigalrm_handler(int sig) +{ + struct kvm_vcpu_events events; + + TEST_ASSERT(sig == SIGALRM, "Unexpected signal = %d", sig); + + vcpu_events_get(vm, VCPU_ID, &events); + + /* + * If an exception is pending, attempt KVM_RUN with invalid guest, + * otherwise rearm the timer and keep doing so until the timer fires + * between KVM queueing an exception and re-entering the guest. + */ + if (events.exception.pending) { + set_invalid_guest_state(); + run_vcpu_with_invalid_state(); + } else { + set_timer(); + } +} + +int main(int argc, char *argv[]) +{ + if (!is_intel_cpu() || vm_is_unrestricted_guest(NULL)) { + print_skip("Must be run with kvm_intel.unrestricted_guest=0"); + exit(KSFT_SKIP); + } + + vm = vm_create_default(VCPU_ID, 0, (void *)guest_code); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + + vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); + + /* + * Stuff invalid guest state for L2 by making TR unusuable. The next + * KVM_RUN should induce a TRIPLE_FAULT in L2 as KVM doesn't support + * emulating invalid guest state for L2. + */ + set_invalid_guest_state(); + run_vcpu_with_invalid_state(); + + /* + * Verify KVM also handles the case where userspace gains control while + * an exception is pending and stuffs invalid state. Run with valid + * guest state and a timer firing every 200us, and attempt to enter the + * guest with invalid state when the handler interrupts KVM with an + * exception pending. + */ + clear_invalid_guest_state(); + TEST_ASSERT(signal(SIGALRM, sigalrm_handler) != SIG_ERR, + "Failed to register SIGALRM handler, errno = %d (%s)", + errno, strerror(errno)); + + set_timer(); + run_vcpu_with_invalid_state(); +} -- cgit v1.2.3-58-ga151 From e09fccb5435d7b9ab3fd5dfeada8ae40cfa56e08 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 13 Jan 2022 13:29:24 +0100 Subject: KVM: avoid warning on s390 in mark_page_dirty Avoid warnings on s390 like [ 1801.980931] CPU: 12 PID: 117600 Comm: kworker/12:0 Tainted: G E 5.17.0-20220113.rc0.git0.32ce2abb03cf.300.fc35.s390x+next #1 [ 1801.980938] Workqueue: events irqfd_inject [kvm] [...] [ 1801.981057] Call Trace: [ 1801.981060] [<000003ff805f0f5c>] mark_page_dirty_in_slot+0xa4/0xb0 [kvm] [ 1801.981083] [<000003ff8060e9fe>] adapter_indicators_set+0xde/0x268 [kvm] [ 1801.981104] [<000003ff80613c24>] set_adapter_int+0x64/0xd8 [kvm] [ 1801.981124] [<000003ff805fb9aa>] kvm_set_irq+0xc2/0x130 [kvm] [ 1801.981144] [<000003ff805f8d86>] irqfd_inject+0x76/0xa0 [kvm] [ 1801.981164] [<0000000175e56906>] process_one_work+0x1fe/0x470 [ 1801.981173] [<0000000175e570a4>] worker_thread+0x64/0x498 [ 1801.981176] [<0000000175e5ef2c>] kthread+0x10c/0x110 [ 1801.981180] [<0000000175de73c8>] __ret_from_fork+0x40/0x58 [ 1801.981185] [<000000017698440a>] ret_from_fork+0xa/0x40 when writing to a guest from an irqfd worker as long as we do not have the dirty ring. Signed-off-by: Christian Borntraeger Reluctantly-acked-by: David Woodhouse Message-Id: <20220113122924.740496-1-borntraeger@linux.ibm.com> Fixes: 2efd61a608b0 ("KVM: Warn if mark_page_dirty() is called without an active vCPU") Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6e8e9d36f382..dc6d1823a9d9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3163,8 +3163,10 @@ void mark_page_dirty_in_slot(struct kvm *kvm, { struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); +#ifdef CONFIG_HAVE_KVM_DIRTY_RING if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm)) return; +#endif if (memslot && kvm_slot_dirty_track_enabled(memslot)) { unsigned long rel_gfn = gfn - memslot->base_gfn; -- cgit v1.2.3-58-ga151 From d76fb40637fc0e84b27bf431cd72cf8fe3f813ef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:14 +0000 Subject: KVM: VMX: Handle PI descriptor updates during vcpu_put/load Move the posted interrupt pre/post_block logic into vcpu_put/load respectively, using the kvm_vcpu_is_blocking() to determining whether or not the wakeup handler needs to be set (and unset). This avoids updating the PI descriptor if halt-polling is successful, reduces the number of touchpoints for updating the descriptor, and eliminates the confusing behavior of intentionally leaving a "stale" PI.NDST when a blocking vCPU is scheduled back in after preemption. The downside is that KVM will do the PID update twice if the vCPU is preempted after prepare_to_rcuwait() but before schedule(), but that's a rare case (and non-existent on !PREEMPT kernels). The notable wart is the need to send a self-IPI on the wakeup vector if an outstanding notification is pending after configuring the wakeup vector. Ideally, KVM would just do a kvm_vcpu_wake_up() in this case, but the scheduler doesn't support waking a task from its preemption notifier callback, i.e. while the task is right in the middle of being scheduled out. Note, setting the wakeup vector before halt-polling is not necessary: once the pending IRQ will be recorded in the PIR, kvm_vcpu_has_events() will detect this (via kvm_cpu_get_interrupt(), kvm_apic_get_interrupt(), apic_has_interrupt_for_ppr() and finally vmx_sync_pir_to_irr()) and terminate the polling. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 150 ++++++++++++++++++----------------------- arch/x86/kvm/vmx/posted_intr.h | 8 ++- arch/x86/kvm/vmx/vmx.c | 5 -- 3 files changed, 70 insertions(+), 93 deletions(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index a82a15aa1982..63e2399f353a 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -52,6 +52,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct pi_desc old, new; + unsigned long flags; unsigned int dest; /* @@ -62,23 +63,34 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!enable_apicv || !lapic_in_kernel(vcpu)) return; - /* Nothing to do if PI.SN and PI.NDST both have the desired value. */ - if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) + /* + * If the vCPU wasn't on the wakeup list and wasn't migrated, then the + * full update can be skipped as neither the vector nor the destination + * needs to be changed. + */ + if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { + /* + * Clear SN if it was set due to being preempted. Again, do + * this even if there is no assigned device for simplicity. + */ + if (pi_test_and_clear_sn(pi_desc)) + goto after_clear_sn; return; + } + + local_irq_save(flags); /* - * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change - * PI.NDST: pi_post_block is the one expected to change PID.NDST and the - * wakeup handler expects the vCPU to be on the blocked_vcpu_list that - * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up - * correctly. + * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup + * list of the _previous_ pCPU, which will not be the same as the + * current pCPU if the task was migrated. */ - if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { - pi_clear_sn(pi_desc); - goto after_clear_sn; + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { + raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); + list_del(&vcpu->blocked_vcpu_list); + raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); } - /* The full case. Set the new destination and clear SN. */ dest = cpu_physical_id(cpu); if (!x2apic_mode) dest = (dest << 8) & 0xFF00; @@ -86,10 +98,22 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) do { old.control = new.control = READ_ONCE(pi_desc->control); + /* + * Clear SN (as above) and refresh the destination APIC ID to + * handle task migration (@cpu != vcpu->cpu). + */ new.ndst = dest; new.sn = 0; + + /* + * Restore the notification vector; in the blocking case, the + * descriptor was modified on "put" to use the wakeup vector. + */ + new.nv = POSTED_INTR_VECTOR; } while (pi_try_set_control(pi_desc, old.control, new.control)); + local_irq_restore(flags); + after_clear_sn: /* @@ -111,83 +135,24 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm) irq_remapping_cap(IRQ_POSTING_CAP); } -void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - - if (!vmx_can_use_vtd_pi(vcpu->kvm)) - return; - - /* Set SN when the vCPU is preempted */ - if (vcpu->preempted) - pi_set_sn(pi_desc); -} - -static void __pi_post_block(struct kvm_vcpu *vcpu) -{ - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - struct pi_desc old, new; - unsigned int dest; - - /* - * Remove the vCPU from the wakeup list of the _previous_ pCPU, which - * will not be the same as the current pCPU if the task was migrated. - */ - raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - list_del(&vcpu->blocked_vcpu_list); - raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); - - dest = cpu_physical_id(vcpu->cpu); - if (!x2apic_mode) - dest = (dest << 8) & 0xFF00; - - WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR, - "Wakeup handler not enabled while the vCPU was blocking"); - - do { - old.control = new.control = READ_ONCE(pi_desc->control); - - new.ndst = dest; - - /* set 'NV' to 'notification vector' */ - new.nv = POSTED_INTR_VECTOR; - } while (pi_try_set_control(pi_desc, old.control, new.control)); - - vcpu->pre_pcpu = -1; -} - /* - * This routine does the following things for vCPU which is going - * to be blocked if VT-d PI is enabled. - * - Store the vCPU to the wakeup list, so when interrupts happen - * we can find the right vCPU to wake up. - * - Change the Posted-interrupt descriptor as below: - * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR - * - If 'ON' is set during this process, which means at least one - * interrupt is posted for this vCPU, we cannot block it, in - * this case, return 1, otherwise, return 0. - * + * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set + * WAKEUP as the notification vector in the PI descriptor. */ -int pi_pre_block(struct kvm_vcpu *vcpu) +static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { - struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct pi_desc old, new; unsigned long flags; - if (!vmx_can_use_vtd_pi(vcpu->kvm) || - vmx_interrupt_blocked(vcpu)) - return 0; - local_irq_save(flags); - vcpu->pre_pcpu = vcpu->cpu; raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); list_add_tail(&vcpu->blocked_vcpu_list, &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); - WARN(pi_desc->sn == 1, - "Posted Interrupt Suppress Notification set before blocking"); + WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); do { old.control = new.control = READ_ONCE(pi_desc->control); @@ -196,24 +161,37 @@ int pi_pre_block(struct kvm_vcpu *vcpu) new.nv = POSTED_INTR_WAKEUP_VECTOR; } while (pi_try_set_control(pi_desc, old.control, new.control)); - /* We should not block the vCPU if an interrupt is posted for it. */ - if (pi_test_on(pi_desc)) - __pi_post_block(vcpu); + /* + * Send a wakeup IPI to this CPU if an interrupt may have been posted + * before the notification vector was updated, in which case the IRQ + * will arrive on the non-wakeup vector. An IPI is needed as calling + * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not + * enabled until it is safe to call try_to_wake_up() on the task being + * scheduled out). + */ + if (pi_test_on(&new)) + apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); local_irq_restore(flags); - return (vcpu->pre_pcpu == -1); } -void pi_post_block(struct kvm_vcpu *vcpu) +void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { - unsigned long flags; + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (vcpu->pre_pcpu == -1) + if (!vmx_can_use_vtd_pi(vcpu->kvm)) return; - local_irq_save(flags); - __pi_post_block(vcpu); - local_irq_restore(flags); + if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) + pi_enable_wakeup_handler(vcpu); + + /* + * Set SN when the vCPU is preempted. Note, the vCPU can both be seen + * as blocking and preempted, e.g. if it's preempted between setting + * its wait state and manually scheduling out. + */ + if (vcpu->preempted) + pi_set_sn(pi_desc); } /* @@ -254,7 +232,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) * Bail out of the block loop if the VM has an assigned * device, but the blocking vCPU didn't reconfigure the * PI.NV to the wakeup vector, i.e. the assigned device - * came along after the initial check in pi_pre_block(). + * came along after the initial check in vmx_vcpu_pi_put(). */ void vmx_pi_start_assignment(struct kvm *kvm) { diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 36ae035f14aa..eb14e76b84ef 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) +{ + return test_and_clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); @@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu); void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu); -int pi_pre_block(struct kvm_vcpu *vcpu); -void pi_post_block(struct kvm_vcpu *vcpu); void pi_wakeup_handler(void); void __init pi_init_cpu(int cpu); bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 41aaa37d9eb8..3a9a49a87b9d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7566,9 +7566,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) static int vmx_pre_block(struct kvm_vcpu *vcpu) { - if (pi_pre_block(vcpu)) - return 1; - if (kvm_lapic_hv_timer_in_use(vcpu)) kvm_lapic_switch_to_sw_timer(vcpu); @@ -7579,8 +7576,6 @@ static void vmx_post_block(struct kvm_vcpu *vcpu) { if (kvm_x86_ops.set_hv_timer) kvm_lapic_switch_to_hv_timer(vcpu); - - pi_post_block(vcpu); } static void vmx_setup_mce(struct kvm_vcpu *vcpu) -- cgit v1.2.3-58-ga151 From e6eec09b7bc7869a49ac0ff376415bad40030ade Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:15 +0000 Subject: KVM: Drop unused kvm_vcpu.pre_pcpu field Remove kvm_vcpu.pre_pcpu as it no longer has any users. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 - virt/kvm/kvm_main.c | 1 - 2 files changed, 2 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3c47b146851a..5c3c67b6318f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -309,7 +309,6 @@ struct kvm_vcpu { u64 requests; unsigned long guest_debug; - int pre_pcpu; struct list_head blocked_vcpu_list; struct mutex mutex; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index dc6d1823a9d9..c301cd8d583e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -427,7 +427,6 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) #endif kvm_async_pf_vcpu_init(vcpu); - vcpu->pre_pcpu = -1; INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); kvm_vcpu_set_in_spin_loop(vcpu, false); -- cgit v1.2.3-58-ga151 From 12a8eee5686ef3ea7d8db90cd664f11e4a39e349 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:16 +0000 Subject: KVM: Move x86 VMX's posted interrupt list_head to vcpu_vmx Move the seemingly generic block_vcpu_list from kvm_vcpu to vcpu_vmx, and rename the list and all associated variables to clarify that it tracks the set of vCPU that need to be poked on a posted interrupt to the wakeup vector. The list is not used to track _all_ vCPUs that are blocking, and the term "blocked" can be misleading as it may refer to a blocking condition in the host or the guest, where as the PI wakeup case is specifically for the vCPUs that are actively blocking from within the guest. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 39 ++++++++++++++++++++------------------- arch/x86/kvm/vmx/vmx.c | 2 ++ arch/x86/kvm/vmx/vmx.h | 3 +++ include/linux/kvm_host.h | 2 -- virt/kvm/kvm_main.c | 2 -- 5 files changed, 25 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 63e2399f353a..901eea44cf24 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -19,7 +19,7 @@ * wake the target vCPUs. vCPUs are removed from the list and the notification * vector is reset when the vCPU is scheduled in. */ -static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); +static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); /* * Protect the per-CPU list with a per-CPU spinlock to handle task migration. * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the @@ -27,7 +27,7 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will * occur if a wakeup IRQ arrives and attempts to acquire the lock. */ -static DEFINE_PER_CPU(raw_spinlock_t, blocked_vcpu_on_cpu_lock); +static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { @@ -51,6 +51,7 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new) void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; unsigned long flags; unsigned int dest; @@ -86,9 +87,9 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) * current pCPU if the task was migrated. */ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { - raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); - list_del(&vcpu->blocked_vcpu_list); - raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_del(&vmx->pi_wakeup_list); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); } dest = cpu_physical_id(cpu); @@ -142,15 +143,16 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm) static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); struct pi_desc old, new; unsigned long flags; local_irq_save(flags); - raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); - list_add_tail(&vcpu->blocked_vcpu_list, - &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu)); - raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu)); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); + list_add_tail(&vmx->pi_wakeup_list, + &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); @@ -199,24 +201,23 @@ void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) */ void pi_wakeup_handler(void) { - struct kvm_vcpu *vcpu; int cpu = smp_processor_id(); + struct vcpu_vmx *vmx; - raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), - blocked_vcpu_list) { - struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); + list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu), + pi_wakeup_list) { - if (pi_test_on(pi_desc)) - kvm_vcpu_kick(vcpu); + if (pi_test_on(&vmx->pi_desc)) + kvm_vcpu_kick(&vmx->vcpu); } - raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } void __init pi_init_cpu(int cpu) { - INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); - raw_spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); + INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu)); + raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3a9a49a87b9d..1840898bb184 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6943,6 +6943,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); vmx = to_vmx(vcpu); + INIT_LIST_HEAD(&vmx->pi_wakeup_list); + err = -ENOMEM; vmx->vpid = allocate_vpid(); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index f8fc7441baea..7f2c82e7f38f 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -317,6 +317,9 @@ struct vcpu_vmx { /* Posted interrupt descriptor */ struct pi_desc pi_desc; + /* Used if this vCPU is waiting for PI notification wakeup. */ + struct list_head pi_wakeup_list; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5c3c67b6318f..f079820f52b5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -309,8 +309,6 @@ struct kvm_vcpu { u64 requests; unsigned long guest_debug; - struct list_head blocked_vcpu_list; - struct mutex mutex; struct kvm_run *run; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c301cd8d583e..5a1164483e6c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -427,8 +427,6 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) #endif kvm_async_pf_vcpu_init(vcpu); - INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); - kvm_vcpu_set_in_spin_loop(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; -- cgit v1.2.3-58-ga151 From 98c25ead5eda5e9d41abe57839ad3e8caf19500c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:17 +0000 Subject: KVM: VMX: Move preemption timer <=> hrtimer dance to common x86 Handle the switch to/from the hypervisor/software timer when a vCPU is blocking in common x86 instead of in VMX. Even though VMX is the only user of a hypervisor timer, the logic and all functions involved are generic x86 (unless future CPUs do something completely different and implement a hypervisor timer that runs regardless of mode). Handling the switch in common x86 will allow for the elimination of the pre/post_blocks hooks, and also lets KVM switch back to the hypervisor timer if and only if it was in use (without additional params). Add a comment explaining why the switch cannot be deferred to kvm_sched_out() or kvm_vcpu_block(). Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 6 +----- arch/x86/kvm/x86.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1840898bb184..70be166833ac 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7568,16 +7568,12 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) static int vmx_pre_block(struct kvm_vcpu *vcpu) { - if (kvm_lapic_hv_timer_in_use(vcpu)) - kvm_lapic_switch_to_sw_timer(vcpu); - return 0; } static void vmx_post_block(struct kvm_vcpu *vcpu) { - if (kvm_x86_ops.set_hv_timer) - kvm_lapic_switch_to_hv_timer(vcpu); + } static void vmx_setup_mce(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 49ff85e966d7..943706a09e6e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10146,8 +10146,21 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { + bool hv_timer; + if (!kvm_arch_vcpu_runnable(vcpu) && (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { + /* + * Switch to the software timer before halt-polling/blocking as + * the guest's timer may be a break event for the vCPU, and the + * hypervisor timer runs only when the CPU is in guest mode. + * Switch before halt-polling so that KVM recognizes an expired + * timer before blocking. + */ + hv_timer = kvm_lapic_hv_timer_in_use(vcpu); + if (hv_timer) + kvm_lapic_switch_to_sw_timer(vcpu); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) kvm_vcpu_halt(vcpu); @@ -10155,6 +10168,9 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + if (hv_timer) + kvm_lapic_switch_to_hv_timer(vcpu); + if (kvm_x86_ops.post_block) static_call(kvm_x86_post_block)(vcpu); @@ -10349,6 +10365,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) r = -EINTR; goto out; } + /* + * It should be impossible for the hypervisor timer to be in + * use before KVM has ever run the vCPU. + */ + WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu)); kvm_vcpu_block(vcpu); if (kvm_apic_accept_events(vcpu) < 0) { r = 0; -- cgit v1.2.3-58-ga151 From b6d42baddf85310eb2c455cf78af2580773c4ff0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:18 +0000 Subject: KVM: x86: Unexport LAPIC's switch_to_{hv,sw}_timer() helpers Unexport switch_to_{hv,sw}_timer() now that common x86 handles the transitions. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-9-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c5028e6b0f96..baca9fa37a91 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1950,7 +1950,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { restart_apic_timer(vcpu->arch.apic); } -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) { @@ -1962,7 +1961,6 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) start_sw_timer(apic); preempt_enable(); } -EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu) { -- cgit v1.2.3-58-ga151 From c3e8abf0f3536a46a235b0533149c2b2c2bbac27 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:19 +0000 Subject: KVM: x86: Remove defunct pre_block/post_block kvm_x86_ops hooks Drop kvm_x86_ops' pre/post_block() now that all implementations are nops. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-10-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm-x86-ops.h | 2 -- arch/x86/include/asm/kvm_host.h | 12 ------------ arch/x86/kvm/vmx/vmx.c | 13 ------------- arch/x86/kvm/x86.c | 6 +----- 4 files changed, 1 insertion(+), 32 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 65c3f1857d6e..631d5040b31e 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -99,8 +99,6 @@ KVM_X86_OP(handle_exit_irqoff) KVM_X86_OP_NULL(request_immediate_exit) KVM_X86_OP(sched_in) KVM_X86_OP_NULL(update_cpu_dirty_logging) -KVM_X86_OP_NULL(pre_block) -KVM_X86_OP_NULL(post_block) KVM_X86_OP_NULL(vcpu_blocking) KVM_X86_OP_NULL(vcpu_unblocking) KVM_X86_OP_NULL(update_pi_irte) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6a8fa26ef98c..682ad02a4e58 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1454,18 +1454,6 @@ struct kvm_x86_ops { const struct kvm_pmu_ops *pmu_ops; const struct kvm_x86_nested_ops *nested_ops; - /* - * Architecture specific hooks for vCPU blocking due to - * HLT instruction. - * Returns for .pre_block(): - * - 0 means continue to block the vCPU. - * - 1 means we cannot block the vCPU since some event - * happens during this period, such as, 'ON' bit in - * posted-interrupts descriptor is set. - */ - int (*pre_block)(struct kvm_vcpu *vcpu); - void (*post_block)(struct kvm_vcpu *vcpu); - void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 70be166833ac..9ab98e1bd65a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7566,16 +7566,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); } -static int vmx_pre_block(struct kvm_vcpu *vcpu) -{ - return 0; -} - -static void vmx_post_block(struct kvm_vcpu *vcpu) -{ - -} - static void vmx_setup_mce(struct kvm_vcpu *vcpu) { if (vcpu->arch.mcg_cap & MCG_LMCE_P) @@ -7777,9 +7767,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .cpu_dirty_log_size = PML_ENTITY_NUM, .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - .pmu_ops = &intel_pmu_ops, .nested_ops = &vmx_nested_ops, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 943706a09e6e..4b9728a53de6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10148,8 +10148,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { bool hv_timer; - if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) { + if (!kvm_arch_vcpu_runnable(vcpu)) { /* * Switch to the software timer before halt-polling/blocking as * the guest's timer may be a break event for the vCPU, and the @@ -10171,9 +10170,6 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) if (hv_timer) kvm_lapic_switch_to_hv_timer(vcpu); - if (kvm_x86_ops.post_block) - static_call(kvm_x86_post_block)(vcpu); - if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; } -- cgit v1.2.3-58-ga151 From 31f251d4ddfa464c6dd92ee873b9b223e992a085 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:20 +0000 Subject: KVM: SVM: Signal AVIC doorbell iff vCPU is in guest mode Signal the AVIC doorbell iff the vCPU is running in the guest. If the vCPU is not IN_GUEST_MODE, it's guaranteed to pick up any pending IRQs on the next VMRUN, which unconditionally processes the vIRR. Add comments to document the logic. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-11-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 0e5b49294086..40039477bebb 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -672,9 +672,22 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) return -1; kvm_lapic_set_irr(vec, vcpu->arch.apic); + + /* + * Pairs with the smp_mb_*() after setting vcpu->guest_mode in + * vcpu_enter_guest() to ensure the write to the vIRR is ordered before + * the read of guest_mode, which guarantees that either VMRUN will see + * and process the new vIRR entry, or that the below code will signal + * the doorbell if the vCPU is already running in the guest. + */ smp_mb__after_atomic(); - if (avic_vcpu_is_running(vcpu)) { + /* + * Signal the doorbell to tell hardware to inject the IRQ if the vCPU + * is in the guest. If the vCPU is not in the guest, hardware will + * automatically process AVIC interrupts at VMRUN. + */ + if (vcpu->mode == IN_GUEST_MODE) { int cpu = READ_ONCE(vcpu->cpu); /* @@ -688,8 +701,13 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) if (cpu != get_cpu()) wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); put_cpu(); - } else + } else { + /* + * Wake the vCPU if it was blocking. KVM will then detect the + * pending IRQ when checking if the vCPU has a wake event. + */ kvm_vcpu_wake_up(vcpu); + } return 0; } -- cgit v1.2.3-58-ga151 From 202470d536b2cad22fa859f3e01202571c49ded9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:21 +0000 Subject: KVM: SVM: Don't bother checking for "running" AVIC when kicking for IPIs Drop the avic_vcpu_is_running() check when waking vCPUs in response to a VM-Exit due to incomplete IPI delivery. The check isn't wrong per se, but it's not 100% accurate in the sense that it doesn't guarantee that the vCPU was one of the vCPUs that didn't receive the IPI. The check isn't required for correctness as blocking == !running in this context. From a performance perspective, waking a live task is not expensive as the only moderately costly operation is a locked operation to temporarily disable preemption. And if that is indeed a performance issue, kvm_vcpu_is_blocking() would be a better check than poking into the AVIC. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-12-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 15 +++++++++------ arch/x86/kvm/svm/svm.h | 11 ----------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 40039477bebb..2d8278167c0f 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -295,13 +295,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, struct kvm_vcpu *vcpu; unsigned long i; + /* + * Wake any target vCPUs that are blocking, i.e. waiting for a wake + * event. There's no need to signal doorbells, as hardware has handled + * vCPUs that were in guest at the time of the IPI, and vCPUs that have + * since entered the guest will have processed pending IRQs at VMRUN. + */ kvm_for_each_vcpu(i, vcpu, kvm) { - bool m = kvm_apic_match_dest(vcpu, source, - icrl & APIC_SHORT_MASK, - GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK); - - if (m && !avic_vcpu_is_running(vcpu)) + if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & APIC_DEST_MASK)) kvm_vcpu_wake_up(vcpu); } } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6d29421dd4bf..fd0685b2ce55 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -573,17 +573,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 *entry = svm->avic_physical_id_cache; - - if (!entry) - return false; - - return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); -} - int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); -- cgit v1.2.3-58-ga151 From e422b88969489e67fd1a87a6ef4ef5c30bb53edb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:22 +0000 Subject: KVM: SVM: Remove unnecessary APICv/AVIC update in vCPU unblocking path Remove handling of KVM_REQ_APICV_UPDATE from svm_vcpu_unblocking(), it's no longer needed as it was made obsolete by commit df7e4827c549 ("KVM: SVM: call avic_vcpu_load/avic_vcpu_put when enabling/disabling AVIC"). Prior to that commit, the manual check was necessary to ensure the AVIC stuff was updated by avic_set_running() when a request to enable APICv became pending while the vCPU was blocking, as the request handling itself would not do the update. But, as evidenced by the commit, that logic was flawed and subject to various races. Now that svm_refresh_apicv_exec_ctrl() does avic_vcpu_load/put() in response to an APICv status change, drop the manual check in the unblocking path. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-13-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 2d8278167c0f..7efe034c44c1 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1040,7 +1040,5 @@ void svm_vcpu_blocking(struct kvm_vcpu *vcpu) void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) { - if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) - kvm_vcpu_update_apicv(vcpu); avic_set_running(vcpu, true); } -- cgit v1.2.3-58-ga151 From af52f5aa5c1b46809834b728a13a1af5aab451e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:23 +0000 Subject: KVM: SVM: Use kvm_vcpu_is_blocking() in AVIC load to handle preemption Use kvm_vcpu_is_blocking() to determine whether or not the vCPU should be marked running during avic_vcpu_load(). Drop avic_is_running, which really should have been named "vcpu_is_not_blocking", as it tracked if the vCPU was blocking, not if it was actually running, e.g. it was set during svm_create_vcpu() when the vCPU was obviously not running. This is technically a teeny tiny functional change, as the vCPU will be marked IsRunning=1 on being reloaded if the vCPU is preempted between svm_vcpu_blocking() and prepare_to_rcuwait(). But that's a benign change as the vCPU will be marked IsRunning=0 when KVM voluntarily schedules out the vCPU. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-14-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 14 +++++++++----- arch/x86/kvm/svm/svm.c | 6 ------ arch/x86/kvm/svm/svm.h | 1 - 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 7efe034c44c1..368813320121 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -975,6 +975,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { u64 entry; /* ID = 0xff (broadcast), ID > 0xff (reserved) */ + bool is_blocking = kvm_vcpu_is_blocking(vcpu); int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -992,12 +993,17 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - if (svm->avic_is_running) + + /* + * Don't mark the vCPU as running if its blocking, i.e. if the vCPU is + * preempted after svm_vcpu_blocking() but before KVM voluntarily + * schedules out the vCPU. + */ + if (!is_blocking) entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, - svm->avic_is_running); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, !is_blocking); } void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -1018,11 +1024,9 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) */ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) { - struct vcpu_svm *svm = to_svm(vcpu); int cpu = get_cpu(); WARN_ON(cpu != vcpu->cpu); - svm->avic_is_running = is_run; if (kvm_vcpu_apicv_active(vcpu)) { if (is_run) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bf0c3a67d836..d21ef0381d29 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1440,12 +1440,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) if (err) goto error_free_vmsa_page; - /* We initialize this flag to true to make sure that the is_running - * bit would be set the first time the vcpu is loaded. - */ - if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm)) - svm->avic_is_running = true; - svm->msrpm = svm_vcpu_alloc_msrpm(); if (!svm->msrpm) { err = -ENOMEM; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index fd0685b2ce55..5178f6b245e1 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -225,7 +225,6 @@ struct vcpu_svm { u32 dfr_reg; struct page *avic_backing_page; u64 *avic_physical_id_cache; - bool avic_is_running; /* * Per-vcpu list of struct amd_svm_iommu_ir: -- cgit v1.2.3-58-ga151 From 782f64558de7bef84b90ea812deb38f0e53a8c7a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:24 +0000 Subject: KVM: SVM: Skip AVIC and IRTE updates when loading blocking vCPU Don't bother updating the Physical APIC table or IRTE when loading a vCPU that is blocking, i.e. won't be marked IsRun{ning}=1, as the pCPU is queried if and only if IsRunning is '1'. If the vCPU was migrated, the new pCPU will be picked up when avic_vcpu_load() is called by svm_vcpu_unblocking(). Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-15-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 368813320121..b7353e11da2e 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -975,7 +975,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { u64 entry; /* ID = 0xff (broadcast), ID > 0xff (reserved) */ - bool is_blocking = kvm_vcpu_is_blocking(vcpu); int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); @@ -986,24 +985,25 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) return; + /* + * No need to update anything if the vCPU is blocking, i.e. if the vCPU + * is being scheduled in after being preempted. The CPU entries in the + * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. + * If the vCPU was migrated, its new CPU value will be stuffed when the + * vCPU unblocks. + */ + if (kvm_vcpu_is_blocking(vcpu)) + return; + entry = READ_ONCE(*(svm->avic_physical_id_cache)); WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); - - entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - - /* - * Don't mark the vCPU as running if its blocking, i.e. if the vCPU is - * preempted after svm_vcpu_blocking() but before KVM voluntarily - * schedules out the vCPU. - */ - if (!is_blocking) - entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; + entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); - avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, !is_blocking); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true); } void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -1012,8 +1012,12 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); entry = READ_ONCE(*(svm->avic_physical_id_cache)); - if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) - avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + + /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ + if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) + return; + + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); -- cgit v1.2.3-58-ga151 From 0f65a9d337676b966316db17374fbef910ab8e4a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:26 +0000 Subject: KVM: VMX: Don't do full kick when triggering posted interrupt "fails" Replace the full "kick" with just the "wake" in the fallback path when triggering a virtual interrupt via a posted interrupt fails because the guest is not IN_GUEST_MODE. If the guest transitions into guest mode between the check and the kick, then it's guaranteed to see the pending interrupt as KVM syncs the PIR to IRR (and onto GUEST_RVI) after setting IN_GUEST_MODE. Kicking the guest in this case is nothing more than an unnecessary VM-Exit (and host IRQ). Opportunistically update comments to explain the various ordering rules and barriers at play. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-17-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 9ab98e1bd65a..ac3e943ec2a4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3998,7 +3998,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, /* the PIR and ON have been set by L1. */ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) - kvm_vcpu_kick(vcpu); + kvm_vcpu_wake_up(vcpu); return 0; } return -1; @@ -4036,7 +4036,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. */ if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) - kvm_vcpu_kick(vcpu); + kvm_vcpu_wake_up(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b9728a53de6..55518b7d3b96 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9978,10 +9978,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_mb__after_srcu_read_unlock(); /* - * This handles the case where a posted interrupt was - * notified with kvm_vcpu_kick. Assigned devices can - * use the POSTED_INTR_VECTOR even if APICv is disabled, - * so do it even if APICv is disabled on this vCPU. + * Process pending posted interrupts to handle the case where the + * notification IRQ arrived in the host, or was never sent (because the + * target vCPU wasn't running). Do this regardless of the vCPU's APICv + * status, KVM doesn't update assigned devices when APICv is inhibited, + * i.e. they can post interrupts even if APICv is temporarily disabled. */ if (kvm_lapic_enabled(vcpu)) static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); -- cgit v1.2.3-58-ga151 From 296aa26644d088d8ccf0d62b0a93443f7188d5e5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:28 +0000 Subject: KVM: VMX: Pass desired vector instead of bool for triggering posted IRQ Refactor the posted interrupt helper to take the desired notification vector instead of a bool so that the callers are self-documenting. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-19-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ac3e943ec2a4..063c21559aa1 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3932,11 +3932,9 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, - bool nested) + int pi_vec) { #ifdef CONFIG_SMP - int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; - if (vcpu->mode == IN_GUEST_MODE) { /* * The vector of interrupt to be delivered to vcpu had @@ -3997,7 +3995,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, smp_mb__after_atomic(); /* the PIR and ON have been set by L1. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR)) kvm_vcpu_wake_up(vcpu); return 0; } @@ -4035,7 +4033,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR)) kvm_vcpu_wake_up(vcpu); return 0; -- cgit v1.2.3-58-ga151 From ccf8d687542f6a7288b79727bec1cc084b3771b3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:29 +0000 Subject: KVM: VMX: Fold fallback path into triggering posted IRQ helper Move the fallback "wake_up" path into the helper to trigger posted interrupt helper now that the nested and non-nested paths are identical. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-20-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 063c21559aa1..a02a28ce7cc3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3931,7 +3931,7 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) pt_update_intercept_for_msr(vcpu); } -static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, +static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, int pi_vec) { #ifdef CONFIG_SMP @@ -3962,10 +3962,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, */ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); - return true; + return; } #endif - return false; + /* + * The vCPU isn't in the guest; wake the vCPU in case it is blocking, + * otherwise do nothing as KVM will grab the highest priority pending + * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). + */ + kvm_vcpu_wake_up(vcpu); } static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, @@ -3995,8 +4000,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, smp_mb__after_atomic(); /* the PIR and ON have been set by L1. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR)) - kvm_vcpu_wake_up(vcpu); + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); return 0; } return -1; @@ -4033,9 +4037,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. */ - if (!kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR)) - kvm_vcpu_wake_up(vcpu); - + kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); return 0; } -- cgit v1.2.3-58-ga151 From 635e6357f948d57bc98af8d37eb81896333822e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:30 +0000 Subject: KVM: VMX: Don't do full kick when handling posted interrupt wakeup When waking vCPUs in the posted interrupt wakeup handling, do exactly that and no more. There is no need to kick the vCPU as the wakeup handler just needs to get the vCPU task running, and if it's in the guest then it's definitely running. Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Message-Id: <20211208015236.1616697-21-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 901eea44cf24..aa1fe9085d77 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -209,7 +209,7 @@ void pi_wakeup_handler(void) pi_wakeup_list) { if (pi_test_on(&vmx->pi_desc)) - kvm_vcpu_kick(&vmx->vcpu); + kvm_vcpu_wake_up(&vmx->vcpu); } raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } -- cgit v1.2.3-58-ga151 From 935a7333958e91b5d0c1b0ebc75a5cefdbb34dd5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:31 +0000 Subject: KVM: SVM: Drop AVIC's intermediate avic_set_running() helper Drop avic_set_running() in favor of calling avic_vcpu_{load,put}() directly, and modify the block+put path to use preempt_disable/enable() instead of get/put_cpu(), as it doesn't actually care about the current pCPU associated with the vCPU. Opportunistically add lockdep assertions as being preempted in avic_vcpu_put() would lead to consuming stale data, even though doing so _in the current code base_ would not be fatal. Add a much needed comment explaining why svm_vcpu_blocking() needs to unload the AVIC and update the IRTE _before_ the vCPU starts blocking. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-22-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 56 +++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index b7353e11da2e..522f859408cb 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -978,6 +978,8 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); + lockdep_assert_preemption_disabled(); + /* * Since the host physical APIC id is 8 bits, * we can support host APIC ID upto 255. @@ -1011,6 +1013,8 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) u64 entry; struct vcpu_svm *svm = to_svm(vcpu); + lockdep_assert_preemption_disabled(); + entry = READ_ONCE(*(svm->avic_physical_id_cache)); /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ @@ -1023,30 +1027,42 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } -/* - * This function is called during VCPU halt/unhalt. - */ -static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) -{ - int cpu = get_cpu(); - - WARN_ON(cpu != vcpu->cpu); - - if (kvm_vcpu_apicv_active(vcpu)) { - if (is_run) - avic_vcpu_load(vcpu, cpu); - else - avic_vcpu_put(vcpu); - } - put_cpu(); -} - void svm_vcpu_blocking(struct kvm_vcpu *vcpu) { - avic_set_running(vcpu, false); + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + preempt_disable(); + + /* + * Unload the AVIC when the vCPU is about to block, _before_ + * the vCPU actually blocks. + * + * Any IRQs that arrive before IsRunning=0 will not cause an + * incomplete IPI vmexit on the source, therefore vIRR will also + * be checked by kvm_vcpu_check_block() before blocking. The + * memory barrier implicit in set_current_state orders writing + * IsRunning=0 before reading the vIRR. The processor needs a + * matching memory barrier on interrupt delivery between writing + * IRR and reading IsRunning; the lack of this barrier might be + * the cause of errata #1235). + */ + avic_vcpu_put(vcpu); + + preempt_enable(); } void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) { - avic_set_running(vcpu, true); + int cpu; + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + + cpu = get_cpu(); + WARN_ON(cpu != vcpu->cpu); + + avic_vcpu_load(vcpu, cpu); + + put_cpu(); } -- cgit v1.2.3-58-ga151 From 54744e17f031cbc5c5b995b1e275df1520c8a739 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:32 +0000 Subject: KVM: SVM: Move svm_hardware_setup() and its helpers below svm_x86_ops Move svm_hardware_setup() below svm_x86_ops so that KVM can modify ops during setup, e.g. the vcpu_(un)blocking hooks can be nullified if AVIC is disabled or unsupported. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-23-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 467 +++++++++++++++++++++++++------------------------ 1 file changed, 234 insertions(+), 233 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d21ef0381d29..9b15d59e6344 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -869,47 +869,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) } } -/* - * The default MMIO mask is a single bit (excluding the present bit), - * which could conflict with the memory encryption bit. Check for - * memory encryption support and override the default MMIO mask if - * memory encryption is enabled. - */ -static __init void svm_adjust_mmio_mask(void) -{ - unsigned int enc_bit, mask_bit; - u64 msr, mask; - - /* If there is no memory encryption support, use existing mask */ - if (cpuid_eax(0x80000000) < 0x8000001f) - return; - - /* If memory encryption is not enabled, use existing mask */ - rdmsrl(MSR_AMD64_SYSCFG, msr); - if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) - return; - - enc_bit = cpuid_ebx(0x8000001f) & 0x3f; - mask_bit = boot_cpu_data.x86_phys_bits; - - /* Increment the mask bit if it is the same as the encryption bit */ - if (enc_bit == mask_bit) - mask_bit++; - - /* - * If the mask bit location is below 52, then some bits above the - * physical addressing limit will always be reserved, so use the - * rsvd_bits() function to generate the mask. This mask, along with - * the present bit, will be used to generate a page fault with - * PFER.RSV = 1. - * - * If the mask bit location is 52 (or above), then clear the mask. - */ - mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; - - kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); -} - static void svm_hardware_teardown(void) { int cpu; @@ -924,198 +883,6 @@ static void svm_hardware_teardown(void) iopm_base = 0; } -static __init void svm_set_cpu_caps(void) -{ - kvm_set_cpu_caps(); - - supported_xss = 0; - - /* CPUID 0x80000001 and 0x8000000A (SVM features) */ - if (nested) { - kvm_cpu_cap_set(X86_FEATURE_SVM); - - if (nrips) - kvm_cpu_cap_set(X86_FEATURE_NRIPS); - - if (npt_enabled) - kvm_cpu_cap_set(X86_FEATURE_NPT); - - if (tsc_scaling) - kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); - - /* Nested VM can receive #VMEXIT instead of triggering #GP */ - kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); - } - - /* CPUID 0x80000008 */ - if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || - boot_cpu_has(X86_FEATURE_AMD_SSBD)) - kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - - /* AMD PMU PERFCTR_CORE CPUID */ - if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) - kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); - - /* CPUID 0x8000001F (SME/SEV features) */ - sev_set_cpu_caps(); -} - -static __init int svm_hardware_setup(void) -{ - int cpu; - struct page *iopm_pages; - void *iopm_va; - int r; - unsigned int order = get_order(IOPM_SIZE); - - /* - * NX is required for shadow paging and for NPT if the NX huge pages - * mitigation is enabled. - */ - if (!boot_cpu_has(X86_FEATURE_NX)) { - pr_err_ratelimited("NX (Execute Disable) not supported\n"); - return -EOPNOTSUPP; - } - kvm_enable_efer_bits(EFER_NX); - - iopm_pages = alloc_pages(GFP_KERNEL, order); - - if (!iopm_pages) - return -ENOMEM; - - iopm_va = page_address(iopm_pages); - memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); - iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; - - init_msrpm_offsets(); - - supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); - - if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) - kvm_enable_efer_bits(EFER_FFXSR); - - if (tsc_scaling) { - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - tsc_scaling = false; - } else { - pr_info("TSC scaling supported\n"); - kvm_has_tsc_control = true; - kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; - kvm_tsc_scaling_ratio_frac_bits = 32; - } - } - - tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); - - /* Check for pause filtering support */ - if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { - pause_filter_count = 0; - pause_filter_thresh = 0; - } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { - pause_filter_thresh = 0; - } - - if (nested) { - printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); - } - - /* - * KVM's MMU doesn't support using 2-level paging for itself, and thus - * NPT isn't supported if the host is using 2-level paging since host - * CR4 is unchanged on VMRUN. - */ - if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) - npt_enabled = false; - - if (!boot_cpu_has(X86_FEATURE_NPT)) - npt_enabled = false; - - /* Force VM NPT level equal to the host's paging level */ - kvm_configure_mmu(npt_enabled, get_npt_level(), - get_npt_level(), PG_LEVEL_1G); - pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); - - /* Note, SEV setup consumes npt_enabled. */ - sev_hardware_setup(); - - svm_hv_hardware_setup(); - - svm_adjust_mmio_mask(); - - for_each_possible_cpu(cpu) { - r = svm_cpu_init(cpu); - if (r) - goto err; - } - - if (nrips) { - if (!boot_cpu_has(X86_FEATURE_NRIPS)) - nrips = false; - } - - enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); - - if (enable_apicv) { - pr_info("AVIC enabled\n"); - - amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); - } - - if (vls) { - if (!npt_enabled || - !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || - !IS_ENABLED(CONFIG_X86_64)) { - vls = false; - } else { - pr_info("Virtual VMLOAD VMSAVE supported\n"); - } - } - - if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) - svm_gp_erratum_intercept = false; - - if (vgif) { - if (!boot_cpu_has(X86_FEATURE_VGIF)) - vgif = false; - else - pr_info("Virtual GIF supported\n"); - } - - if (lbrv) { - if (!boot_cpu_has(X86_FEATURE_LBRV)) - lbrv = false; - else - pr_info("LBR virtualization supported\n"); - } - - if (!enable_pmu) - pr_info("PMU virtualization is disabled\n"); - - svm_set_cpu_caps(); - - /* - * It seems that on AMD processors PTE's accessed bit is - * being set by the CPU hardware before the NPF vmexit. - * This is not expected behaviour and our tests fail because - * of it. - * A workaround here is to disable support for - * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. - * In this case userspace can know if there is support using - * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle - * it - * If future AMD CPU models change the behaviour described above, - * this variable can be changed accordingly - */ - allow_smaller_maxphyaddr = !npt_enabled; - - return 0; - -err: - svm_hardware_teardown(); - return r; -} - static void init_seg(struct vmcb_seg *seg) { seg->selector = 0; @@ -4738,6 +4505,240 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, }; +/* + * The default MMIO mask is a single bit (excluding the present bit), + * which could conflict with the memory encryption bit. Check for + * memory encryption support and override the default MMIO mask if + * memory encryption is enabled. + */ +static __init void svm_adjust_mmio_mask(void) +{ + unsigned int enc_bit, mask_bit; + u64 msr, mask; + + /* If there is no memory encryption support, use existing mask */ + if (cpuid_eax(0x80000000) < 0x8000001f) + return; + + /* If memory encryption is not enabled, use existing mask */ + rdmsrl(MSR_AMD64_SYSCFG, msr); + if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) + return; + + enc_bit = cpuid_ebx(0x8000001f) & 0x3f; + mask_bit = boot_cpu_data.x86_phys_bits; + + /* Increment the mask bit if it is the same as the encryption bit */ + if (enc_bit == mask_bit) + mask_bit++; + + /* + * If the mask bit location is below 52, then some bits above the + * physical addressing limit will always be reserved, so use the + * rsvd_bits() function to generate the mask. This mask, along with + * the present bit, will be used to generate a page fault with + * PFER.RSV = 1. + * + * If the mask bit location is 52 (or above), then clear the mask. + */ + mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; + + kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); +} + +static __init void svm_set_cpu_caps(void) +{ + kvm_set_cpu_caps(); + + supported_xss = 0; + + /* CPUID 0x80000001 and 0x8000000A (SVM features) */ + if (nested) { + kvm_cpu_cap_set(X86_FEATURE_SVM); + + if (nrips) + kvm_cpu_cap_set(X86_FEATURE_NRIPS); + + if (npt_enabled) + kvm_cpu_cap_set(X86_FEATURE_NPT); + + if (tsc_scaling) + kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); + + /* Nested VM can receive #VMEXIT instead of triggering #GP */ + kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); + } + + /* CPUID 0x80000008 */ + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || + boot_cpu_has(X86_FEATURE_AMD_SSBD)) + kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + + /* AMD PMU PERFCTR_CORE CPUID */ + if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); + + /* CPUID 0x8000001F (SME/SEV features) */ + sev_set_cpu_caps(); +} + +static __init int svm_hardware_setup(void) +{ + int cpu; + struct page *iopm_pages; + void *iopm_va; + int r; + unsigned int order = get_order(IOPM_SIZE); + + /* + * NX is required for shadow paging and for NPT if the NX huge pages + * mitigation is enabled. + */ + if (!boot_cpu_has(X86_FEATURE_NX)) { + pr_err_ratelimited("NX (Execute Disable) not supported\n"); + return -EOPNOTSUPP; + } + kvm_enable_efer_bits(EFER_NX); + + iopm_pages = alloc_pages(GFP_KERNEL, order); + + if (!iopm_pages) + return -ENOMEM; + + iopm_va = page_address(iopm_pages); + memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); + iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + + init_msrpm_offsets(); + + supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + + if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) + kvm_enable_efer_bits(EFER_FFXSR); + + if (tsc_scaling) { + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + tsc_scaling = false; + } else { + pr_info("TSC scaling supported\n"); + kvm_has_tsc_control = true; + kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; + kvm_tsc_scaling_ratio_frac_bits = 32; + } + } + + tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); + + /* Check for pause filtering support */ + if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { + pause_filter_count = 0; + pause_filter_thresh = 0; + } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { + pause_filter_thresh = 0; + } + + if (nested) { + printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); + kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); + } + + /* + * KVM's MMU doesn't support using 2-level paging for itself, and thus + * NPT isn't supported if the host is using 2-level paging since host + * CR4 is unchanged on VMRUN. + */ + if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) + npt_enabled = false; + + if (!boot_cpu_has(X86_FEATURE_NPT)) + npt_enabled = false; + + /* Force VM NPT level equal to the host's paging level */ + kvm_configure_mmu(npt_enabled, get_npt_level(), + get_npt_level(), PG_LEVEL_1G); + pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); + + /* Note, SEV setup consumes npt_enabled. */ + sev_hardware_setup(); + + svm_hv_hardware_setup(); + + svm_adjust_mmio_mask(); + + for_each_possible_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err; + } + + if (nrips) { + if (!boot_cpu_has(X86_FEATURE_NRIPS)) + nrips = false; + } + + enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); + + if (enable_apicv) { + pr_info("AVIC enabled\n"); + + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + } + + if (vls) { + if (!npt_enabled || + !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || + !IS_ENABLED(CONFIG_X86_64)) { + vls = false; + } else { + pr_info("Virtual VMLOAD VMSAVE supported\n"); + } + } + + if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) + svm_gp_erratum_intercept = false; + + if (vgif) { + if (!boot_cpu_has(X86_FEATURE_VGIF)) + vgif = false; + else + pr_info("Virtual GIF supported\n"); + } + + if (lbrv) { + if (!boot_cpu_has(X86_FEATURE_LBRV)) + lbrv = false; + else + pr_info("LBR virtualization supported\n"); + } + + if (!enable_pmu) + pr_info("PMU virtualization is disabled\n"); + + svm_set_cpu_caps(); + + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; + + return 0; + +err: + svm_hardware_teardown(); + return r; +} + + static struct kvm_x86_init_ops svm_init_ops __initdata = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, -- cgit v1.2.3-58-ga151 From a3c19d5beaad25fcaa703b251c72c3a22fc09100 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 8 Dec 2021 01:52:33 +0000 Subject: KVM: SVM: Nullify vcpu_(un)blocking() hooks if AVIC is disabled Nullify svm_x86_ops.vcpu_(un)blocking if AVIC/APICv is disabled as the hooks are necessary only to clear the vCPU's IsRunning entry in the Physical APIC and to update IRTE entries if the VM has a pass-through device attached. Opportunistically rename the helpers to clarify their AVIC relationship. Signed-off-by: Sean Christopherson Message-Id: <20211208015236.1616697-24-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 4 ++-- arch/x86/kvm/svm/svm.c | 7 +++++-- arch/x86/kvm/svm/svm.h | 4 ++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 522f859408cb..90364d02f22a 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -1027,7 +1027,7 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } -void svm_vcpu_blocking(struct kvm_vcpu *vcpu) +void avic_vcpu_blocking(struct kvm_vcpu *vcpu) { if (!kvm_vcpu_apicv_active(vcpu)) return; @@ -1052,7 +1052,7 @@ void svm_vcpu_blocking(struct kvm_vcpu *vcpu) preempt_enable(); } -void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) +void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) { int cpu; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9b15d59e6344..6d31d357a83b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4391,8 +4391,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .prepare_guest_switch = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .vcpu_blocking = svm_vcpu_blocking, - .vcpu_unblocking = svm_vcpu_unblocking, + .vcpu_blocking = avic_vcpu_blocking, + .vcpu_unblocking = avic_vcpu_unblocking, .update_exception_bitmap = svm_update_exception_bitmap, .get_msr_feature = svm_get_msr_feature, @@ -4682,6 +4682,9 @@ static __init int svm_hardware_setup(void) pr_info("AVIC enabled\n"); amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + } else { + svm_x86_ops.vcpu_blocking = NULL; + svm_x86_ops.vcpu_unblocking = NULL; } if (vls) { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 5178f6b245e1..47ef8f4a9358 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -592,8 +592,8 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec); bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); -void svm_vcpu_blocking(struct kvm_vcpu *vcpu); -void svm_vcpu_unblocking(struct kvm_vcpu *vcpu); +void avic_vcpu_blocking(struct kvm_vcpu *vcpu); +void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); /* sev.c */ -- cgit v1.2.3-58-ga151 From d5ad5b1c04c85f01850e88231cad7dfbc9e1d30c Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Tue, 18 Jan 2022 17:20:52 +0500 Subject: selftests: kvm: add amx_test to .gitignore amx_test's binary should be present in the .gitignore file for the git to ignore it. Fixes: bf70636d9443 ("selftest: kvm: Add amx selftest") Signed-off-by: Muhammad Usama Anjum Message-Id: <20220118122053.1941915-1-usama.anjum@collabora.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index c2165b137a50..dce7de7755e6 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -8,6 +8,7 @@ /s390x/memop /s390x/resets /s390x/sync_regs_test +/x86_64/amx_test /x86_64/cpuid_test /x86_64/cr4_cpuid_sync_test /x86_64/debug_regs -- cgit v1.2.3-58-ga151 From 6b34cd8e175bfbf4f3f01b6d19eae18245e1a8cc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 17 Jan 2022 16:28:29 +0000 Subject: btrfs: fix too long loop when defragging a 1 byte file When attempting to defrag a file with a single byte, we can end up in a too long loop, which is nearly infinite because at btrfs_defrag_file() we end up with the variable last_byte assigned with a value of 18446744073709551615 (which is (u64)-1). The problem comes from the fact we end up doing: last_byte = round_up(last_byte, fs_info->sectorsize) - 1; So if last_byte was assigned 0, which is i_size - 1, we underflow and end up with the value 18446744073709551615. This is trivial to reproduce and the following script triggers it: $ cat test.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj mkfs.btrfs -f $DEV mount $DEV $MNT echo -n "X" > $MNT/foobar btrfs filesystem defragment $MNT/foobar umount $MNT So fix this by not decrementing last_byte by 1 before doing the sector size round up. Also, to make it easier to follow, make the round up right after computing last_byte. Reported-by: Anthony Ruhier Fixes: 7b508037d4cac3 ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") Link: https://lore.kernel.org/linux-btrfs/0a269612-e43f-da22-c5bc-b34b1b56ebe8@mailbox.org/ CC: stable@vger.kernel.org # 5.16 Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a5bd6926f7ff..6ad2bc2e5af3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1518,12 +1518,16 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, if (range->start + range->len > range->start) { /* Got a specific range */ - last_byte = min(isize, range->start + range->len) - 1; + last_byte = min(isize, range->start + range->len); } else { /* Defrag until file end */ - last_byte = isize - 1; + last_byte = isize; } + /* Align the range */ + cur = round_down(range->start, fs_info->sectorsize); + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + /* * If we were not given a ra, allocate a readahead context. As * readahead is just an optimization, defrag will work without it so @@ -1536,10 +1540,6 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, file_ra_state_init(ra, inode->i_mapping); } - /* Align the range */ - cur = round_down(range->start, fs_info->sectorsize); - last_byte = round_up(last_byte, fs_info->sectorsize) - 1; - while (cur < last_byte) { u64 cluster_end; -- cgit v1.2.3-58-ga151 From b767c2fc787e992daeadfff40d61c05f66c82da0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 18 Jan 2022 13:43:31 +0000 Subject: btrfs: allow defrag to be interruptible During defrag, at btrfs_defrag_file(), we have this loop that iterates over a file range in steps no larger than 256K subranges. If the range is too long, there's no way to interrupt it. So make the loop check in each iteration if there's signal pending, and if there is, break and return -AGAIN to userspace. Before kernel 5.16, we used to allow defrag to be cancelled through a signal, but that was lost with commit 7b508037d4cac3 ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()"). This change adds back the possibility to cancel a defrag with a signal and keeps the same semantics, returning -EAGAIN to user space (and not the usually more expected -EINTR). This is also motivated by a recent bug on 5.16 where defragging a 1 byte file resulted in iterating from file range 0 to (u64)-1, as hitting the bug triggered a too long loop, basically requiring one to reboot the machine, as it was not possible to cancel defrag. Fixes: 7b508037d4cac3 ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") CC: stable@vger.kernel.org # 5.16 Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6ad2bc2e5af3..6391be7409d8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1546,6 +1546,11 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, /* The cluster size 256K should always be page aligned */ BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); + if (btrfs_defrag_cancelled(fs_info)) { + ret = -EAGAIN; + break; + } + /* We want the cluster end at page boundary when possible */ cluster_end = (((cur >> PAGE_SHIFT) + (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; -- cgit v1.2.3-58-ga151 From 70431bfd825d9cd5d93412c0456f253ecad6c415 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 17 Nov 2020 15:56:59 +0000 Subject: cifs: Support fscache indexing rewrite Change the cifs filesystem to take account of the changes to fscache's indexing rewrite and reenable caching in cifs. The following changes have been made: (1) The fscache_netfs struct is no more, and there's no need to register the filesystem as a whole. (2) The session cookie is now an fscache_volume cookie, allocated with fscache_acquire_volume(). That takes three parameters: a string representing the "volume" in the index, a string naming the cache to use (or NULL) and a u64 that conveys coherency metadata for the volume. For cifs, I've made it render the volume name string as: "cifs,," where the sharename has '/' characters replaced with ';'. This probably needs rethinking a bit as the total name could exceed the maximum filename component length. Further, the coherency data is currently just set to 0. It needs something else doing with it - I wonder if it would suffice simply to sum the resource_id, vol_create_time and vol_serial_number or maybe hash them. (3) The fscache_cookie_def is no more and needed information is passed directly to fscache_acquire_cookie(). The cache no longer calls back into the filesystem, but rather metadata changes are indicated at other times. fscache_acquire_cookie() is passed the same keying and coherency information as before. (4) The functions to set/reset cookies are removed and fscache_use_cookie() and fscache_unuse_cookie() are used instead. fscache_use_cookie() is passed a flag to indicate if the cookie is opened for writing. fscache_unuse_cookie() is passed updates for the metadata if we changed it (ie. if the file was opened for writing). These are called when the file is opened or closed. (5) cifs_setattr_*() are made to call fscache_resize() to change the size of the cache object. (6) The functions to read and write data are stubbed out pending a conversion to use netfslib. Changes ======= ver #8: - Abstract cache invalidation into a helper function. - Fix some checkpatch warnings[3]. ver #7: - Removed the accidentally added-back call to get the super cookie in cifs_root_iget(). - Fixed the right call to cifs_fscache_get_super_cookie() to take account of the "-o fsc" mount flag. ver #6: - Moved the change of gfpflags_allow_blocking() to current_is_kswapd() for cifs here. - Fixed one of the error paths in cifs_atomic_open() to jump around the call to use the cookie. - Fixed an additional successful return in the middle of cifs_open() to use the cookie on the way out. - Only get a volume cookie (and thus inode cookies) when "-o fsc" is supplied to mount. ver #5: - Fixed a couple of bits of cookie handling[2]: - The cookie should be released in cifs_evict_inode(), not cifsFileInfo_put_final(). The cookie needs to persist beyond file closure so that writepages will be able to write to it. - fscache_use_cookie() needs to be called in cifs_atomic_open() as it is for cifs_open(). ver #4: - Fixed the use of sizeof with memset. - tcon->vol_create_time is __le64 so doesn't need cpu_to_le64(). ver #3: - Canonicalise the cifs coherency data to make the cache portable. - Set volume coherency data. ver #2: - Use gfpflags_allow_blocking() rather than using flag directly. - Upgraded to -rc4 to allow for upstream changes[1]. - fscache_acquire_volume() now returns errors. Signed-off-by: David Howells Acked-by: Jeff Layton cc: Steve French cc: Shyam Prasad N cc: linux-cifs@vger.kernel.org cc: linux-cachefs@redhat.com Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=23b55d673d7527b093cd97b7c217c82e70cd1af0 [1] Link: https://lore.kernel.org/r/3419813.1641592362@warthog.procyon.org.uk/ [2] Link: https://lore.kernel.org/r/CAH2r5muTanw9pJqzAHd01d9A8keeChkzGsCEH6=0rHutVLAF-A@mail.gmail.com/ [3] Link: https://lore.kernel.org/r/163819671009.215744.11230627184193298714.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/163906982979.143852.10672081929614953210.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/163967187187.1823006.247415138444991444.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/164021579335.640689.2681324337038770579.stgit@warthog.procyon.org.uk/ # v4 Link: https://lore.kernel.org/r/3462849.1641593783@warthog.procyon.org.uk/ # v5 Link: https://lore.kernel.org/r/1318953.1642024578@warthog.procyon.org.uk/ # v6 Signed-off-by: Steve French --- fs/cifs/Kconfig | 2 +- fs/cifs/Makefile | 2 +- fs/cifs/cache.c | 105 ----------------- fs/cifs/cifsfs.c | 19 +-- fs/cifs/cifsglob.h | 5 +- fs/cifs/connect.c | 15 +-- fs/cifs/dir.c | 5 + fs/cifs/file.c | 66 +++++++---- fs/cifs/fscache.c | 333 +++++++++++++---------------------------------------- fs/cifs/fscache.h | 128 ++++++++------------ fs/cifs/inode.c | 19 +-- 11 files changed, 199 insertions(+), 500 deletions(-) delete mode 100644 fs/cifs/cache.c diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 346ae8716deb..3b7e3b9e4fd2 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -188,7 +188,7 @@ config CIFS_SMB_DIRECT config CIFS_FSCACHE bool "Provide CIFS client caching support" - depends on CIFS=m && FSCACHE_OLD_API || CIFS=y && FSCACHE_OLD_API=y + depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y help Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data to be cached locally on disk through the general filesystem cache diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 87fcacdf3de7..cc8fdcb35b71 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -25,7 +25,7 @@ cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o -cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cache.o +cifs-$(CONFIG_CIFS_FSCACHE) += fscache.o cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c deleted file mode 100644 index 8be57aaedab6..000000000000 --- a/fs/cifs/cache.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: LGPL-2.1 -/* - * CIFS filesystem cache index structure definitions - * - * Copyright (c) 2010 Novell, Inc. - * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> - * - */ -#include "fscache.h" -#include "cifs_debug.h" - -/* - * CIFS filesystem definition for FS-Cache - */ -struct fscache_netfs cifs_fscache_netfs = { - .name = "cifs", - .version = 0, -}; - -/* - * Register CIFS for caching with FS-Cache - */ -int cifs_fscache_register(void) -{ - return fscache_register_netfs(&cifs_fscache_netfs); -} - -/* - * Unregister CIFS for caching - */ -void cifs_fscache_unregister(void) -{ - fscache_unregister_netfs(&cifs_fscache_netfs); -} - -/* - * Server object for FS-Cache - */ -const struct fscache_cookie_def cifs_fscache_server_index_def = { - .name = "CIFS.server", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -static enum -fscache_checkaux cifs_fscache_super_check_aux(void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size) -{ - struct cifs_fscache_super_auxdata auxdata; - const struct cifs_tcon *tcon = cookie_netfs_data; - - if (datalen != sizeof(auxdata)) - return FSCACHE_CHECKAUX_OBSOLETE; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.resource_id = tcon->resource_id; - auxdata.vol_create_time = tcon->vol_create_time; - auxdata.vol_serial_number = tcon->vol_serial_number; - - if (memcmp(data, &auxdata, datalen) != 0) - return FSCACHE_CHECKAUX_OBSOLETE; - - return FSCACHE_CHECKAUX_OKAY; -} - -/* - * Superblock object for FS-Cache - */ -const struct fscache_cookie_def cifs_fscache_super_index_def = { - .name = "CIFS.super", - .type = FSCACHE_COOKIE_TYPE_INDEX, - .check_aux = cifs_fscache_super_check_aux, -}; - -static enum -fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, - const void *data, - uint16_t datalen, - loff_t object_size) -{ - struct cifs_fscache_inode_auxdata auxdata; - struct cifsInodeInfo *cifsi = cookie_netfs_data; - - if (datalen != sizeof(auxdata)) - return FSCACHE_CHECKAUX_OBSOLETE; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec; - auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec; - auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec; - auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; - - if (memcmp(data, &auxdata, datalen) != 0) - return FSCACHE_CHECKAUX_OBSOLETE; - - return FSCACHE_CHECKAUX_OKAY; -} - -const struct fscache_cookie_def cifs_fscache_inode_object_def = { - .name = "CIFS.uniqueid", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = cifs_fscache_inode_check_aux, -}; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 36b2e0cb9736..199edac0cb59 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -397,6 +397,9 @@ static void cifs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); + if (inode->i_state & I_PINNING_FSCACHE_WB) + cifs_fscache_unuse_inode_cookie(inode, true); + cifs_fscache_release_inode_cookie(inode); clear_inode(inode); } @@ -721,6 +724,12 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root) } #endif +static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + fscache_unpin_writeback(wbc, cifs_inode_cookie(inode)); + return 0; +} + static int cifs_drop_inode(struct inode *inode) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -733,6 +742,7 @@ static int cifs_drop_inode(struct inode *inode) static const struct super_operations cifs_super_ops = { .statfs = cifs_statfs, .alloc_inode = cifs_alloc_inode, + .write_inode = cifs_write_inode, .free_inode = cifs_free_inode, .drop_inode = cifs_drop_inode, .evict_inode = cifs_evict_inode, @@ -1625,13 +1635,9 @@ init_cifs(void) goto out_destroy_cifsoplockd_wq; } - rc = cifs_fscache_register(); - if (rc) - goto out_destroy_deferredclose_wq; - rc = cifs_init_inodecache(); if (rc) - goto out_unreg_fscache; + goto out_destroy_deferredclose_wq; rc = cifs_init_mids(); if (rc) @@ -1693,8 +1699,6 @@ out_destroy_mids: cifs_destroy_mids(); out_destroy_inodecache: cifs_destroy_inodecache(); -out_unreg_fscache: - cifs_fscache_unregister(); out_destroy_deferredclose_wq: destroy_workqueue(deferredclose_wq); out_destroy_cifsoplockd_wq: @@ -1730,7 +1734,6 @@ exit_cifs(void) cifs_destroy_request_bufs(); cifs_destroy_mids(); cifs_destroy_inodecache(); - cifs_fscache_unregister(); destroy_workqueue(deferredclose_wq); destroy_workqueue(cifsoplockd_wq); destroy_workqueue(decrypt_wq); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index cace0818d510..48b343d03430 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -668,9 +668,6 @@ struct TCP_Server_Info { unsigned int total_read; /* total amount of data read in this pass */ atomic_t in_send; /* requests trying to send */ atomic_t num_waiters; /* blocked waiting to get in sendrecv */ -#ifdef CONFIG_CIFS_FSCACHE - struct fscache_cookie *fscache; /* client index cache cookie */ -#endif #ifdef CONFIG_CIFS_STATS2 atomic_t num_cmds[NUMBER_OF_SMB2_COMMANDS]; /* total requests by cmd */ atomic_t smb2slowcmd[NUMBER_OF_SMB2_COMMANDS]; /* count resps > 1 sec */ @@ -1112,7 +1109,7 @@ struct cifs_tcon { __u32 max_bytes_copy; #ifdef CONFIG_CIFS_FSCACHE u64 resource_id; /* server resource id */ - struct fscache_cookie *fscache; /* cookie for share */ + struct fscache_volume *fscache; /* cookie for share */ #endif struct list_head pending_opens; /* list of incomplete opens */ struct cached_fid crfid; /* Cached root fid */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 453aba3c2ad6..11a22a30ee14 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1444,10 +1444,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) cifs_crypto_secmech_release(server); - /* fscache server cookies are based on primary channel only */ - if (!CIFS_SERVER_IS_CHAN(server)) - cifs_fscache_release_client_cookie(server); - kfree(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; @@ -1609,14 +1605,6 @@ smbd_connected: list_add(&tcp_ses->tcp_ses_list, &cifs_tcp_ses_list); spin_unlock(&cifs_tcp_ses_lock); - /* fscache server cookies are based on primary channel only */ - if (!CIFS_SERVER_IS_CHAN(tcp_ses)) - cifs_fscache_get_client_cookie(tcp_ses); -#ifdef CONFIG_CIFS_FSCACHE - else - tcp_ses->fscache = tcp_ses->primary_server->fscache; -#endif /* CONFIG_CIFS_FSCACHE */ - /* queue echo request delayed work */ queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval); @@ -3128,7 +3116,8 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx) * Inside cifs_fscache_get_super_cookie it checks * that we do not get super cookie twice. */ - cifs_fscache_get_super_cookie(tcon); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) + cifs_fscache_get_super_cookie(tcon); out: mnt_ctx->server = server; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 6e8e7cc26ae2..ce9b22aecfba 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -22,6 +22,7 @@ #include "cifs_unicode.h" #include "fs_context.h" #include "cifs_ioctl.h" +#include "fscache.h" static void renew_parental_timestamps(struct dentry *direntry) @@ -507,8 +508,12 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, server->ops->close(xid, tcon, &fid); cifs_del_pending_open(&open); rc = -ENOMEM; + goto out; } + fscache_use_cookie(cifs_inode_cookie(file_inode(file)), + file->f_mode & FMODE_WRITE); + out: cifs_put_tlink(tlink); out_free_xid: diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 9fee3af83a73..59334be9ed3b 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -376,8 +376,6 @@ static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) struct cifsLockInfo *li, *tmp; struct super_block *sb = inode->i_sb; - cifs_fscache_release_inode_cookie(inode); - /* * Delete any outstanding lock records. We'll lose them when the file * is closed anyway. @@ -570,7 +568,7 @@ int cifs_open(struct inode *inode, struct file *file) spin_lock(&CIFS_I(inode)->deferred_lock); cifs_del_deferred_close(cfile); spin_unlock(&CIFS_I(inode)->deferred_lock); - goto out; + goto use_cache; } else { _cifsFileInfo_put(cfile, true, false); } @@ -632,8 +630,6 @@ int cifs_open(struct inode *inode, struct file *file) goto out; } - cifs_fscache_set_inode_cookie(inode, file); - if ((oplock & CIFS_CREATE_ACTION) && !posix_open_ok && tcon->unix_ext) { /* * Time to set mode which we can not set earlier due to @@ -652,6 +648,15 @@ int cifs_open(struct inode *inode, struct file *file) cfile->pid); } +use_cache: + fscache_use_cookie(cifs_inode_cookie(file_inode(file)), + file->f_mode & FMODE_WRITE); + if (file->f_flags & O_DIRECT && + (!((file->f_flags & O_ACCMODE) != O_RDONLY) || + file->f_flags & O_APPEND)) + cifs_invalidate_cache(file_inode(file), + FSCACHE_INVAL_DIO_WRITE); + out: free_dentry_path(page); free_xid(xid); @@ -876,6 +881,8 @@ int cifs_close(struct inode *inode, struct file *file) struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_deferred_close *dclose; + cifs_fscache_unuse_inode_cookie(inode, file->f_mode & FMODE_WRITE); + if (file->private_data != NULL) { cfile = file->private_data; file->private_data = NULL; @@ -886,7 +893,6 @@ int cifs_close(struct inode *inode, struct file *file) dclose) { if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { inode->i_ctime = inode->i_mtime = current_time(inode); - cifs_fscache_update_inode_cookie(inode); } spin_lock(&cinode->deferred_lock); cifs_add_deferred_close(cfile, dclose); @@ -4198,10 +4204,12 @@ static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; - struct file *file = vmf->vma->vm_file; - struct inode *inode = file_inode(file); - cifs_fscache_wait_on_page_write(inode, page); +#ifdef CONFIG_CIFS_FSCACHE + if (PageFsCache(page) && + wait_on_page_fscache_killable(page) < 0) + return VM_FAULT_RETRY; +#endif lock_page(page); return VM_FAULT_LOCKED; @@ -4275,8 +4283,6 @@ cifs_readv_complete(struct work_struct *work) if (rdata->result == 0 || (rdata->result == -EAGAIN && got_bytes)) cifs_readpage_to_fscache(rdata->mapping->host, page); - else - cifs_fscache_uncache_page(rdata->mapping->host, page); got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes); @@ -4593,11 +4599,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, kref_put(&rdata->refcount, cifs_readdata_release); } - /* Any pages that have been shown to fscache but didn't get added to - * the pagecache must be uncached before they get returned to the - * allocator. - */ - cifs_fscache_readpages_cancel(mapping->host, page_list); free_xid(xid); return rc; } @@ -4801,17 +4802,19 @@ static int cifs_release_page(struct page *page, gfp_t gfp) { if (PagePrivate(page)) return 0; - - return cifs_fscache_release_page(page, gfp); + if (PageFsCache(page)) { + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; + wait_on_page_fscache(page); + } + fscache_note_page_release(cifs_inode_cookie(page->mapping->host)); + return true; } static void cifs_invalidate_page(struct page *page, unsigned int offset, unsigned int length) { - struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host); - - if (offset == 0 && length == PAGE_SIZE) - cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); + wait_on_page_fscache(page); } static int cifs_launder_page(struct page *page) @@ -4831,7 +4834,7 @@ static int cifs_launder_page(struct page *page) if (clear_page_dirty_for_io(page)) rc = cifs_writepage_locked(page, &wbc); - cifs_fscache_invalidate_page(page, page->mapping->host); + wait_on_page_fscache(page); return rc; } @@ -4988,6 +4991,19 @@ static void cifs_swap_deactivate(struct file *file) /* do we need to unpin (or unlock) the file */ } +/* + * Mark a page as having been made dirty and thus needing writeback. We also + * need to pin the cache object to write back to. + */ +#ifdef CONFIG_CIFS_FSCACHE +static int cifs_set_page_dirty(struct page *page) +{ + return fscache_set_page_dirty(page, cifs_inode_cookie(page->mapping->host)); +} +#else +#define cifs_set_page_dirty __set_page_dirty_nobuffers +#endif + const struct address_space_operations cifs_addr_ops = { .readpage = cifs_readpage, .readpages = cifs_readpages, @@ -4995,7 +5011,7 @@ const struct address_space_operations cifs_addr_ops = { .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = cifs_set_page_dirty, .releasepage = cifs_release_page, .direct_IO = cifs_direct_io, .invalidatepage = cifs_invalidate_page, @@ -5020,7 +5036,7 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .writepages = cifs_writepages, .write_begin = cifs_write_begin, .write_end = cifs_write_end, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = cifs_set_page_dirty, .releasepage = cifs_release_page, .invalidatepage = cifs_invalidate_page, .launder_page = cifs_launder_page, diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 003c5f1f4dfb..efaac4d5ff55 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -12,250 +12,136 @@ #include "cifs_fs_sb.h" #include "cifsproto.h" -/* - * Key layout of CIFS server cache index object - */ -struct cifs_server_key { - __u64 conn_id; -} __packed; - -/* - * Get a cookie for a server object keyed by {IPaddress,port,family} tuple - */ -void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) -{ - struct cifs_server_key key; - - /* - * Check if cookie was already initialized so don't reinitialize it. - * In the future, as we integrate with newer fscache features, - * we may want to instead add a check if cookie has changed - */ - if (server->fscache) - return; - - memset(&key, 0, sizeof(key)); - key.conn_id = server->conn_id; - - server->fscache = - fscache_acquire_cookie(cifs_fscache_netfs.primary_index, - &cifs_fscache_server_index_def, - &key, sizeof(key), - NULL, 0, - server, 0, true); - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", - __func__, server, server->fscache); -} - -void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) +static void cifs_fscache_fill_volume_coherency( + struct cifs_tcon *tcon, + struct cifs_fscache_volume_coherency_data *cd) { - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", - __func__, server, server->fscache); - fscache_relinquish_cookie(server->fscache, NULL, false); - server->fscache = NULL; + memset(cd, 0, sizeof(*cd)); + cd->resource_id = cpu_to_le64(tcon->resource_id); + cd->vol_create_time = tcon->vol_create_time; + cd->vol_serial_number = cpu_to_le32(tcon->vol_serial_number); } -void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) +int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) { + struct cifs_fscache_volume_coherency_data cd; struct TCP_Server_Info *server = tcon->ses->server; + struct fscache_volume *vcookie; + const struct sockaddr *sa = (struct sockaddr *)&server->dstaddr; + size_t slen, i; char *sharename; - struct cifs_fscache_super_auxdata auxdata; + char *key; + int ret = -ENOMEM; - /* - * Check if cookie was already initialized so don't reinitialize it. - * In the future, as we integrate with newer fscache features, - * we may want to instead add a check if cookie has changed - */ - if (tcon->fscache) - return; + tcon->fscache = NULL; + switch (sa->sa_family) { + case AF_INET: + case AF_INET6: + break; + default: + cifs_dbg(VFS, "Unknown network family '%d'\n", sa->sa_family); + return -EINVAL; + } + + memset(&key, 0, sizeof(key)); sharename = extract_sharename(tcon->treeName); if (IS_ERR(sharename)) { cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__); - tcon->fscache = NULL; - return; + return -EINVAL; } - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.resource_id = tcon->resource_id; - auxdata.vol_create_time = tcon->vol_create_time; - auxdata.vol_serial_number = tcon->vol_serial_number; + slen = strlen(sharename); + for (i = 0; i < slen; i++) + if (sharename[i] == '/') + sharename[i] = ';'; + + key = kasprintf(GFP_KERNEL, "cifs,%pISpc,%s", sa, sharename); + if (!key) + goto out; + + cifs_fscache_fill_volume_coherency(tcon, &cd); + vcookie = fscache_acquire_volume(key, + NULL, /* preferred_cache */ + &cd, sizeof(cd)); + cifs_dbg(FYI, "%s: (%s/0x%p)\n", __func__, key, vcookie); + if (IS_ERR(vcookie)) { + if (vcookie != ERR_PTR(-EBUSY)) { + ret = PTR_ERR(vcookie); + goto out_2; + } + pr_err("Cache volume key already in use (%s)\n", key); + vcookie = NULL; + } - tcon->fscache = - fscache_acquire_cookie(server->fscache, - &cifs_fscache_super_index_def, - sharename, strlen(sharename), - &auxdata, sizeof(auxdata), - tcon, 0, true); + tcon->fscache = vcookie; + ret = 0; +out_2: + kfree(key); +out: kfree(sharename); - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", - __func__, server->fscache, tcon->fscache); + return ret; } void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) { - struct cifs_fscache_super_auxdata auxdata; - - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.resource_id = tcon->resource_id; - auxdata.vol_create_time = tcon->vol_create_time; - auxdata.vol_serial_number = tcon->vol_serial_number; + struct cifs_fscache_volume_coherency_data cd; cifs_dbg(FYI, "%s: (0x%p)\n", __func__, tcon->fscache); - fscache_relinquish_cookie(tcon->fscache, &auxdata, false); - tcon->fscache = NULL; -} - -static void cifs_fscache_acquire_inode_cookie(struct cifsInodeInfo *cifsi, - struct cifs_tcon *tcon) -{ - struct cifs_fscache_inode_auxdata auxdata; - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec; - auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec; - auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec; - auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; - - cifsi->fscache = - fscache_acquire_cookie(tcon->fscache, - &cifs_fscache_inode_object_def, - &cifsi->uniqueid, sizeof(cifsi->uniqueid), - &auxdata, sizeof(auxdata), - cifsi, cifsi->vfs_inode.i_size, true); + cifs_fscache_fill_volume_coherency(tcon, &cd); + fscache_relinquish_volume(tcon->fscache, &cd, false); + tcon->fscache = NULL; } -static void cifs_fscache_enable_inode_cookie(struct inode *inode) +void cifs_fscache_get_inode_cookie(struct inode *inode) { + struct cifs_fscache_inode_coherency_data cd; struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - if (cifsi->fscache) - return; - - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)) - return; - - cifs_fscache_acquire_inode_cookie(cifsi, tcon); + cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd); - cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n", - __func__, tcon->fscache, cifsi->fscache); + cifsi->fscache = + fscache_acquire_cookie(tcon->fscache, 0, + &cifsi->uniqueid, sizeof(cifsi->uniqueid), + &cd, sizeof(cd), + i_size_read(&cifsi->vfs_inode)); } -void cifs_fscache_release_inode_cookie(struct inode *inode) +void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) { - struct cifs_fscache_inode_auxdata auxdata; - struct cifsInodeInfo *cifsi = CIFS_I(inode); - - if (cifsi->fscache) { - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec; - auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec; - auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec; - auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; + if (update) { + struct cifs_fscache_inode_coherency_data cd; + loff_t i_size = i_size_read(inode); - cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); - /* fscache_relinquish_cookie does not seem to update auxdata */ - fscache_update_cookie(cifsi->fscache, &auxdata); - fscache_relinquish_cookie(cifsi->fscache, &auxdata, false); - cifsi->fscache = NULL; + cifs_fscache_fill_coherency(inode, &cd); + fscache_unuse_cookie(cifs_inode_cookie(inode), &cd, &i_size); + } else { + fscache_unuse_cookie(cifs_inode_cookie(inode), NULL, NULL); } } -void cifs_fscache_update_inode_cookie(struct inode *inode) +void cifs_fscache_release_inode_cookie(struct inode *inode) { - struct cifs_fscache_inode_auxdata auxdata; struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.eof = cifsi->server_eof; - auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec; - auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec; - auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec; - auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; - cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); - fscache_update_cookie(cifsi->fscache, &auxdata); - } -} - -void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) -{ - cifs_fscache_enable_inode_cookie(inode); -} - -void cifs_fscache_reset_inode_cookie(struct inode *inode) -{ - struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - struct fscache_cookie *old = cifsi->fscache; - - if (cifsi->fscache) { - /* retire the current fscache cache and get a new one */ - fscache_relinquish_cookie(cifsi->fscache, NULL, true); - - cifs_fscache_acquire_inode_cookie(cifsi, tcon); - cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n", - __func__, cifsi->fscache, old); + fscache_relinquish_cookie(cifsi->fscache, false); + cifsi->fscache = NULL; } } -int cifs_fscache_release_page(struct page *page, gfp_t gfp) -{ - if (PageFsCache(page)) { - struct inode *inode = page->mapping->host; - struct cifsInodeInfo *cifsi = CIFS_I(inode); - - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", - __func__, page, cifsi->fscache); - if (!fscache_maybe_release_page(cifsi->fscache, page, gfp)) - return 0; - } - - return 1; -} - -static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx, - int error) -{ - cifs_dbg(FYI, "%s: (0x%p/%d)\n", __func__, page, error); - if (!error) - SetPageUptodate(page); - unlock_page(page); -} - /* * Retrieve a page from FS-Cache */ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) { - int ret; - cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n", __func__, CIFS_I(inode)->fscache, page, inode); - ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page, - cifs_readpage_from_fscache_complete, - NULL, - GFP_KERNEL); - switch (ret) { - - case 0: /* page found in fscache, read submitted */ - cifs_dbg(FYI, "%s: submitted\n", __func__); - return ret; - case -ENOBUFS: /* page won't be cached */ - case -ENODATA: /* page not in cache */ - cifs_dbg(FYI, "%s: %d\n", __func__, ret); - return 1; - - default: - cifs_dbg(VFS, "unknown error ret = %d\n", ret); - } - return ret; + return -ENOBUFS; // Needs conversion to using netfslib } /* @@ -266,78 +152,19 @@ int __cifs_readpages_from_fscache(struct inode *inode, struct list_head *pages, unsigned *nr_pages) { - int ret; - cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n", __func__, CIFS_I(inode)->fscache, *nr_pages, inode); - ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping, - pages, nr_pages, - cifs_readpage_from_fscache_complete, - NULL, - mapping_gfp_mask(mapping)); - switch (ret) { - case 0: /* read submitted to the cache for all pages */ - cifs_dbg(FYI, "%s: submitted\n", __func__); - return ret; - - case -ENOBUFS: /* some pages are not cached and can't be */ - case -ENODATA: /* some pages are not cached */ - cifs_dbg(FYI, "%s: no page\n", __func__); - return 1; - - default: - cifs_dbg(FYI, "unknown error ret = %d\n", ret); - } - - return ret; + return -ENOBUFS; // Needs conversion to using netfslib } void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) { struct cifsInodeInfo *cifsi = CIFS_I(inode); - int ret; WARN_ON(!cifsi->fscache); cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", __func__, cifsi->fscache, page, inode); - ret = fscache_write_page(cifsi->fscache, page, - cifsi->vfs_inode.i_size, GFP_KERNEL); - if (ret != 0) - fscache_uncache_page(cifsi->fscache, page); -} - -void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) -{ - cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n", - __func__, CIFS_I(inode)->fscache, inode); - fscache_readpages_cancel(CIFS_I(inode)->fscache, pages); -} - -void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) -{ - struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct fscache_cookie *cookie = cifsi->fscache; - - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); - fscache_wait_on_page_write(cookie, page); - fscache_uncache_page(cookie, page); -} - -void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page) -{ - struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct fscache_cookie *cookie = cifsi->fscache; - - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); - fscache_wait_on_page_write(cookie, page); -} - -void __cifs_fscache_uncache_page(struct inode *inode, struct page *page) -{ - struct cifsInodeInfo *cifsi = CIFS_I(inode); - struct fscache_cookie *cookie = cifsi->fscache; - cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); - fscache_uncache_page(cookie, page); + // Needs conversion to using netfslib } diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 9baa1d0f22bd..c6ca49ac33d4 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -13,84 +13,71 @@ #include "cifsglob.h" -#ifdef CONFIG_CIFS_FSCACHE - /* - * Auxiliary data attached to CIFS superblock within the cache + * Coherency data attached to CIFS volume within the cache */ -struct cifs_fscache_super_auxdata { - u64 resource_id; /* unique server resource id */ +struct cifs_fscache_volume_coherency_data { + __le64 resource_id; /* unique server resource id */ __le64 vol_create_time; - u32 vol_serial_number; + __le32 vol_serial_number; } __packed; /* - * Auxiliary data attached to CIFS inode within the cache + * Coherency data attached to CIFS inode within the cache. */ -struct cifs_fscache_inode_auxdata { - u64 last_write_time_sec; - u64 last_change_time_sec; - u32 last_write_time_nsec; - u32 last_change_time_nsec; - u64 eof; +struct cifs_fscache_inode_coherency_data { + __le64 last_write_time_sec; + __le64 last_change_time_sec; + __le32 last_write_time_nsec; + __le32 last_change_time_nsec; }; -/* - * cache.c - */ -extern struct fscache_netfs cifs_fscache_netfs; -extern const struct fscache_cookie_def cifs_fscache_server_index_def; -extern const struct fscache_cookie_def cifs_fscache_super_index_def; -extern const struct fscache_cookie_def cifs_fscache_inode_object_def; - -extern int cifs_fscache_register(void); -extern void cifs_fscache_unregister(void); +#ifdef CONFIG_CIFS_FSCACHE /* * fscache.c */ -extern void cifs_fscache_get_client_cookie(struct TCP_Server_Info *); -extern void cifs_fscache_release_client_cookie(struct TCP_Server_Info *); -extern void cifs_fscache_get_super_cookie(struct cifs_tcon *); +extern int cifs_fscache_get_super_cookie(struct cifs_tcon *); extern void cifs_fscache_release_super_cookie(struct cifs_tcon *); +extern void cifs_fscache_get_inode_cookie(struct inode *inode); extern void cifs_fscache_release_inode_cookie(struct inode *); -extern void cifs_fscache_update_inode_cookie(struct inode *inode); -extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *); -extern void cifs_fscache_reset_inode_cookie(struct inode *); +extern void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update); + +static inline +void cifs_fscache_fill_coherency(struct inode *inode, + struct cifs_fscache_inode_coherency_data *cd) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + + memset(cd, 0, sizeof(*cd)); + cd->last_write_time_sec = cpu_to_le64(cifsi->vfs_inode.i_mtime.tv_sec); + cd->last_write_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_mtime.tv_nsec); + cd->last_change_time_sec = cpu_to_le64(cifsi->vfs_inode.i_ctime.tv_sec); + cd->last_change_time_nsec = cpu_to_le32(cifsi->vfs_inode.i_ctime.tv_nsec); +} + -extern void __cifs_fscache_invalidate_page(struct page *, struct inode *); -extern void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page); -extern void __cifs_fscache_uncache_page(struct inode *inode, struct page *page); extern int cifs_fscache_release_page(struct page *page, gfp_t gfp); extern int __cifs_readpage_from_fscache(struct inode *, struct page *); extern int __cifs_readpages_from_fscache(struct inode *, struct address_space *, struct list_head *, unsigned *); -extern void __cifs_fscache_readpages_cancel(struct inode *, struct list_head *); - extern void __cifs_readpage_to_fscache(struct inode *, struct page *); -static inline void cifs_fscache_invalidate_page(struct page *page, - struct inode *inode) +static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { - if (PageFsCache(page)) - __cifs_fscache_invalidate_page(page, inode); + return CIFS_I(inode)->fscache; } -static inline void cifs_fscache_wait_on_page_write(struct inode *inode, - struct page *page) +static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) { - if (PageFsCache(page)) - __cifs_fscache_wait_on_page_write(inode, page); -} + struct cifs_fscache_inode_coherency_data cd; -static inline void cifs_fscache_uncache_page(struct inode *inode, - struct page *page) -{ - if (PageFsCache(page)) - __cifs_fscache_uncache_page(inode, page); + cifs_fscache_fill_coherency(inode, &cd); + fscache_invalidate(cifs_inode_cookie(inode), &cd, + i_size_read(inode), flags); } static inline int cifs_readpage_from_fscache(struct inode *inode, @@ -120,41 +107,21 @@ static inline void cifs_readpage_to_fscache(struct inode *inode, __cifs_readpage_to_fscache(inode, page); } -static inline void cifs_fscache_readpages_cancel(struct inode *inode, - struct list_head *pages) +#else /* CONFIG_CIFS_FSCACHE */ +static inline +void cifs_fscache_fill_coherency(struct inode *inode, + struct cifs_fscache_inode_coherency_data *cd) { - if (CIFS_I(inode)->fscache) - return __cifs_fscache_readpages_cancel(inode, pages); } -#else /* CONFIG_CIFS_FSCACHE */ -static inline int cifs_fscache_register(void) { return 0; } -static inline void cifs_fscache_unregister(void) {} - -static inline void -cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) {} -static inline void -cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) {} -static inline void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) {} -static inline void -cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {} +static inline int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) { return 0; } +static inline void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {} +static inline void cifs_fscache_get_inode_cookie(struct inode *inode) {} static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {} -static inline void cifs_fscache_update_inode_cookie(struct inode *inode) {} -static inline void cifs_fscache_set_inode_cookie(struct inode *inode, - struct file *filp) {} -static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {} -static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp) -{ - return 1; /* May release page */ -} - -static inline void cifs_fscache_invalidate_page(struct page *page, - struct inode *inode) {} -static inline void cifs_fscache_wait_on_page_write(struct inode *inode, - struct page *page) {} -static inline void cifs_fscache_uncache_page(struct inode *inode, - struct page *page) {} +static inline void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) {} +static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { return NULL; } +static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) {} static inline int cifs_readpage_from_fscache(struct inode *inode, struct page *page) @@ -173,11 +140,6 @@ static inline int cifs_readpages_from_fscache(struct inode *inode, static inline void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {} -static inline void cifs_fscache_readpages_cancel(struct inode *inode, - struct list_head *pages) -{ -} - #endif /* CONFIG_CIFS_FSCACHE */ #endif /* _CIFS_FSCACHE_H */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index baa197edd8c5..7d8b3ceb2af3 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1304,10 +1304,7 @@ retry_iget5_locked: inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; -#ifdef CONFIG_CIFS_FSCACHE - /* initialize per-inode cache cookie pointer */ - CIFS_I(inode)->fscache = NULL; -#endif + cifs_fscache_get_inode_cookie(inode); unlock_new_inode(inode); } } @@ -1376,6 +1373,7 @@ iget_no_retry: iget_failed(inode); inode = ERR_PTR(rc); } + out: kfree(path); free_xid(xid); @@ -2263,6 +2261,8 @@ cifs_dentry_needs_reval(struct dentry *dentry) int cifs_invalidate_mapping(struct inode *inode) { + struct cifs_fscache_inode_coherency_data cd; + struct cifsInodeInfo *cifsi = CIFS_I(inode); int rc = 0; if (inode->i_mapping && inode->i_mapping->nrpages != 0) { @@ -2272,7 +2272,8 @@ cifs_invalidate_mapping(struct inode *inode) __func__, inode); } - cifs_fscache_reset_inode_cookie(inode); + cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd); + fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0); return rc; } @@ -2777,8 +2778,10 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) goto out; if ((attrs->ia_valid & ATTR_SIZE) && - attrs->ia_size != i_size_read(inode)) + attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); + } setattr_copy(&init_user_ns, inode, attrs); mark_inode_dirty(inode); @@ -2973,8 +2976,10 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) goto cifs_setattr_exit; if ((attrs->ia_valid & ATTR_SIZE) && - attrs->ia_size != i_size_read(inode)) + attrs->ia_size != i_size_read(inode)) { truncate_setsize(inode, attrs->ia_size); + fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size); + } setattr_copy(&init_user_ns, inode, attrs); mark_inode_dirty(inode); -- cgit v1.2.3-58-ga151 From 484167da77739a8d0e225008c48e697fd3f781ae Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 18 Jan 2022 15:19:04 +0800 Subject: btrfs: defrag: fix wrong number of defragged sectors [BUG] There are users using autodefrag mount option reporting obvious increase in IO: > If I compare the write average (in total, I don't have it per process) > when taking idle periods on the same machine: > Linux 5.16: > without autodefrag: ~ 10KiB/s > with autodefrag: between 1 and 2MiB/s. > > Linux 5.15: > with autodefrag:~ 10KiB/s (around the same as without > autodefrag on 5.16) [CAUSE] When autodefrag mount option is enabled, btrfs_defrag_file() will be called with @max_sectors = BTRFS_DEFRAG_BATCH (1024) to limit how many sectors we can defrag in one try. And then use the number of sectors defragged to determine if we need to re-defrag. But commit b18c3ab2343d ("btrfs: defrag: introduce helper to defrag one cluster") uses wrong unit to increase @sectors_defragged, which should be in unit of sector, not byte. This means, if we have defragged any sector, then @sectors_defragged will be >= sectorsize (normally 4096), which is larger than BTRFS_DEFRAG_BATCH. This makes the @max_sectors check in defrag_one_cluster() to underflow, rendering the whole @max_sectors check useless. Thus causing way more IO for autodefrag mount options, as now there is no limit on how many sectors can really be defragged. [FIX] Fix the problems by: - Use sector as unit when increasing @sectors_defragged - Include @sectors_defragged > @max_sectors case to break the loop - Add extra comment on the return value of btrfs_defrag_file() Reported-by: Anthony Ruhier Fixes: b18c3ab2343d ("btrfs: defrag: introduce helper to defrag one cluster") Link: https://lore.kernel.org/linux-btrfs/0a269612-e43f-da22-c5bc-b34b1b56ebe8@mailbox.org/ CC: stable@vger.kernel.org # 5.16 Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6391be7409d8..7029ee75e301 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1442,8 +1442,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, list_for_each_entry(entry, &target_list, list) { u32 range_len = entry->len; - /* Reached the limit */ - if (max_sectors && max_sectors == *sectors_defragged) + /* Reached or beyond the limit */ + if (max_sectors && *sectors_defragged >= max_sectors) break; if (max_sectors) @@ -1465,7 +1465,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode, extent_thresh, newer_than, do_compress); if (ret < 0) break; - *sectors_defragged += range_len; + *sectors_defragged += range_len >> + inode->root->fs_info->sectorsize_bits; } out: list_for_each_entry_safe(entry, tmp, &target_list, list) { @@ -1484,6 +1485,9 @@ out: * @newer_than: minimum transid to defrag * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode * will be defragged. + * + * Return <0 for error. + * Return >=0 for the number of sectors defragged. */ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, -- cgit v1.2.3-58-ga151 From c080b4144b9dd3b7af838a194ffad3204ca15166 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 18 Jan 2022 19:53:52 +0800 Subject: btrfs: defrag: properly update range->start for autodefrag [BUG] After commit 7b508037d4ca ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") autodefrag no longer properly re-defrag the file from previously finished location. [CAUSE] The recent refactoring of defrag only focuses on defrag ioctl subpage support, doesn't take autodefrag into consideration. There are two problems involved which prevents autodefrag to restart its scan: - No range.start update Previously when one defrag target is found, range->start will be updated to indicate where next search should start from. But now btrfs_defrag_file() doesn't update it anymore, making all autodefrag to rescan from file offset 0. This would also make autodefrag to mark the same range dirty again and again, causing extra IO. - No proper quick exit for defrag_one_cluster() Currently if we reached or exceed @max_sectors limit, we just exit defrag_one_cluster(), and let next defrag_one_cluster() call to do a quick exit. This makes @cur increase, thus no way to properly know which range is defragged and which range is skipped. [FIX] The fix involves two modifications: - Update range->start to next cluster start This is a little different from the old behavior. Previously range->start is updated to the next defrag target. But in the end, the behavior should still be pretty much the same, as now we skip to next defrag target inside btrfs_defrag_file(). Thus if auto-defrag determines to re-scan, then we still do the skip, just at a different timing. - Make defrag_one_cluster() to return >0 to indicate a quick exit So that btrfs_defrag_file() can also do a quick exit, without increasing @cur to the range end, and re-use @cur to update @range->start. - Add comment for btrfs_defrag_file() to mention the range->start update Currently only autodefrag utilize this behavior, as defrag ioctl won't set @max_to_defrag parameter, thus unless interrupted it will always try to defrag the whole range. Reported-by: Filipe Manana Fixes: 7b508037d4ca ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") Link: https://lore.kernel.org/linux-btrfs/0a269612-e43f-da22-c5bc-b34b1b56ebe8@mailbox.org/ CC: stable@vger.kernel.org # 5.16 Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7029ee75e301..6d5d6d2c4ad1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1443,8 +1443,10 @@ static int defrag_one_cluster(struct btrfs_inode *inode, u32 range_len = entry->len; /* Reached or beyond the limit */ - if (max_sectors && *sectors_defragged >= max_sectors) + if (max_sectors && *sectors_defragged >= max_sectors) { + ret = 1; break; + } if (max_sectors) range_len = min_t(u32, range_len, @@ -1487,7 +1489,10 @@ out: * will be defragged. * * Return <0 for error. - * Return >=0 for the number of sectors defragged. + * Return >=0 for the number of sectors defragged, and range->start will be updated + * to indicate the file offset where next defrag should be started at. + * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without + * defragging all the range). */ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, @@ -1580,10 +1585,19 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, if (ret < 0) break; cur = cluster_end + 1; + if (ret > 0) { + ret = 0; + break; + } } if (ra_allocated) kfree(ra); + /* + * Update range.start for autodefrag, this will indicate where to start + * in next run. + */ + range->start = cur; if (sectors_defragged) { /* * We have defragged some sectors, for compression case they -- cgit v1.2.3-58-ga151 From 8326c79d10be2ddbfd3d3804206949a71cb15675 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 13 Nov 2021 10:43:52 -0300 Subject: tools headers UAPI: Sync x86 arch prctl headers with the kernel sources To pick the changes in this cset: 980fe2fddcff2193 ("x86/fpu: Extend fpu_xstate_prctl() with guest permissions") This picks these new prctls: $ tools/perf/trace/beauty/x86_arch_prctl.sh > /tmp/before $ cp arch/x86/include/uapi/asm/prctl.h tools/arch/x86/include/uapi/asm/prctl.h $ tools/perf/trace/beauty/x86_arch_prctl.sh > /tmp/after $ diff -u /tmp/before /tmp/after --- /tmp/before 2022-01-19 14:40:05.049394977 -0300 +++ /tmp/after 2022-01-19 14:40:35.628154565 -0300 @@ -9,6 +9,8 @@ [0x1021 - 0x1001]= "GET_XCOMP_SUPP", [0x1022 - 0x1001]= "GET_XCOMP_PERM", [0x1023 - 0x1001]= "REQ_XCOMP_PERM", + [0x1024 - 0x1001]= "GET_XCOMP_GUEST_PERM", + [0x1025 - 0x1001]= "REQ_XCOMP_GUEST_PERM", }; #define x86_arch_prctl_codes_2_offset 0x2001 $ With this 'perf trace' can translate those numbers into strings and use the strings in filter expressions: # perf trace -e prctl 0.000 ( 0.011 ms): DOM Worker/3722622 prctl(option: SET_NAME, arg2: 0x7f9c014b7df5) = 0 0.032 ( 0.002 ms): DOM Worker/3722622 prctl(option: SET_NAME, arg2: 0x7f9bb6b51580) = 0 5.452 ( 0.003 ms): StreamT~ns #30/3722623 prctl(option: SET_NAME, arg2: 0x7f9bdbdfeb70) = 0 5.468 ( 0.002 ms): StreamT~ns #30/3722623 prctl(option: SET_NAME, arg2: 0x7f9bdbdfea70) = 0 24.494 ( 0.009 ms): IndexedDB #556/3722624 prctl(option: SET_NAME, arg2: 0x7f562a32ae28) = 0 24.540 ( 0.002 ms): IndexedDB #556/3722624 prctl(option: SET_NAME, arg2: 0x7f563c6d4b30) = 0 670.281 ( 0.008 ms): systemd-userwo/3722339 prctl(option: SET_NAME, arg2: 0x564be30805c8) = 0 670.293 ( 0.002 ms): systemd-userwo/3722339 prctl(option: SET_NAME, arg2: 0x564be30800f0) = 0 ^C# This addresses these perf build warnings: Warning: Kernel ABI header at 'tools/arch/x86/include/uapi/asm/prctl.h' differs from latest version at 'arch/x86/include/uapi/asm/prctl.h' diff -u tools/arch/x86/include/uapi/asm/prctl.h arch/x86/include/uapi/asm/prctl.h Cc: Paolo Bonzini Cc: Thomas Gleixner Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/prctl.h | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/tools/arch/x86/include/uapi/asm/prctl.h b/tools/arch/x86/include/uapi/asm/prctl.h index 754a07856817..500b96e71f18 100644 --- a/tools/arch/x86/include/uapi/asm/prctl.h +++ b/tools/arch/x86/include/uapi/asm/prctl.h @@ -2,20 +2,22 @@ #ifndef _ASM_X86_PRCTL_H #define _ASM_X86_PRCTL_H -#define ARCH_SET_GS 0x1001 -#define ARCH_SET_FS 0x1002 -#define ARCH_GET_FS 0x1003 -#define ARCH_GET_GS 0x1004 +#define ARCH_SET_GS 0x1001 +#define ARCH_SET_FS 0x1002 +#define ARCH_GET_FS 0x1003 +#define ARCH_GET_GS 0x1004 -#define ARCH_GET_CPUID 0x1011 -#define ARCH_SET_CPUID 0x1012 +#define ARCH_GET_CPUID 0x1011 +#define ARCH_SET_CPUID 0x1012 -#define ARCH_GET_XCOMP_SUPP 0x1021 -#define ARCH_GET_XCOMP_PERM 0x1022 -#define ARCH_REQ_XCOMP_PERM 0x1023 +#define ARCH_GET_XCOMP_SUPP 0x1021 +#define ARCH_GET_XCOMP_PERM 0x1022 +#define ARCH_REQ_XCOMP_PERM 0x1023 +#define ARCH_GET_XCOMP_GUEST_PERM 0x1024 +#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 -#define ARCH_MAP_VDSO_X32 0x2001 -#define ARCH_MAP_VDSO_32 0x2002 -#define ARCH_MAP_VDSO_64 0x2003 +#define ARCH_MAP_VDSO_X32 0x2001 +#define ARCH_MAP_VDSO_32 0x2002 +#define ARCH_MAP_VDSO_64 0x2003 #endif /* _ASM_X86_PRCTL_H */ -- cgit v1.2.3-58-ga151 From 902d6364aad5fc8731ac5b19ed9a1d9c9ee0bd91 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 6 Dec 2021 23:03:49 +0800 Subject: riscv: mm: init: remove unnecessary "#ifdef CONFIG_CRASH_DUMP" The is_kdump_kernel() returns false for !CRASH_DUMP case, so we don't need the #ifdef CONFIG_CRASH_DUMP for is_kdump_kernel() checking. Signed-off-by: Jisheng Zhang Reviewed-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 0624c68331d8..122352823835 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -791,12 +791,10 @@ static void __init reserve_crashkernel(void) * since it doesn't make much sense and we have limited memory * resources. */ -#ifdef CONFIG_CRASH_DUMP if (is_kdump_kernel()) { pr_info("crashkernel: ignoring reservation request\n"); return; } -#endif ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base); -- cgit v1.2.3-58-ga151 From 07aabe8fb6d1ac3163cc74c856521f2ee746270b Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 6 Dec 2021 23:03:50 +0800 Subject: riscv: mm: init: try best to use IS_ENABLED(CONFIG_64BIT) instead of #ifdef Try our best to replace the conditional compilation using "#ifdef CONFIG_64BIT" by a check for "IS_ENABLED(CONFIG_64BIT)", to simplify the code and to increase compile coverage. Now we can also remove the __maybe_unused used in max_mapped_addr declaration. We also remove the BUG_ON check of mapping the last 4K bytes of the addressable memory since this is always true for every kernel actually. Signed-off-by: Jisheng Zhang Reviewed-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 43 ++++++++++++++++--------------------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 122352823835..fd506637c372 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -102,10 +102,9 @@ static void __init print_vm_layout(void) (unsigned long)VMALLOC_END); print_mlm("lowmem", (unsigned long)PAGE_OFFSET, (unsigned long)high_memory); -#ifdef CONFIG_64BIT - print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR, - (unsigned long)ADDRESS_SPACE_END); -#endif + if (IS_ENABLED(CONFIG_64BIT)) + print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR, + (unsigned long)ADDRESS_SPACE_END); } #else static void print_vm_layout(void) { } @@ -163,7 +162,7 @@ static void __init setup_bootmem(void) { phys_addr_t vmlinux_end = __pa_symbol(&_end); phys_addr_t vmlinux_start = __pa_symbol(&_start); - phys_addr_t __maybe_unused max_mapped_addr; + phys_addr_t max_mapped_addr; phys_addr_t phys_ram_end; #ifdef CONFIG_XIP_KERNEL @@ -172,17 +171,16 @@ static void __init setup_bootmem(void) memblock_enforce_memory_limit(memory_limit); - /* - * Reserve from the start of the kernel to the end of the kernel - */ -#if defined(CONFIG_64BIT) && defined(CONFIG_STRICT_KERNEL_RWX) /* * Make sure we align the reservation on PMD_SIZE since we will * map the kernel in the linear mapping as read-only: we do not want * any allocation to happen between _end and the next pmd aligned page. */ - vmlinux_end = (vmlinux_end + PMD_SIZE - 1) & PMD_MASK; -#endif + if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + vmlinux_end = (vmlinux_end + PMD_SIZE - 1) & PMD_MASK; + /* + * Reserve from the start of the kernel to the end of the kernel + */ memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); @@ -190,7 +188,6 @@ static void __init setup_bootmem(void) #ifndef CONFIG_XIP_KERNEL phys_ram_base = memblock_start_of_DRAM(); #endif -#ifndef CONFIG_64BIT /* * memblock allocator is not aware of the fact that last 4K bytes of * the addressable memory can not be mapped because of IS_ERR_VALUE @@ -200,10 +197,11 @@ static void __init setup_bootmem(void) * address space is occupied by the kernel mapping then this check must * be done as soon as the kernel mapping base address is determined. */ - max_mapped_addr = __pa(~(ulong)0); - if (max_mapped_addr == (phys_ram_end - 1)) - memblock_set_current_limit(max_mapped_addr - 4096); -#endif + if (!IS_ENABLED(CONFIG_64BIT)) { + max_mapped_addr = __pa(~(ulong)0); + if (max_mapped_addr == (phys_ram_end - 1)) + memblock_set_current_limit(max_mapped_addr - 4096); + } min_low_pfn = PFN_UP(phys_ram_base); max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end); @@ -617,14 +615,6 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0); BUG_ON((kernel_map.phys_addr % PMD_SIZE) != 0); -#ifdef CONFIG_64BIT - /* - * The last 4K bytes of the addressable memory can not be mapped because - * of IS_ERR_VALUE macro. - */ - BUG_ON((kernel_map.virt_addr + kernel_map.size) > ADDRESS_SPACE_END - SZ_4K); -#endif - pt_ops.alloc_pte = alloc_pte_early; pt_ops.get_pte_virt = get_pte_virt_early; #ifndef __PAGETABLE_PMD_FOLDED @@ -736,10 +726,9 @@ static void __init setup_vm_final(void) } } -#ifdef CONFIG_64BIT /* Map the kernel */ - create_kernel_page_table(swapper_pg_dir, false); -#endif + if (IS_ENABLED(CONFIG_64BIT)) + create_kernel_page_table(swapper_pg_dir, false); /* Clear fixmap PTE and PMD mappings */ clear_fixmap(FIX_PTE); -- cgit v1.2.3-58-ga151 From 3274a6ef3b1bd9faef940c40ef201723d5d3d056 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 6 Dec 2021 23:03:51 +0800 Subject: riscv: mm: init: remove _pt_ops and use pt_ops directly Except "pt_ops", other global vars when CONFIG_XIP_KERNEL=y is defined as below: |foo_type foo; |#ifdef CONFIG_XIP_KERNEL |#define foo (*(foo_type *)XIP_FIXUP(&foo)) |#endif Follow the same way for pt_ops to unify the style and to simplify code. Signed-off-by: Jisheng Zhang Reviewed-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index fd506637c372..f96acd26801f 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -227,12 +227,10 @@ static void __init setup_bootmem(void) } #ifdef CONFIG_MMU -static struct pt_alloc_ops _pt_ops __initdata; +static struct pt_alloc_ops pt_ops __initdata; #ifdef CONFIG_XIP_KERNEL -#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops)) -#else -#define pt_ops _pt_ops +#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) #endif unsigned long riscv_pfn_base __ro_after_init; -- cgit v1.2.3-58-ga151 From fe036db7d8a93cfbfd254f76e0ecf81aecbbf6b4 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 6 Dec 2021 23:03:52 +0800 Subject: riscv: mm: init: try IS_ENABLED(CONFIG_XIP_KERNEL) instead of #ifdef Try our best to replace the conditional compilation using "#ifdef CONFIG_XIP_KERNEL" with "IS_ENABLED(CONFIG_XIP_KERNEL)", to simplify the code and to increase compile coverage. Signed-off-by: Jisheng Zhang Reviewed-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index f96acd26801f..b20d6d2e184c 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -161,13 +161,13 @@ early_param("mem", early_mem); static void __init setup_bootmem(void) { phys_addr_t vmlinux_end = __pa_symbol(&_end); - phys_addr_t vmlinux_start = __pa_symbol(&_start); phys_addr_t max_mapped_addr; - phys_addr_t phys_ram_end; + phys_addr_t phys_ram_end, vmlinux_start; -#ifdef CONFIG_XIP_KERNEL - vmlinux_start = __pa_symbol(&_sdata); -#endif + if (IS_ENABLED(CONFIG_XIP_KERNEL)) + vmlinux_start = __pa_symbol(&_sdata); + else + vmlinux_start = __pa_symbol(&_start); memblock_enforce_memory_limit(memory_limit); @@ -183,11 +183,9 @@ static void __init setup_bootmem(void) */ memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); - phys_ram_end = memblock_end_of_DRAM(); -#ifndef CONFIG_XIP_KERNEL - phys_ram_base = memblock_start_of_DRAM(); -#endif + if (!IS_ENABLED(CONFIG_XIP_KERNEL)) + phys_ram_base = memblock_start_of_DRAM(); /* * memblock allocator is not aware of the fact that last 4K bytes of * the addressable memory can not be mapped because of IS_ERR_VALUE -- cgit v1.2.3-58-ga151 From 805a3ebed59f81155ded218648db58bdc886a881 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 6 Dec 2021 23:03:53 +0800 Subject: riscv: mm: init: try best to remove #ifdef CONFIG_XIP_KERNEL usage Currently, the #ifdef CONFIG_XIP_KERNEL usage can be divided into the following three types: The first one is for functions/declarations only used in XIP case. The second one is for XIP_FIXUP case. Something as below: |foo_type foo; |#ifdef CONFIG_XIP_KERNEL |#define foo (*(foo_type *)XIP_FIXUP(&foo)) |#endif Usually, it's better to let the foo macro sit with the foo var together. But if various foos are defined adjacently, we can save some #ifdef CONFIG_XIP_KERNEL usage by grouping them together. The third one is for different implementations for XIP, usually, this is a #ifdef...#else...#endif case. This patch moves the pt_ops macro to adjacent #ifdef CONFIG_XIP_KERNEL and group first type usage cases into one. Signed-off-by: Jisheng Zhang Reviewed-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index b20d6d2e184c..c1fffec37749 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -40,10 +40,6 @@ EXPORT_SYMBOL(kernel_map); phys_addr_t phys_ram_base __ro_after_init; EXPORT_SYMBOL(phys_ram_base); -#ifdef CONFIG_XIP_KERNEL -extern char _xiprom[], _exiprom[], __data_loc; -#endif - unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); @@ -227,10 +223,6 @@ static void __init setup_bootmem(void) #ifdef CONFIG_MMU static struct pt_alloc_ops pt_ops __initdata; -#ifdef CONFIG_XIP_KERNEL -#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) -#endif - unsigned long riscv_pfn_base __ro_after_init; EXPORT_SYMBOL(riscv_pfn_base); @@ -242,6 +234,7 @@ pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #ifdef CONFIG_XIP_KERNEL +#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops)) #define trampoline_pg_dir ((pgd_t *)XIP_FIXUP(trampoline_pg_dir)) #define fixmap_pte ((pte_t *)XIP_FIXUP(fixmap_pte)) #define early_pg_dir ((pgd_t *)XIP_FIXUP(early_pg_dir)) @@ -446,6 +439,8 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) } #ifdef CONFIG_XIP_KERNEL +extern char _xiprom[], _exiprom[], __data_loc; + /* called from head.S with MMU off */ asmlinkage void __init __copy_data(void) { -- cgit v1.2.3-58-ga151 From fa68118144c63e292628945c5b8feb16b84fea7d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 19 Jan 2022 13:30:23 -0500 Subject: kvm: selftests: sync uapi/linux/kvm.h with Linux header KVM_CAP_XSAVE2 is out of sync due to a conflict. Copy the whole file while at it. Reported-by: Yang Zhong Signed-off-by: Paolo Bonzini --- tools/include/uapi/linux/kvm.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index f066637ee206..9563d294f181 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1131,7 +1131,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204 #define KVM_CAP_ARM_MTE 205 #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 -#define KVM_CAP_XSAVE2 207 +#define KVM_CAP_VM_GPA_BITS 207 +#define KVM_CAP_XSAVE2 208 #ifdef KVM_CAP_IRQ_ROUTING @@ -1163,11 +1164,20 @@ struct kvm_irq_routing_hv_sint { __u32 sint; }; +struct kvm_irq_routing_xen_evtchn { + __u32 port; + __u32 vcpu; + __u32 priority; +}; + +#define KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL ((__u32)(-1)) + /* gsi routing entry types */ #define KVM_IRQ_ROUTING_IRQCHIP 1 #define KVM_IRQ_ROUTING_MSI 2 #define KVM_IRQ_ROUTING_S390_ADAPTER 3 #define KVM_IRQ_ROUTING_HV_SINT 4 +#define KVM_IRQ_ROUTING_XEN_EVTCHN 5 struct kvm_irq_routing_entry { __u32 gsi; @@ -1179,6 +1189,7 @@ struct kvm_irq_routing_entry { struct kvm_irq_routing_msi msi; struct kvm_irq_routing_s390_adapter adapter; struct kvm_irq_routing_hv_sint hv_sint; + struct kvm_irq_routing_xen_evtchn xen_evtchn; __u32 pad[8]; } u; }; @@ -1209,6 +1220,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) #define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) +#define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) struct kvm_xen_hvm_config { __u32 flags; @@ -1552,8 +1564,6 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_XSAVE */ #define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave) #define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave) -/* Available with KVM_CAP_XSAVE2 */ -#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) /* Available with KVM_CAP_XCRS */ #define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) #define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) @@ -1613,6 +1623,9 @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) +/* Available with KVM_CAP_XSAVE2 */ +#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) + struct kvm_s390_pv_sec_parm { __u64 origin; __u64 length; -- cgit v1.2.3-58-ga151 From 96c852c8bf52af2d34654e700d9b5d2e8a99bae5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 19 Jan 2022 13:34:30 -0500 Subject: kvm: selftests: Do not indent with spaces Some indentation with spaces crept in, likely due to terminal-based cut and paste. Clean it up. Signed-off-by: Paolo Bonzini --- .../selftests/kvm/include/x86_64/processor.h | 10 ++-- tools/testing/selftests/kvm/lib/kvm_util.c | 8 +-- tools/testing/selftests/kvm/lib/x86_64/processor.c | 60 +++++++++++----------- tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c | 4 +- .../kvm/x86_64/vmx_close_while_nested_test.c | 4 +- .../testing/selftests/kvm/x86_64/xen_shinfo_test.c | 34 ++++++------ 6 files changed, 61 insertions(+), 59 deletions(-) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 122447827954..423d8a61bd2e 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -368,14 +368,14 @@ bool is_amd_cpu(void); static inline unsigned int x86_family(unsigned int eax) { - unsigned int x86; + unsigned int x86; - x86 = (eax >> 8) & 0xf; + x86 = (eax >> 8) & 0xf; - if (x86 == 0xf) - x86 += (eax >> 20) & 0xff; + if (x86 == 0xf) + x86 += (eax >> 20) & 0xff; - return x86; + return x86; } static inline unsigned int x86_model(unsigned int eax) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index c22a17aac6b0..8c53f96ab7fe 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -499,9 +499,11 @@ void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log) void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log, uint64_t first_page, uint32_t num_pages) { - struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot, - .first_page = first_page, - .num_pages = num_pages }; + struct kvm_clear_dirty_log args = { + .dirty_bitmap = log, .slot = slot, + .first_page = first_page, + .num_pages = num_pages + }; int ret; ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 59dcfe1967cc..5c8c270a9158 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1144,25 +1144,25 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); list->nmsrs = nmsrs; r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i", + r); state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0])); r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i", + r); r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i", + r); r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i", + r); r = vcpu_save_xsave_state(vm, vcpu, state); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i", + r); if (kvm_check_cap(KVM_CAP_XCRS)) { r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs); @@ -1171,17 +1171,17 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) } r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i", + r); if (nested_size) { state->nested.size = sizeof(state->nested_); r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested); TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i", - r); + r); TEST_ASSERT(state->nested.size <= nested_size, - "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", - state->nested.size, nested_size); + "Nested state size too big, %i (KVM_CHECK_CAP gave %i)", + state->nested.size, nested_size); } else state->nested.size = 0; @@ -1189,12 +1189,12 @@ struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid) for (i = 0; i < nmsrs; i++) state->msrs.entries[i].index = list->indices[i]; r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs); - TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", - r, r == nmsrs ? -1 : list->indices[r]); + TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)", + r, r == nmsrs ? -1 : list->indices[r]); r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i", + r); free(list); return state; @@ -1207,7 +1207,7 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs); TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i", - r); + r); r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs); TEST_ASSERT(r == state->msrs.nmsrs, @@ -1222,28 +1222,28 @@ void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *s r = ioctl(vcpu->fd, KVM_SET_XSAVE, state->xsave); TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i", - r); + r); r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i", + r); r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i", + r); r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i", + r); r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs); - TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i", - r); + TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i", + r); if (state->nested.size) { r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested); TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i", - r); + r); } } diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c index 5a6a662f2e59..a426078b16a3 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c @@ -77,8 +77,8 @@ static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage) switch (get_ucall(vm, vcpuid, &uc)) { case UCALL_SYNC: TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && - uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx", - stage + 1, (ulong)uc.args[1]); + uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx", + stage + 1, (ulong)uc.args[1]); return; case UCALL_DONE: return; diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c index 2835a17f1b7a..edac8839e717 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c @@ -30,8 +30,8 @@ static struct kvm_vm *vm; static void l2_guest_code(void) { /* Exit to L0 */ - asm volatile("inb %%dx, %%al" - : : [port] "d" (PORT_L0_EXIT) : "rax"); + asm volatile("inb %%dx, %%al" + : : [port] "d" (PORT_L0_EXIT) : "rax"); } static void l1_guest_code(struct vmx_pages *vmx_pages) diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 478e0ae8b93e..865e17146815 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -46,20 +46,20 @@ static struct kvm_vm *vm; #define MIN_STEAL_TIME 50000 struct pvclock_vcpu_time_info { - u32 version; - u32 pad0; - u64 tsc_timestamp; - u64 system_time; - u32 tsc_to_system_mul; - s8 tsc_shift; - u8 flags; - u8 pad[2]; + u32 version; + u32 pad0; + u64 tsc_timestamp; + u64 system_time; + u32 tsc_to_system_mul; + s8 tsc_shift; + u8 flags; + u8 pad[2]; } __attribute__((__packed__)); /* 32 bytes */ struct pvclock_wall_clock { - u32 version; - u32 sec; - u32 nsec; + u32 version; + u32 sec; + u32 nsec; } __attribute__((__packed__)); struct vcpu_runstate_info { @@ -74,11 +74,11 @@ struct arch_vcpu_info { }; struct vcpu_info { - uint8_t evtchn_upcall_pending; - uint8_t evtchn_upcall_mask; - unsigned long evtchn_pending_sel; - struct arch_vcpu_info arch; - struct pvclock_vcpu_time_info time; + uint8_t evtchn_upcall_pending; + uint8_t evtchn_upcall_mask; + unsigned long evtchn_pending_sel; + struct arch_vcpu_info arch; + struct pvclock_vcpu_time_info time; }; /* 64 bytes (x86) */ struct shared_info { @@ -493,7 +493,7 @@ int main(int argc, char *argv[]) vm_ts.tv_sec = wc->sec; vm_ts.tv_nsec = wc->nsec; - TEST_ASSERT(wc->version && !(wc->version & 1), + TEST_ASSERT(wc->version && !(wc->version & 1), "Bad wallclock version %x", wc->version); TEST_ASSERT(cmp_timespec(&min_ts, &vm_ts) <= 0, "VM time too old"); TEST_ASSERT(cmp_timespec(&max_ts, &vm_ts) >= 0, "VM time too new"); -- cgit v1.2.3-58-ga151 From fc839c6d33c8828514f595822f457e51328507e5 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 11 Jan 2022 00:52:08 +0800 Subject: riscv: bpf: Fix eBPF's exception tables eBPF's exception tables needs to be modified to relative synchronously. Suggested-by: Tong Tiangen Signed-off-by: Jisheng Zhang Fixes: 1f77ed9422cb ("riscv: switch to relative extable and other improvements") Signed-off-by: Palmer Dabbelt --- arch/riscv/net/bpf_jit_comp64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 293dd6e171ed..0bcda99d1d68 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -497,7 +497,7 @@ static int add_exception_handler(const struct bpf_insn *insn, offset = pc - (long)&ex->insn; if (WARN_ON_ONCE(offset >= 0 || offset < INT_MIN)) return -ERANGE; - ex->insn = pc; + ex->insn = offset; /* * Since the extable follows the program, the fixup offset is always -- cgit v1.2.3-58-ga151 From b4966a7dc0725b2baa12b0aeb1489d52568a2aad Mon Sep 17 00:00:00 2001 From: Sam Shih Date: Wed, 19 Jan 2022 20:36:58 +0800 Subject: clk: mediatek: relicense mt7986 clock driver to GPL-2.0 The previous mt7986 clock drivers were incorrectly marked as GPL-1.0. This patch changes the driver to the standard GPL-2.0 license. Signed-off-by: Sam Shih Link: https://lore.kernel.org/r/20220119123658.10095-2-sam.shih@mediatek.com Reported-by: Lukas Bulwahn Signed-off-by: Stephen Boyd --- drivers/clk/mediatek/clk-mt7986-apmixed.c | 2 +- drivers/clk/mediatek/clk-mt7986-infracfg.c | 2 +- drivers/clk/mediatek/clk-mt7986-topckgen.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/clk/mediatek/clk-mt7986-apmixed.c b/drivers/clk/mediatek/clk-mt7986-apmixed.c index 76c8ebdeae96..98ec3887585f 100644 --- a/drivers/clk/mediatek/clk-mt7986-apmixed.c +++ b/drivers/clk/mediatek/clk-mt7986-apmixed.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-1.0 +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2021 MediaTek Inc. * Author: Sam Shih diff --git a/drivers/clk/mediatek/clk-mt7986-infracfg.c b/drivers/clk/mediatek/clk-mt7986-infracfg.c index 3be168c34fc0..f209c559fbc3 100644 --- a/drivers/clk/mediatek/clk-mt7986-infracfg.c +++ b/drivers/clk/mediatek/clk-mt7986-infracfg.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-1.0 +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2021 MediaTek Inc. * Author: Sam Shih diff --git a/drivers/clk/mediatek/clk-mt7986-topckgen.c b/drivers/clk/mediatek/clk-mt7986-topckgen.c index 8550e2be7773..8f6f79b6e31e 100644 --- a/drivers/clk/mediatek/clk-mt7986-topckgen.c +++ b/drivers/clk/mediatek/clk-mt7986-topckgen.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-1.0 +// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2021 MediaTek Inc. * Author: Sam Shih -- cgit v1.2.3-58-ga151 From 73031f761cb7c2397d73957d14d041c31fe58c34 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Jan 2022 13:11:58 -0700 Subject: io-wq: delete dead lock shuffling code We used to have more code around the work loop, but now the goto and lock juggling just makes it less readable than it should. Get rid of it. Signed-off-by: Jens Axboe --- fs/io-wq.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 1efb134c98b7..013e12b9fabf 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -547,7 +547,7 @@ static void io_worker_handle_work(struct io_worker *worker) do { struct io_wq_work *work; -get_next: + /* * If we got some work, mark us as busy. If we didn't, but * the list isn't empty, it means we stalled on hashed work. @@ -606,11 +606,6 @@ get_next: spin_unlock_irq(&wq->hash->wait.lock); if (wq_has_sleeper(&wq->hash->wait)) wake_up(&wq->hash->wait); - raw_spin_lock(&wqe->lock); - /* skip unnecessary unlock-lock wqe->lock */ - if (!work) - goto get_next; - raw_spin_unlock(&wqe->lock); } } while (work); -- cgit v1.2.3-58-ga151 From 6191cf3ad59fda5901160633fef8e41b064a5246 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 18 Jan 2022 11:32:35 -0800 Subject: xfs: flush inodegc workqueue tasks before cancel The xfs_inodegc_stop() helper performs a high level flush of pending work on the percpu queues and then runs a cancel_work_sync() on each of the percpu work tasks to ensure all work has completed before returning. While cancel_work_sync() waits for wq tasks to complete, it does not guarantee work tasks have started. This means that the _stop() helper can queue and instantly cancel a wq task without having completed the associated work. This can be observed by tracepoint inspection of a simple "rm -f ; fsfreeze -f " test: xfs_destroy_inode: ... ino 0x83 ... xfs_inode_set_need_inactive: ... ino 0x83 ... xfs_inodegc_stop: ... ... xfs_inodegc_start: ... xfs_inodegc_worker: ... xfs_inode_inactivating: ... ino 0x83 ... The first few lines show that the inode is removed and need inactive state set, but the inactivation work has not completed before the inodegc mechanism stops. The inactivation doesn't actually occur until the fs is unfrozen and the gc mechanism starts back up. Note that this test requires fsfreeze to reproduce because xfs_freeze indirectly invokes xfs_fs_statfs(), which calls xfs_inodegc_flush(). When this occurs, the workqueue try_to_grab_pending() logic first tries to steal the pending bit, which does not succeed because the bit has been set by queue_work_on(). Subsequently, it checks for association of a pool workqueue from the work item under the pool lock. This association is set at the point a work item is queued and cleared when dequeued for processing. If the association exists, the work item is removed from the queue and cancel_work_sync() returns true. If the pwq association is cleared, the remove attempt assumes the task is busy and retries (eventually returning false to the caller after waiting for the work task to complete). To avoid this race, we can flush each work item explicitly before cancel. However, since the _queue_all() already schedules each underlying work item, the workqueue level helpers are sufficient to achieve the same ordering effect. E.g., the inodegc enabled flag prevents scheduling any further work in the _stop() case. Use the drain_workqueue() helper in this particular case to make the intent a bit more self explanatory. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_icache.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d019c98eb839..7a2a5e2be3cf 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1852,28 +1852,20 @@ xfs_inodegc_worker( } /* - * Force all currently queued inode inactivation work to run immediately, and - * wait for the work to finish. Two pass - queue all the work first pass, wait - * for it in a second pass. + * Force all currently queued inode inactivation work to run immediately and + * wait for the work to finish. */ void xfs_inodegc_flush( struct xfs_mount *mp) { - struct xfs_inodegc *gc; - int cpu; - if (!xfs_is_inodegc_enabled(mp)) return; trace_xfs_inodegc_flush(mp, __return_address); xfs_inodegc_queue_all(mp); - - for_each_online_cpu(cpu) { - gc = per_cpu_ptr(mp->m_inodegc, cpu); - flush_work(&gc->work); - } + flush_workqueue(mp->m_inodegc_wq); } /* @@ -1884,18 +1876,12 @@ void xfs_inodegc_stop( struct xfs_mount *mp) { - struct xfs_inodegc *gc; - int cpu; - if (!xfs_clear_inodegc_enabled(mp)) return; xfs_inodegc_queue_all(mp); + drain_workqueue(mp->m_inodegc_wq); - for_each_online_cpu(cpu) { - gc = per_cpu_ptr(mp->m_inodegc, cpu); - cancel_work_sync(&gc->work); - } trace_xfs_inodegc_stop(mp, __return_address); } -- cgit v1.2.3-58-ga151 From db1503d355a79d1d4255a9996f20e72848b74a56 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 17 Jan 2022 10:57:16 +0100 Subject: riscv: Get rid of MAXPHYSMEM configs CONFIG_MAXPHYSMEM_* are actually never used, even the nommu defconfigs selecting the MAXPHYSMEM_2GB had no effects on PAGE_OFFSET since it was preempted by !MMU case right before. In addition, the move of the kernel mapping at the end of the address space broke the use of MAXPHYSMEM_2G with MMU since it defines PAGE_OFFSET at the same address as the kernel mapping. Reported-by: Geert Uytterhoeven Fixes: 2bfc6cd81bd1 ("riscv: Move kernel mapping outside of linear mapping") Signed-off-by: Alexandre Ghiti Tested-by: Geert Uytterhoeven Tested-by: Conor Dooley Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 23 ++--------------------- arch/riscv/configs/nommu_k210_defconfig | 2 -- arch/riscv/configs/nommu_k210_sdcard_defconfig | 2 -- arch/riscv/configs/nommu_virt_defconfig | 1 - 4 files changed, 2 insertions(+), 26 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 821252b65f89..7a68a4106e5a 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -158,10 +158,9 @@ config PA_BITS config PAGE_OFFSET hex - default 0xC0000000 if 32BIT && MAXPHYSMEM_1GB + default 0xC0000000 if 32BIT default 0x80000000 if 64BIT && !MMU - default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB - default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB + default 0xffffffe000000000 if 64BIT config KASAN_SHADOW_OFFSET hex @@ -270,24 +269,6 @@ config MODULE_SECTIONS bool select HAVE_MOD_ARCH_SPECIFIC -choice - prompt "Maximum Physical Memory" - default MAXPHYSMEM_1GB if 32BIT - default MAXPHYSMEM_2GB if 64BIT && CMODEL_MEDLOW - default MAXPHYSMEM_128GB if 64BIT && CMODEL_MEDANY - - config MAXPHYSMEM_1GB - depends on 32BIT - bool "1GiB" - config MAXPHYSMEM_2GB - depends on 64BIT && CMODEL_MEDLOW - bool "2GiB" - config MAXPHYSMEM_128GB - depends on 64BIT && CMODEL_MEDANY - bool "128GiB" -endchoice - - config SMP bool "Symmetric Multi-Processing" help diff --git a/arch/riscv/configs/nommu_k210_defconfig b/arch/riscv/configs/nommu_k210_defconfig index b16a2a12c82a..3b9f83221f9c 100644 --- a/arch/riscv/configs/nommu_k210_defconfig +++ b/arch/riscv/configs/nommu_k210_defconfig @@ -29,8 +29,6 @@ CONFIG_EMBEDDED=y CONFIG_SLOB=y # CONFIG_MMU is not set CONFIG_SOC_CANAAN=y -CONFIG_SOC_CANAAN_K210_DTB_SOURCE="k210_generic" -CONFIG_MAXPHYSMEM_2GB=y CONFIG_SMP=y CONFIG_NR_CPUS=2 CONFIG_CMDLINE="earlycon console=ttySIF0" diff --git a/arch/riscv/configs/nommu_k210_sdcard_defconfig b/arch/riscv/configs/nommu_k210_sdcard_defconfig index 61f887f65419..d68b743d580f 100644 --- a/arch/riscv/configs/nommu_k210_sdcard_defconfig +++ b/arch/riscv/configs/nommu_k210_sdcard_defconfig @@ -21,8 +21,6 @@ CONFIG_EMBEDDED=y CONFIG_SLOB=y # CONFIG_MMU is not set CONFIG_SOC_CANAAN=y -CONFIG_SOC_CANAAN_K210_DTB_SOURCE="k210_generic" -CONFIG_MAXPHYSMEM_2GB=y CONFIG_SMP=y CONFIG_NR_CPUS=2 CONFIG_CMDLINE="earlycon console=ttySIF0 rootdelay=2 root=/dev/mmcblk0p1 ro" diff --git a/arch/riscv/configs/nommu_virt_defconfig b/arch/riscv/configs/nommu_virt_defconfig index e046a0babde4..f224be697785 100644 --- a/arch/riscv/configs/nommu_virt_defconfig +++ b/arch/riscv/configs/nommu_virt_defconfig @@ -27,7 +27,6 @@ CONFIG_SLOB=y # CONFIG_SLAB_MERGE_DEFAULT is not set # CONFIG_MMU is not set CONFIG_SOC_VIRT=y -CONFIG_MAXPHYSMEM_2GB=y CONFIG_SMP=y CONFIG_CMDLINE="root=/dev/vda rw earlycon=uart8250,mmio,0x10000000,115200n8 console=ttyS0" CONFIG_CMDLINE_FORCE=y -- cgit v1.2.3-58-ga151 From f7ae02333d13f598da6ff6b94cf643255707f752 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:45 +0100 Subject: riscv: Move KASAN mapping next to the kernel mapping Now that KASAN_SHADOW_OFFSET is defined at compile time as a config, this value must remain constant whatever the size of the virtual address space, which is only possible by pushing this region at the end of the address space next to the kernel mapping. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- Documentation/riscv/vm-layout.rst | 12 ++++++------ arch/riscv/Kconfig | 4 ++-- arch/riscv/include/asm/kasan.h | 4 ++-- arch/riscv/include/asm/page.h | 2 -- arch/riscv/include/asm/pgtable.h | 22 +++++++++++++++------- arch/riscv/mm/init.c | 25 +++++++++++++------------ 6 files changed, 38 insertions(+), 31 deletions(-) diff --git a/Documentation/riscv/vm-layout.rst b/Documentation/riscv/vm-layout.rst index b7f98930d38d..1bd687b97104 100644 --- a/Documentation/riscv/vm-layout.rst +++ b/Documentation/riscv/vm-layout.rst @@ -47,12 +47,12 @@ RISC-V Linux Kernel SV39 | Kernel-space virtual memory, shared between all processes: ____________________________________________________________|___________________________________________________________ | | | | - ffffffc000000000 | -256 GB | ffffffc7ffffffff | 32 GB | kasan - ffffffcefee00000 | -196 GB | ffffffcefeffffff | 2 MB | fixmap - ffffffceff000000 | -196 GB | ffffffceffffffff | 16 MB | PCI io - ffffffcf00000000 | -196 GB | ffffffcfffffffff | 4 GB | vmemmap - ffffffd000000000 | -192 GB | ffffffdfffffffff | 64 GB | vmalloc/ioremap space - ffffffe000000000 | -128 GB | ffffffff7fffffff | 124 GB | direct mapping of all physical memory + ffffffc6fee00000 | -228 GB | ffffffc6feffffff | 2 MB | fixmap + ffffffc6ff000000 | -228 GB | ffffffc6ffffffff | 16 MB | PCI io + ffffffc700000000 | -228 GB | ffffffc7ffffffff | 4 GB | vmemmap + ffffffc800000000 | -224 GB | ffffffd7ffffffff | 64 GB | vmalloc/ioremap space + ffffffd800000000 | -160 GB | fffffff6ffffffff | 124 GB | direct mapping of all physical memory + fffffff700000000 | -36 GB | fffffffeffffffff | 32 GB | kasan __________________|____________|__________________|_________|____________________________________________________________ | | diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7a68a4106e5a..7f8aea333291 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -160,12 +160,12 @@ config PAGE_OFFSET hex default 0xC0000000 if 32BIT default 0x80000000 if 64BIT && !MMU - default 0xffffffe000000000 if 64BIT + default 0xffffffd800000000 if 64BIT config KASAN_SHADOW_OFFSET hex depends on KASAN_GENERIC - default 0xdfffffc800000000 if 64BIT + default 0xdfffffff00000000 if 64BIT default 0xffffffff if 32BIT config ARCH_FLATMEM_ENABLE diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index b00f503ec124..257a2495145a 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -28,8 +28,8 @@ #define KASAN_SHADOW_SCALE_SHIFT 3 #define KASAN_SHADOW_SIZE (UL(1) << ((CONFIG_VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) -#define KASAN_SHADOW_START KERN_VIRT_START -#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE) +#define KASAN_SHADOW_START (KASAN_SHADOW_END - KASAN_SHADOW_SIZE) +#define KASAN_SHADOW_END MODULES_LOWEST_VADDR #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) void kasan_init(void); diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index b3e5ff0125fe..a18c989e600c 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -33,8 +33,6 @@ */ #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) -#define KERN_VIRT_SIZE (-PAGE_OFFSET) - #ifndef __ASSEMBLY__ #define clear_page(pgaddr) memset((pgaddr), 0, PAGE_SIZE) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index bf204e7c1f74..fa400b2abb36 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -24,6 +24,17 @@ #define KERNEL_LINK_ADDR PAGE_OFFSET #endif +/* Number of entries in the page global directory */ +#define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) +/* Number of entries in the page table */ +#define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t)) + +/* + * Half of the kernel address space (half of the entries of the page global + * directory) is for the direct mapping. + */ +#define KERN_VIRT_SIZE ((PTRS_PER_PGD / 2 * PGDIR_SIZE) / 2) + #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) @@ -39,8 +50,10 @@ /* Modules always live before the kernel */ #ifdef CONFIG_64BIT -#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G) -#define MODULES_END (PFN_ALIGN((unsigned long)&_start)) +/* This is used to define the end of the KASAN shadow region */ +#define MODULES_LOWEST_VADDR (KERNEL_LINK_ADDR - SZ_2G) +#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G) +#define MODULES_END (PFN_ALIGN((unsigned long)&_start)) #endif /* @@ -108,11 +121,6 @@ #endif /* CONFIG_XIP_KERNEL */ #ifdef CONFIG_MMU -/* Number of entries in the page global directory */ -#define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) -/* Number of entries in the page table */ -#define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t)) - /* Number of PGD entries that a user-mode program can use */ #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 24b2b8044602..f515964e9c00 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -103,6 +103,9 @@ static void __init print_vm_layout(void) print_mlm("lowmem", (unsigned long)PAGE_OFFSET, (unsigned long)high_memory); #ifdef CONFIG_64BIT +#ifdef CONFIG_KASAN + print_mlm("kasan", KASAN_SHADOW_START, KASAN_SHADOW_END); +#endif print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR, (unsigned long)ADDRESS_SPACE_END); #endif @@ -130,18 +133,8 @@ void __init mem_init(void) print_vm_layout(); } -/* - * The default maximal physical memory size is -PAGE_OFFSET for 32-bit kernel, - * whereas for 64-bit kernel, the end of the virtual address space is occupied - * by the modules/BPF/kernel mappings which reduces the available size of the - * linear mapping. - * Limit the memory size via mem. - */ -#ifdef CONFIG_64BIT -static phys_addr_t memory_limit = -PAGE_OFFSET - SZ_4G; -#else -static phys_addr_t memory_limit = -PAGE_OFFSET; -#endif +/* Limit the memory size via mem. */ +static phys_addr_t memory_limit; static int __init early_mem(char *p) { @@ -612,6 +605,14 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) riscv_pfn_base = PFN_DOWN(kernel_map.phys_addr); + /* + * The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit + * kernel, whereas for 64-bit kernel, the end of the virtual address + * space is occupied by the modules/BPF/kernel mappings which reduces + * the available size of the linear mapping. + */ + memory_limit = KERN_VIRT_SIZE - (IS_ENABLED(CONFIG_64BIT) ? SZ_4G : 0); + /* Sanity check alignment and size */ BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0); BUG_ON((kernel_map.phys_addr % PMD_SIZE) != 0); -- cgit v1.2.3-58-ga151 From 2efad17e5794f4223bbeff1b2c568e3afd9a8c22 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:46 +0100 Subject: riscv: Split early kasan mapping to prepare sv48 introduction Now that kasan shadow region is next to the kernel, for sv48, this region won't be aligned on PGDIR_SIZE and then when populating this region, we'll need to get down to lower levels of the page table. So instead of reimplementing the page table walk for the early population, take advantage of the existing functions used for the final population. Note that kasan swapper initialization must also be split since memblock is not initialized at this point and as the last PGD is shared with the kernel, we'd need to allocate a PUD so postpone the kasan final population after the kernel population is done. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/kasan.h | 1 + arch/riscv/mm/init.c | 4 ++ arch/riscv/mm/kasan_init.c | 113 ++++++++++++++++++++++------------------- 3 files changed, 67 insertions(+), 51 deletions(-) diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index 257a2495145a..2788e2c46609 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -34,6 +34,7 @@ void kasan_init(void); asmlinkage void kasan_early_init(void); +void kasan_swapper_init(void); #endif #endif diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index f515964e9c00..f5d0052c95ad 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -741,6 +741,10 @@ static void __init setup_vm_final(void) create_kernel_page_table(swapper_pg_dir, false); #endif +#ifdef CONFIG_KASAN + kasan_swapper_init(); +#endif + /* Clear fixmap PTE and PMD mappings */ clear_fixmap(FIX_PTE); clear_fixmap(FIX_PMD); diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 54294f83513d..1434a0225140 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -12,44 +12,6 @@ #include extern pgd_t early_pg_dir[PTRS_PER_PGD]; -asmlinkage void __init kasan_early_init(void) -{ - uintptr_t i; - pgd_t *pgd = early_pg_dir + pgd_index(KASAN_SHADOW_START); - - BUILD_BUG_ON(KASAN_SHADOW_OFFSET != - KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT))); - - for (i = 0; i < PTRS_PER_PTE; ++i) - set_pte(kasan_early_shadow_pte + i, - mk_pte(virt_to_page(kasan_early_shadow_page), - PAGE_KERNEL)); - - for (i = 0; i < PTRS_PER_PMD; ++i) - set_pmd(kasan_early_shadow_pmd + i, - pfn_pmd(PFN_DOWN - (__pa((uintptr_t) kasan_early_shadow_pte)), - __pgprot(_PAGE_TABLE))); - - for (i = KASAN_SHADOW_START; i < KASAN_SHADOW_END; - i += PGDIR_SIZE, ++pgd) - set_pgd(pgd, - pfn_pgd(PFN_DOWN - (__pa(((uintptr_t) kasan_early_shadow_pmd))), - __pgprot(_PAGE_TABLE))); - - /* init for swapper_pg_dir */ - pgd = pgd_offset_k(KASAN_SHADOW_START); - - for (i = KASAN_SHADOW_START; i < KASAN_SHADOW_END; - i += PGDIR_SIZE, ++pgd) - set_pgd(pgd, - pfn_pgd(PFN_DOWN - (__pa(((uintptr_t) kasan_early_shadow_pmd))), - __pgprot(_PAGE_TABLE))); - - local_flush_tlb_all(); -} static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) { @@ -108,26 +70,35 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); } -static void __init kasan_populate_pgd(unsigned long vaddr, unsigned long end) +static void __init kasan_populate_pgd(pgd_t *pgdp, + unsigned long vaddr, unsigned long end, + bool early) { phys_addr_t phys_addr; - pgd_t *pgdp = pgd_offset_k(vaddr); unsigned long next; do { next = pgd_addr_end(vaddr, end); - /* - * pgdp can't be none since kasan_early_init initialized all KASAN - * shadow region with kasan_early_shadow_pmd: if this is stillthe case, - * that means we can try to allocate a hugepage as a replacement. - */ - if (pgd_page_vaddr(*pgdp) == (unsigned long)lm_alias(kasan_early_shadow_pmd) && - IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) { - phys_addr = memblock_phys_alloc(PGDIR_SIZE, PGDIR_SIZE); - if (phys_addr) { - set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_KERNEL)); + if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) { + if (early) { + phys_addr = __pa((uintptr_t)kasan_early_shadow_pgd_next); + set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_TABLE)); continue; + } else if (pgd_page_vaddr(*pgdp) == + (unsigned long)lm_alias(kasan_early_shadow_pgd_next)) { + /* + * pgdp can't be none since kasan_early_init + * initialized all KASAN shadow region with + * kasan_early_shadow_pud: if this is still the + * case, that means we can try to allocate a + * hugepage as a replacement. + */ + phys_addr = memblock_phys_alloc(PGDIR_SIZE, PGDIR_SIZE); + if (phys_addr) { + set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_KERNEL)); + continue; + } } } @@ -135,12 +106,52 @@ static void __init kasan_populate_pgd(unsigned long vaddr, unsigned long end) } while (pgdp++, vaddr = next, vaddr != end); } +asmlinkage void __init kasan_early_init(void) +{ + uintptr_t i; + + BUILD_BUG_ON(KASAN_SHADOW_OFFSET != + KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT))); + + for (i = 0; i < PTRS_PER_PTE; ++i) + set_pte(kasan_early_shadow_pte + i, + mk_pte(virt_to_page(kasan_early_shadow_page), + PAGE_KERNEL)); + + for (i = 0; i < PTRS_PER_PMD; ++i) + set_pmd(kasan_early_shadow_pmd + i, + pfn_pmd(PFN_DOWN + (__pa((uintptr_t)kasan_early_shadow_pte)), + PAGE_TABLE)); + + if (pgtable_l4_enabled) { + for (i = 0; i < PTRS_PER_PUD; ++i) + set_pud(kasan_early_shadow_pud + i, + pfn_pud(PFN_DOWN + (__pa(((uintptr_t)kasan_early_shadow_pmd))), + PAGE_TABLE)); + } + + kasan_populate_pgd(early_pg_dir + pgd_index(KASAN_SHADOW_START), + KASAN_SHADOW_START, KASAN_SHADOW_END, true); + + local_flush_tlb_all(); +} + +void __init kasan_swapper_init(void) +{ + kasan_populate_pgd(pgd_offset_k(KASAN_SHADOW_START), + KASAN_SHADOW_START, KASAN_SHADOW_END, true); + + local_flush_tlb_all(); +} + static void __init kasan_populate(void *start, void *end) { unsigned long vaddr = (unsigned long)start & PAGE_MASK; unsigned long vend = PAGE_ALIGN((unsigned long)end); - kasan_populate_pgd(vaddr, vend); + kasan_populate_pgd(pgd_offset_k(vaddr), vaddr, vend, false); local_flush_tlb_all(); memset(start, KASAN_SHADOW_INIT, end - start); -- cgit v1.2.3-58-ga151 From 840125a97abc7e676d839adc2743e8f703a156b3 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:47 +0100 Subject: riscv: Introduce functions to switch pt_ops This simply gathers the different pt_ops initialization in functions where a comment was added to explain why the page table operations must be changed along the boot process. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 74 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 23 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index f5d0052c95ad..424d7777f4df 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -581,6 +581,52 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) dtb_early_pa = dtb_pa; } +/* + * MMU is not enabled, the page tables are allocated directly using + * early_pmd/pud/p4d and the address returned is the physical one. + */ +void pt_ops_set_early(void) +{ + pt_ops.alloc_pte = alloc_pte_early; + pt_ops.get_pte_virt = get_pte_virt_early; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_early; + pt_ops.get_pmd_virt = get_pmd_virt_early; +#endif +} + +/* + * MMU is enabled but page table setup is not complete yet. + * fixmap page table alloc functions must be used as a means to temporarily + * map the allocated physical pages since the linear mapping does not exist yet. + * + * Note that this is called with MMU disabled, hence kernel_mapping_pa_to_va, + * but it will be used as described above. + */ +void pt_ops_set_fixmap(void) +{ + pt_ops.alloc_pte = kernel_mapping_pa_to_va((uintptr_t)alloc_pte_fixmap); + pt_ops.get_pte_virt = kernel_mapping_pa_to_va((uintptr_t)get_pte_virt_fixmap); +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap); + pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap); +#endif +} + +/* + * MMU is enabled and page table setup is complete, so from now, we can use + * generic page allocation functions to setup page table. + */ +void pt_ops_set_late(void) +{ + pt_ops.alloc_pte = alloc_pte_late; + pt_ops.get_pte_virt = get_pte_virt_late; +#ifndef __PAGETABLE_PMD_FOLDED + pt_ops.alloc_pmd = alloc_pmd_late; + pt_ops.get_pmd_virt = get_pmd_virt_late; +#endif +} + asmlinkage void __init setup_vm(uintptr_t dtb_pa) { pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; @@ -625,12 +671,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) BUG_ON((kernel_map.virt_addr + kernel_map.size) > ADDRESS_SPACE_END - SZ_4K); #endif - pt_ops.alloc_pte = alloc_pte_early; - pt_ops.get_pte_virt = get_pte_virt_early; -#ifndef __PAGETABLE_PMD_FOLDED - pt_ops.alloc_pmd = alloc_pmd_early; - pt_ops.get_pmd_virt = get_pmd_virt_early; -#endif + pt_ops_set_early(); + /* Setup early PGD for fixmap */ create_pgd_mapping(early_pg_dir, FIXADDR_START, (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); @@ -694,6 +736,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pr_warn("FIX_BTMAP_BEGIN: %d\n", FIX_BTMAP_BEGIN); } #endif + + pt_ops_set_fixmap(); } static void __init setup_vm_final(void) @@ -702,16 +746,6 @@ static void __init setup_vm_final(void) phys_addr_t pa, start, end; u64 i; - /** - * MMU is enabled at this point. But page table setup is not complete yet. - * fixmap page table alloc functions should be used at this point - */ - pt_ops.alloc_pte = alloc_pte_fixmap; - pt_ops.get_pte_virt = get_pte_virt_fixmap; -#ifndef __PAGETABLE_PMD_FOLDED - pt_ops.alloc_pmd = alloc_pmd_fixmap; - pt_ops.get_pmd_virt = get_pmd_virt_fixmap; -#endif /* Setup swapper PGD for fixmap */ create_pgd_mapping(swapper_pg_dir, FIXADDR_START, __pa_symbol(fixmap_pgd_next), @@ -753,13 +787,7 @@ static void __init setup_vm_final(void) csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); local_flush_tlb_all(); - /* generic page allocation functions must be used to setup page table */ - pt_ops.alloc_pte = alloc_pte_late; - pt_ops.get_pte_virt = get_pte_virt_late; -#ifndef __PAGETABLE_PMD_FOLDED - pt_ops.alloc_pmd = alloc_pmd_late; - pt_ops.get_pmd_virt = get_pmd_virt_late; -#endif + pt_ops_set_late(); } #else asmlinkage void __init setup_vm(uintptr_t dtb_pa) -- cgit v1.2.3-58-ga151 From 3270bfdb9e4a01bb15d018612a6354c1837b5f97 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:48 +0100 Subject: riscv: Allow to dynamically define VA_BITS With 4-level page table folding at runtime, we don't know at compile time the size of the virtual address space so we must set VA_BITS dynamically so that sparsemem reserves the right amount of memory for struct pages. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 10 ---------- arch/riscv/include/asm/kasan.h | 2 +- arch/riscv/include/asm/pgtable.h | 10 ++++++++-- arch/riscv/include/asm/sparsemem.h | 6 +++++- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7f8aea333291..7945bacb1d0e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -146,16 +146,6 @@ config MMU Select if you want MMU-based virtualised addressing space support by paged memory management. If unsure, say 'Y'. -config VA_BITS - int - default 32 if 32BIT - default 39 if 64BIT - -config PA_BITS - int - default 34 if 32BIT - default 56 if 64BIT - config PAGE_OFFSET hex default 0xC0000000 if 32BIT diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index 2788e2c46609..743e6ff57996 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -27,7 +27,7 @@ */ #define KASAN_SHADOW_SCALE_SHIFT 3 -#define KASAN_SHADOW_SIZE (UL(1) << ((CONFIG_VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) +#define KASAN_SHADOW_SIZE (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) #define KASAN_SHADOW_START (KASAN_SHADOW_END - KASAN_SHADOW_SIZE) #define KASAN_SHADOW_END MODULES_LOWEST_VADDR #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index fa400b2abb36..23c7c66f2d79 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -61,8 +61,14 @@ * struct pages to map half the virtual address space. Then * position vmemmap directly below the VMALLOC region. */ +#ifdef CONFIG_64BIT +#define VA_BITS 39 +#else +#define VA_BITS 32 +#endif + #define VMEMMAP_SHIFT \ - (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) + (VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT) #define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT) #define VMEMMAP_END (VMALLOC_START - 1) #define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE) @@ -661,7 +667,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, * and give the kernel the other (upper) half. */ #ifdef CONFIG_64BIT -#define KERN_VIRT_START (-(BIT(CONFIG_VA_BITS)) + TASK_SIZE) +#define KERN_VIRT_START (-(BIT(VA_BITS)) + TASK_SIZE) #else #define KERN_VIRT_START FIXADDR_START #endif diff --git a/arch/riscv/include/asm/sparsemem.h b/arch/riscv/include/asm/sparsemem.h index 45a7018a8118..63acaecc3374 100644 --- a/arch/riscv/include/asm/sparsemem.h +++ b/arch/riscv/include/asm/sparsemem.h @@ -4,7 +4,11 @@ #define _ASM_RISCV_SPARSEMEM_H #ifdef CONFIG_SPARSEMEM -#define MAX_PHYSMEM_BITS CONFIG_PA_BITS +#ifdef CONFIG_64BIT +#define MAX_PHYSMEM_BITS 56 +#else +#define MAX_PHYSMEM_BITS 34 +#endif /* CONFIG_64BIT */ #define SECTION_SIZE_BITS 27 #endif /* CONFIG_SPARSEMEM */ -- cgit v1.2.3-58-ga151 From 60639f74c2f4fcc3ffa2ac0b120eaa874ccc713f Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:50 +0100 Subject: asm-generic: Prepare for riscv use of pud_alloc_one and pud_free In the following commits, riscv will almost use the generic versions of pud_alloc_one and pud_free but an additional check is required since those functions are only relevant when using at least a 4-level page table, which will be determined at runtime on riscv. So move the content of those functions into other functions that riscv can use without duplicating code. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- include/asm-generic/pgalloc.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 02932efad3ab..977bea16cf1b 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -147,6 +147,15 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #if CONFIG_PGTABLE_LEVELS > 3 +static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_PGTABLE_USER; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + return (pud_t *)get_zeroed_page(gfp); +} + #ifndef __HAVE_ARCH_PUD_ALLOC_ONE /** * pud_alloc_one - allocate a page for PUD-level page table @@ -159,20 +168,23 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) */ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - gfp_t gfp = GFP_PGTABLE_USER; - - if (mm == &init_mm) - gfp = GFP_PGTABLE_KERNEL; - return (pud_t *)get_zeroed_page(gfp); + return __pud_alloc_one(mm, addr); } #endif -static inline void pud_free(struct mm_struct *mm, pud_t *pud) +static inline void __pud_free(struct mm_struct *mm, pud_t *pud) { BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); free_page((unsigned long)pud); } +#ifndef __HAVE_ARCH_PUD_FREE +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + __pud_free(mm, pud); +} +#endif + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #ifndef __HAVE_ARCH_PGD_FREE -- cgit v1.2.3-58-ga151 From e8a62cc26ddf53a3c6ba2a8d33036cf7b84f3923 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:51 +0100 Subject: riscv: Implement sv48 support By adding a new 4th level of page table, give the possibility to 64bit kernel to address 2^48 bytes of virtual address: in practice, that offers 128TB of virtual address space to userspace and allows up to 64TB of physical memory. If the underlying hardware does not support sv48, we will automatically fallback to a standard 3-level page table by folding the new PUD level into PGDIR level. In order to detect HW capabilities at runtime, we use SATP feature that ignores writes with an unsupported mode. Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 4 +- arch/riscv/include/asm/csr.h | 3 +- arch/riscv/include/asm/fixmap.h | 1 + arch/riscv/include/asm/kasan.h | 6 +- arch/riscv/include/asm/page.h | 14 +++ arch/riscv/include/asm/pgalloc.h | 40 ++++++ arch/riscv/include/asm/pgtable-64.h | 108 +++++++++++++++- arch/riscv/include/asm/pgtable.h | 24 +++- arch/riscv/kernel/head.S | 3 +- arch/riscv/mm/context.c | 4 +- arch/riscv/mm/init.c | 212 ++++++++++++++++++++++++++++---- arch/riscv/mm/kasan_init.c | 137 ++++++++++++++++++++- drivers/firmware/efi/libstub/efi-stub.c | 2 + 13 files changed, 514 insertions(+), 44 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7945bacb1d0e..c4289d755dd4 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -150,7 +150,7 @@ config PAGE_OFFSET hex default 0xC0000000 if 32BIT default 0x80000000 if 64BIT && !MMU - default 0xffffffd800000000 if 64BIT + default 0xffffaf8000000000 if 64BIT config KASAN_SHADOW_OFFSET hex @@ -201,7 +201,7 @@ config FIX_EARLYCON_MEM config PGTABLE_LEVELS int - default 3 if 64BIT + default 4 if 64BIT default 2 config LOCKDEP_SUPPORT diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 5046f431645c..ae711692eec9 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -40,14 +40,13 @@ #ifndef CONFIG_64BIT #define SATP_PPN _AC(0x003FFFFF, UL) #define SATP_MODE_32 _AC(0x80000000, UL) -#define SATP_MODE SATP_MODE_32 #define SATP_ASID_BITS 9 #define SATP_ASID_SHIFT 22 #define SATP_ASID_MASK _AC(0x1FF, UL) #else #define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL) #define SATP_MODE_39 _AC(0x8000000000000000, UL) -#define SATP_MODE SATP_MODE_39 +#define SATP_MODE_48 _AC(0x9000000000000000, UL) #define SATP_ASID_BITS 16 #define SATP_ASID_SHIFT 44 #define SATP_ASID_MASK _AC(0xFFFF, UL) diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h index 54cbf07fb4e9..58a718573ad6 100644 --- a/arch/riscv/include/asm/fixmap.h +++ b/arch/riscv/include/asm/fixmap.h @@ -24,6 +24,7 @@ enum fixed_addresses { FIX_HOLE, FIX_PTE, FIX_PMD, + FIX_PUD, FIX_TEXT_POKE1, FIX_TEXT_POKE0, FIX_EARLYCON_MEM_BASE, diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h index 743e6ff57996..0b85e363e778 100644 --- a/arch/riscv/include/asm/kasan.h +++ b/arch/riscv/include/asm/kasan.h @@ -28,7 +28,11 @@ #define KASAN_SHADOW_SCALE_SHIFT 3 #define KASAN_SHADOW_SIZE (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT)) -#define KASAN_SHADOW_START (KASAN_SHADOW_END - KASAN_SHADOW_SIZE) +/* + * Depending on the size of the virtual address space, the region may not be + * aligned on PGDIR_SIZE, so force its alignment to ease its population. + */ +#define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK) #define KASAN_SHADOW_END MODULES_LOWEST_VADDR #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index a18c989e600c..160e3a1e8f8b 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -31,7 +31,20 @@ * When not using MMU this corresponds to the first free page in * physical memory (aligned on a page boundary). */ +#ifdef CONFIG_64BIT +#ifdef CONFIG_MMU +#define PAGE_OFFSET kernel_map.page_offset +#else +#define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +#endif +/* + * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so + * define the PAGE_OFFSET value for SV39. + */ +#define PAGE_OFFSET_L3 _AC(0xffffffd800000000, UL) +#else #define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) +#endif /* CONFIG_64BIT */ #ifndef __ASSEMBLY__ @@ -84,6 +97,7 @@ extern unsigned long riscv_pfn_base; #endif /* CONFIG_MMU */ struct kernel_mapping { + unsigned long page_offset; unsigned long virt_addr; uintptr_t phys_addr; uintptr_t size; diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index 0af6933a7100..11823004b87a 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -11,6 +11,8 @@ #include #ifdef CONFIG_MMU +#define __HAVE_ARCH_PUD_ALLOC_ONE +#define __HAVE_ARCH_PUD_FREE #include static inline void pmd_populate_kernel(struct mm_struct *mm, @@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); } + +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) +{ + if (pgtable_l4_enabled) { + unsigned long pfn = virt_to_pfn(pud); + + set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); + } +} + +static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, + pud_t *pud) +{ + if (pgtable_l4_enabled) { + unsigned long pfn = virt_to_pfn(pud); + + set_p4d_safe(p4d, + __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE)); + } +} + +#define pud_alloc_one pud_alloc_one +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + if (pgtable_l4_enabled) + return __pud_alloc_one(mm, addr); + + return NULL; +} + +#define pud_free pud_free +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + if (pgtable_l4_enabled) + __pud_free(mm, pud); +} + +#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud) #endif /* __PAGETABLE_PMD_FOLDED */ static inline pgd_t *pgd_alloc(struct mm_struct *mm) diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 228261aa9628..bbbdd66e5e2f 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -8,16 +8,36 @@ #include -#define PGDIR_SHIFT 30 +extern bool pgtable_l4_enabled; + +#define PGDIR_SHIFT_L3 30 +#define PGDIR_SHIFT_L4 39 +#define PGDIR_SIZE_L3 (_AC(1, UL) << PGDIR_SHIFT_L3) + +#define PGDIR_SHIFT (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3) /* Size of region mapped by a page global directory */ #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) +/* pud is folded into pgd in case of 3-level page table */ +#define PUD_SHIFT 30 +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE - 1)) + #define PMD_SHIFT 21 /* Size of region mapped by a page middle directory */ #define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE - 1)) +/* Page Upper Directory entry */ +typedef struct { + unsigned long pud; +} pud_t; + +#define pud_val(x) ((x).pud) +#define __pud(x) ((pud_t) { (x) }) +#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t)) + /* Page Middle Directory entry */ typedef struct { unsigned long pmd; @@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp) set_pud(pudp, __pud(0)); } +static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot) +{ + return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); +} + +static inline unsigned long _pud_pfn(pud_t pud) +{ + return pud_val(pud) >> _PAGE_PFN_SHIFT; +} + static inline pmd_t *pud_pgtable(pud_t pud) { return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT); @@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud) return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT); } +#define mm_pud_folded mm_pud_folded +static inline bool mm_pud_folded(struct mm_struct *mm) +{ + if (pgtable_l4_enabled) + return false; + + return true; +} + +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) + static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot) { return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot)); @@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd) #define pmd_ERROR(e) \ pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e)) +#define pud_ERROR(e) \ + pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) + +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) +{ + if (pgtable_l4_enabled) + *p4dp = p4d; + else + set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) }); +} + +static inline int p4d_none(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return (p4d_val(p4d) == 0); + + return 0; +} + +static inline int p4d_present(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return (p4d_val(p4d) & _PAGE_PRESENT); + + return 1; +} + +static inline int p4d_bad(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return !p4d_present(p4d); + + return 0; +} + +static inline void p4d_clear(p4d_t *p4d) +{ + if (pgtable_l4_enabled) + set_p4d(p4d, __p4d(0)); +} + +static inline pud_t *p4d_pgtable(p4d_t p4d) +{ + if (pgtable_l4_enabled) + return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT); + + return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) }); +} + +static inline struct page *p4d_page(p4d_t p4d) +{ + return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT); +} + +#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) + +#define pud_offset pud_offset +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +{ + if (pgtable_l4_enabled) + return p4d_pgtable(*p4d) + pud_index(address); + + return (pud_t *)p4d; +} + #endif /* _ASM_RISCV_PGTABLE_64_H */ diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 23c7c66f2d79..2ab6650d3926 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -62,7 +62,7 @@ * position vmemmap directly below the VMALLOC region. */ #ifdef CONFIG_64BIT -#define VA_BITS 39 +#define VA_BITS (pgtable_l4_enabled ? 48 : 39) #else #define VA_BITS 32 #endif @@ -102,8 +102,7 @@ #ifndef __ASSEMBLY__ -/* Page Upper Directory not used in RISC-V */ -#include +#include #include #include #include @@ -126,6 +125,17 @@ #define XIP_FIXUP(addr) (addr) #endif /* CONFIG_XIP_KERNEL */ +struct pt_alloc_ops { + pte_t *(*get_pte_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pte)(uintptr_t va); +#ifndef __PAGETABLE_PMD_FOLDED + pmd_t *(*get_pmd_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pmd)(uintptr_t va); + pud_t *(*get_pud_virt)(phys_addr_t pa); + phys_addr_t (*alloc_pud)(uintptr_t va); +#endif +}; + #ifdef CONFIG_MMU /* Number of PGD entries that a user-mode program can use */ #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) @@ -677,9 +687,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, * Note that PGDIR_SIZE must evenly divide TASK_SIZE. */ #ifdef CONFIG_64BIT -#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) +#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) +#define TASK_SIZE_MIN (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2) #else -#define TASK_SIZE FIXADDR_START +#define TASK_SIZE FIXADDR_START +#define TASK_SIZE_MIN TASK_SIZE #endif #else /* CONFIG_MMU */ @@ -705,6 +717,8 @@ extern uintptr_t _dtb_early_pa; #define dtb_early_va _dtb_early_va #define dtb_early_pa _dtb_early_pa #endif /* CONFIG_XIP_KERNEL */ +extern u64 satp_mode; +extern bool pgtable_l4_enabled; void paging_init(void); void misc_mem_init(void); diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index f52f01ecbeea..10e718559960 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -105,7 +105,8 @@ relocate: /* Compute satp for kernel page tables, but don't load it yet */ srl a2, a0, PAGE_SHIFT - li a1, SATP_MODE + la a1, satp_mode + REG_L a1, 0(a1) or a2, a2, a1 /* diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index ea54cc0c9106..7acbfbd14557 100644 --- a/arch/riscv/mm/context.c +++ b/arch/riscv/mm/context.c @@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu) switch_mm_fast: csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | ((cntx & asid_mask) << SATP_ASID_SHIFT) | - SATP_MODE); + satp_mode); if (need_flush_tlb) local_flush_tlb_all(); @@ -201,7 +201,7 @@ switch_mm_fast: static void set_mm_noasid(struct mm_struct *mm) { /* Switch the page table and blindly nuke entire local TLB */ - csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE); + csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode); local_flush_tlb_all(); } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 424d7777f4df..7ba915849817 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -37,6 +37,17 @@ EXPORT_SYMBOL(kernel_map); #define kernel_map (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map)) #endif +#ifdef CONFIG_64BIT +u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39; +#else +u64 satp_mode = SATP_MODE_32; +#endif +EXPORT_SYMBOL(satp_mode); + +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ? + true : false; +EXPORT_SYMBOL(pgtable_l4_enabled); + phys_addr_t phys_ram_base __ro_after_init; EXPORT_SYMBOL(phys_ram_base); @@ -53,15 +64,6 @@ extern char _start[]; void *_dtb_early_va __initdata; uintptr_t _dtb_early_pa __initdata; -struct pt_alloc_ops { - pte_t *(*get_pte_virt)(phys_addr_t pa); - phys_addr_t (*alloc_pte)(uintptr_t va); -#ifndef __PAGETABLE_PMD_FOLDED - pmd_t *(*get_pmd_virt)(phys_addr_t pa); - phys_addr_t (*alloc_pmd)(uintptr_t va); -#endif -}; - static phys_addr_t dma32_phys_limit __initdata; static void __init zone_sizes_init(void) @@ -222,7 +224,7 @@ static void __init setup_bootmem(void) } #ifdef CONFIG_MMU -static struct pt_alloc_ops _pt_ops __initdata; +struct pt_alloc_ops _pt_ops __initdata; #ifdef CONFIG_XIP_KERNEL #define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops)) @@ -238,6 +240,7 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss; static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss; pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); +static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #ifdef CONFIG_XIP_KERNEL @@ -326,6 +329,16 @@ static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE); #define early_pmd ((pmd_t *)XIP_FIXUP(early_pmd)) #endif /* CONFIG_XIP_KERNEL */ +static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss; +static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss; +static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE); + +#ifdef CONFIG_XIP_KERNEL +#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud)) +#define fixmap_pud ((pud_t *)XIP_FIXUP(fixmap_pud)) +#define early_pud ((pud_t *)XIP_FIXUP(early_pud)) +#endif /* CONFIG_XIP_KERNEL */ + static pmd_t *__init get_pmd_virt_early(phys_addr_t pa) { /* Before MMU is enabled */ @@ -345,7 +358,7 @@ static pmd_t *__init get_pmd_virt_late(phys_addr_t pa) static phys_addr_t __init alloc_pmd_early(uintptr_t va) { - BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); + BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT); return (uintptr_t)early_pmd; } @@ -391,21 +404,97 @@ static void __init create_pmd_mapping(pmd_t *pmdp, create_pte_mapping(ptep, va, pa, sz, prot); } -#define pgd_next_t pmd_t -#define alloc_pgd_next(__va) pt_ops.alloc_pmd(__va) -#define get_pgd_next_virt(__pa) pt_ops.get_pmd_virt(__pa) +static pud_t *__init get_pud_virt_early(phys_addr_t pa) +{ + return (pud_t *)((uintptr_t)pa); +} + +static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa) +{ + clear_fixmap(FIX_PUD); + return (pud_t *)set_fixmap_offset(FIX_PUD, pa); +} + +static pud_t *__init get_pud_virt_late(phys_addr_t pa) +{ + return (pud_t *)__va(pa); +} + +static phys_addr_t __init alloc_pud_early(uintptr_t va) +{ + /* Only one PUD is available for early mapping */ + BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT); + + return (uintptr_t)early_pud; +} + +static phys_addr_t __init alloc_pud_fixmap(uintptr_t va) +{ + return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); +} + +static phys_addr_t alloc_pud_late(uintptr_t va) +{ + unsigned long vaddr; + + vaddr = __get_free_page(GFP_KERNEL); + BUG_ON(!vaddr); + return __pa(vaddr); +} + +static void __init create_pud_mapping(pud_t *pudp, + uintptr_t va, phys_addr_t pa, + phys_addr_t sz, pgprot_t prot) +{ + pmd_t *nextp; + phys_addr_t next_phys; + uintptr_t pud_index = pud_index(va); + + if (sz == PUD_SIZE) { + if (pud_val(pudp[pud_index]) == 0) + pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot); + return; + } + + if (pud_val(pudp[pud_index]) == 0) { + next_phys = pt_ops.alloc_pmd(va); + pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE); + nextp = pt_ops.get_pmd_virt(next_phys); + memset(nextp, 0, PAGE_SIZE); + } else { + next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index])); + nextp = pt_ops.get_pmd_virt(next_phys); + } + + create_pmd_mapping(nextp, va, pa, sz, prot); +} + +#define pgd_next_t pud_t +#define alloc_pgd_next(__va) (pgtable_l4_enabled ? \ + pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va)) +#define get_pgd_next_virt(__pa) (pgtable_l4_enabled ? \ + pt_ops.get_pud_virt(__pa) : (pgd_next_t *)pt_ops.get_pmd_virt(__pa)) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ - create_pmd_mapping(__nextp, __va, __pa, __sz, __prot) -#define fixmap_pgd_next fixmap_pmd + (pgtable_l4_enabled ? \ + create_pud_mapping(__nextp, __va, __pa, __sz, __prot) : \ + create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot)) +#define fixmap_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd) +#define trampoline_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd) +#define early_dtb_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd) #else #define pgd_next_t pte_t #define alloc_pgd_next(__va) pt_ops.alloc_pte(__va) #define get_pgd_next_virt(__pa) pt_ops.get_pte_virt(__pa) #define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \ create_pte_mapping(__nextp, __va, __pa, __sz, __prot) -#define fixmap_pgd_next fixmap_pte +#define fixmap_pgd_next ((uintptr_t)fixmap_pte) +#define early_dtb_pgd_next ((uintptr_t)early_dtb_pmd) +#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot) #define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) -#endif +#endif /* __PAGETABLE_PMD_FOLDED */ void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, @@ -492,6 +581,57 @@ static __init pgprot_t pgprot_from_va(uintptr_t va) } #endif /* CONFIG_STRICT_KERNEL_RWX */ +#ifdef CONFIG_64BIT +static void __init disable_pgtable_l4(void) +{ + pgtable_l4_enabled = false; + kernel_map.page_offset = PAGE_OFFSET_L3; + satp_mode = SATP_MODE_39; +} + +/* + * There is a simple way to determine if 4-level is supported by the + * underlying hardware: establish 1:1 mapping in 4-level page table mode + * then read SATP to see if the configuration was taken into account + * meaning sv48 is supported. + */ +static __init void set_satp_mode(void) +{ + u64 identity_satp, hw_satp; + uintptr_t set_satp_mode_pmd; + + set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK; + create_pgd_mapping(early_pg_dir, + set_satp_mode_pmd, (uintptr_t)early_pud, + PGDIR_SIZE, PAGE_TABLE); + create_pud_mapping(early_pud, + set_satp_mode_pmd, (uintptr_t)early_pmd, + PUD_SIZE, PAGE_TABLE); + /* Handle the case where set_satp_mode straddles 2 PMDs */ + create_pmd_mapping(early_pmd, + set_satp_mode_pmd, set_satp_mode_pmd, + PMD_SIZE, PAGE_KERNEL_EXEC); + create_pmd_mapping(early_pmd, + set_satp_mode_pmd + PMD_SIZE, + set_satp_mode_pmd + PMD_SIZE, + PMD_SIZE, PAGE_KERNEL_EXEC); + + identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode; + + local_flush_tlb_all(); + csr_write(CSR_SATP, identity_satp); + hw_satp = csr_swap(CSR_SATP, 0ULL); + local_flush_tlb_all(); + + if (hw_satp != identity_satp) + disable_pgtable_l4(); + + memset(early_pg_dir, 0, PAGE_SIZE); + memset(early_pud, 0, PAGE_SIZE); + memset(early_pmd, 0, PAGE_SIZE); +} +#endif + /* * setup_vm() is called from head.S with MMU-off. * @@ -556,10 +696,15 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa) uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1); create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA, - IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa, + IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa, PGDIR_SIZE, IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL); + if (pgtable_l4_enabled) { + create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA, + (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE); + } + if (IS_ENABLED(CONFIG_64BIT)) { create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA, pa, PMD_SIZE, PAGE_KERNEL); @@ -592,6 +737,8 @@ void pt_ops_set_early(void) #ifndef __PAGETABLE_PMD_FOLDED pt_ops.alloc_pmd = alloc_pmd_early; pt_ops.get_pmd_virt = get_pmd_virt_early; + pt_ops.alloc_pud = alloc_pud_early; + pt_ops.get_pud_virt = get_pud_virt_early; #endif } @@ -610,6 +757,8 @@ void pt_ops_set_fixmap(void) #ifndef __PAGETABLE_PMD_FOLDED pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap); pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap); + pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap); + pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap); #endif } @@ -624,6 +773,8 @@ void pt_ops_set_late(void) #ifndef __PAGETABLE_PMD_FOLDED pt_ops.alloc_pmd = alloc_pmd_late; pt_ops.get_pmd_virt = get_pmd_virt_late; + pt_ops.alloc_pud = alloc_pud_late; + pt_ops.get_pud_virt = get_pud_virt_late; #endif } @@ -632,6 +783,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd; kernel_map.virt_addr = KERNEL_LINK_ADDR; + kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); #ifdef CONFIG_XIP_KERNEL kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR; @@ -646,6 +798,11 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) kernel_map.phys_addr = (uintptr_t)(&_start); kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr; #endif + +#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) + set_satp_mode(); +#endif + kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr; kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; @@ -675,15 +832,21 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) /* Setup early PGD for fixmap */ create_pgd_mapping(early_pg_dir, FIXADDR_START, - (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); + fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE); #ifndef __PAGETABLE_PMD_FOLDED - /* Setup fixmap PMD */ + /* Setup fixmap PUD and PMD */ + if (pgtable_l4_enabled) + create_pud_mapping(fixmap_pud, FIXADDR_START, + (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE); create_pmd_mapping(fixmap_pmd, FIXADDR_START, (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); /* Setup trampoline PGD and PMD */ create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr, - (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); + trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE); + if (pgtable_l4_enabled) + create_pud_mapping(trampoline_pud, kernel_map.virt_addr, + (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE); #ifdef CONFIG_XIP_KERNEL create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr, kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC); @@ -711,7 +874,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) * Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap * range can not span multiple pmds. */ - BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) + BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); #ifndef __PAGETABLE_PMD_FOLDED @@ -782,9 +945,10 @@ static void __init setup_vm_final(void) /* Clear fixmap PTE and PMD mappings */ clear_fixmap(FIX_PTE); clear_fixmap(FIX_PMD); + clear_fixmap(FIX_PUD); /* Move to swapper page table */ - csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE); + csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode); local_flush_tlb_all(); pt_ops_set_late(); diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 1434a0225140..993f50571a3b 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -11,7 +11,29 @@ #include #include +/* + * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57 + * which is right before the kernel. + * + * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate + * the page global directory with kasan_early_shadow_pmd. + * + * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping + * must be divided as follows: + * - the first PGD entry, although incomplete, is populated with + * kasan_early_shadow_pud/p4d + * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d + * - the last PGD entry is shared with the kernel mapping so populated at the + * lower levels pud/p4d + * + * In addition, when shallow populating a kasan region (for example vmalloc), + * this region may also not be aligned on PGDIR size, so we must go down to the + * pud level too. + */ + extern pgd_t early_pg_dir[PTRS_PER_PGD]; +extern struct pt_alloc_ops _pt_ops __initdata; +#define pt_ops _pt_ops static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) { @@ -35,15 +57,19 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE)); } -static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end) +static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end) { phys_addr_t phys_addr; pmd_t *pmdp, *base_pmd; unsigned long next; - base_pmd = (pmd_t *)pgd_page_vaddr(*pgd); - if (base_pmd == lm_alias(kasan_early_shadow_pmd)) + if (pud_none(*pud)) { base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + } else { + base_pmd = (pmd_t *)pud_pgtable(*pud); + if (base_pmd == lm_alias(kasan_early_shadow_pmd)) + base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + } pmdp = base_pmd + pmd_index(vaddr); @@ -67,9 +93,72 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned * it entirely, memblock could allocate a page at a physical address * where KASAN is not populated yet and then we'd get a page fault. */ - set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); + set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); +} + +static void __init kasan_populate_pud(pgd_t *pgd, + unsigned long vaddr, unsigned long end, + bool early) +{ + phys_addr_t phys_addr; + pud_t *pudp, *base_pud; + unsigned long next; + + if (early) { + /* + * We can't use pgd_page_vaddr here as it would return a linear + * mapping address but it is not mapped yet, but when populating + * early_pg_dir, we need the physical address and when populating + * swapper_pg_dir, we need the kernel virtual address so use + * pt_ops facility. + */ + base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd))); + } else { + base_pud = (pud_t *)pgd_page_vaddr(*pgd); + if (base_pud == lm_alias(kasan_early_shadow_pud)) + base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); + } + + pudp = base_pud + pud_index(vaddr); + + do { + next = pud_addr_end(vaddr, end); + + if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) { + if (early) { + phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd)); + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE)); + continue; + } else { + phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE); + if (phys_addr) { + set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL)); + continue; + } + } + } + + kasan_populate_pmd(pudp, vaddr, next); + } while (pudp++, vaddr = next, vaddr != end); + + /* + * Wait for the whole PGD to be populated before setting the PGD in + * the page table, otherwise, if we did set the PGD before populating + * it entirely, memblock could allocate a page at a physical address + * where KASAN is not populated yet and then we'd get a page fault. + */ + if (!early) + set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE)); } +#define kasan_early_shadow_pgd_next (pgtable_l4_enabled ? \ + (uintptr_t)kasan_early_shadow_pud : \ + (uintptr_t)kasan_early_shadow_pmd) +#define kasan_populate_pgd_next(pgdp, vaddr, next, early) \ + (pgtable_l4_enabled ? \ + kasan_populate_pud(pgdp, vaddr, next, early) : \ + kasan_populate_pmd((pud_t *)pgdp, vaddr, next)) + static void __init kasan_populate_pgd(pgd_t *pgdp, unsigned long vaddr, unsigned long end, bool early) @@ -102,7 +191,7 @@ static void __init kasan_populate_pgd(pgd_t *pgdp, } } - kasan_populate_pmd(pgdp, vaddr, next); + kasan_populate_pgd_next(pgdp, vaddr, next, early); } while (pgdp++, vaddr = next, vaddr != end); } @@ -157,18 +246,54 @@ static void __init kasan_populate(void *start, void *end) memset(start, KASAN_SHADOW_INIT, end - start); } +static void __init kasan_shallow_populate_pud(pgd_t *pgdp, + unsigned long vaddr, unsigned long end, + bool kasan_populate) +{ + unsigned long next; + pud_t *pudp, *base_pud; + pmd_t *base_pmd; + bool is_kasan_pmd; + + base_pud = (pud_t *)pgd_page_vaddr(*pgdp); + pudp = base_pud + pud_index(vaddr); + + if (kasan_populate) + memcpy(base_pud, (void *)kasan_early_shadow_pgd_next, + sizeof(pud_t) * PTRS_PER_PUD); + + do { + next = pud_addr_end(vaddr, end); + is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd)); + + if (is_kasan_pmd) { + base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE)); + } + } while (pudp++, vaddr = next, vaddr != end); +} + static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end) { unsigned long next; void *p; pgd_t *pgd_k = pgd_offset_k(vaddr); + bool is_kasan_pgd_next; do { next = pgd_addr_end(vaddr, end); - if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) { + is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) == + (unsigned long)lm_alias(kasan_early_shadow_pgd_next)); + + if (is_kasan_pgd_next) { p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); } + + if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) + continue; + + kasan_shallow_populate_pud(pgd_k, vaddr, next, is_kasan_pgd_next); } while (pgd_k++, vaddr = next, vaddr != end); } diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index 26e69788f27a..b3db5d91ed38 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -40,6 +40,8 @@ #ifdef CONFIG_ARM64 # define EFI_RT_VIRTUAL_LIMIT DEFAULT_MAP_WINDOW_64 +#elif defined(CONFIG_RISCV) +# define EFI_RT_VIRTUAL_LIMIT TASK_SIZE_MIN #else # define EFI_RT_VIRTUAL_LIMIT TASK_SIZE #endif -- cgit v1.2.3-58-ga151 From 73c7c8f68e7266bd558227bd9c598cb90b1673cc Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:52 +0100 Subject: riscv: Use pgtable_l4_enabled to output mmu_type in cpuinfo Now that the mmu type is determined at runtime using SATP characteristic, use the global variable pgtable_l4_enabled to output mmu type of the processor through /proc/cpuinfo instead of relying on device tree infos. Signed-off-by: Alexandre Ghiti Reviewed-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index f13b2c9ea912..ad0a7e9f828b 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -7,6 +7,7 @@ #include #include #include +#include /* * Returns the hart ID of the given device tree node, or -ENODEV if the node @@ -71,18 +72,19 @@ static void print_isa(struct seq_file *f, const char *isa) seq_puts(f, "\n"); } -static void print_mmu(struct seq_file *f, const char *mmu_type) +static void print_mmu(struct seq_file *f) { + char sv_type[16]; + #if defined(CONFIG_32BIT) - if (strcmp(mmu_type, "riscv,sv32") != 0) - return; + strncpy(sv_type, "sv32", 5); #elif defined(CONFIG_64BIT) - if (strcmp(mmu_type, "riscv,sv39") != 0 && - strcmp(mmu_type, "riscv,sv48") != 0) - return; + if (pgtable_l4_enabled) + strncpy(sv_type, "sv48", 5); + else + strncpy(sv_type, "sv39", 5); #endif - - seq_printf(f, "mmu\t\t: %s\n", mmu_type+6); + seq_printf(f, "mmu\t\t: %s\n", sv_type); } static void *c_start(struct seq_file *m, loff_t *pos) @@ -107,14 +109,13 @@ static int c_show(struct seq_file *m, void *v) { unsigned long cpu_id = (unsigned long)v - 1; struct device_node *node = of_get_cpu_node(cpu_id, NULL); - const char *compat, *isa, *mmu; + const char *compat, *isa; seq_printf(m, "processor\t: %lu\n", cpu_id); seq_printf(m, "hart\t\t: %lu\n", cpuid_to_hartid_map(cpu_id)); if (!of_property_read_string(node, "riscv,isa", &isa)) print_isa(m, isa); - if (!of_property_read_string(node, "mmu-type", &mmu)) - print_mmu(m, mmu); + print_mmu(m); if (!of_property_read_string(node, "compatible", &compat) && strcmp(compat, "riscv")) seq_printf(m, "uarch\t\t: %s\n", compat); -- cgit v1.2.3-58-ga151 From c774de22c430733487f70d755067d9ea55dbe6de Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 6 Dec 2021 11:46:53 +0100 Subject: riscv: Explicit comment about user virtual address space size Define precisely the size of the user accessible virtual space size for sv32/39/48 mmu types and explain why the whole virtual address space is split into 2 equal chunks between kernel and user space. Signed-off-by: Alexandre Ghiti Reviewed-by: Anup Patel Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 2ab6650d3926..d0a96b507561 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -685,6 +685,15 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, /* * Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32. * Note that PGDIR_SIZE must evenly divide TASK_SIZE. + * Task size is: + * - 0x9fc00000 (~2.5GB) for RV32. + * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu + * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu + * + * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V + * Instruction Set Manual Volume II: Privileged Architecture" states that + * "load and store effective addresses, which are 64bits, must have bits + * 63–48 all equal to bit 47, or else a page-fault exception will occur." */ #ifdef CONFIG_64BIT #define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2) -- cgit v1.2.3-58-ga151 From 20aa49541a2ea2cb767ada04cbcaf12fe3ca1275 Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Wed, 19 Jan 2022 11:38:36 +0800 Subject: riscv: fix boolconv.cocci warnings arch/riscv/mm/init.c:48:11-16: WARNING: conversion to bool not needed here Remove unneeded conversion to bool Semantic patch information: Relational and logical operators evaluate to bool, explicit conversion is overly verbose and unneeded. Generated by: scripts/coccinelle/misc/boolconv.cocci Reported-by: kernel test robot Signed-off-by: kernel test robot Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 8def6f8db0b0..cf4d018b7d66 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -44,8 +44,7 @@ u64 satp_mode = SATP_MODE_32; #endif EXPORT_SYMBOL(satp_mode); -bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL) ? - true : false; +bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL); EXPORT_SYMBOL(pgtable_l4_enabled); phys_addr_t phys_ram_base __ro_after_init; -- cgit v1.2.3-58-ga151 From 52d005337b2c94ab37273d9ad8382d4fb051defd Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 19 Jan 2022 22:00:29 -0600 Subject: smb3: send NTLMSSP version information For improved debugging it can be helpful to send version information as other clients do during NTLMSSP negotiation. See protocol document MS-NLMP section 2.2.1.1 Set the major and minor versions based on the kernel version, and the BuildNumber based on the internal cifs.ko module version number, and following the recommendation in the protocol documentation (MS-NLMP section 2.2.10) we set the NTLMRevisionCurrent field to 15. Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 1 + fs/cifs/ntlmssp.h | 30 +++++++++++++++++++++++- fs/cifs/sess.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/smb2pdu.c | 2 +- 4 files changed, 101 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 9e5d9e192ef0..7c6f8180df69 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -152,5 +152,6 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ +#define SMB3_PRODUCT_BUILD 34 #define CIFS_VERSION "2.34" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 6d242af536cb..298458404252 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -40,7 +40,7 @@ #define NTLMSSP_REQUEST_NON_NT_KEY 0x400000 #define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000 /* #define reserved4 0x1000000 */ -#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we do not set */ +#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we only set for SMB2+ */ /* #define reserved3 0x4000000 */ /* #define reserved2 0x8000000 */ /* #define reserved1 0x10000000 */ @@ -87,6 +87,30 @@ typedef struct _NEGOTIATE_MESSAGE { /* followed by WorkstationString */ } __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE; +#define NTLMSSP_REVISION_W2K3 0x0F + +/* See MS-NLMP section 2.2.2.10 */ +struct ntlmssp_version { + __u8 ProductMajorVersion; + __u8 ProductMinorVersion; + __le16 ProductBuild; /* we send the cifs.ko module version here */ + __u8 Reserved[3]; + __u8 NTLMRevisionCurrent; /* currently 0x0F */ +} __packed; + +/* see MS-NLMP section 2.2.1.1 */ +struct negotiate_message { + __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; + __le32 MessageType; /* NtLmNegotiate = 1 */ + __le32 NegotiateFlags; + SECURITY_BUFFER DomainName; /* RFC 1001 style and ASCII */ + SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */ + struct ntlmssp_version Version; + /* SECURITY_BUFFER */ + char DomainString[0]; + /* followed by WorkstationString */ +} __packed; + typedef struct _CHALLENGE_MESSAGE { __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; __le32 MessageType; /* NtLmChallenge = 2 */ @@ -123,6 +147,10 @@ int build_ntlmssp_negotiate_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, struct TCP_Server_Info *server, const struct nls_table *nls_cp); +int build_ntlmssp_smb3_negotiate_blob(unsigned char **pbuffer, u16 *buflen, + struct cifs_ses *ses, + struct TCP_Server_Info *server, + const struct nls_table *nls_cp); int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, struct TCP_Server_Info *server, diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 15373a377a36..dc3b16d1be09 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -17,6 +17,8 @@ #include "nterr.h" #include #include +#include +#include "cifsfs.h" #include "cifs_spnego.h" #include "smb2proto.h" #include "fs_context.h" @@ -809,6 +811,74 @@ setup_ntlm_neg_ret: return rc; } +/* + * Build ntlmssp blob with additional fields, such as version, + * supported by modern servers. For safety limit to SMB3 or later + * See notes in MS-NLMP Section 2.2.2.1 e.g. + */ +int build_ntlmssp_smb3_negotiate_blob(unsigned char **pbuffer, + u16 *buflen, + struct cifs_ses *ses, + struct TCP_Server_Info *server, + const struct nls_table *nls_cp) +{ + int rc = 0; + struct negotiate_message *sec_blob; + __u32 flags; + unsigned char *tmp; + int len; + + len = size_of_ntlmssp_blob(ses, sizeof(struct negotiate_message)); + *pbuffer = kmalloc(len, GFP_KERNEL); + if (!*pbuffer) { + rc = -ENOMEM; + cifs_dbg(VFS, "Error %d during NTLMSSP allocation\n", rc); + *buflen = 0; + goto setup_ntlm_smb3_neg_ret; + } + sec_blob = (struct negotiate_message *)*pbuffer; + + memset(*pbuffer, 0, sizeof(struct negotiate_message)); + memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); + sec_blob->MessageType = NtLmNegotiate; + + /* BB is NTLMV2 session security format easier to use here? */ + flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | + NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | + NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC | + NTLMSSP_NEGOTIATE_ALWAYS_SIGN | NTLMSSP_NEGOTIATE_SEAL | + NTLMSSP_NEGOTIATE_SIGN | NTLMSSP_NEGOTIATE_VERSION; + if (!server->session_estab || ses->ntlmssp->sesskey_per_smbsess) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH; + + sec_blob->Version.ProductMajorVersion = LINUX_VERSION_MAJOR; + sec_blob->Version.ProductMinorVersion = LINUX_VERSION_PATCHLEVEL; + sec_blob->Version.ProductBuild = cpu_to_le16(SMB3_PRODUCT_BUILD); + sec_blob->Version.NTLMRevisionCurrent = NTLMSSP_REVISION_W2K3; + + tmp = *pbuffer + sizeof(struct negotiate_message); + ses->ntlmssp->client_flags = flags; + sec_blob->NegotiateFlags = cpu_to_le32(flags); + + /* these fields should be null in negotiate phase MS-NLMP 3.1.5.1.1 */ + cifs_security_buffer_from_str(&sec_blob->DomainName, + NULL, + CIFS_MAX_DOMAINNAME_LEN, + *pbuffer, &tmp, + nls_cp); + + cifs_security_buffer_from_str(&sec_blob->WorkstationName, + NULL, + CIFS_MAX_WORKSTATION_LEN, + *pbuffer, &tmp, + nls_cp); + + *buflen = tmp - *pbuffer; +setup_ntlm_smb3_neg_ret: + return rc; +} + + int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 1e670e56b07a..7e7909b1ae11 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1506,7 +1506,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) if (rc) goto out_err; - rc = build_ntlmssp_negotiate_blob(&ntlmssp_blob, + rc = build_ntlmssp_smb3_negotiate_blob(&ntlmssp_blob, &blob_length, ses, server, sess_data->nls_cp); if (rc) -- cgit v1.2.3-58-ga151 From 51620150ca2df62f8ea472ab8962be590c957288 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 19 Jan 2022 22:11:33 -0600 Subject: cifs: update internal module number To 2.35 Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 7c6f8180df69..15a5c5db038b 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -152,6 +152,6 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define SMB3_PRODUCT_BUILD 34 -#define CIFS_VERSION "2.34" +#define SMB3_PRODUCT_BUILD 35 +#define CIFS_VERSION "2.35" #endif /* _CIFSFS_H */ -- cgit v1.2.3-58-ga151 From 7c1cf55577782725ea2bc24687767c8fe8e57486 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 19 Jan 2022 09:04:31 +0800 Subject: gpio: idt3243x: Fix an ignored error return from platform_get_irq() The return from the call to platform_get_irq() is int, it can be a negative error code, however this is being assigned to an unsigned int variable 'parent_irq', so making 'parent_irq' an int. Eliminate the following coccicheck warning: ./drivers/gpio/gpio-idt3243x.c:167:6-16: WARNING: Unsigned expression compared with zero: parent_irq < 0 Reported-by: Abaci Robot Fixes: 30fee1d7462a ("gpio: idt3243x: Fix IRQ check in idt_gpio_probe") Signed-off-by: Yang Li Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-idt3243x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-idt3243x.c b/drivers/gpio/gpio-idt3243x.c index 08493b05be2d..52b8b72ded77 100644 --- a/drivers/gpio/gpio-idt3243x.c +++ b/drivers/gpio/gpio-idt3243x.c @@ -132,7 +132,7 @@ static int idt_gpio_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct gpio_irq_chip *girq; struct idt_gpio_ctrl *ctrl; - unsigned int parent_irq; + int parent_irq; int ngpios; int ret; -- cgit v1.2.3-58-ga151 From 9f51ce0b9e73f83bab2442b36d5e247a81bd3401 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 19 Jan 2022 09:04:32 +0800 Subject: gpio: mpc8xxx: Fix an ignored error return from platform_get_irq() The return from the call to platform_get_irq() is int, it can be a negative error code, however this is being assigned to an unsigned int variable 'irqn', so making 'irqn' an int. Eliminate the following coccicheck warning: ./drivers/gpio/gpio-mpc8xxx.c:391:5-21: WARNING: Unsigned expression compared with zero: mpc8xxx_gc -> irqn < 0 Reported-by: Abaci Robot Fixes: 0b39536cc699 ("gpio: mpc8xxx: Fix IRQ check in mpc8xxx_probe") Signed-off-by: Yang Li Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mpc8xxx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index 01634c8d27b3..a964e25ea620 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -47,7 +47,7 @@ struct mpc8xxx_gpio_chip { unsigned offset, int value); struct irq_domain *irq; - unsigned int irqn; + int irqn; }; /* -- cgit v1.2.3-58-ga151 From 440323b6cf5b9896013a78c4f578823e8243a7fd Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Fri, 14 Jan 2022 18:58:57 +0800 Subject: asm-generic: Add missing brackets for io_stop_wc macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After using io_stop_wc(), drivers reports following compile error when compiled on X86. drivers/net/ethernet/hisilicon/hns3/hns3_enet.c: In function ‘hns3_tx_push_bd’: drivers/net/ethernet/hisilicon/hns3/hns3_enet.c:2058:12: error: expected ‘;’ before ‘(’ token io_stop_wc(); ^ It is because I missed to add the brackets after io_stop_wc macro. So let's add the missing brackets. Fixes: d5624bb29f49 ("asm-generic: introduce io_stop_wc() and add implementation for ARM64") Reported-by: Guangbin Huang Signed-off-by: Xiongfeng Wang Link: https://lore.kernel.org/r/20220114105857.126300-1-wangxiongfeng2@huawei.com Signed-off-by: Catalin Marinas --- include/asm-generic/barrier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 4c2c1b830344..3385ca5ca5c0 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -259,7 +259,7 @@ do { \ * write-combining memory accesses before this macro with those after it. */ #ifndef io_stop_wc -#define io_stop_wc do { } while (0) +#define io_stop_wc() do { } while (0) #endif #endif /* !__ASSEMBLY__ */ -- cgit v1.2.3-58-ga151 From 3364c6ce23c6e347c63fc895e0d6d1e8a9407849 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jan 2022 12:22:59 -0800 Subject: arm64: atomics: lse: Dereference matching size When building with -Warray-bounds, the following warning is generated: In file included from ./arch/arm64/include/asm/lse.h:16, from ./arch/arm64/include/asm/cmpxchg.h:14, from ./arch/arm64/include/asm/atomic.h:16, from ./include/linux/atomic.h:7, from ./include/asm-generic/bitops/atomic.h:5, from ./arch/arm64/include/asm/bitops.h:25, from ./include/linux/bitops.h:33, from ./include/linux/kernel.h:22, from kernel/printk/printk.c:22: ./arch/arm64/include/asm/atomic_lse.h:247:9: warning: array subscript 'long unsigned int[0]' is partly outside array bounds of 'atomic_t[1]' [-Warray-bounds] 247 | asm volatile( \ | ^~~ ./arch/arm64/include/asm/atomic_lse.h:266:1: note: in expansion of macro '__CMPXCHG_CASE' 266 | __CMPXCHG_CASE(w, , acq_, 32, a, "memory") | ^~~~~~~~~~~~~~ kernel/printk/printk.c:3606:17: note: while referencing 'printk_cpulock_owner' 3606 | static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); | ^~~~~~~~~~~~~~~~~~~~ This is due to the compiler seeing an unsigned long * cast against something (atomic_t) that is int sized. Replace the cast with the matching size cast. This results in no change in binary output. Note that __ll_sc__cmpxchg_case_##name##sz already uses the same constraint: [v] "+Q" (*(u##sz *)ptr Which is why only the LSE form needs updating and not the LL/SC form, so this change is unlikely to be problematic. Cc: Will Deacon Cc: Peter Zijlstra Cc: Boqun Feng Cc: linux-arm-kernel@lists.infradead.org Acked-by: Ard Biesheuvel Acked-by: Mark Rutland Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220112202259.3950286-1-keescook@chromium.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/atomic_lse.h | 2 +- arch/arm64/include/asm/cmpxchg.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h index d955ade5df7c..5d460f6b7675 100644 --- a/arch/arm64/include/asm/atomic_lse.h +++ b/arch/arm64/include/asm/atomic_lse.h @@ -249,7 +249,7 @@ __lse__cmpxchg_case_##name##sz(volatile void *ptr, \ " mov %" #w "[tmp], %" #w "[old]\n" \ " cas" #mb #sfx "\t%" #w "[tmp], %" #w "[new], %[v]\n" \ " mov %" #w "[ret], %" #w "[tmp]" \ - : [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr), \ + : [ret] "+r" (x0), [v] "+Q" (*(u##sz *)ptr), \ [tmp] "=&r" (tmp) \ : [old] "r" (x1), [new] "r" (x2) \ : cl); \ diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h index f9bef42c1411..497acf134d99 100644 --- a/arch/arm64/include/asm/cmpxchg.h +++ b/arch/arm64/include/asm/cmpxchg.h @@ -243,7 +243,7 @@ static inline void __cmpwait_case_##sz(volatile void *ptr, \ " cbnz %" #w "[tmp], 1f\n" \ " wfe\n" \ "1:" \ - : [tmp] "=&r" (tmp), [v] "+Q" (*(unsigned long *)ptr) \ + : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \ : [val] "r" (val)); \ } -- cgit v1.2.3-58-ga151 From bb425a7598479fa0f171ec806033c440f218b0ce Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Wed, 15 Dec 2021 14:45:58 +0800 Subject: arm64: mm: apply __ro_after_init to memory_limit This variable is only set during initialization, so mark with __ro_after_init. Signed-off-by: Peng Fan Reviewed-by: David Hildenbrand Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20211215064559.2843555-1-peng.fan@oss.nxp.com Signed-off-by: Catalin Marinas --- arch/arm64/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index a8834434af99..db63cc885771 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -172,7 +172,7 @@ int pfn_is_map_memory(unsigned long pfn) } EXPORT_SYMBOL(pfn_is_map_memory); -static phys_addr_t memory_limit = PHYS_ADDR_MAX; +static phys_addr_t memory_limit __ro_after_init = PHYS_ADDR_MAX; /* * Limit the memory size that was specified via FDT. -- cgit v1.2.3-58-ga151 From 3ee859e384d453d6ac68bfd5971f630d9fa46ad3 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 9 Jan 2022 18:36:43 +0900 Subject: block: Fix wrong offset in bio_truncate() bio_truncate() clears the buffer outside of last block of bdev, however current bio_truncate() is using the wrong offset of page. So it can return the uninitialized data. This happened when both of truncated/corrupted FS and userspace (via bdev) are trying to read the last of bdev. Reported-by: syzbot+ac94ae5f68b84197f41c@syzkaller.appspotmail.com Signed-off-by: OGAWA Hirofumi Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/875yqt1c9g.fsf@mail.parknet.co.jp Signed-off-by: Jens Axboe --- block/bio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 0d400ba2dbd1..4312a8085396 100644 --- a/block/bio.c +++ b/block/bio.c @@ -569,7 +569,8 @@ static void bio_truncate(struct bio *bio, unsigned new_size) offset = new_size - done; else offset = 0; - zero_user(bv.bv_page, offset, bv.bv_len - offset); + zero_user(bv.bv_page, bv.bv_offset + offset, + bv.bv_len - offset); truncated = true; } done += bv.bv_len; -- cgit v1.2.3-58-ga151 From 2a1355f0bf41a2132d522ed7a2a7eb1cc4fe3d8f Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Thu, 20 Jan 2022 10:56:18 +0000 Subject: ALSA: hda/cs8409: Add new Warlock SKUs to patch_cs8409 Signed-off-by: Stefan Binding Signed-off-by: Vitaly Rodionov Link: https://lore.kernel.org/r/20220120105618.249144-1-vitalyr@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_cs8409-tables.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_cs8409-tables.c b/sound/pci/hda/patch_cs8409-tables.c index df0b4522babf..2d1fa706327b 100644 --- a/sound/pci/hda/patch_cs8409-tables.c +++ b/sound/pci/hda/patch_cs8409-tables.c @@ -490,6 +490,8 @@ const struct snd_pci_quirk cs8409_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0ADC, "Warlock", CS8409_WARLOCK), SND_PCI_QUIRK(0x1028, 0x0AF4, "Warlock", CS8409_WARLOCK), SND_PCI_QUIRK(0x1028, 0x0AF5, "Warlock", CS8409_WARLOCK), + SND_PCI_QUIRK(0x1028, 0x0BB5, "Warlock N3 15 TGL-U Nuvoton EC", CS8409_WARLOCK), + SND_PCI_QUIRK(0x1028, 0x0BB6, "Warlock V3 15 TGL-U Nuvoton EC", CS8409_WARLOCK), SND_PCI_QUIRK(0x1028, 0x0A77, "Cyborg", CS8409_CYBORG), SND_PCI_QUIRK(0x1028, 0x0A78, "Cyborg", CS8409_CYBORG), SND_PCI_QUIRK(0x1028, 0x0A79, "Cyborg", CS8409_CYBORG), -- cgit v1.2.3-58-ga151 From 6e10e21915c1ab6eaa145f7b5ebaf4500af1b011 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 8 Sep 2021 16:09:08 -0300 Subject: tools headers UAPI: Sync files changed by new set_mempolicy_home_node syscall To pick the changes in these csets: 21b084fdf2a49ca1 ("mm/mempolicy: wire up syscall set_mempolicy_home_node") That add support for this new syscall in tools such as 'perf trace'. For instance, this is now possible: [root@five ~]# perf trace -e set_mempolicy_home_node ^C[root@five ~]# [root@five ~]# perf trace -v -e set_mempolicy_home_node Using CPUID AuthenticAMD-25-21-0 event qualifier tracepoint filter: (common_pid != 253729 && common_pid != 3585) && (id == 450) mmap size 528384B ^C[root@five ~] [root@five ~]# perf trace -v -e set* --max-events 5 Using CPUID AuthenticAMD-25-21-0 event qualifier tracepoint filter: (common_pid != 253734 && common_pid != 3585) && (id == 38 || id == 54 || id == 105 || id == 106 || id == 109 || id == 112 || id == 113 || id == 114 || id == 116 || id == 117 || id == 119 || id == 122 || id == 123 || id == 141 || id == 160 || id == 164 || id == 170 || id == 171 || id == 188 || id == 205 || id == 218 || id == 238 || id == 273 || id == 308 || id == 450) mmap size 528384B 0.000 ( 0.008 ms): bash/253735 setpgid(pid: 253735 (bash), pgid: 253735 (bash)) = 0 6849.011 ( 0.008 ms): bash/16046 setpgid(pid: 253736 (bash), pgid: 253736 (bash)) = 0 6849.080 ( 0.005 ms): bash/253736 setpgid(pid: 253736 (bash), pgid: 253736 (bash)) = 0 7437.718 ( 0.009 ms): gnome-shell/253737 set_robust_list(head: 0x7f34b527e920, len: 24) = 0 13445.986 ( 0.010 ms): bash/16046 setpgid(pid: 253738 (bash), pgid: 253738 (bash)) = 0 [root@five ~]# That is the filter expression attached to the raw_syscalls:sys_{enter,exit} tracepoints. $ find tools/perf/arch/ -name "syscall*tbl" | xargs grep -w set_mempolicy_home_node tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl:450 common set_mempolicy_home_node sys_set_mempolicy_home_node tools/perf/arch/powerpc/entry/syscalls/syscall.tbl:450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node tools/perf/arch/s390/entry/syscalls/syscall.tbl:450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node tools/perf/arch/x86/entry/syscalls/syscall_64.tbl:450 common set_mempolicy_home_node sys_set_mempolicy_home_node $ $ grep -w set_mempolicy_home_node /tmp/build/perf/arch/x86/include/generated/asm/syscalls_64.c [450] = "set_mempolicy_home_node", $ This addresses these perf build warnings: Warning: Kernel ABI header at 'tools/include/uapi/asm-generic/unistd.h' differs from latest version at 'include/uapi/asm-generic/unistd.h' diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h Warning: Kernel ABI header at 'tools/perf/arch/x86/entry/syscalls/syscall_64.tbl' differs from latest version at 'arch/x86/entry/syscalls/syscall_64.tbl' diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl Warning: Kernel ABI header at 'tools/perf/arch/powerpc/entry/syscalls/syscall.tbl' differs from latest version at 'arch/powerpc/kernel/syscalls/syscall.tbl' diff -u tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl Warning: Kernel ABI header at 'tools/perf/arch/s390/entry/syscalls/syscall.tbl' differs from latest version at 'arch/s390/kernel/syscalls/syscall.tbl' diff -u tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl Warning: Kernel ABI header at 'tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl' differs from latest version at 'arch/mips/kernel/syscalls/syscall_n64.tbl' diff -u tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl Cc: Aneesh Kumar K.V Cc: Linus Torvalds Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/unistd.h | 5 ++++- tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl | 1 + tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 1 + tools/perf/arch/s390/entry/syscalls/syscall.tbl | 1 + tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 1 + 5 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 4557a8b6086f..1c48b0ae3ba3 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -883,8 +883,11 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) #define __NR_futex_waitv 449 __SYSCALL(__NR_futex_waitv, sys_futex_waitv) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) + #undef __NR_syscalls -#define __NR_syscalls 450 +#define __NR_syscalls 451 /* * 32 bit systems traditionally used different diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl index e2c481fcede6..3f1886ad9d80 100644 --- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -364,3 +364,4 @@ # 447 reserved for memfd_secret 448 n64 process_mrelease sys_process_mrelease 449 n64 futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index 15109af9d075..2600b4237292 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -529,3 +529,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index ed9c5c2eafad..799147658dee 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -452,3 +452,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index fe8f8dd157b4..c84d12608cd2 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -371,6 +371,7 @@ 447 common memfd_secret sys_memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node # # Due to a historical design error, certain syscalls are numbered differently -- cgit v1.2.3-58-ga151 From 3938d5a2f9369d1ebd56320629fed395ce327e9c Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Thu, 16 Dec 2021 13:35:38 +0100 Subject: riscv: default to CONFIG_RISCV_SBI_V01=n The SBI 0.1 specification is obsolete. The current version is 0.3. Hence we should not rely by default on SBI 0.1 being implemented. Signed-off-by: Heinrich Schuchardt Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 38205f25c0d7..86553d8c6b07 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -369,7 +369,6 @@ source "kernel/Kconfig.hz" config RISCV_SBI_V01 bool "SBI v0.1 support" - default y depends on RISCV_SBI help This config allows kernel to use SBI v0.1 APIs. This will be -- cgit v1.2.3-58-ga151 From a0f4ba7f51ea736a6b4ccf58563507d7af9128fb Mon Sep 17 00:00:00 2001 From: Jinrong Liang Date: Wed, 19 Jan 2022 21:39:10 +0800 Subject: selftests: kvm/x86: Fix the warning in pmu_event_filter_test.c The following warning appears when executing make -C tools/testing/selftests/kvm x86_64/pmu_event_filter_test.c: In function 'vcpu_supports_intel_br_retired': x86_64/pmu_event_filter_test.c:241:28: warning: variable 'cpuid' set but not used [-Wunused-but-set-variable] 241 | struct kvm_cpuid2 *cpuid; | ^~~~~ x86_64/pmu_event_filter_test.c: In function 'vcpu_supports_amd_zen_br_retired': x86_64/pmu_event_filter_test.c:258:28: warning: variable 'cpuid' set but not used [-Wunused-but-set-variable] 258 | struct kvm_cpuid2 *cpuid; | ^~~~~ Just delete the unused variables to stay away from warnings. Fixes: dc7e75b3b3ee ("selftests: kvm/x86: Add test for KVM_SET_PMU_EVENT_FILTER") Signed-off-by: Jinrong Liang Message-Id: <20220119133910.56285-1-cloudliang@tencent.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c index aa104946e6e0..c715adcbd487 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c @@ -347,9 +347,7 @@ static bool check_intel_pmu_leaf(struct kvm_cpuid_entry2 *entry) static bool use_intel_pmu(void) { struct kvm_cpuid_entry2 *entry; - struct kvm_cpuid2 *cpuid; - cpuid = kvm_get_supported_cpuid(); entry = kvm_get_supported_cpuid_index(0xa, 0); return is_intel_cpu() && entry && check_intel_pmu_leaf(entry); } @@ -381,9 +379,7 @@ static bool is_zen3(uint32_t eax) static bool use_amd_pmu(void) { struct kvm_cpuid_entry2 *entry; - struct kvm_cpuid2 *cpuid; - cpuid = kvm_get_supported_cpuid(); entry = kvm_get_supported_cpuid_index(1, 0); return is_amd_cpu() && entry && (is_zen1(entry->eax) || -- cgit v1.2.3-58-ga151 From 83a34ad848937462aa64fa3d48f8c0b4034f2503 Mon Sep 17 00:00:00 2001 From: Jinrong Liang Date: Wed, 19 Jan 2022 22:03:25 +0800 Subject: selftests: kvm/x86: Fix the warning in lib/x86_64/processor.c The following warning appears when executing make -C tools/testing/selftests/kvm include/x86_64/processor.h:290:2: warning: 'ecx' may be used uninitialized in this function [-Wmaybe-uninitialized] asm volatile("cpuid" ^~~ lib/x86_64/processor.c:1523:21: note: 'ecx' was declared here uint32_t eax, ebx, ecx, edx, max_ext_leaf; Just initialize ecx to remove this warning. Fixes: c8cc43c1eae2 ("selftests: KVM: avoid failures due to reserved HyperTransport region") Signed-off-by: Jinrong Liang Message-Id: <20220119140325.59369-1-cloudliang@tencent.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/x86_64/processor.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 5c8c270a9158..5f9d7e91dc69 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -1535,6 +1535,7 @@ unsigned long vm_compute_max_gfn(struct kvm_vm *vm) /* Before family 17h, the HyperTransport area is just below 1T. */ ht_gfn = (1 << 28) - num_ht_pages; eax = 1; + ecx = 0; cpuid(&eax, &ebx, &ecx, &edx); if (x86_family(eax) < 0x17) goto done; -- cgit v1.2.3-58-ga151 From e2e83a73d7ce66f62c7830a85619542ef59c90e4 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 19 Jan 2022 23:50:03 -0500 Subject: docs: kvm: fix WARNINGs from api.rst Use the api number 134 for KVM_GET_XSAVE2, instead of 42, which has been used by KVM_GET_XSAVE. Also, fix the WARNINGs of the underlines being too short. Reported-by: Stephen Rothwell Signed-off-by: Wei Wang Tested-by: Stephen Rothwell Message-Id: <20220120045003.315177-1-wei.w.wang@intel.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index d3791a14eb9a..bb8cfddbb22d 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5545,8 +5545,8 @@ the trailing ``'\0'``, is indicated by ``name_size`` in the header. The Stats Data block contains an array of 64-bit values in the same order as the descriptors in Descriptors block. -4.42 KVM_GET_XSAVE2 ------------------- +4.134 KVM_GET_XSAVE2 +-------------------- :Capability: KVM_CAP_XSAVE2 :Architectures: x86 @@ -7363,7 +7363,7 @@ trap and emulate MSRs that are outside of the scope of KVM as well as limit the attack surface on KVM's MSR emulation code. 8.28 KVM_CAP_ENFORCE_PV_FEATURE_CPUID ------------------------------ +------------------------------------- Architectures: x86 -- cgit v1.2.3-58-ga151 From 9a2451f1866344d38b4a1dc20396e3a03954fcd7 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 20 Jan 2022 01:09:13 -0800 Subject: RISC-V: Avoid using per cpu array for ordered booting Currently both order booting and spinwait approach uses a per cpu array to update stack & task pointer. This approach will not work for the following cases. 1. If NR_CPUs are configured to be less than highest hart id. 2. A platform has sparse hartid. This issue can be fixed for ordered booting as the booting cpu brings up one cpu at a time using SBI HSM extension which has opaque parameter that is unused until now. Introduce a common secondary boot data structure that can store the stack and task pointer. Secondary harts will use this data while booting up to setup the sp & tp. Reviewed-by: Anup Patel Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpu_ops_sbi.h | 25 +++++++++++++++++++++++++ arch/riscv/kernel/asm-offsets.c | 3 +++ arch/riscv/kernel/cpu_ops_sbi.c | 26 ++++++++++++++++++++------ arch/riscv/kernel/head.S | 19 ++++++++++--------- 4 files changed, 58 insertions(+), 15 deletions(-) create mode 100644 arch/riscv/include/asm/cpu_ops_sbi.h diff --git a/arch/riscv/include/asm/cpu_ops_sbi.h b/arch/riscv/include/asm/cpu_ops_sbi.h new file mode 100644 index 000000000000..56e4b76d09ff --- /dev/null +++ b/arch/riscv/include/asm/cpu_ops_sbi.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021 by Rivos Inc. + */ +#ifndef __ASM_CPU_OPS_SBI_H +#define __ASM_CPU_OPS_SBI_H + +#ifndef __ASSEMBLY__ +#include +#include +#include + +/** + * struct sbi_hart_boot_data - Hart specific boot used during booting and + * cpu hotplug. + * @task_ptr: A pointer to the hart specific tp + * @stack_ptr: A pointer to the hart specific sp + */ +struct sbi_hart_boot_data { + void *task_ptr; + void *stack_ptr; +}; +#endif + +#endif /* ifndef __ASM_CPU_OPS_SBI_H */ diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 253126e4beef..df0519a64eaf 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -12,6 +12,7 @@ #include #include #include +#include void asm_offsets(void); @@ -468,4 +469,6 @@ void asm_offsets(void) DEFINE(PT_SIZE_ON_STACK, ALIGN(sizeof(struct pt_regs), STACK_ALIGN)); OFFSET(KERNEL_MAP_VIRT_ADDR, kernel_mapping, virt_addr); + OFFSET(SBI_HART_BOOT_TASK_PTR_OFFSET, sbi_hart_boot_data, task_ptr); + OFFSET(SBI_HART_BOOT_STACK_PTR_OFFSET, sbi_hart_boot_data, stack_ptr); } diff --git a/arch/riscv/kernel/cpu_ops_sbi.c b/arch/riscv/kernel/cpu_ops_sbi.c index 685fae72b7f5..dae29cbfe550 100644 --- a/arch/riscv/kernel/cpu_ops_sbi.c +++ b/arch/riscv/kernel/cpu_ops_sbi.c @@ -7,13 +7,22 @@ #include #include +#include #include +#include #include #include extern char secondary_start_sbi[]; const struct cpu_operations cpu_ops_sbi; +/* + * Ordered booting via HSM brings one cpu at a time. However, cpu hotplug can + * be invoked from multiple threads in parallel. Define a per cpu data + * to handle that. + */ +DEFINE_PER_CPU(struct sbi_hart_boot_data, boot_data); + static int sbi_hsm_hart_start(unsigned long hartid, unsigned long saddr, unsigned long priv) { @@ -55,14 +64,19 @@ static int sbi_hsm_hart_get_status(unsigned long hartid) static int sbi_cpu_start(unsigned int cpuid, struct task_struct *tidle) { - int rc; unsigned long boot_addr = __pa_symbol(secondary_start_sbi); int hartid = cpuid_to_hartid_map(cpuid); - - cpu_update_secondary_bootdata(cpuid, tidle); - rc = sbi_hsm_hart_start(hartid, boot_addr, 0); - - return rc; + unsigned long hsm_data; + struct sbi_hart_boot_data *bdata = &per_cpu(boot_data, cpuid); + + /* Make sure tidle is updated */ + smp_mb(); + bdata->task_ptr = tidle; + bdata->stack_ptr = task_stack_page(tidle) + THREAD_SIZE; + /* Make sure boot data is updated */ + smp_mb(); + hsm_data = __pa(bdata); + return sbi_hsm_hart_start(hartid, boot_addr, hsm_data); } static int sbi_cpu_prepare(unsigned int cpuid) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index db4be6a7e392..05ea372e1909 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include "efi-header.S" @@ -168,15 +169,15 @@ secondary_start_sbi: la a3, .Lsecondary_park csrw CSR_TVEC, a3 - slli a3, a0, LGREG - la a4, __cpu_up_stack_pointer - XIP_FIXUP_OFFSET a4 - la a5, __cpu_up_task_pointer - XIP_FIXUP_OFFSET a5 - add a4, a3, a4 - add a5, a3, a5 - REG_L sp, (a4) - REG_L tp, (a5) + /* a0 contains the hartid & a1 contains boot data */ + li a2, SBI_HART_BOOT_TASK_PTR_OFFSET + XIP_FIXUP_OFFSET a2 + add a2, a2, a1 + REG_L tp, (a2) + li a3, SBI_HART_BOOT_STACK_PTR_OFFSET + XIP_FIXUP_OFFSET a3 + add a3, a3, a1 + REG_L sp, (a3) .Lsecondary_start_common: -- cgit v1.2.3-58-ga151 From 410bb20a698d4c95c63e7f2b6f6f7d8da43795f5 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 20 Jan 2022 01:09:14 -0800 Subject: RISC-V: Do not print the SBI version during HSM extension boot print The HSM extension information log also prints the SBI version v0.2. This is misleading as the underlying firmware SBI version may be different from v0.2. Remove the unncessary printing of SBI version. Signed-off-by: Atish Patra Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu_ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c index 1985884fe829..3f5a38b03044 100644 --- a/arch/riscv/kernel/cpu_ops.c +++ b/arch/riscv/kernel/cpu_ops.c @@ -38,7 +38,7 @@ void __init cpu_set_ops(int cpuid) #if IS_ENABLED(CONFIG_RISCV_SBI) if (sbi_probe_extension(SBI_EXT_HSM) > 0) { if (!cpuid) - pr_info("SBI v0.2 HSM extension detected\n"); + pr_info("SBI HSM extension detected\n"); cpu_ops[cpuid] = &cpu_ops_sbi; } else #endif -- cgit v1.2.3-58-ga151 From c78f94f35cf6486c4057317e8de3ddc4c62e12c7 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 20 Jan 2022 01:09:15 -0800 Subject: RISC-V: Use __cpu_up_stack/task_pointer only for spinwait method The __cpu_up_stack/task_pointer array is only used for spinwait method now. The per cpu array based lookup is also fragile for platforms with discontiguous/sparse hartids. The spinwait method is only used for M-mode Linux or older firmwares without SBI HSM extension. For general Linux systems, ordered booting method is preferred anyways to support cpu hotplug and kexec. Make sure that __cpu_up_stack/task_pointer is only used for spinwait method. Take this opportunity to rename it to __cpu_spinwait_stack/task_pointer to emphasize the purpose as well. Reviewed-by: Anup Patel Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpu_ops.h | 2 -- arch/riscv/kernel/cpu_ops.c | 16 ---------------- arch/riscv/kernel/cpu_ops_spinwait.c | 27 ++++++++++++++++++++++++++- arch/riscv/kernel/head.S | 4 ++-- arch/riscv/kernel/head.h | 4 ++-- 5 files changed, 30 insertions(+), 23 deletions(-) diff --git a/arch/riscv/include/asm/cpu_ops.h b/arch/riscv/include/asm/cpu_ops.h index a8ec3c5c1bd2..134590f1b843 100644 --- a/arch/riscv/include/asm/cpu_ops.h +++ b/arch/riscv/include/asm/cpu_ops.h @@ -40,7 +40,5 @@ struct cpu_operations { extern const struct cpu_operations *cpu_ops[NR_CPUS]; void __init cpu_set_ops(int cpu); -void cpu_update_secondary_bootdata(unsigned int cpuid, - struct task_struct *tidle); #endif /* ifndef __ASM_CPU_OPS_H */ diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c index 3f5a38b03044..c1e30f403c3b 100644 --- a/arch/riscv/kernel/cpu_ops.c +++ b/arch/riscv/kernel/cpu_ops.c @@ -8,31 +8,15 @@ #include #include #include -#include #include #include #include const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; -void *__cpu_up_stack_pointer[NR_CPUS] __section(".data"); -void *__cpu_up_task_pointer[NR_CPUS] __section(".data"); - extern const struct cpu_operations cpu_ops_sbi; extern const struct cpu_operations cpu_ops_spinwait; -void cpu_update_secondary_bootdata(unsigned int cpuid, - struct task_struct *tidle) -{ - int hartid = cpuid_to_hartid_map(cpuid); - - /* Make sure tidle is updated */ - smp_mb(); - WRITE_ONCE(__cpu_up_stack_pointer[hartid], - task_stack_page(tidle) + THREAD_SIZE); - WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle); -} - void __init cpu_set_ops(int cpuid) { #if IS_ENABLED(CONFIG_RISCV_SBI) diff --git a/arch/riscv/kernel/cpu_ops_spinwait.c b/arch/riscv/kernel/cpu_ops_spinwait.c index b2c957bb68c1..346847f6c41c 100644 --- a/arch/riscv/kernel/cpu_ops_spinwait.c +++ b/arch/riscv/kernel/cpu_ops_spinwait.c @@ -6,11 +6,36 @@ #include #include #include +#include #include #include #include const struct cpu_operations cpu_ops_spinwait; +void *__cpu_spinwait_stack_pointer[NR_CPUS] __section(".data"); +void *__cpu_spinwait_task_pointer[NR_CPUS] __section(".data"); + +static void cpu_update_secondary_bootdata(unsigned int cpuid, + struct task_struct *tidle) +{ + int hartid = cpuid_to_hartid_map(cpuid); + + /* + * The hartid must be less than NR_CPUS to avoid out-of-bound access + * errors for __cpu_spinwait_stack/task_pointer. That is not always possible + * for platforms with discontiguous hartid numbering scheme. That's why + * spinwait booting is not the recommended approach for any platforms + * booting Linux in S-mode and can be disabled in the future. + */ + if (hartid == INVALID_HARTID || hartid >= NR_CPUS) + return; + + /* Make sure tidle is updated */ + smp_mb(); + WRITE_ONCE(__cpu_spinwait_stack_pointer[hartid], + task_stack_page(tidle) + THREAD_SIZE); + WRITE_ONCE(__cpu_spinwait_task_pointer[hartid], tidle); +} static int spinwait_cpu_prepare(unsigned int cpuid) { @@ -28,7 +53,7 @@ static int spinwait_cpu_start(unsigned int cpuid, struct task_struct *tidle) * selects the first cpu to boot the kernel and causes the remainder * of the cpus to spin in a loop waiting for their stack pointer to be * setup by that main cpu. Writing to bootdata - * (i.e __cpu_up_stack_pointer) signals to the spinning cpus that they + * (i.e __cpu_spinwait_stack_pointer) signals to the spinning cpus that they * can continue the boot process. */ cpu_update_secondary_bootdata(cpuid, tidle); diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 05ea372e1909..efa9cd499ac6 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -347,9 +347,9 @@ clear_bss_done: csrw CSR_TVEC, a3 slli a3, a0, LGREG - la a1, __cpu_up_stack_pointer + la a1, __cpu_spinwait_stack_pointer XIP_FIXUP_OFFSET a1 - la a2, __cpu_up_task_pointer + la a2, __cpu_spinwait_task_pointer XIP_FIXUP_OFFSET a2 add a1, a3, a1 add a2, a3, a2 diff --git a/arch/riscv/kernel/head.h b/arch/riscv/kernel/head.h index aabbc3ac3e48..5393cca77790 100644 --- a/arch/riscv/kernel/head.h +++ b/arch/riscv/kernel/head.h @@ -16,7 +16,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa); asmlinkage void __init __copy_data(void); #endif -extern void *__cpu_up_stack_pointer[]; -extern void *__cpu_up_task_pointer[]; +extern void *__cpu_spinwait_stack_pointer[]; +extern void *__cpu_spinwait_task_pointer[]; #endif /* __ASM_HEAD_H */ -- cgit v1.2.3-58-ga151 From 0b39eb38f85908e039ce8c9f09868438e029757b Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 20 Jan 2022 01:09:16 -0800 Subject: RISC-V: Move the entire hart selection via lottery to SMP The booting hart selection via lottery is only useful for SMP systems. Moreover, the lottery selection is only necessary for systems using spinwait booting method. It is better to keep the entire lottery selection together so that it can be disabled in future. Move the lottery selection code to under CONFIG_SMP. Reviewed-by: Anup Patel Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/head.S | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index efa9cd499ac6..b0766f62bd73 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -264,8 +264,8 @@ pmp_done: blt a0, t0, .Lgood_cores tail .Lsecondary_park .Lgood_cores: -#endif + /* The lottery system is only required for spinwait booting method */ #ifndef CONFIG_XIP_KERNEL /* Pick one hart to run the main boot sequence */ la a3, hart_lottery @@ -284,6 +284,10 @@ pmp_done: /* first time here if hart_lottery in RAM is not set */ beq t0, t1, .Lsecondary_start +#endif /* CONFIG_XIP */ +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_XIP_KERNEL la sp, _end + THREAD_SIZE XIP_FIXUP_OFFSET sp mv s0, a0 @@ -340,8 +344,8 @@ clear_bss_done: call soc_early_init tail start_kernel -.Lsecondary_start: #ifdef CONFIG_SMP +.Lsecondary_start: /* Set trap vector to spin forever to help debug */ la a3, .Lsecondary_park csrw CSR_TVEC, a3 -- cgit v1.2.3-58-ga151 From 2ffc48fc7071da4b2d881b0f21d37ed05feb697b Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 20 Jan 2022 01:09:17 -0800 Subject: RISC-V: Move spinwait booting method to its own config The spinwait booting method should only be used for platforms with older firmware without SBI HSM extension or M-mode firmware because spinwait method can't support cpu hotplug, kexec or sparse hartid. It is better to move the entire spinwait implementation to its own config which can be disabled if required. It is enabled by default to maintain backward compatibility and M-mode Linux. Reviewed-by: Anup Patel Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 14 ++++++++++++++ arch/riscv/kernel/Makefile | 3 ++- arch/riscv/kernel/cpu_ops.c | 8 ++++++++ arch/riscv/kernel/head.S | 8 ++++---- arch/riscv/kernel/head.h | 2 ++ 5 files changed, 30 insertions(+), 5 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 86553d8c6b07..fd60ab94260b 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -374,6 +374,20 @@ config RISCV_SBI_V01 This config allows kernel to use SBI v0.1 APIs. This will be deprecated in future once legacy M-mode software are no longer in use. +config RISCV_BOOT_SPINWAIT + bool "Spinwait booting method" + depends on SMP + default y + help + This enables support for booting Linux via spinwait method. In the + spinwait method, all cores randomly jump to Linux. One of the cores + gets chosen via lottery and all other keep spinning on a percpu + variable. This method cannot support CPU hotplug and sparse hartid + scheme. It should be only enabled for M-mode Linux or platforms relying + on older firmware without SBI HSM extension. All other platforms should + rely on ordered booting via SBI HSM extension which gets chosen + dynamically at runtime if the firmware supports it. + config KEXEC bool "Kexec system call" select KEXEC_CORE diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 3397ddac1a30..612556faa527 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -43,7 +43,8 @@ obj-$(CONFIG_FPU) += fpu.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += cpu_ops.o -obj-$(CONFIG_SMP) += cpu_ops_spinwait.o + +obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c index c1e30f403c3b..170d07e57721 100644 --- a/arch/riscv/kernel/cpu_ops.c +++ b/arch/riscv/kernel/cpu_ops.c @@ -15,7 +15,15 @@ const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init; extern const struct cpu_operations cpu_ops_sbi; +#ifdef CONFIG_RISCV_BOOT_SPINWAIT extern const struct cpu_operations cpu_ops_spinwait; +#else +const struct cpu_operations cpu_ops_spinwait = { + .name = "", + .cpu_prepare = NULL, + .cpu_start = NULL, +}; +#endif void __init cpu_set_ops(int cpuid) { diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index b0766f62bd73..2363b43312fc 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -259,7 +259,7 @@ pmp_done: li t0, SR_FS csrc CSR_STATUS, t0 -#ifdef CONFIG_SMP +#ifdef CONFIG_RISCV_BOOT_SPINWAIT li t0, CONFIG_NR_CPUS blt a0, t0, .Lgood_cores tail .Lsecondary_park @@ -285,7 +285,7 @@ pmp_done: beq t0, t1, .Lsecondary_start #endif /* CONFIG_XIP */ -#endif /* CONFIG_SMP */ +#endif /* CONFIG_RISCV_BOOT_SPINWAIT */ #ifdef CONFIG_XIP_KERNEL la sp, _end + THREAD_SIZE @@ -344,7 +344,7 @@ clear_bss_done: call soc_early_init tail start_kernel -#ifdef CONFIG_SMP +#if CONFIG_RISCV_BOOT_SPINWAIT .Lsecondary_start: /* Set trap vector to spin forever to help debug */ la a3, .Lsecondary_park @@ -371,7 +371,7 @@ clear_bss_done: fence tail .Lsecondary_start_common -#endif +#endif /* CONFIG_RISCV_BOOT_SPINWAIT */ END(_start_kernel) diff --git a/arch/riscv/kernel/head.h b/arch/riscv/kernel/head.h index 5393cca77790..726731ada534 100644 --- a/arch/riscv/kernel/head.h +++ b/arch/riscv/kernel/head.h @@ -16,7 +16,9 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa); asmlinkage void __init __copy_data(void); #endif +#ifdef CONFIG_RISCV_BOOT_SPINWAIT extern void *__cpu_spinwait_stack_pointer[]; extern void *__cpu_spinwait_task_pointer[]; +#endif #endif /* __ASM_HEAD_H */ -- cgit v1.2.3-58-ga151 From 26fb751ca37846c912daa347be298bfd945cc560 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Thu, 20 Jan 2022 01:09:18 -0800 Subject: RISC-V: Do not use cpumask data structure for hartid bitmap Currently, SBI APIs accept a hartmask that is generated from struct cpumask. Cpumask data structure can hold upto NR_CPUs value. Thus, it is not the correct data structure for hartids as it can be higher than NR_CPUs for platforms with sparse or discontguous hartids. Remove all association between hartid mask and struct cpumask. Reviewed-by: Anup Patel (For Linux RISC-V changes) Acked-by: Anup Patel (For KVM RISC-V changes) Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 19 ++-- arch/riscv/include/asm/smp.h | 2 - arch/riscv/kernel/sbi.c | 189 +++++++++++++++++++++----------------- arch/riscv/kernel/setup.c | 10 -- arch/riscv/kernel/smpboot.c | 2 +- arch/riscv/kvm/mmu.c | 4 +- arch/riscv/kvm/vcpu_sbi_replace.c | 11 +-- arch/riscv/kvm/vcpu_sbi_v01.c | 11 +-- arch/riscv/kvm/vmid.c | 4 +- arch/riscv/mm/cacheflush.c | 5 +- arch/riscv/mm/tlbflush.c | 9 +- 11 files changed, 130 insertions(+), 136 deletions(-) diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 26ba6f2d7a40..d1c37479d828 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -8,6 +8,7 @@ #define _ASM_RISCV_SBI_H #include +#include #ifdef CONFIG_RISCV_SBI enum sbi_ext_id { @@ -128,27 +129,27 @@ long sbi_get_mimpid(void); void sbi_set_timer(uint64_t stime_value); void sbi_shutdown(void); void sbi_clear_ipi(void); -int sbi_send_ipi(const unsigned long *hart_mask); -int sbi_remote_fence_i(const unsigned long *hart_mask); -int sbi_remote_sfence_vma(const unsigned long *hart_mask, +int sbi_send_ipi(const struct cpumask *cpu_mask); +int sbi_remote_fence_i(const struct cpumask *cpu_mask); +int sbi_remote_sfence_vma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size); -int sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, +int sbi_remote_sfence_vma_asid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long asid); -int sbi_remote_hfence_gvma(const unsigned long *hart_mask, +int sbi_remote_hfence_gvma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size); -int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask, +int sbi_remote_hfence_gvma_vmid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long vmid); -int sbi_remote_hfence_vvma(const unsigned long *hart_mask, +int sbi_remote_hfence_vvma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size); -int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask, +int sbi_remote_hfence_vvma_asid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long asid); @@ -183,7 +184,7 @@ static inline unsigned long sbi_mk_version(unsigned long major, int sbi_err_map_linux_errno(int err); #else /* CONFIG_RISCV_SBI */ -static inline int sbi_remote_fence_i(const unsigned long *hart_mask) { return -1; } +static inline int sbi_remote_fence_i(const struct cpumask *cpu_mask) { return -1; } static inline void sbi_init(void) {} #endif /* CONFIG_RISCV_SBI */ #endif /* _ASM_RISCV_SBI_H */ diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h index 6ad749f42807..23170c933d73 100644 --- a/arch/riscv/include/asm/smp.h +++ b/arch/riscv/include/asm/smp.h @@ -92,8 +92,6 @@ static inline void riscv_clear_ipi(void) #endif /* CONFIG_SMP */ -void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out); - #if defined(CONFIG_HOTPLUG_CPU) && (CONFIG_SMP) bool cpu_has_hotplug(unsigned int cpu); #else diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 9a84f0cb5175..f72527fcb347 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -16,8 +16,8 @@ unsigned long sbi_spec_version __ro_after_init = SBI_SPEC_VERSION_DEFAULT; EXPORT_SYMBOL(sbi_spec_version); static void (*__sbi_set_timer)(uint64_t stime) __ro_after_init; -static int (*__sbi_send_ipi)(const unsigned long *hart_mask) __ro_after_init; -static int (*__sbi_rfence)(int fid, const unsigned long *hart_mask, +static int (*__sbi_send_ipi)(const struct cpumask *cpu_mask) __ro_after_init; +static int (*__sbi_rfence)(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) __ro_after_init; @@ -67,6 +67,30 @@ int sbi_err_map_linux_errno(int err) EXPORT_SYMBOL(sbi_err_map_linux_errno); #ifdef CONFIG_RISCV_SBI_V01 +static unsigned long __sbi_v01_cpumask_to_hartmask(const struct cpumask *cpu_mask) +{ + unsigned long cpuid, hartid; + unsigned long hmask = 0; + + /* + * There is no maximum hartid concept in RISC-V and NR_CPUS must not be + * associated with hartid. As SBI v0.1 is only kept for backward compatibility + * and will be removed in the future, there is no point in supporting hartid + * greater than BITS_PER_LONG (32 for RV32 and 64 for RV64). Ideally, SBI v0.2 + * should be used for platforms with hartid greater than BITS_PER_LONG. + */ + for_each_cpu(cpuid, cpu_mask) { + hartid = cpuid_to_hartid_map(cpuid); + if (hartid >= BITS_PER_LONG) { + pr_warn("Unable to send any request to hartid > BITS_PER_LONG for SBI v0.1\n"); + break; + } + hmask |= 1 << hartid; + } + + return hmask; +} + /** * sbi_console_putchar() - Writes given character to the console device. * @ch: The data to be written to the console. @@ -132,33 +156,44 @@ static void __sbi_set_timer_v01(uint64_t stime_value) #endif } -static int __sbi_send_ipi_v01(const unsigned long *hart_mask) +static int __sbi_send_ipi_v01(const struct cpumask *cpu_mask) { - sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)hart_mask, + unsigned long hart_mask; + + if (!cpu_mask) + cpu_mask = cpu_online_mask; + hart_mask = __sbi_v01_cpumask_to_hartmask(cpu_mask); + + sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)(&hart_mask), 0, 0, 0, 0, 0); return 0; } -static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask, +static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) { int result = 0; + unsigned long hart_mask; + + if (!cpu_mask) + cpu_mask = cpu_online_mask; + hart_mask = __sbi_v01_cpumask_to_hartmask(cpu_mask); /* v0.2 function IDs are equivalent to v0.1 extension IDs */ switch (fid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: sbi_ecall(SBI_EXT_0_1_REMOTE_FENCE_I, 0, - (unsigned long)hart_mask, 0, 0, 0, 0, 0); + (unsigned long)&hart_mask, 0, 0, 0, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA, 0, - (unsigned long)hart_mask, start, size, + (unsigned long)&hart_mask, start, size, 0, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID, 0, - (unsigned long)hart_mask, start, size, + (unsigned long)&hart_mask, start, size, arg4, 0, 0); break; default: @@ -180,7 +215,7 @@ static void __sbi_set_timer_v01(uint64_t stime_value) sbi_major_version(), sbi_minor_version()); } -static int __sbi_send_ipi_v01(const unsigned long *hart_mask) +static int __sbi_send_ipi_v01(const struct cpumask *cpu_mask) { pr_warn("IPI extension is not available in SBI v%lu.%lu\n", sbi_major_version(), sbi_minor_version()); @@ -188,7 +223,7 @@ static int __sbi_send_ipi_v01(const unsigned long *hart_mask) return 0; } -static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask, +static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) { @@ -212,37 +247,33 @@ static void __sbi_set_timer_v02(uint64_t stime_value) #endif } -static int __sbi_send_ipi_v02(const unsigned long *hart_mask) +static int __sbi_send_ipi_v02(const struct cpumask *cpu_mask) { - unsigned long hartid, hmask_val, hbase; - struct cpumask tmask; + unsigned long hartid, cpuid, hmask = 0, hbase = 0; struct sbiret ret = {0}; int result; - if (!hart_mask || !(*hart_mask)) { - riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask); - hart_mask = cpumask_bits(&tmask); - } + if (!cpu_mask) + cpu_mask = cpu_online_mask; - hmask_val = 0; - hbase = 0; - for_each_set_bit(hartid, hart_mask, NR_CPUS) { - if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) { + for_each_cpu(cpuid, cpu_mask) { + hartid = cpuid_to_hartid_map(cpuid); + if (hmask && ((hbase + BITS_PER_LONG) <= hartid)) { ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI, - hmask_val, hbase, 0, 0, 0, 0); + hmask, hbase, 0, 0, 0, 0); if (ret.error) goto ecall_failed; - hmask_val = 0; + hmask = 0; hbase = 0; } - if (!hmask_val) + if (!hmask) hbase = hartid; - hmask_val |= 1UL << (hartid - hbase); + hmask |= 1UL << (hartid - hbase); } - if (hmask_val) { + if (hmask) { ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI, - hmask_val, hbase, 0, 0, 0, 0); + hmask, hbase, 0, 0, 0, 0); if (ret.error) goto ecall_failed; } @@ -252,11 +283,11 @@ static int __sbi_send_ipi_v02(const unsigned long *hart_mask) ecall_failed: result = sbi_err_map_linux_errno(ret.error); pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n", - __func__, hbase, hmask_val, result); + __func__, hbase, hmask, result); return result; } -static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val, +static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask, unsigned long hbase, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) @@ -267,31 +298,31 @@ static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val, switch (fid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: - ret = sbi_ecall(ext, fid, hmask_val, hbase, 0, 0, 0, 0); + ret = sbi_ecall(ext, fid, hmask, hbase, 0, 0, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, arg4, 0); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, arg4, 0); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, 0, 0); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID: - ret = sbi_ecall(ext, fid, hmask_val, hbase, start, + ret = sbi_ecall(ext, fid, hmask, hbase, start, size, arg4, 0); break; default: @@ -303,43 +334,39 @@ static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val, if (ret.error) { result = sbi_err_map_linux_errno(ret.error); pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n", - __func__, hbase, hmask_val, result); + __func__, hbase, hmask, result); } return result; } -static int __sbi_rfence_v02(int fid, const unsigned long *hart_mask, +static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long arg4, unsigned long arg5) { - unsigned long hmask_val, hartid, hbase; - struct cpumask tmask; + unsigned long hartid, cpuid, hmask = 0, hbase = 0; int result; - if (!hart_mask || !(*hart_mask)) { - riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask); - hart_mask = cpumask_bits(&tmask); - } + if (!cpu_mask) + cpu_mask = cpu_online_mask; - hmask_val = 0; - hbase = 0; - for_each_set_bit(hartid, hart_mask, NR_CPUS) { - if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) { - result = __sbi_rfence_v02_call(fid, hmask_val, hbase, + for_each_cpu(cpuid, cpu_mask) { + hartid = cpuid_to_hartid_map(cpuid); + if (hmask && ((hbase + BITS_PER_LONG) <= hartid)) { + result = __sbi_rfence_v02_call(fid, hmask, hbase, start, size, arg4, arg5); if (result) return result; - hmask_val = 0; + hmask = 0; hbase = 0; } - if (!hmask_val) + if (!hmask) hbase = hartid; - hmask_val |= 1UL << (hartid - hbase); + hmask |= 1UL << (hartid - hbase); } - if (hmask_val) { - result = __sbi_rfence_v02_call(fid, hmask_val, hbase, + if (hmask) { + result = __sbi_rfence_v02_call(fid, hmask, hbase, start, size, arg4, arg5); if (result) return result; @@ -361,44 +388,44 @@ void sbi_set_timer(uint64_t stime_value) /** * sbi_send_ipi() - Send an IPI to any hart. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * * Return: 0 on success, appropriate linux error code otherwise. */ -int sbi_send_ipi(const unsigned long *hart_mask) +int sbi_send_ipi(const struct cpumask *cpu_mask) { - return __sbi_send_ipi(hart_mask); + return __sbi_send_ipi(cpu_mask); } EXPORT_SYMBOL(sbi_send_ipi); /** * sbi_remote_fence_i() - Execute FENCE.I instruction on given remote harts. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * * Return: 0 on success, appropriate linux error code otherwise. */ -int sbi_remote_fence_i(const unsigned long *hart_mask) +int sbi_remote_fence_i(const struct cpumask *cpu_mask) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_FENCE_I, - hart_mask, 0, 0, 0, 0); + cpu_mask, 0, 0, 0, 0); } EXPORT_SYMBOL(sbi_remote_fence_i); /** * sbi_remote_sfence_vma() - Execute SFENCE.VMA instructions on given remote * harts for the specified virtual address range. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the virtual address * @size: Total size of the virtual address range. * * Return: 0 on success, appropriate linux error code otherwise. */ -int sbi_remote_sfence_vma(const unsigned long *hart_mask, +int sbi_remote_sfence_vma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA, - hart_mask, start, size, 0, 0); + cpu_mask, start, size, 0, 0); } EXPORT_SYMBOL(sbi_remote_sfence_vma); @@ -406,38 +433,38 @@ EXPORT_SYMBOL(sbi_remote_sfence_vma); * sbi_remote_sfence_vma_asid() - Execute SFENCE.VMA instructions on given * remote harts for a virtual address range belonging to a specific ASID. * - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the virtual address * @size: Total size of the virtual address range. * @asid: The value of address space identifier (ASID). * * Return: 0 on success, appropriate linux error code otherwise. */ -int sbi_remote_sfence_vma_asid(const unsigned long *hart_mask, +int sbi_remote_sfence_vma_asid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long asid) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID, - hart_mask, start, size, asid, 0); + cpu_mask, start, size, asid, 0); } EXPORT_SYMBOL(sbi_remote_sfence_vma_asid); /** * sbi_remote_hfence_gvma() - Execute HFENCE.GVMA instructions on given remote * harts for the specified guest physical address range. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the guest physical address * @size: Total size of the guest physical address range. * * Return: None */ -int sbi_remote_hfence_gvma(const unsigned long *hart_mask, +int sbi_remote_hfence_gvma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA, - hart_mask, start, size, 0, 0); + cpu_mask, start, size, 0, 0); } EXPORT_SYMBOL_GPL(sbi_remote_hfence_gvma); @@ -445,38 +472,38 @@ EXPORT_SYMBOL_GPL(sbi_remote_hfence_gvma); * sbi_remote_hfence_gvma_vmid() - Execute HFENCE.GVMA instructions on given * remote harts for a guest physical address range belonging to a specific VMID. * - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the guest physical address * @size: Total size of the guest physical address range. * @vmid: The value of guest ID (VMID). * * Return: 0 if success, Error otherwise. */ -int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask, +int sbi_remote_hfence_gvma_vmid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long vmid) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID, - hart_mask, start, size, vmid, 0); + cpu_mask, start, size, vmid, 0); } EXPORT_SYMBOL(sbi_remote_hfence_gvma_vmid); /** * sbi_remote_hfence_vvma() - Execute HFENCE.VVMA instructions on given remote * harts for the current guest virtual address range. - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the current guest virtual address * @size: Total size of the current guest virtual address range. * * Return: None */ -int sbi_remote_hfence_vvma(const unsigned long *hart_mask, +int sbi_remote_hfence_vvma(const struct cpumask *cpu_mask, unsigned long start, unsigned long size) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA, - hart_mask, start, size, 0, 0); + cpu_mask, start, size, 0, 0); } EXPORT_SYMBOL(sbi_remote_hfence_vvma); @@ -485,20 +512,20 @@ EXPORT_SYMBOL(sbi_remote_hfence_vvma); * remote harts for current guest virtual address range belonging to a specific * ASID. * - * @hart_mask: A cpu mask containing all the target harts. + * @cpu_mask: A cpu mask containing all the target harts. * @start: Start of the current guest virtual address * @size: Total size of the current guest virtual address range. * @asid: The value of address space identifier (ASID). * * Return: None */ -int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask, +int sbi_remote_hfence_vvma_asid(const struct cpumask *cpu_mask, unsigned long start, unsigned long size, unsigned long asid) { return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID, - hart_mask, start, size, asid, 0); + cpu_mask, start, size, asid, 0); } EXPORT_SYMBOL(sbi_remote_hfence_vvma_asid); @@ -591,11 +618,7 @@ long sbi_get_mimpid(void) static void sbi_send_cpumask_ipi(const struct cpumask *target) { - struct cpumask hartid_mask; - - riscv_cpuid_to_hartid_mask(target, &hartid_mask); - - sbi_send_ipi(cpumask_bits(&hartid_mask)); + sbi_send_ipi(target); } static const struct riscv_ipi_ops sbi_ipi_ops = { diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 63241abe84eb..b42bfdc67482 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -59,16 +59,6 @@ atomic_t hart_lottery __section(".sdata") unsigned long boot_cpu_hartid; static DEFINE_PER_CPU(struct cpu, cpu_devices); -void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out) -{ - int cpu; - - cpumask_clear(out); - for_each_cpu(cpu, in) - cpumask_set_cpu(cpuid_to_hartid_map(cpu), out); -} -EXPORT_SYMBOL_GPL(riscv_cpuid_to_hartid_mask); - /* * Place kernel memory regions on the resource tree so that * kexec-tools can retrieve them from /proc/iomem. While there diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c index bd82375db51a..622f226454d5 100644 --- a/arch/riscv/kernel/smpboot.c +++ b/arch/riscv/kernel/smpboot.c @@ -96,7 +96,7 @@ void __init setup_smp(void) if (cpuid >= NR_CPUS) { pr_warn("Invalid cpuid [%d] for hartid [%d]\n", cpuid, hart); - break; + continue; } cpuid_to_hartid_map(cpuid) = hart; diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 9af67dbdc66a..f80a34fbf102 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -114,7 +114,6 @@ static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr, static void stage2_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) { - struct cpumask hmask; unsigned long size = PAGE_SIZE; struct kvm_vmid *vmid = &kvm->arch.vmid; @@ -127,8 +126,7 @@ static void stage2_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) * where the Guest/VM is running. */ preempt_disable(); - riscv_cpuid_to_hartid_mask(cpu_online_mask, &hmask); - sbi_remote_hfence_gvma_vmid(cpumask_bits(&hmask), addr, size, + sbi_remote_hfence_gvma_vmid(cpu_online_mask, addr, size, READ_ONCE(vmid->vmid)); preempt_enable(); } diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 00036b7f83b9..1bc0608a5bfd 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -82,7 +82,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run { int ret = 0; unsigned long i; - struct cpumask cm, hm; + struct cpumask cm; struct kvm_vcpu *tmp; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; unsigned long hmask = cp->a0; @@ -90,7 +90,6 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run unsigned long funcid = cp->a6; cpumask_clear(&cm); - cpumask_clear(&hm); kvm_for_each_vcpu(i, tmp, vcpu->kvm) { if (hbase != -1UL) { if (tmp->vcpu_id < hbase) @@ -103,17 +102,15 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run cpumask_set_cpu(tmp->cpu, &cm); } - riscv_cpuid_to_hartid_mask(&cm, &hm); - switch (funcid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: - ret = sbi_remote_fence_i(cpumask_bits(&hm)); + ret = sbi_remote_fence_i(&cm); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: - ret = sbi_remote_hfence_vvma(cpumask_bits(&hm), cp->a2, cp->a3); + ret = sbi_remote_hfence_vvma(&cm, cp->a2, cp->a3); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: - ret = sbi_remote_hfence_vvma_asid(cpumask_bits(&hm), cp->a2, + ret = sbi_remote_hfence_vvma_asid(&cm, cp->a2, cp->a3, cp->a4); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA: diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c index 4c7e13ec9ccc..07e2de14433a 100644 --- a/arch/riscv/kvm/vcpu_sbi_v01.c +++ b/arch/riscv/kvm/vcpu_sbi_v01.c @@ -38,7 +38,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, int i, ret = 0; u64 next_cycle; struct kvm_vcpu *rvcpu; - struct cpumask cm, hm; + struct cpumask cm; struct kvm *kvm = vcpu->kvm; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; @@ -101,15 +101,12 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, continue; cpumask_set_cpu(rvcpu->cpu, &cm); } - riscv_cpuid_to_hartid_mask(&cm, &hm); if (cp->a7 == SBI_EXT_0_1_REMOTE_FENCE_I) - ret = sbi_remote_fence_i(cpumask_bits(&hm)); + ret = sbi_remote_fence_i(&cm); else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) - ret = sbi_remote_hfence_vvma(cpumask_bits(&hm), - cp->a1, cp->a2); + ret = sbi_remote_hfence_vvma(&cm, cp->a1, cp->a2); else - ret = sbi_remote_hfence_vvma_asid(cpumask_bits(&hm), - cp->a1, cp->a2, cp->a3); + ret = sbi_remote_hfence_vvma_asid(&cm, cp->a1, cp->a2, cp->a3); break; default: ret = -EINVAL; diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index 807228f8f409..2fa4f7b1813d 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -67,7 +67,6 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) { unsigned long i; struct kvm_vcpu *v; - struct cpumask hmask; struct kvm_vmid *vmid = &vcpu->kvm->arch.vmid; if (!kvm_riscv_stage2_vmid_ver_changed(vmid)) @@ -102,8 +101,7 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) * running, we force VM exits on all host CPUs using IPI and * flush all Guest TLBs. */ - riscv_cpuid_to_hartid_mask(cpu_online_mask, &hmask); - sbi_remote_hfence_gvma(cpumask_bits(&hmask), 0, 0); + sbi_remote_hfence_gvma(cpu_online_mask, 0, 0); } vmid->vmid = vmid_next; diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 89f81067e09e..6cb7d96ad9c7 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -67,10 +67,7 @@ void flush_icache_mm(struct mm_struct *mm, bool local) */ smp_mb(); } else if (IS_ENABLED(CONFIG_RISCV_SBI)) { - cpumask_t hartid_mask; - - riscv_cpuid_to_hartid_mask(&others, &hartid_mask); - sbi_remote_fence_i(cpumask_bits(&hartid_mask)); + sbi_remote_fence_i(&others); } else { on_each_cpu_mask(&others, ipi_remote_fence_i, NULL, 1); } diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index 64f8201237c2..37ed760d007c 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -32,7 +32,6 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start, unsigned long size, unsigned long stride) { struct cpumask *cmask = mm_cpumask(mm); - struct cpumask hmask; unsigned int cpuid; bool broadcast; @@ -46,9 +45,7 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start, unsigned long asid = atomic_long_read(&mm->context.id); if (broadcast) { - riscv_cpuid_to_hartid_mask(cmask, &hmask); - sbi_remote_sfence_vma_asid(cpumask_bits(&hmask), - start, size, asid); + sbi_remote_sfence_vma_asid(cmask, start, size, asid); } else if (size <= stride) { local_flush_tlb_page_asid(start, asid); } else { @@ -56,9 +53,7 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start, } } else { if (broadcast) { - riscv_cpuid_to_hartid_mask(cmask, &hmask); - sbi_remote_sfence_vma(cpumask_bits(&hmask), - start, size); + sbi_remote_sfence_vma(cmask, start, size); } else if (size <= stride) { local_flush_tlb_page(start); } else { -- cgit v1.2.3-58-ga151 From 3c2905ea79245fd37c2ab9d9384ab85d4732e3ef Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 29 Dec 2021 20:24:58 +0100 Subject: riscv: canaan: remove useless select of non-existing config SYSCON The config SYSCON never existed in the kernel repository; so, the select of that config in ./drivers/soc/canaan/Kconfig has no effect. Presumably, this was just some mistake, assuming some symmetry in handling and naming of configs that simply does not exist. Remove this useless select of a non-existing config. Signed-off-by: Lukas Bulwahn Signed-off-by: Palmer Dabbelt --- drivers/soc/canaan/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/soc/canaan/Kconfig b/drivers/soc/canaan/Kconfig index 853096b7e84c..2527cf5757ec 100644 --- a/drivers/soc/canaan/Kconfig +++ b/drivers/soc/canaan/Kconfig @@ -5,7 +5,6 @@ config SOC_K210_SYSCTL depends on RISCV && SOC_CANAAN && OF default SOC_CANAAN select PM - select SYSCON select MFD_SYSCON help Canaan Kendryte K210 SoC system controller driver. -- cgit v1.2.3-58-ga151 From db3f02df1853acf4d678bcddb3f1eab23219b410 Mon Sep 17 00:00:00 2001 From: Ron Economos Date: Thu, 30 Dec 2021 22:11:06 -0800 Subject: riscv: dts: sifive unmatched: Add gpio poweroff Some of the GPIO pins on the Unmatched are wire up to control the power of the board, indicate that in the device tree. Signed-off-by: Ron Economos Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts index 6bfa1f24d3de..c4ed9efdff03 100644 --- a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts +++ b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts @@ -39,6 +39,11 @@ clock-frequency = ; clock-output-names = "rtcclk"; }; + + gpio-poweroff { + compatible = "gpio-poweroff"; + gpios = <&gpio 2 GPIO_ACTIVE_LOW>; + }; }; &uart0 { -- cgit v1.2.3-58-ga151 From 58dfff3e984dfb96dae98008e6ea0ab92248d003 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 18 Jan 2022 19:53:25 -0600 Subject: dt-bindings: Drop unnecessary pinctrl properties For a single pinctrl mode, it is not necessary to define pinctrl properties as the tools always allow pinctrl properties. Signed-off-by: Rob Herring Acked-by: Charles Keepax Acked-by: Mark Brown Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220119015325.2438277-1-robh@kernel.org --- .../bindings/display/rockchip/rockchip,rk3066-hdmi.yaml | 8 -------- Documentation/devicetree/bindings/input/gpio-keys.yaml | 6 ------ .../devicetree/bindings/pinctrl/cirrus,lochnagar.yaml | 9 --------- Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml | 10 ---------- Documentation/devicetree/bindings/sound/samsung-i2s.yaml | 6 ------ 5 files changed, 39 deletions(-) diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.yaml b/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.yaml index 008c144257cb..1a68a940d165 100644 --- a/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.yaml +++ b/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.yaml @@ -26,14 +26,6 @@ properties: clock-names: const: hclk - pinctrl-0: - maxItems: 2 - - pinctrl-names: - const: default - description: - Switch the iomux for the HPD/I2C pins to HDMI function. - power-domains: maxItems: 1 diff --git a/Documentation/devicetree/bindings/input/gpio-keys.yaml b/Documentation/devicetree/bindings/input/gpio-keys.yaml index dbe7ecc19ccb..7fe1966ea28a 100644 --- a/Documentation/devicetree/bindings/input/gpio-keys.yaml +++ b/Documentation/devicetree/bindings/input/gpio-keys.yaml @@ -88,12 +88,6 @@ patternProperties: which can be disabled to suppress events from the button. type: boolean - pinctrl-0: - maxItems: 1 - - pinctrl-names: - maxItems: 1 - required: - linux,code diff --git a/Documentation/devicetree/bindings/pinctrl/cirrus,lochnagar.yaml b/Documentation/devicetree/bindings/pinctrl/cirrus,lochnagar.yaml index a07dd197176a..6ff829a7d6e5 100644 --- a/Documentation/devicetree/bindings/pinctrl/cirrus,lochnagar.yaml +++ b/Documentation/devicetree/bindings/pinctrl/cirrus,lochnagar.yaml @@ -51,15 +51,6 @@ properties: appropriate of the LOCHNAGARx_PIN_NUM_GPIOS define, see [3]. maxItems: 1 - pinctrl-0: - description: - A phandle to the default pinctrl state. - - pinctrl-names: - description: - A pinctrl state named "default" must be defined. - const: default - pin-settings: type: object patternProperties: diff --git a/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml b/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml index 4cb174bf31ff..8a90d8273767 100644 --- a/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml +++ b/Documentation/devicetree/bindings/pinctrl/cirrus,madera.yaml @@ -30,16 +30,6 @@ description: | Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt properties: - pinctrl-0: - description: - A phandle to the node containing the subnodes containing default - configurations. - - pinctrl-names: - description: - A pinctrl state named "default" must be defined. - const: default - pin-settings: description: One subnode is required to contain the default settings. It diff --git a/Documentation/devicetree/bindings/sound/samsung-i2s.yaml b/Documentation/devicetree/bindings/sound/samsung-i2s.yaml index 2e3628ef48df..84c4d6cba521 100644 --- a/Documentation/devicetree/bindings/sound/samsung-i2s.yaml +++ b/Documentation/devicetree/bindings/sound/samsung-i2s.yaml @@ -110,12 +110,6 @@ properties: Internal DMA register base address of the audio subsystem (used in secondary sound source). - pinctrl-0: - description: Should specify pin control groups used for this controller. - - pinctrl-names: - const: default - power-domains: maxItems: 1 -- cgit v1.2.3-58-ga151 From 46cdc45acb089c811d9a54fd50af33b96e5fae9d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 20 Jan 2022 10:28:13 -0700 Subject: block: fix async_depth sysfs interface for mq-deadline A previous commit added this feature, but it inadvertently used the wrong variable to show/store the setting from/to, victimized by copy/paste. Fix it up so that the async_depth sysfs interface reads and writes from the right setting. Fixes: 07757588e507 ("block/mq-deadline: Reserve 25% of scheduler tags for synchronous requests") Link: https://bugzilla.kernel.org/show_bug.cgi?id=215485 Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/mq-deadline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 85d919bf60c7..3ed5eaf3446a 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -865,7 +865,7 @@ SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); -SHOW_INT(deadline_async_depth_show, dd->front_merges); +SHOW_INT(deadline_async_depth_show, dd->async_depth); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES @@ -895,7 +895,7 @@ STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MA STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); -STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); +STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT -- cgit v1.2.3-58-ga151 From 8da46c0f98a1a17bc8f6d324d82b4d26e1448869 Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Wed, 12 Jan 2022 08:27:29 +0000 Subject: RISC-V: Remove redundant err variable Return value from user_regset_copyin() directly instead of taking this in another redundant variable. Reported-by: Zeal Robot Signed-off-by: Minghao Chi Signed-off-by: CGEL ZTE Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/ptrace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c index 9c0511119bad..a89243730153 100644 --- a/arch/riscv/kernel/ptrace.c +++ b/arch/riscv/kernel/ptrace.c @@ -42,12 +42,10 @@ static int riscv_gpr_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - int ret; struct pt_regs *regs; regs = task_pt_regs(target); - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, regs, 0, -1); - return ret; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, regs, 0, -1); } #ifdef CONFIG_FPU -- cgit v1.2.3-58-ga151 From 986536b952fd7070f5137358df7b055f3081dd2b Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 18 Jan 2022 19:56:26 -0600 Subject: dt-bindings: Fix array schemas encoded as matrices The YAML DT encoding has leaked into some array properties. Properties which are defined as an array should have a schema that's just an array. That means there should only be a single level of 'minItems', 'maxItems', and/or 'items'. Signed-off-by: Rob Herring Acked-by: Marc Kleine-Budde Link: https://lore.kernel.org/r/20220119015627.2443334-1-robh@kernel.org --- .../bindings/media/nxp,imx7-mipi-csi2.yaml | 12 ++--- .../bindings/media/nxp,imx8mq-mipi-csi2.yaml | 12 ++--- .../devicetree/bindings/net/can/bosch,m_can.yaml | 52 +++++++++---------- .../bindings/net/ethernet-controller.yaml | 59 ++++++++++------------ Documentation/devicetree/bindings/nvmem/nvmem.yaml | 17 +++---- 5 files changed, 70 insertions(+), 82 deletions(-) diff --git a/Documentation/devicetree/bindings/media/nxp,imx7-mipi-csi2.yaml b/Documentation/devicetree/bindings/media/nxp,imx7-mipi-csi2.yaml index 1ef849dc74d7..e2e6e9aa0fe6 100644 --- a/Documentation/devicetree/bindings/media/nxp,imx7-mipi-csi2.yaml +++ b/Documentation/devicetree/bindings/media/nxp,imx7-mipi-csi2.yaml @@ -81,14 +81,12 @@ properties: data-lanes: description: Note that 'fsl,imx7-mipi-csi2' only supports up to 2 data lines. + minItems: 1 items: - minItems: 1 - maxItems: 4 - items: - - const: 1 - - const: 2 - - const: 3 - - const: 4 + - const: 1 + - const: 2 + - const: 3 + - const: 4 required: - data-lanes diff --git a/Documentation/devicetree/bindings/media/nxp,imx8mq-mipi-csi2.yaml b/Documentation/devicetree/bindings/media/nxp,imx8mq-mipi-csi2.yaml index 9c04fa85ee5c..1b3e1c4b99ed 100644 --- a/Documentation/devicetree/bindings/media/nxp,imx8mq-mipi-csi2.yaml +++ b/Documentation/devicetree/bindings/media/nxp,imx8mq-mipi-csi2.yaml @@ -87,14 +87,12 @@ properties: properties: data-lanes: + minItems: 1 items: - minItems: 1 - maxItems: 4 - items: - - const: 1 - - const: 2 - - const: 3 - - const: 4 + - const: 1 + - const: 2 + - const: 3 + - const: 4 required: - data-lanes diff --git a/Documentation/devicetree/bindings/net/can/bosch,m_can.yaml b/Documentation/devicetree/bindings/net/can/bosch,m_can.yaml index fb547e26c676..401ab7cdb379 100644 --- a/Documentation/devicetree/bindings/net/can/bosch,m_can.yaml +++ b/Documentation/devicetree/bindings/net/can/bosch,m_can.yaml @@ -76,33 +76,31 @@ properties: M_CAN user manual for details. $ref: /schemas/types.yaml#/definitions/int32-array items: - items: - - description: The 'offset' is an address offset of the Message RAM where - the following elements start from. This is usually set to 0x0 if - you're using a private Message RAM. - default: 0 - - description: 11-bit Filter 0-128 elements / 0-128 words - minimum: 0 - maximum: 128 - - description: 29-bit Filter 0-64 elements / 0-128 words - minimum: 0 - maximum: 64 - - description: Rx FIFO 0 0-64 elements / 0-1152 words - minimum: 0 - maximum: 64 - - description: Rx FIFO 1 0-64 elements / 0-1152 words - minimum: 0 - maximum: 64 - - description: Rx Buffers 0-64 elements / 0-1152 words - minimum: 0 - maximum: 64 - - description: Tx Event FIFO 0-32 elements / 0-64 words - minimum: 0 - maximum: 32 - - description: Tx Buffers 0-32 elements / 0-576 words - minimum: 0 - maximum: 32 - maxItems: 1 + - description: The 'offset' is an address offset of the Message RAM where + the following elements start from. This is usually set to 0x0 if + you're using a private Message RAM. + default: 0 + - description: 11-bit Filter 0-128 elements / 0-128 words + minimum: 0 + maximum: 128 + - description: 29-bit Filter 0-64 elements / 0-128 words + minimum: 0 + maximum: 64 + - description: Rx FIFO 0 0-64 elements / 0-1152 words + minimum: 0 + maximum: 64 + - description: Rx FIFO 1 0-64 elements / 0-1152 words + minimum: 0 + maximum: 64 + - description: Rx Buffers 0-64 elements / 0-1152 words + minimum: 0 + maximum: 64 + - description: Tx Event FIFO 0-32 elements / 0-64 words + minimum: 0 + maximum: 32 + - description: Tx Buffers 0-32 elements / 0-576 words + minimum: 0 + maximum: 32 power-domains: description: diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml index 47b5f728701d..34c5463abcec 100644 --- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml @@ -17,9 +17,8 @@ properties: description: Specifies the MAC address that was assigned to the network device. $ref: /schemas/types.yaml#/definitions/uint8-array - items: - - minItems: 6 - maxItems: 6 + minItems: 6 + maxItems: 6 mac-address: description: @@ -28,9 +27,8 @@ properties: to the device by the boot program is different from the local-mac-address property. $ref: /schemas/types.yaml#/definitions/uint8-array - items: - - minItems: 6 - maxItems: 6 + minItems: 6 + maxItems: 6 max-frame-size: $ref: /schemas/types.yaml#/definitions/uint32 @@ -164,33 +162,30 @@ properties: type: array then: deprecated: true - minItems: 1 - maxItems: 1 items: - items: - - minimum: 0 - maximum: 31 - description: - Emulated PHY ID, choose any but unique to the all - specified fixed-links - - - enum: [0, 1] - description: - Duplex configuration. 0 for half duplex or 1 for - full duplex - - - enum: [10, 100, 1000, 2500, 10000] - description: - Link speed in Mbits/sec. - - - enum: [0, 1] - description: - Pause configuration. 0 for no pause, 1 for pause - - - enum: [0, 1] - description: - Asymmetric pause configuration. 0 for no asymmetric - pause, 1 for asymmetric pause + - minimum: 0 + maximum: 31 + description: + Emulated PHY ID, choose any but unique to the all + specified fixed-links + + - enum: [0, 1] + description: + Duplex configuration. 0 for half duplex or 1 for + full duplex + + - enum: [10, 100, 1000, 2500, 10000] + description: + Link speed in Mbits/sec. + + - enum: [0, 1] + description: + Pause configuration. 0 for no pause, 1 for pause + + - enum: [0, 1] + description: + Asymmetric pause configuration. 0 for no asymmetric + pause, 1 for asymmetric pause - if: diff --git a/Documentation/devicetree/bindings/nvmem/nvmem.yaml b/Documentation/devicetree/bindings/nvmem/nvmem.yaml index 456fb808100a..43ed7e32e5ac 100644 --- a/Documentation/devicetree/bindings/nvmem/nvmem.yaml +++ b/Documentation/devicetree/bindings/nvmem/nvmem.yaml @@ -50,16 +50,15 @@ patternProperties: Offset and size in bytes within the storage device. bits: - maxItems: 1 + $ref: /schemas/types.yaml#/definitions/uint32-array items: - items: - - minimum: 0 - maximum: 7 - description: - Offset in bit within the address range specified by reg. - - minimum: 1 - description: - Size in bit within the address range specified by reg. + - minimum: 0 + maximum: 7 + description: + Offset in bit within the address range specified by reg. + - minimum: 1 + description: + Size in bit within the address range specified by reg. required: - reg -- cgit v1.2.3-58-ga151 From 25e20b505e0e2454ff6713c0cef4bcdd66dffb95 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 18 Jan 2022 19:56:11 -0600 Subject: dt-bindings: mfd: cirrus,madera: Fix 'interrupts' in example The 'interrupts' properties takes an irq number, not a phandle, and 'interrupt-parent' isn't needed in examples. Signed-off-by: Rob Herring Acked-by: Charles Keepax Link: https://lore.kernel.org/r/20220119015611.2442819-1-robh@kernel.org --- Documentation/devicetree/bindings/mfd/cirrus,madera.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/mfd/cirrus,madera.yaml b/Documentation/devicetree/bindings/mfd/cirrus,madera.yaml index 5dce62a7eff2..68c75a517c92 100644 --- a/Documentation/devicetree/bindings/mfd/cirrus,madera.yaml +++ b/Documentation/devicetree/bindings/mfd/cirrus,madera.yaml @@ -245,8 +245,7 @@ examples: interrupt-controller; #interrupt-cells = <2>; - interrupts = <&host_irq1>; - interrupt-parent = <&gic>; + interrupts = <4 1 0>; gpio-controller; #gpio-cells = <2>; -- cgit v1.2.3-58-ga151 From 66a8f7f04979f4ad739085f01d99c8caf620b4f5 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 18 Jan 2022 18:35:02 +0100 Subject: of: base: make small of_parse_phandle() variants static inline Make all the smaller variants of the of_parse_phandle() static inline. This also let us remove the empty function stubs if CONFIG_OF is not defined. Suggested-by: Rob Herring Signed-off-by: Michael Walle [robh: move index < 0 check into __of_parse_phandle_with_args] Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220118173504.2867523-2-michael@walle.cc --- drivers/of/base.c | 131 ++++------------------------------------------- include/linux/of.h | 148 +++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 129 insertions(+), 150 deletions(-) diff --git a/drivers/of/base.c b/drivers/of/base.c index 8a24d37153b4..e7d92b67cb8a 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -1420,15 +1420,18 @@ int of_phandle_iterator_args(struct of_phandle_iterator *it, return count; } -static int __of_parse_phandle_with_args(const struct device_node *np, - const char *list_name, - const char *cells_name, - int cell_count, int index, - struct of_phandle_args *out_args) +int __of_parse_phandle_with_args(const struct device_node *np, + const char *list_name, + const char *cells_name, + int cell_count, int index, + struct of_phandle_args *out_args) { struct of_phandle_iterator it; int rc, cur_index = 0; + if (index < 0) + return -EINVAL; + /* Loop over the phandles until all the requested entry is found */ of_for_each_phandle(&it, rc, np, list_name, cells_name, cell_count) { /* @@ -1471,82 +1474,7 @@ static int __of_parse_phandle_with_args(const struct device_node *np, of_node_put(it.node); return rc; } - -/** - * of_parse_phandle - Resolve a phandle property to a device_node pointer - * @np: Pointer to device node holding phandle property - * @phandle_name: Name of property holding a phandle value - * @index: For properties holding a table of phandles, this is the index into - * the table - * - * Return: The device_node pointer with refcount incremented. Use - * of_node_put() on it when done. - */ -struct device_node *of_parse_phandle(const struct device_node *np, - const char *phandle_name, int index) -{ - struct of_phandle_args args; - - if (index < 0) - return NULL; - - if (__of_parse_phandle_with_args(np, phandle_name, NULL, 0, - index, &args)) - return NULL; - - return args.np; -} -EXPORT_SYMBOL(of_parse_phandle); - -/** - * of_parse_phandle_with_args() - Find a node pointed by phandle in a list - * @np: pointer to a device tree node containing a list - * @list_name: property name that contains a list - * @cells_name: property name that specifies phandles' arguments count - * @index: index of a phandle to parse out - * @out_args: optional pointer to output arguments structure (will be filled) - * - * This function is useful to parse lists of phandles and their arguments. - * Returns 0 on success and fills out_args, on error returns appropriate - * errno value. - * - * Caller is responsible to call of_node_put() on the returned out_args->np - * pointer. - * - * Example:: - * - * phandle1: node1 { - * #list-cells = <2>; - * }; - * - * phandle2: node2 { - * #list-cells = <1>; - * }; - * - * node3 { - * list = <&phandle1 1 2 &phandle2 3>; - * }; - * - * To get a device_node of the ``node2`` node you may call this: - * of_parse_phandle_with_args(node3, "list", "#list-cells", 1, &args); - */ -int of_parse_phandle_with_args(const struct device_node *np, const char *list_name, - const char *cells_name, int index, - struct of_phandle_args *out_args) -{ - int cell_count = -1; - - if (index < 0) - return -EINVAL; - - /* If cells_name is NULL we assume a cell count of 0 */ - if (!cells_name) - cell_count = 0; - - return __of_parse_phandle_with_args(np, list_name, cells_name, - cell_count, index, out_args); -} -EXPORT_SYMBOL(of_parse_phandle_with_args); +EXPORT_SYMBOL(__of_parse_phandle_with_args); /** * of_parse_phandle_with_args_map() - Find a node pointed by phandle in a list and remap it @@ -1732,47 +1660,6 @@ free: } EXPORT_SYMBOL(of_parse_phandle_with_args_map); -/** - * of_parse_phandle_with_fixed_args() - Find a node pointed by phandle in a list - * @np: pointer to a device tree node containing a list - * @list_name: property name that contains a list - * @cell_count: number of argument cells following the phandle - * @index: index of a phandle to parse out - * @out_args: optional pointer to output arguments structure (will be filled) - * - * This function is useful to parse lists of phandles and their arguments. - * Returns 0 on success and fills out_args, on error returns appropriate - * errno value. - * - * Caller is responsible to call of_node_put() on the returned out_args->np - * pointer. - * - * Example:: - * - * phandle1: node1 { - * }; - * - * phandle2: node2 { - * }; - * - * node3 { - * list = <&phandle1 0 2 &phandle2 2 3>; - * }; - * - * To get a device_node of the ``node2`` node you may call this: - * of_parse_phandle_with_fixed_args(node3, "list", 2, 1, &args); - */ -int of_parse_phandle_with_fixed_args(const struct device_node *np, - const char *list_name, int cell_count, - int index, struct of_phandle_args *out_args) -{ - if (index < 0) - return -EINVAL; - return __of_parse_phandle_with_args(np, list_name, NULL, cell_count, - index, out_args); -} -EXPORT_SYMBOL(of_parse_phandle_with_fixed_args); - /** * of_count_phandle_with_args() - Find the number of phandles references in a property * @np: pointer to a device tree node containing a list diff --git a/include/linux/of.h b/include/linux/of.h index ff143a027abc..16d76c92fbe0 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -364,18 +364,12 @@ extern const struct of_device_id *of_match_node( const struct of_device_id *matches, const struct device_node *node); extern int of_modalias_node(struct device_node *node, char *modalias, int len); extern void of_print_phandle_args(const char *msg, const struct of_phandle_args *args); -extern struct device_node *of_parse_phandle(const struct device_node *np, - const char *phandle_name, - int index); -extern int of_parse_phandle_with_args(const struct device_node *np, - const char *list_name, const char *cells_name, int index, - struct of_phandle_args *out_args); +extern int __of_parse_phandle_with_args(const struct device_node *np, + const char *list_name, const char *cells_name, int cell_count, + int index, struct of_phandle_args *out_args); extern int of_parse_phandle_with_args_map(const struct device_node *np, const char *list_name, const char *stem_name, int index, struct of_phandle_args *out_args); -extern int of_parse_phandle_with_fixed_args(const struct device_node *np, - const char *list_name, int cells_count, int index, - struct of_phandle_args *out_args); extern int of_count_phandle_with_args(const struct device_node *np, const char *list_name, const char *cells_name); @@ -865,18 +859,12 @@ static inline int of_property_read_string_helper(const struct device_node *np, return -ENOSYS; } -static inline struct device_node *of_parse_phandle(const struct device_node *np, - const char *phandle_name, - int index) -{ - return NULL; -} - -static inline int of_parse_phandle_with_args(const struct device_node *np, - const char *list_name, - const char *cells_name, - int index, - struct of_phandle_args *out_args) +static inline int __of_parse_phandle_with_args(const struct device_node *np, + const char *list_name, + const char *cells_name, + int cell_count, + int index, + struct of_phandle_args *out_args) { return -ENOSYS; } @@ -890,13 +878,6 @@ static inline int of_parse_phandle_with_args_map(const struct device_node *np, return -ENOSYS; } -static inline int of_parse_phandle_with_fixed_args(const struct device_node *np, - const char *list_name, int cells_count, int index, - struct of_phandle_args *out_args) -{ - return -ENOSYS; -} - static inline int of_count_phandle_with_args(const struct device_node *np, const char *list_name, const char *cells_name) @@ -1077,6 +1058,117 @@ static inline bool of_node_is_type(const struct device_node *np, const char *typ return np && match && type && !strcmp(match, type); } +/** + * of_parse_phandle - Resolve a phandle property to a device_node pointer + * @np: Pointer to device node holding phandle property + * @phandle_name: Name of property holding a phandle value + * @index: For properties holding a table of phandles, this is the index into + * the table + * + * Return: The device_node pointer with refcount incremented. Use + * of_node_put() on it when done. + */ +static inline struct device_node *of_parse_phandle(const struct device_node *np, + const char *phandle_name, + int index) +{ + struct of_phandle_args args; + + if (__of_parse_phandle_with_args(np, phandle_name, NULL, 0, + index, &args)) + return NULL; + + return args.np; +} + +/** + * of_parse_phandle_with_args() - Find a node pointed by phandle in a list + * @np: pointer to a device tree node containing a list + * @list_name: property name that contains a list + * @cells_name: property name that specifies phandles' arguments count + * @index: index of a phandle to parse out + * @out_args: optional pointer to output arguments structure (will be filled) + * + * This function is useful to parse lists of phandles and their arguments. + * Returns 0 on success and fills out_args, on error returns appropriate + * errno value. + * + * Caller is responsible to call of_node_put() on the returned out_args->np + * pointer. + * + * Example:: + * + * phandle1: node1 { + * #list-cells = <2>; + * }; + * + * phandle2: node2 { + * #list-cells = <1>; + * }; + * + * node3 { + * list = <&phandle1 1 2 &phandle2 3>; + * }; + * + * To get a device_node of the ``node2`` node you may call this: + * of_parse_phandle_with_args(node3, "list", "#list-cells", 1, &args); + */ +static inline int of_parse_phandle_with_args(const struct device_node *np, + const char *list_name, + const char *cells_name, + int index, + struct of_phandle_args *out_args) +{ + int cell_count = -1; + + /* If cells_name is NULL we assume a cell count of 0 */ + if (!cells_name) + cell_count = 0; + + return __of_parse_phandle_with_args(np, list_name, cells_name, + cell_count, index, out_args); +} + +/** + * of_parse_phandle_with_fixed_args() - Find a node pointed by phandle in a list + * @np: pointer to a device tree node containing a list + * @list_name: property name that contains a list + * @cell_count: number of argument cells following the phandle + * @index: index of a phandle to parse out + * @out_args: optional pointer to output arguments structure (will be filled) + * + * This function is useful to parse lists of phandles and their arguments. + * Returns 0 on success and fills out_args, on error returns appropriate + * errno value. + * + * Caller is responsible to call of_node_put() on the returned out_args->np + * pointer. + * + * Example:: + * + * phandle1: node1 { + * }; + * + * phandle2: node2 { + * }; + * + * node3 { + * list = <&phandle1 0 2 &phandle2 2 3>; + * }; + * + * To get a device_node of the ``node2`` node you may call this: + * of_parse_phandle_with_fixed_args(node3, "list", 2, 1, &args); + */ +static inline int of_parse_phandle_with_fixed_args(const struct device_node *np, + const char *list_name, + int cell_count, + int index, + struct of_phandle_args *out_args) +{ + return __of_parse_phandle_with_args(np, list_name, NULL, cell_count, + index, out_args); +} + /** * of_property_count_u8_elems - Count the number of u8 elements in a property * -- cgit v1.2.3-58-ga151 From 2ca42c3ad9ed875b136065b010753a4caaaa1d38 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Tue, 18 Jan 2022 18:35:03 +0100 Subject: of: property: define of_property_read_u{8,16,32,64}_array() unconditionally We can get rid of all the empty stubs because all these functions call of_property_read_variable_u{8,16,32,64}_array() which already have an empty stub if CONFIG_OF is not defined. Signed-off-by: Michael Walle Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220118173504.2867523-3-michael@walle.cc --- include/linux/of.h | 274 ++++++++++++++++++++++++----------------------------- 1 file changed, 124 insertions(+), 150 deletions(-) diff --git a/include/linux/of.h b/include/linux/of.h index 16d76c92fbe0..2dc77430a91a 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -410,130 +410,6 @@ extern int of_detach_node(struct device_node *); #define of_match_ptr(_ptr) (_ptr) -/** - * of_property_read_u8_array - Find and read an array of u8 from a property. - * - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @out_values: pointer to return value, modified only if return value is 0. - * @sz: number of array elements to read - * - * Search for a property in a device node and read 8-bit value(s) from - * it. - * - * dts entry of array should be like: - * ``property = /bits/ 8 <0x50 0x60 0x70>;`` - * - * Return: 0 on success, -EINVAL if the property does not exist, - * -ENODATA if property does not have a value, and -EOVERFLOW if the - * property data isn't large enough. - * - * The out_values is modified only if a valid u8 value can be decoded. - */ -static inline int of_property_read_u8_array(const struct device_node *np, - const char *propname, - u8 *out_values, size_t sz) -{ - int ret = of_property_read_variable_u8_array(np, propname, out_values, - sz, 0); - if (ret >= 0) - return 0; - else - return ret; -} - -/** - * of_property_read_u16_array - Find and read an array of u16 from a property. - * - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @out_values: pointer to return value, modified only if return value is 0. - * @sz: number of array elements to read - * - * Search for a property in a device node and read 16-bit value(s) from - * it. - * - * dts entry of array should be like: - * ``property = /bits/ 16 <0x5000 0x6000 0x7000>;`` - * - * Return: 0 on success, -EINVAL if the property does not exist, - * -ENODATA if property does not have a value, and -EOVERFLOW if the - * property data isn't large enough. - * - * The out_values is modified only if a valid u16 value can be decoded. - */ -static inline int of_property_read_u16_array(const struct device_node *np, - const char *propname, - u16 *out_values, size_t sz) -{ - int ret = of_property_read_variable_u16_array(np, propname, out_values, - sz, 0); - if (ret >= 0) - return 0; - else - return ret; -} - -/** - * of_property_read_u32_array - Find and read an array of 32 bit integers - * from a property. - * - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @out_values: pointer to return value, modified only if return value is 0. - * @sz: number of array elements to read - * - * Search for a property in a device node and read 32-bit value(s) from - * it. - * - * Return: 0 on success, -EINVAL if the property does not exist, - * -ENODATA if property does not have a value, and -EOVERFLOW if the - * property data isn't large enough. - * - * The out_values is modified only if a valid u32 value can be decoded. - */ -static inline int of_property_read_u32_array(const struct device_node *np, - const char *propname, - u32 *out_values, size_t sz) -{ - int ret = of_property_read_variable_u32_array(np, propname, out_values, - sz, 0); - if (ret >= 0) - return 0; - else - return ret; -} - -/** - * of_property_read_u64_array - Find and read an array of 64 bit integers - * from a property. - * - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @out_values: pointer to return value, modified only if return value is 0. - * @sz: number of array elements to read - * - * Search for a property in a device node and read 64-bit value(s) from - * it. - * - * Return: 0 on success, -EINVAL if the property does not exist, - * -ENODATA if property does not have a value, and -EOVERFLOW if the - * property data isn't large enough. - * - * The out_values is modified only if a valid u64 value can be decoded. - */ -static inline int of_property_read_u64_array(const struct device_node *np, - const char *propname, - u64 *out_values, size_t sz) -{ - int ret = of_property_read_variable_u64_array(np, propname, out_values, - sz, 0); - if (ret >= 0) - return 0; - else - return ret; -} - /* * struct property *prop; * const __be32 *p; @@ -728,32 +604,6 @@ static inline int of_property_count_elems_of_size(const struct device_node *np, return -ENOSYS; } -static inline int of_property_read_u8_array(const struct device_node *np, - const char *propname, u8 *out_values, size_t sz) -{ - return -ENOSYS; -} - -static inline int of_property_read_u16_array(const struct device_node *np, - const char *propname, u16 *out_values, size_t sz) -{ - return -ENOSYS; -} - -static inline int of_property_read_u32_array(const struct device_node *np, - const char *propname, - u32 *out_values, size_t sz) -{ - return -ENOSYS; -} - -static inline int of_property_read_u64_array(const struct device_node *np, - const char *propname, - u64 *out_values, size_t sz) -{ - return -ENOSYS; -} - static inline int of_property_read_u32_index(const struct device_node *np, const char *propname, u32 index, u32 *out_value) { @@ -1328,6 +1178,130 @@ static inline bool of_property_read_bool(const struct device_node *np, return prop ? true : false; } +/** + * of_property_read_u8_array - Find and read an array of u8 from a property. + * + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_values: pointer to return value, modified only if return value is 0. + * @sz: number of array elements to read + * + * Search for a property in a device node and read 8-bit value(s) from + * it. + * + * dts entry of array should be like: + * ``property = /bits/ 8 <0x50 0x60 0x70>;`` + * + * Return: 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + * + * The out_values is modified only if a valid u8 value can be decoded. + */ +static inline int of_property_read_u8_array(const struct device_node *np, + const char *propname, + u8 *out_values, size_t sz) +{ + int ret = of_property_read_variable_u8_array(np, propname, out_values, + sz, 0); + if (ret >= 0) + return 0; + else + return ret; +} + +/** + * of_property_read_u16_array - Find and read an array of u16 from a property. + * + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_values: pointer to return value, modified only if return value is 0. + * @sz: number of array elements to read + * + * Search for a property in a device node and read 16-bit value(s) from + * it. + * + * dts entry of array should be like: + * ``property = /bits/ 16 <0x5000 0x6000 0x7000>;`` + * + * Return: 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + * + * The out_values is modified only if a valid u16 value can be decoded. + */ +static inline int of_property_read_u16_array(const struct device_node *np, + const char *propname, + u16 *out_values, size_t sz) +{ + int ret = of_property_read_variable_u16_array(np, propname, out_values, + sz, 0); + if (ret >= 0) + return 0; + else + return ret; +} + +/** + * of_property_read_u32_array - Find and read an array of 32 bit integers + * from a property. + * + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_values: pointer to return value, modified only if return value is 0. + * @sz: number of array elements to read + * + * Search for a property in a device node and read 32-bit value(s) from + * it. + * + * Return: 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + * + * The out_values is modified only if a valid u32 value can be decoded. + */ +static inline int of_property_read_u32_array(const struct device_node *np, + const char *propname, + u32 *out_values, size_t sz) +{ + int ret = of_property_read_variable_u32_array(np, propname, out_values, + sz, 0); + if (ret >= 0) + return 0; + else + return ret; +} + +/** + * of_property_read_u64_array - Find and read an array of 64 bit integers + * from a property. + * + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_values: pointer to return value, modified only if return value is 0. + * @sz: number of array elements to read + * + * Search for a property in a device node and read 64-bit value(s) from + * it. + * + * Return: 0 on success, -EINVAL if the property does not exist, + * -ENODATA if property does not have a value, and -EOVERFLOW if the + * property data isn't large enough. + * + * The out_values is modified only if a valid u64 value can be decoded. + */ +static inline int of_property_read_u64_array(const struct device_node *np, + const char *propname, + u64 *out_values, size_t sz) +{ + int ret = of_property_read_variable_u64_array(np, propname, out_values, + sz, 0); + if (ret >= 0) + return 0; + else + return ret; +} + static inline int of_property_read_u8(const struct device_node *np, const char *propname, u8 *out_value) -- cgit v1.2.3-58-ga151 From 9b22c17a3cc5f61b195da624cbb48634b4e42055 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 18 Jan 2022 11:34:04 -0600 Subject: of: Check 'of_node_reused' flag on of_match_device() Commit 0f153a1b8193 ("usb: chipidea: Set the DT node on the child device") caused the child device to match on the parent driver instead of the child's driver since the child's DT node pointer matched. The worst case result is a loop of the parent driver probing another instance and creating yet another child device eventually exhausting the stack. If the child driver happens to match first, then everything works fine. A device sharing the DT node should never do DT based driver matching, so let's simply check of_node_reused in of_match_device() to prevent that. Fixes: 0f153a1b8193 ("usb: chipidea: Set the DT node on the child device") Link: https://lore.kernel.org/all/20220114105620.GK18506@ediswmail.ad.cirrus.com/ Reported-by: Charles Keepax Cc: Frank Rowand Cc: Arnd Bergmann Cc: Tony Lindgren Cc: Greg Kroah-Hartman Cc: Peter Chen Tested-by: Charles Keepax Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220118173404.1891800-1-robh@kernel.org --- drivers/of/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/of/device.c b/drivers/of/device.c index b0800c260f64..874f031442dc 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -28,7 +28,7 @@ const struct of_device_id *of_match_device(const struct of_device_id *matches, const struct device *dev) { - if ((!matches) || (!dev->of_node)) + if (!matches || !dev->of_node || dev->of_node_reused) return NULL; return of_match_node(matches, dev->of_node); } -- cgit v1.2.3-58-ga151 From bd25c378527f3fde38a496ac2744cbd3924f0803 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 14 Jan 2022 07:52:07 +0100 Subject: parisc: Use safer strscpy() in setup_cmdline() Signed-off-by: Helge Deller --- arch/parisc/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index cceb09855e03..456087a2350c 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -56,7 +56,7 @@ void __init setup_cmdline(char **cmdline_p) /* called from hpux boot loader */ boot_command_line[0] = '\0'; } else { - strlcpy(boot_command_line, (char *)__va(boot_args[1]), + strscpy(boot_command_line, (char *)__va(boot_args[1]), COMMAND_LINE_SIZE); #ifdef CONFIG_BLK_DEV_INITRD @@ -68,7 +68,7 @@ void __init setup_cmdline(char **cmdline_p) #endif } - strcpy(command_line, boot_command_line); + strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; } -- cgit v1.2.3-58-ga151 From 5f7ee6e37a3cadefe45378c17c4285fa41141d92 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 14 Jan 2022 07:57:20 +0100 Subject: parisc: Autodetect default output device and set console= kernel parameter Usually palo (the PA-RISC boot loader) will check at boot time if the machine/firmware was configured to use the serial line (ttyS0, SERIAL_x) or the graphical display (tty0, graph) as default output device and add the correct "console=ttyS0" or "console=tty0" Linux kernel parameter to the kernel command line when starting the Linux kernel. But the kernel could also have been started via the HP-UX boot loader or directly in qemu, in which cases the console parameter is missing. This patch fixes this problem by adding the correct console= parameter if it's missing in the current kernel command line. Signed-off-by: Helge Deller --- arch/parisc/kernel/setup.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index 456087a2350c..b91cb45ffd4e 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -48,6 +48,7 @@ struct proc_dir_entry * proc_mckinley_root __read_mostly = NULL; void __init setup_cmdline(char **cmdline_p) { extern unsigned int boot_args[]; + char *p; /* Collect stuff passed in from the boot loader */ @@ -59,6 +60,16 @@ void __init setup_cmdline(char **cmdline_p) strscpy(boot_command_line, (char *)__va(boot_args[1]), COMMAND_LINE_SIZE); + /* autodetect console type (if not done by palo yet) */ + p = boot_command_line; + if (!str_has_prefix(p, "console=") && !strstr(p, " console=")) { + strlcat(p, " console=", COMMAND_LINE_SIZE); + if (PAGE0->mem_cons.cl_class == CL_DUPLEX) + strlcat(p, "ttyS0", COMMAND_LINE_SIZE); + else + strlcat(p, "tty0", COMMAND_LINE_SIZE); + } + #ifdef CONFIG_BLK_DEV_INITRD if (boot_args[2] != 0) /* did palo pass us a ramdisk? */ { -- cgit v1.2.3-58-ga151 From 30f308999426871e1b896384093e9a681099f521 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 17 Jan 2022 10:10:10 +0100 Subject: parisc: Fix missing prototype for 'toc_intr' warning in toc.c Fix a missing prototype warning noticed by the kernel test robot. Reported-by: kernel test robot Signed-off-by: Helge Deller --- arch/parisc/include/asm/processor.h | 1 + arch/parisc/kernel/toc.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index b669f4b9040b..3a3d05438408 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -289,6 +289,7 @@ extern int _parisc_requires_coherency; extern int running_on_qemu; +extern void __noreturn toc_intr(struct pt_regs *regs); extern void toc_handler(void); extern unsigned int toc_handler_size; extern unsigned int toc_handler_csum; diff --git a/arch/parisc/kernel/toc.c b/arch/parisc/kernel/toc.c index fa5a10eaf0aa..e4b48d07afbd 100644 --- a/arch/parisc/kernel/toc.c +++ b/arch/parisc/kernel/toc.c @@ -10,6 +10,7 @@ #include #include #include +#include static unsigned int __aligned(16) toc_lock = 1; DEFINE_PER_CPU_PAGE_ALIGNED(char [16384], toc_stack) __visible; -- cgit v1.2.3-58-ga151 From 5e547d60dae7c66fe0c33654474eedcc1ddace67 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 20 Jan 2022 11:40:08 +0100 Subject: dt-bindings: display: bridge: drop Enric Balletbo i Serra from maintainers Enric Balletbo i Serra emails bounce: : Recipient address rejected: User unknown in local recipient table so drop him from the maintainers, similarly to commit 3119c28634dd ("MAINTAINERS: Chrome: Drop Enric Balletbo i Serra"). Add generic DRM bridge maintainers to Analogix ANX7814. Signed-off-by: Krzysztof Kozlowski Acked-by: Neil Armstrong Acked-by: Enric Balletbo i Serra Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220120104009.159147-1-krzysztof.kozlowski@canonical.com --- .../devicetree/bindings/display/bridge/analogix,anx7814.yaml | 4 +++- .../devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml | 1 - Documentation/devicetree/bindings/display/bridge/ps8640.yaml | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/devicetree/bindings/display/bridge/analogix,anx7814.yaml b/Documentation/devicetree/bindings/display/bridge/analogix,anx7814.yaml index 8e13f27b28ed..bce96b5b0db0 100644 --- a/Documentation/devicetree/bindings/display/bridge/analogix,anx7814.yaml +++ b/Documentation/devicetree/bindings/display/bridge/analogix,anx7814.yaml @@ -7,7 +7,9 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Analogix ANX7814 SlimPort (Full-HD Transmitter) maintainers: - - Enric Balletbo i Serra + - Andrzej Hajda + - Neil Armstrong + - Robert Foss properties: compatible: diff --git a/Documentation/devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml b/Documentation/devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml index 9f7cc6b757cb..a88a5d8c7ba5 100644 --- a/Documentation/devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml +++ b/Documentation/devicetree/bindings/display/bridge/google,cros-ec-anx7688.yaml @@ -8,7 +8,6 @@ title: ChromeOS EC ANX7688 HDMI to DP Converter through Type-C Port maintainers: - Nicolas Boichat - - Enric Balletbo i Serra description: | ChromeOS EC ANX7688 is a display bridge that converts HDMI 2.0 to diff --git a/Documentation/devicetree/bindings/display/bridge/ps8640.yaml b/Documentation/devicetree/bindings/display/bridge/ps8640.yaml index cdaf7a7a8f88..186e17be51fb 100644 --- a/Documentation/devicetree/bindings/display/bridge/ps8640.yaml +++ b/Documentation/devicetree/bindings/display/bridge/ps8640.yaml @@ -8,7 +8,6 @@ title: MIPI DSI to eDP Video Format Converter Device Tree Bindings maintainers: - Nicolas Boichat - - Enric Balletbo i Serra description: | The PS8640 is a low power MIPI-to-eDP video format converter supporting -- cgit v1.2.3-58-ga151 From 18a86e5907f7160fb548d0d717e0f842b310708a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 20 Jan 2022 11:40:09 +0100 Subject: dt-bindings: google,cros-ec: drop Enric Balletbo i Serra from maintainers Enric Balletbo i Serra emails bounce: : Recipient address rejected: User unknown in local recipient table so drop him from the maintainers, similarly to commit 3119c28634dd ("MAINTAINERS: Chrome: Drop Enric Balletbo i Serra"). Signed-off-by: Krzysztof Kozlowski Acked-by: Lee Jones Acked-by: Enric Balletbo i Serra Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20220120104009.159147-2-krzysztof.kozlowski@canonical.com --- Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml | 1 - Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml | 1 - .../devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml | 1 - Documentation/devicetree/bindings/input/google,cros-ec-keyb.yaml | 1 - Documentation/devicetree/bindings/mfd/google,cros-ec.yaml | 1 - 5 files changed, 5 deletions(-) diff --git a/Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml b/Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml index 20e1ccfc8630..2d82b44268db 100644 --- a/Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml +++ b/Documentation/devicetree/bindings/extcon/extcon-usbc-cros-ec.yaml @@ -8,7 +8,6 @@ title: ChromeOS EC USB Type-C cable and accessories detection maintainers: - Benson Leung - - Enric Balletbo i Serra description: | On ChromeOS systems with USB Type C ports, the ChromeOS Embedded Controller is diff --git a/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml b/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml index b386e4128a79..6e1c70e9275e 100644 --- a/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml +++ b/Documentation/devicetree/bindings/i2c/google,cros-ec-i2c-tunnel.yaml @@ -10,7 +10,6 @@ title: I2C bus that tunnels through the ChromeOS EC (cros-ec) maintainers: - Doug Anderson - Benson Leung - - Enric Balletbo i Serra description: | On some ChromeOS board designs we've got a connection to the EC diff --git a/Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml b/Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml index 099b4be927d4..00e3b59641d2 100644 --- a/Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml +++ b/Documentation/devicetree/bindings/iio/proximity/google,cros-ec-mkbp-proximity.yaml @@ -10,7 +10,6 @@ title: ChromeOS EC MKBP Proximity Sensor maintainers: - Stephen Boyd - Benson Leung - - Enric Balletbo i Serra description: | Google's ChromeOS EC sometimes has the ability to detect user proximity. diff --git a/Documentation/devicetree/bindings/input/google,cros-ec-keyb.yaml b/Documentation/devicetree/bindings/input/google,cros-ec-keyb.yaml index 5377b232fa10..e8f137abb03c 100644 --- a/Documentation/devicetree/bindings/input/google,cros-ec-keyb.yaml +++ b/Documentation/devicetree/bindings/input/google,cros-ec-keyb.yaml @@ -10,7 +10,6 @@ title: ChromeOS EC Keyboard maintainers: - Simon Glass - Benson Leung - - Enric Balletbo i Serra description: | Google's ChromeOS EC Keyboard is a simple matrix keyboard diff --git a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml b/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml index d793dd0316b7..58a1a9405228 100644 --- a/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml +++ b/Documentation/devicetree/bindings/mfd/google,cros-ec.yaml @@ -8,7 +8,6 @@ title: ChromeOS Embedded Controller maintainers: - Benson Leung - - Enric Balletbo i Serra - Guenter Roeck description: -- cgit v1.2.3-58-ga151 From c59cd507fb640c2acc6b07cb60d7f765839e18c7 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Wed, 19 Jan 2022 10:39:37 -0800 Subject: RISC-V: nommu_virt: Drop unused SLAB_MERGE_DEFAULT Our nommu_virt_defconfig set SLOB=y and SLAB_MERGE_DEFAULT=n. As of eb52c0fc2331 ("mm: Make SLAB_MERGE_DEFAULT depend on SL[AU]B") it's no longer necessary to set the second, which appears to never have had any effect for SLOB=y anyway. This was suggested by savedefconfig. Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/nommu_virt_defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/configs/nommu_virt_defconfig b/arch/riscv/configs/nommu_virt_defconfig index 5f078156dfcd..e1c9864b6237 100644 --- a/arch/riscv/configs/nommu_virt_defconfig +++ b/arch/riscv/configs/nommu_virt_defconfig @@ -24,7 +24,6 @@ CONFIG_EXPERT=y # CONFIG_VM_EVENT_COUNTERS is not set # CONFIG_COMPAT_BRK is not set CONFIG_SLOB=y -# CONFIG_SLAB_MERGE_DEFAULT is not set # CONFIG_MMU is not set CONFIG_SOC_VIRT=y CONFIG_SMP=y -- cgit v1.2.3-58-ga151 From b0ac702f3329cdc8a06dcaac73183d4b5a2b942d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 18 Jan 2022 19:39:05 -0800 Subject: Documentation: fix firewire.rst ABI file path error Adjust the path of the ABI files for firewire.rst to prevent a documentation build error. Prevents this problem: Sphinx parallel build error: docutils.utils.SystemMessage: Documentation/driver-api/firewire.rst:22: (SEVERE/4) Problems with "include" directive path: InputError: [Errno 2] No such file or directory: '../Documentation/driver-api/ABI/stable/firewire-cdev'. Fixes: 2f4830ef96d2 ("FireWire: add driver-api Introduction section") Signed-off-by: Randy Dunlap Tested-by: Akira Yokosawa Link: https://lore.kernel.org/r/20220119033905.4779-1-rdunlap@infradead.org Signed-off-by: Jonathan Corbet --- Documentation/driver-api/firewire.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/driver-api/firewire.rst b/Documentation/driver-api/firewire.rst index 94a2d7f01d99..d3cfa73cbb2b 100644 --- a/Documentation/driver-api/firewire.rst +++ b/Documentation/driver-api/firewire.rst @@ -19,7 +19,7 @@ of kernel interfaces is available via exported symbols in `firewire-core` module Firewire char device data structures ==================================== -.. include:: /ABI/stable/firewire-cdev +.. include:: ../ABI/stable/firewire-cdev :literal: .. kernel-doc:: include/uapi/linux/firewire-cdev.h @@ -28,7 +28,7 @@ Firewire char device data structures Firewire device probing and sysfs interfaces ============================================ -.. include:: /ABI/stable/sysfs-bus-firewire +.. include:: ../ABI/stable/sysfs-bus-firewire :literal: .. kernel-doc:: drivers/firewire/core-device.c -- cgit v1.2.3-58-ga151 From a0af3d1104f752b6d0dba71788e3fddd67c857a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Jan 2022 18:54:52 +0100 Subject: PCI/MSI: Prevent UAF in error path When the core MSI allocation fails, then the PCI/MSI code uses an already freed MSI descriptor to unmask the MSI mask register in order to bring it back into reset state. Remove MSI_FLAG_FREE_MSI_DESCS from the PCI/MSI irqdomain flags and let the PCI/MSI code free the MSI descriptors after usage. Fixes: 0f62d941acf9 ("genirq/msi: Provide msi_domain_alloc/free_irqs_descs_locked()") Reported-by: Tong Zhang Signed-off-by: Thomas Gleixner Tested-by: Tong Zhang Acked-by: Bjorn Helgaas Link: https://lore.kernel.org/r/87r1938vbn.ffs@tglx --- drivers/pci/msi/irqdomain.c | 4 ++-- drivers/pci/msi/legacy.c | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 0d63541c4052..e9cf318e6670 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -28,6 +28,7 @@ void pci_msi_teardown_msi_irqs(struct pci_dev *dev) msi_domain_free_irqs_descs_locked(domain, &dev->dev); else pci_msi_legacy_teardown_msi_irqs(dev); + msi_free_msi_descs(&dev->dev); } /** @@ -171,8 +172,7 @@ struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) pci_msi_domain_update_chip_ops(info); - info->flags |= MSI_FLAG_ACTIVATE_EARLY | MSI_FLAG_DEV_SYSFS | - MSI_FLAG_FREE_MSI_DESCS; + info->flags |= MSI_FLAG_ACTIVATE_EARLY | MSI_FLAG_DEV_SYSFS; if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) info->flags |= MSI_FLAG_MUST_REACTIVATE; diff --git a/drivers/pci/msi/legacy.c b/drivers/pci/msi/legacy.c index cdbb4689db78..db761adef652 100644 --- a/drivers/pci/msi/legacy.c +++ b/drivers/pci/msi/legacy.c @@ -77,5 +77,4 @@ void pci_msi_legacy_teardown_msi_irqs(struct pci_dev *dev) { msi_device_destroy_sysfs(&dev->dev); arch_teardown_msi_irqs(dev); - msi_free_msi_descs(&dev->dev); } -- cgit v1.2.3-58-ga151 From b875b39e7373dcaccb19a600a52a956061c2c833 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 21 Jan 2022 17:19:33 +0900 Subject: ata: pata_octeon_cf: fix call to trace_ata_bmdma_stop() The first argument of trace_ata_bmdma_stop() must be a pointer to a struct ata_port, not to a struct ata_queued_cmd. Reported-by: Linux Kernel Functional Testing Fixes: d3e140f2b008 ("ata: pata_octeon_cf: Drop pointless VPRINTK() calls and convert the remaining one") Signed-off-by: Damien Le Moal Tested-by: Linux Kernel Functional Testing --- drivers/ata/pata_octeon_cf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c index 0912846bc1b0..05c2ab375756 100644 --- a/drivers/ata/pata_octeon_cf.c +++ b/drivers/ata/pata_octeon_cf.c @@ -595,7 +595,7 @@ static unsigned int octeon_cf_dma_finished(struct ata_port *ap, union cvmx_mio_boot_dma_intx dma_int; u8 status; - trace_ata_bmdma_stop(qc, &qc->tf, qc->tag); + trace_ata_bmdma_stop(ap, &qc->tf, qc->tag); if (ap->hsm_task_state != HSM_ST_LAST) return 0; -- cgit v1.2.3-58-ga151 From 546e41ac994cc185ef3de610ca849a294b5df3ba Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Tue, 18 Jan 2022 14:37:55 -0800 Subject: HID: wacom: Reset expected and received contact counts at the same time These two values go hand-in-hand and must be valid for the driver to behave correctly. We are currently lazy about updating the values and rely on the "expected" code flow to take care of making sure they're valid at the point they're needed. The "expected" flow changed somewhat with commit f8b6a74719b5 ("HID: wacom: generic: Support multiple tools per report"), however. This led to problems with the DTH-2452 due (in part) to *all* contacts being fully processed -- even those past the expected contact count. Specifically, the received count gets reset to 0 once all expected fingers are processed, but not the expected count. The rest of the contacts in the report are then *also* processed since now the driver thinks we've only processed 0 of N expected contacts. Later commits such as 7fb0413baa7f (HID: wacom: Use "Confidence" flag to prevent reporting invalid contacts) worked around the DTH-2452 issue by skipping the invalid contacts at the end of the report, but this is not a complete fix. The confidence flag cannot be relied on when a contact is removed (see the following patch), and dealing with that condition re-introduces the DTH-2452 issue unless we also address this contact count laziness. By resetting expected and received counts at the same time we ensure the driver understands that there are 0 more contacts expected in the report. Similarly, we also make sure to reset the received count if for some reason we're out of sync in the pre-report phase. Link: https://github.com/linuxwacom/input-wacom/issues/288 Fixes: f8b6a74719b5 ("HID: wacom: generic: Support multiple tools per report") CC: stable@vger.kernel.org Signed-off-by: Jason Gerecke Reviewed-by: Ping Cheng Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 2a4cc39962e7..5978399ae7d2 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -2692,11 +2692,14 @@ static void wacom_wac_finger_pre_report(struct hid_device *hdev, hid_data->cc_index >= 0) { struct hid_field *field = report->field[hid_data->cc_index]; int value = field->value[hid_data->cc_value_index]; - if (value) + if (value) { hid_data->num_expected = value; + hid_data->num_received = 0; + } } else { hid_data->num_expected = wacom_wac->features.touch_max; + hid_data->num_received = 0; } } @@ -2724,6 +2727,7 @@ static void wacom_wac_finger_report(struct hid_device *hdev, input_sync(input); wacom_wac->hid_data.num_received = 0; + wacom_wac->hid_data.num_expected = 0; /* keep touch state for pen event */ wacom_wac->shared->touch_down = wacom_wac_finger_count_touches(wacom_wac); -- cgit v1.2.3-58-ga151 From df03e9bd6d4806619b4cdc91a3d7695818a8e2b7 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Tue, 18 Jan 2022 14:37:56 -0800 Subject: HID: wacom: Ignore the confidence flag when a touch is removed AES hardware may internally re-classify a contact that it thought was intentional as a palm. Intentional contacts are reported as "down" with the confidence bit set. When this re-classification occurs, however, the state transitions to "up" with the confidence bit cleared. This kind of transition appears to be legal according to Microsoft docs, but we do not handle it correctly. Because the confidence bit is clear, we don't call `wacom_wac_finger_slot` and update userspace. This causes hung touches that confuse userspace and interfere with pen arbitration. This commit adds a special case to ignore the confidence flag if a contact is reported as removed. This ensures we do not leave a hung touch if one of these re-classification events occured. Ideally we'd have some way to also let userspace know that the touch has been re-classified as a palm and needs to be canceled, but that's not possible right now :) Link: https://github.com/linuxwacom/input-wacom/issues/288 Fixes: 7fb0413baa7f (HID: wacom: Use "Confidence" flag to prevent reporting invalid contacts) CC: stable@vger.kernel.org Signed-off-by: Jason Gerecke Reviewed-by: Ping Cheng Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 5978399ae7d2..92b52b1de526 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -2588,6 +2588,24 @@ static void wacom_wac_finger_slot(struct wacom_wac *wacom_wac, } } +static bool wacom_wac_slot_is_active(struct input_dev *dev, int key) +{ + struct input_mt *mt = dev->mt; + struct input_mt_slot *s; + + if (!mt) + return false; + + for (s = mt->slots; s != mt->slots + mt->num_slots; s++) { + if (s->key == key && + input_mt_get_value(s, ABS_MT_TRACKING_ID) >= 0) { + return true; + } + } + + return false; +} + static void wacom_wac_finger_event(struct hid_device *hdev, struct hid_field *field, struct hid_usage *usage, __s32 value) { @@ -2638,9 +2656,14 @@ static void wacom_wac_finger_event(struct hid_device *hdev, } if (usage->usage_index + 1 == field->report_count) { - if (equivalent_usage == wacom_wac->hid_data.last_slot_field && - wacom_wac->hid_data.confidence) - wacom_wac_finger_slot(wacom_wac, wacom_wac->touch_input); + if (equivalent_usage == wacom_wac->hid_data.last_slot_field) { + bool touch_removed = wacom_wac_slot_is_active(wacom_wac->touch_input, + wacom_wac->hid_data.id) && !wacom_wac->hid_data.tipswitch; + + if (wacom_wac->hid_data.confidence || touch_removed) { + wacom_wac_finger_slot(wacom_wac, wacom_wac->touch_input); + } + } } } -- cgit v1.2.3-58-ga151 From 20f3cf5f860f9f267a6a6e5642d3d0525edb1814 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Tue, 18 Jan 2022 14:38:41 -0800 Subject: HID: wacom: Avoid using stale array indicies to read contact count If we ever see a touch report with contact count data we initialize several variables used to read the contact count in the pre-report phase. These variables are never reset if we process a report which doesn't contain a contact count, however. This can cause the pre- report function to trigger a read of arbitrary memory (e.g. NULL if we're lucky) and potentially crash the driver. This commit restores resetting of the variables back to default "none" values that were used prior to the commit mentioned below. Link: https://github.com/linuxwacom/input-wacom/issues/276 Fixes: 003f50ab673c (HID: wacom: Update last_slot_field during pre_report phase) CC: stable@vger.kernel.org Signed-off-by: Jason Gerecke Reviewed-by: Ping Cheng Signed-off-by: Jiri Kosina --- drivers/hid/wacom_wac.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/hid/wacom_wac.c b/drivers/hid/wacom_wac.c index 92b52b1de526..a7176fc0635d 100644 --- a/drivers/hid/wacom_wac.c +++ b/drivers/hid/wacom_wac.c @@ -2682,6 +2682,10 @@ static void wacom_wac_finger_pre_report(struct hid_device *hdev, hid_data->confidence = true; + hid_data->cc_report = 0; + hid_data->cc_index = -1; + hid_data->cc_value_index = -1; + for (i = 0; i < report->maxfield; i++) { struct hid_field *field = report->field[i]; int j; -- cgit v1.2.3-58-ga151 From 80a00ab8344f0fe4d555a1f97960215b659436e9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2022 13:30:17 +0000 Subject: fscache: Fix the volume collision wait condition The condition that the waits in fscache_wait_on_volume_collision() are waiting until are inverted. This suddenly started happening on the upstream kernel with something like the following appearing in dmesg when running xfstests: CacheFiles: cachefiles: Inode already in use: Iafs,example.com,100055 Fix them by inverting the conditions. Fixes: 62ab63352350 ("fscache: Implement volume registration") Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/164251398010.3435901.943876048104930939.stgit@warthog.procyon.org.uk/ # v1 --- fs/fscache/volume.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fscache/volume.c b/fs/fscache/volume.c index a57c6cbee858..f2aa7dbad766 100644 --- a/fs/fscache/volume.c +++ b/fs/fscache/volume.c @@ -142,12 +142,12 @@ static void fscache_wait_on_volume_collision(struct fscache_volume *candidate, unsigned int collidee_debug_id) { wait_var_event_timeout(&candidate->flags, - fscache_is_acquire_pending(candidate), 20 * HZ); + !fscache_is_acquire_pending(candidate), 20 * HZ); if (!fscache_is_acquire_pending(candidate)) { pr_notice("Potential volume collision new=%08x old=%08x", candidate->debug_id, collidee_debug_id); fscache_stat(&fscache_n_volumes_collision); - wait_var_event(&candidate->flags, fscache_is_acquire_pending(candidate)); + wait_var_event(&candidate->flags, !fscache_is_acquire_pending(candidate)); } } -- cgit v1.2.3-58-ga151 From 5638b067d370583c6c455f019129ce33340b4142 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2022 14:13:59 +0000 Subject: cachefiles: Calculate the blockshift in terms of bytes, not pages Cachefiles keeps track of how much space is available on the backing filesystem and refuses new writes permission to start if there isn't enough (we especially don't want ENOSPC happening). It also tracks the amount of data pending in DIO writes (cache->b_writing) and reduces the amount of free space available by this amount before deciding if it can set up a new write. However, the old fscache I/O API was very much page-granularity dependent and, as such, cachefiles's cache->bshift was meant to be a multiplier to get from PAGE_SIZE to block size (ie. a blocksize of 512 would give a shift of 3 for a 4KiB page) - and this was incorrectly being used to turn the number of bytes in a DIO write into a number of blocks, leading to a massive over estimation of the amount of data in flight. Fix this by changing cache->bshift to be a multiplier from bytes to blocksize and deal with quantities of blocks, not quantities of pages. Fix also the rounding in the calculation in cachefiles_write() which needs a "- 1" inserting. Fixes: 047487c947e8 ("cachefiles: Implement the I/O routines") Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/164251398954.3435901.7138806620218474123.stgit@warthog.procyon.org.uk/ # v1 --- fs/cachefiles/cache.c | 7 ++----- fs/cachefiles/internal.h | 2 +- fs/cachefiles/io.c | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index ce4d4785003c..1e9c71666c6a 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -84,9 +84,7 @@ int cachefiles_add_cache(struct cachefiles_cache *cache) goto error_unsupported; cache->bsize = stats.f_bsize; - cache->bshift = 0; - if (stats.f_bsize < PAGE_SIZE) - cache->bshift = PAGE_SHIFT - ilog2(stats.f_bsize); + cache->bshift = ilog2(stats.f_bsize); _debug("blksize %u (shift %u)", cache->bsize, cache->bshift); @@ -106,7 +104,6 @@ int cachefiles_add_cache(struct cachefiles_cache *cache) (unsigned long long) cache->fcull, (unsigned long long) cache->fstop); - stats.f_blocks >>= cache->bshift; do_div(stats.f_blocks, 100); cache->bstop = stats.f_blocks * cache->bstop_percent; cache->bcull = stats.f_blocks * cache->bcull_percent; @@ -209,7 +206,7 @@ int cachefiles_has_space(struct cachefiles_cache *cache, return ret; } - b_avail = stats.f_bavail >> cache->bshift; + b_avail = stats.f_bavail; b_writing = atomic_long_read(&cache->b_writing); if (b_avail > b_writing) b_avail -= b_writing; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 8dd54d9375b6..c793d33b0224 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -86,7 +86,7 @@ struct cachefiles_cache { unsigned bcull_percent; /* when to start culling (% blocks) */ unsigned bstop_percent; /* when to stop allocating (% blocks) */ unsigned bsize; /* cache's block size */ - unsigned bshift; /* min(ilog2(PAGE_SIZE / bsize), 0) */ + unsigned bshift; /* ilog2(bsize) */ uint64_t frun; /* when to stop culling */ uint64_t fcull; /* when to start culling */ uint64_t fstop; /* when to stop allocating */ diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 60b1eac2ce78..04eb52736990 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -264,7 +264,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres, ki->term_func = term_func; ki->term_func_priv = term_func_priv; ki->was_async = true; - ki->b_writing = (len + (1 << cache->bshift)) >> cache->bshift; + ki->b_writing = (len + (1 << cache->bshift) - 1) >> cache->bshift; if (ki->term_func) ki->iocb.ki_complete = cachefiles_write_complete; -- cgit v1.2.3-58-ga151 From c7ca73155762684a896ba57edf48519b645ea528 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Fri, 14 Jan 2022 15:30:29 +0800 Subject: cachefiles: set default tag name if it's unspecified fscache_acquire_cache() requires a non-empty name, while 'tag ' command is optional for cachefilesd. Thus set default tag name if it's unspecified to avoid the regression of cachefilesd. The logic is the same with that before rewritten. Signed-off-by: Jeffle Xu Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/164251399914.3435901.4761991152407411408.stgit@warthog.procyon.org.uk/ # v1 --- fs/cachefiles/daemon.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 40a792421fc1..7ac04ee2c0a0 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -703,6 +703,17 @@ static int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) return -EBUSY; } + /* Make sure we have copies of the tag string */ + if (!cache->tag) { + /* + * The tag string is released by the fops->release() + * function, so we don't release it on error here + */ + cache->tag = kstrdup("CacheFiles", GFP_KERNEL); + if (!cache->tag) + return -ENOMEM; + } + return cachefiles_add_cache(cache); } -- cgit v1.2.3-58-ga151 From 8c39b8bc82aafcc8dd378bd79c76fac8e8a89c8d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2022 11:44:54 +0000 Subject: cachefiles: Make some tracepoint adjustments Make some adjustments to tracepoints to make the tracing a bit more followable: (1) Standardise on displaying the backing inode number as "B=" with no leading zeros. (2) Make the cachefiles_lookup tracepoint log the directory inode number as well as the looked-up inode number. (3) Add a cachefiles_lookup tracepoint into cachefiles_get_directory() to log directory lookup. (4) Add a new cachefiles_mkdir tracepoint and use that to log a successful mkdir from cachefiles_get_directory(). (5) Make the cachefiles_unlink and cachefiles_rename tracepoints log the inode number of the affected file/dir rather than dentry struct pointers. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/164251403694.3435901.9797725381831316715.stgit@warthog.procyon.org.uk/ # v1 --- fs/cachefiles/namei.c | 8 ++-- include/trace/events/cachefiles.h | 82 ++++++++++++++++++++++++--------------- 2 files changed, 56 insertions(+), 34 deletions(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 9bd692870617..52c9f0864a87 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -101,6 +101,7 @@ retry: subdir = lookup_one_len(dirname, dir, strlen(dirname)); else subdir = ERR_PTR(ret); + trace_cachefiles_lookup(NULL, dir, subdir); if (IS_ERR(subdir)) { trace_cachefiles_vfs_error(NULL, d_backing_inode(dir), PTR_ERR(subdir), @@ -135,6 +136,7 @@ retry: cachefiles_trace_mkdir_error); goto mkdir_error; } + trace_cachefiles_mkdir(dir, subdir); if (unlikely(d_unhashed(subdir))) { cachefiles_put_directory(subdir); @@ -233,7 +235,7 @@ static int cachefiles_unlink(struct cachefiles_cache *cache, }; int ret; - trace_cachefiles_unlink(object, dentry, why); + trace_cachefiles_unlink(object, d_inode(dentry)->i_ino, why); ret = security_path_unlink(&path, dentry); if (ret < 0) { cachefiles_io_error(cache, "Unlink security error"); @@ -386,7 +388,7 @@ try_again: .new_dir = d_inode(cache->graveyard), .new_dentry = grave, }; - trace_cachefiles_rename(object, rep, grave, why); + trace_cachefiles_rename(object, d_inode(rep)->i_ino, why); ret = cachefiles_inject_read_error(); if (ret == 0) ret = vfs_rename(&rd); @@ -617,7 +619,7 @@ bool cachefiles_look_up_object(struct cachefiles_object *object) object->d_name_len); else dentry = ERR_PTR(ret); - trace_cachefiles_lookup(object, dentry); + trace_cachefiles_lookup(object, fan, dentry); if (IS_ERR(dentry)) { if (dentry == ERR_PTR(-ENOENT)) goto new_file; diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index 1172529b5b49..093c4acb7a3a 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -233,25 +233,48 @@ TRACE_EVENT(cachefiles_ref, TRACE_EVENT(cachefiles_lookup, TP_PROTO(struct cachefiles_object *obj, + struct dentry *dir, struct dentry *de), - TP_ARGS(obj, de), + TP_ARGS(obj, dir, de), TP_STRUCT__entry( __field(unsigned int, obj ) __field(short, error ) + __field(unsigned long, dino ) __field(unsigned long, ino ) ), TP_fast_assign( - __entry->obj = obj->debug_id; + __entry->obj = obj ? obj->debug_id : 0; + __entry->dino = d_backing_inode(dir)->i_ino; __entry->ino = (!IS_ERR(de) && d_backing_inode(de) ? d_backing_inode(de)->i_ino : 0); __entry->error = IS_ERR(de) ? PTR_ERR(de) : 0; ), - TP_printk("o=%08x i=%lx e=%d", - __entry->obj, __entry->ino, __entry->error) + TP_printk("o=%08x dB=%lx B=%lx e=%d", + __entry->obj, __entry->dino, __entry->ino, __entry->error) + ); + +TRACE_EVENT(cachefiles_mkdir, + TP_PROTO(struct dentry *dir, struct dentry *subdir), + + TP_ARGS(dir, subdir), + + TP_STRUCT__entry( + __field(unsigned int, dir ) + __field(unsigned int, subdir ) + ), + + TP_fast_assign( + __entry->dir = d_backing_inode(dir)->i_ino; + __entry->subdir = d_backing_inode(subdir)->i_ino; + ), + + TP_printk("dB=%x sB=%x", + __entry->dir, + __entry->subdir) ); TRACE_EVENT(cachefiles_tmpfile, @@ -269,7 +292,7 @@ TRACE_EVENT(cachefiles_tmpfile, __entry->backer = backer->i_ino; ), - TP_printk("o=%08x b=%08x", + TP_printk("o=%08x B=%x", __entry->obj, __entry->backer) ); @@ -289,61 +312,58 @@ TRACE_EVENT(cachefiles_link, __entry->backer = backer->i_ino; ), - TP_printk("o=%08x b=%08x", + TP_printk("o=%08x B=%x", __entry->obj, __entry->backer) ); TRACE_EVENT(cachefiles_unlink, TP_PROTO(struct cachefiles_object *obj, - struct dentry *de, + ino_t ino, enum fscache_why_object_killed why), - TP_ARGS(obj, de, why), + TP_ARGS(obj, ino, why), /* Note that obj may be NULL */ TP_STRUCT__entry( __field(unsigned int, obj ) - __field(struct dentry *, de ) + __field(unsigned int, ino ) __field(enum fscache_why_object_killed, why ) ), TP_fast_assign( __entry->obj = obj ? obj->debug_id : UINT_MAX; - __entry->de = de; + __entry->ino = ino; __entry->why = why; ), - TP_printk("o=%08x d=%p w=%s", - __entry->obj, __entry->de, + TP_printk("o=%08x B=%x w=%s", + __entry->obj, __entry->ino, __print_symbolic(__entry->why, cachefiles_obj_kill_traces)) ); TRACE_EVENT(cachefiles_rename, TP_PROTO(struct cachefiles_object *obj, - struct dentry *de, - struct dentry *to, + ino_t ino, enum fscache_why_object_killed why), - TP_ARGS(obj, de, to, why), + TP_ARGS(obj, ino, why), /* Note that obj may be NULL */ TP_STRUCT__entry( __field(unsigned int, obj ) - __field(struct dentry *, de ) - __field(struct dentry *, to ) + __field(unsigned int, ino ) __field(enum fscache_why_object_killed, why ) ), TP_fast_assign( __entry->obj = obj ? obj->debug_id : UINT_MAX; - __entry->de = de; - __entry->to = to; + __entry->ino = ino; __entry->why = why; ), - TP_printk("o=%08x d=%p t=%p w=%s", - __entry->obj, __entry->de, __entry->to, + TP_printk("o=%08x B=%x w=%s", + __entry->obj, __entry->ino, __print_symbolic(__entry->why, cachefiles_obj_kill_traces)) ); @@ -370,7 +390,7 @@ TRACE_EVENT(cachefiles_coherency, __entry->ino = ino; ), - TP_printk("o=%08x %s i=%llx c=%u", + TP_printk("o=%08x %s B=%llx c=%u", __entry->obj, __print_symbolic(__entry->why, cachefiles_coherency_traces), __entry->ino, @@ -397,7 +417,7 @@ TRACE_EVENT(cachefiles_vol_coherency, __entry->ino = ino; ), - TP_printk("V=%08x %s i=%llx", + TP_printk("V=%08x %s B=%llx", __entry->vol, __print_symbolic(__entry->why, cachefiles_coherency_traces), __entry->ino) @@ -435,7 +455,7 @@ TRACE_EVENT(cachefiles_prep_read, __entry->cache_inode = cache_inode; ), - TP_printk("R=%08x[%u] %s %s f=%02x s=%llx %zx ni=%x b=%x", + TP_printk("R=%08x[%u] %s %s f=%02x s=%llx %zx ni=%x B=%x", __entry->rreq, __entry->index, __print_symbolic(__entry->source, netfs_sreq_sources), __print_symbolic(__entry->why, cachefiles_prepare_read_traces), @@ -466,7 +486,7 @@ TRACE_EVENT(cachefiles_read, __entry->len = len; ), - TP_printk("o=%08x b=%08x s=%llx l=%zx", + TP_printk("o=%08x B=%x s=%llx l=%zx", __entry->obj, __entry->backer, __entry->start, @@ -495,7 +515,7 @@ TRACE_EVENT(cachefiles_write, __entry->len = len; ), - TP_printk("o=%08x b=%08x s=%llx l=%zx", + TP_printk("o=%08x B=%x s=%llx l=%zx", __entry->obj, __entry->backer, __entry->start, @@ -524,7 +544,7 @@ TRACE_EVENT(cachefiles_trunc, __entry->why = why; ), - TP_printk("o=%08x b=%08x %s l=%llx->%llx", + TP_printk("o=%08x B=%x %s l=%llx->%llx", __entry->obj, __entry->backer, __print_symbolic(__entry->why, cachefiles_trunc_traces), @@ -549,7 +569,7 @@ TRACE_EVENT(cachefiles_mark_active, __entry->inode = inode->i_ino; ), - TP_printk("o=%08x i=%lx", + TP_printk("o=%08x B=%lx", __entry->obj, __entry->inode) ); @@ -570,7 +590,7 @@ TRACE_EVENT(cachefiles_mark_inactive, __entry->inode = inode->i_ino; ), - TP_printk("o=%08x i=%lx", + TP_printk("o=%08x B=%lx", __entry->obj, __entry->inode) ); @@ -594,7 +614,7 @@ TRACE_EVENT(cachefiles_vfs_error, __entry->where = where; ), - TP_printk("o=%08x b=%08x %s e=%d", + TP_printk("o=%08x B=%x %s e=%d", __entry->obj, __entry->backer, __print_symbolic(__entry->where, cachefiles_error_traces), @@ -621,7 +641,7 @@ TRACE_EVENT(cachefiles_io_error, __entry->where = where; ), - TP_printk("o=%08x b=%08x %s e=%d", + TP_printk("o=%08x B=%x %s e=%d", __entry->obj, __entry->backer, __print_symbolic(__entry->where, cachefiles_error_traces), -- cgit v1.2.3-58-ga151 From b64a3314989df8e44c114f377808407f36dbf4f4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 14 Jan 2022 11:05:13 +0000 Subject: cachefiles: Trace active-mark failure Add a tracepoint to log failure to apply an active mark to a file in addition to tracing successfully setting and unsetting the mark. Also include the backing file inode number in the message logged to dmesg. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/164251404666.3435901.17331742792401482190.stgit@warthog.procyon.org.uk/ # v1 --- fs/cachefiles/namei.c | 4 +++- include/trace/events/cachefiles.h | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 52c9f0864a87..f256c8aff7bb 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -25,7 +25,9 @@ static bool __cachefiles_mark_inode_in_use(struct cachefiles_object *object, trace_cachefiles_mark_active(object, inode); can_use = true; } else { - pr_notice("cachefiles: Inode already in use: %pd\n", dentry); + trace_cachefiles_mark_failed(object, inode); + pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + dentry, inode->i_ino); } return can_use; diff --git a/include/trace/events/cachefiles.h b/include/trace/events/cachefiles.h index 093c4acb7a3a..c6f5aa74db89 100644 --- a/include/trace/events/cachefiles.h +++ b/include/trace/events/cachefiles.h @@ -573,6 +573,27 @@ TRACE_EVENT(cachefiles_mark_active, __entry->obj, __entry->inode) ); +TRACE_EVENT(cachefiles_mark_failed, + TP_PROTO(struct cachefiles_object *obj, + struct inode *inode), + + TP_ARGS(obj, inode), + + /* Note that obj may be NULL */ + TP_STRUCT__entry( + __field(unsigned int, obj ) + __field(ino_t, inode ) + ), + + TP_fast_assign( + __entry->obj = obj ? obj->debug_id : 0; + __entry->inode = inode->i_ino; + ), + + TP_printk("o=%08x B=%lx", + __entry->obj, __entry->inode) + ); + TRACE_EVENT(cachefiles_mark_inactive, TP_PROTO(struct cachefiles_object *obj, struct inode *inode), -- cgit v1.2.3-58-ga151 From 14b9d0902dfa25dac9c41bf346aa655fdeafe5b2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Jan 2022 10:51:13 +0000 Subject: cachefiles: Explain checks in a comment Add a comment to explain the checks that cachefiles is making of the backing filesystem[1]. Suggested-by: Jeff Layton Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/568749bd7cc02908ecf6f3d6a611b6f9cf5c4afd.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/164251405621.3435901.771439791811515914.stgit@warthog.procyon.org.uk/ # v1 --- fs/cachefiles/cache.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index 1e9c71666c6a..2b2879c5d1d2 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -49,7 +49,13 @@ int cachefiles_add_cache(struct cachefiles_cache *cache) goto error_unsupported; } - /* check parameters */ + /* Check features of the backing filesystem: + * - Directories must support looking up and directory creation + * - We use xattrs to store metadata + * - We need to be able to query the amount of space available + * - We want to be able to sync the filesystem when stopping the cache + * - We use DIO to/from pages, so the blocksize mustn't be too big. + */ ret = -EOPNOTSUPP; if (d_is_negative(root) || !d_backing_inode(root)->i_op->lookup || -- cgit v1.2.3-58-ga151 From 6633213139d827fb9abf9a9a280f3d9e89fc7091 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Jan 2022 10:57:45 +0000 Subject: cachefiles: Check that the backing filesystem supports tmpfiles Add a check that the backing filesystem supports the creation of tmpfiles[1]. Suggested-by: Jeff Layton Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/568749bd7cc02908ecf6f3d6a611b6f9cf5c4afd.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/164251406558.3435901.1249023136670058162.stgit@warthog.procyon.org.uk/ # v1 --- fs/cachefiles/cache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index 2b2879c5d1d2..7077f72e6f47 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -51,6 +51,7 @@ int cachefiles_add_cache(struct cachefiles_cache *cache) /* Check features of the backing filesystem: * - Directories must support looking up and directory creation + * - We create tmpfiles to handle invalidation * - We use xattrs to store metadata * - We need to be able to query the amount of space available * - We want to be able to sync the filesystem when stopping the cache @@ -60,6 +61,7 @@ int cachefiles_add_cache(struct cachefiles_cache *cache) if (d_is_negative(root) || !d_backing_inode(root)->i_op->lookup || !d_backing_inode(root)->i_op->mkdir || + !d_backing_inode(root)->i_op->tmpfile || !(d_backing_inode(root)->i_opflags & IOP_XATTR) || !root->d_sb->s_op->statfs || !root->d_sb->s_op->sync_fs || -- cgit v1.2.3-58-ga151 From c522e3ad296b7b692ed3960dfde467f2a34b434f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Jan 2022 09:28:41 +0000 Subject: fscache: Add a comment explaining how page-release optimisation works Add a comment into fscache_note_page_release() to explain how the page-release optimisation logic works[1]. It's not entirely obvious as it has nothing to do with whether or not the netfs file contains data. FSCACHE_COOKIE_NO_DATA_TO_READ is set if we have no data in the cache yet (ie. the backing file lookup was negative, the file is 0 length or the cookie got invalidated). It means that we have no data in the cache, not that the file is necessarily empty on the server. FSCACHE_COOKIE_HAVE_DATA is set once we've stored data in the backing file. From that point on, we have data we *could* read - however, it's covered by pages in the netfs pagecache until at such time one of those covering pages is released. So if we've written data to the cache (HAVE_DATA) and there wasn't any data in the cache when we started (NO_DATA_TO_READ), it may no longer be true that we can skip reading from the cache. Read skipping is done by cachefiles_prepare_read(). Note that tracking is not done on a per-page basis, but only on a per-file basis. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/043a206f03929c2667a465314144e518070a9b2d.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/164251408479.3435901.9540165422908194636.stgit@warthog.procyon.org.uk/ # v1 --- include/linux/fscache.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/fscache.h b/include/linux/fscache.h index ede50406bcb0..296c5f1d9f35 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -665,6 +665,11 @@ static inline void fscache_clear_inode_writeback(struct fscache_cookie *cookie, static inline void fscache_note_page_release(struct fscache_cookie *cookie) { + /* If we've written data to the cache (HAVE_DATA) and there wasn't any + * data in the cache when we started (NO_DATA_TO_READ), it may no + * longer be true that we can skip reading from the cache - so clear + * the flag that causes reads to be skipped. + */ if (cookie && test_bit(FSCACHE_COOKIE_HAVE_DATA, &cookie->flags) && test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) -- cgit v1.2.3-58-ga151 From cef0223191452b3c493a1070baad9ffe806babac Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Tue, 28 Dec 2021 20:44:19 +0800 Subject: netfs: Make ops->init_rreq() optional Make the ops->init_rreq() callback optional. This isn't required for the erofs changes I'm implementing to do on-demand read through fscache[1]. Further, ceph has an empty init_rreq method that can then be removed and it's marked optional in the documentation. Signed-off-by: Jeffle Xu Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/20211227125444.21187-1-jefflexu@linux.alibaba.com/ [1] Link: https://lore.kernel.org/r/20211228124419.103020-1-jefflexu@linux.alibaba.com Link: https://lore.kernel.org/r/164251410387.3435901.2504600788262093313.stgit@warthog.procyon.org.uk/ # v1 --- fs/ceph/addr.c | 5 ----- fs/netfs/read_helper.c | 3 ++- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b3d9459c9bbd..c98e5238a1b6 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -297,10 +297,6 @@ out: dout("%s: result %d\n", __func__, err); } -static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file) -{ -} - static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) { struct inode *inode = mapping->host; @@ -312,7 +308,6 @@ static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) } static const struct netfs_read_request_ops ceph_netfs_read_ops = { - .init_rreq = ceph_init_rreq, .is_cache_enabled = ceph_is_cache_enabled, .begin_cache_operation = ceph_begin_cache_operation, .issue_op = ceph_netfs_issue_op, diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 6169659857b3..501da990c259 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -55,7 +55,8 @@ static struct netfs_read_request *netfs_alloc_read_request( INIT_WORK(&rreq->work, netfs_rreq_work); refcount_set(&rreq->usage, 1); __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - ops->init_rreq(rreq, file); + if (ops->init_rreq) + ops->init_rreq(rreq, file); netfs_stat(&netfs_n_rh_rreq); } -- cgit v1.2.3-58-ga151 From d24846a4246b6e61ecbd036880a4adf61681d241 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 20 Jan 2022 12:18:12 +0000 Subject: parisc: pdc_stable: Fix memory leak in pdcs_register_pathentries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kobject_init_and_add() takes reference even when it fails. According to the doc of kobject_init_and_add(): If this function returns an error, kobject_put() must be called to properly clean up the memory associated with the object. Fix memory leak by calling kobject_put(). Fixes: 73f368cf679b ("Kobject: change drivers/parisc/pdc_stable.c to use kobject_init_and_add") Signed-off-by: Miaoqian Lin Signed-off-by: Helge Deller --- drivers/parisc/pdc_stable.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c index 9513c39719d1..d9e51036a4fa 100644 --- a/drivers/parisc/pdc_stable.c +++ b/drivers/parisc/pdc_stable.c @@ -980,8 +980,10 @@ pdcs_register_pathentries(void) entry->kobj.kset = paths_kset; err = kobject_init_and_add(&entry->kobj, &ktype_pdcspath, NULL, "%s", entry->name); - if (err) + if (err) { + kobject_put(&entry->kobj); return err; + } /* kobject is now registered */ write_lock(&entry->rw_lock); -- cgit v1.2.3-58-ga151 From ffa65753c43142f3b803486442813744da71cff2 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 21 Jan 2022 22:10:46 -0800 Subject: mm/migrate.c: rework migration_entry_wait() to not take a pageref This fixes the FIXME in migrate_vma_check_page(). Before migrating a page migration code will take a reference and check there are no unexpected page references, failing the migration if there are. When a thread faults on a migration entry it will take a temporary reference to the page to wait for the page to become unlocked signifying the migration entry has been removed. This reference is dropped just prior to waiting on the page lock, however the extra reference can cause migration failures so it is desirable to avoid taking it. As migration code already has a reference to the migrating page an extra reference to wait on PG_locked is unnecessary so long as the reference can't be dropped whilst setting up the wait. When faulting on a migration entry the ptl is taken to check the migration entry. Removing a migration entry also requires the ptl, and migration code won't drop its page reference until after the migration entry has been removed. Therefore retaining the ptl of a migration entry is sufficient to ensure the page has a reference. Reworking migration_entry_wait() to hold the ptl until the wait setup is complete means the extra page reference is no longer needed. [apopple@nvidia.com: v5] Link: https://lkml.kernel.org/r/20211213033848.1973946-1-apopple@nvidia.com Link: https://lkml.kernel.org/r/20211118020754.954425-1-apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 ++ mm/filemap.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++ mm/migrate.c | 38 +++------------------ 3 files changed, 97 insertions(+), 34 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 4850cc5bf813..db96e10eb8da 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -40,6 +40,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); +void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, + spinlock_t *ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); void folio_migrate_copy(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, diff --git a/mm/filemap.c b/mm/filemap.c index 2fd9b2f24025..60866ae711e2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -1388,6 +1390,95 @@ repeat: return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } +#ifdef CONFIG_MIGRATION +/** + * migration_entry_wait_on_locked - Wait for a migration entry to be removed + * @entry: migration swap entry. + * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required + * for pte entries, pass NULL for pmd entries. + * @ptl: already locked ptl. This function will drop the lock. + * + * Wait for a migration entry referencing the given page to be removed. This is + * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except + * this can be called without taking a reference on the page. Instead this + * should be called while holding the ptl for the migration entry referencing + * the page. + * + * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock(). + * + * This follows the same logic as folio_wait_bit_common() so see the comments + * there. + */ +void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, + spinlock_t *ptl) +{ + struct wait_page_queue wait_page; + wait_queue_entry_t *wait = &wait_page.wait; + bool thrashing = false; + bool delayacct = false; + unsigned long pflags; + wait_queue_head_t *q; + struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); + + q = folio_waitqueue(folio); + if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { + if (!folio_test_swapbacked(folio)) { + delayacct_thrashing_start(); + delayacct = true; + } + psi_memstall_enter(&pflags); + thrashing = true; + } + + init_wait(wait); + wait->func = wake_page_function; + wait_page.folio = folio; + wait_page.bit_nr = PG_locked; + wait->flags = 0; + + spin_lock_irq(&q->lock); + folio_set_waiters(folio); + if (!folio_trylock_flag(folio, PG_locked, wait)) + __add_wait_queue_entry_tail(q, wait); + spin_unlock_irq(&q->lock); + + /* + * If a migration entry exists for the page the migration path must hold + * a valid reference to the page, and it must take the ptl to remove the + * migration entry. So the page is valid until the ptl is dropped. + */ + if (ptep) + pte_unmap_unlock(ptep, ptl); + else + spin_unlock(ptl); + + for (;;) { + unsigned int flags; + + set_current_state(TASK_UNINTERRUPTIBLE); + + /* Loop until we've been woken or interrupted */ + flags = smp_load_acquire(&wait->flags); + if (!(flags & WQ_FLAG_WOKEN)) { + if (signal_pending_state(TASK_UNINTERRUPTIBLE, current)) + break; + + io_schedule(); + continue; + } + break; + } + + finish_wait(q, wait); + + if (thrashing) { + if (delayacct) + delayacct_thrashing_end(); + psi_memstall_leave(&pflags); + } +} +#endif + void folio_wait_bit(struct folio *folio, int bit_nr) { folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED); diff --git a/mm/migrate.c b/mm/migrate.c index 18ce840914f0..c7da064b4781 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -291,7 +291,6 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, { pte_t pte; swp_entry_t entry; - struct folio *folio; spin_lock(ptl); pte = *ptep; @@ -302,17 +301,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, if (!is_migration_entry(entry)) goto out; - folio = page_folio(pfn_swap_entry_to_page(entry)); - - /* - * Once page cache replacement of page migration started, page_count - * is zero; but we must not call folio_put_wait_locked() without - * a ref. Use folio_try_get(), and just fault again if it fails. - */ - if (!folio_try_get(folio)) - goto out; - pte_unmap_unlock(ptep, ptl); - folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); + migration_entry_wait_on_locked(entry, ptep, ptl); return; out: pte_unmap_unlock(ptep, ptl); @@ -337,16 +326,11 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; - struct folio *folio; ptl = pmd_lock(mm, pmd); if (!is_pmd_migration_entry(*pmd)) goto unlock; - folio = page_folio(pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd))); - if (!folio_try_get(folio)) - goto unlock; - spin_unlock(ptl); - folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE); + migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl); return; unlock: spin_unlock(ptl); @@ -2431,22 +2415,8 @@ static bool migrate_vma_check_page(struct page *page) return false; /* Page from ZONE_DEVICE have one extra reference */ - if (is_zone_device_page(page)) { - /* - * Private page can never be pin as they have no valid pte and - * GUP will fail for those. Yet if there is a pending migration - * a thread might try to wait on the pte migration entry and - * will bump the page reference count. Sadly there is no way to - * differentiate a regular pin from migration wait. Hence to - * avoid 2 racing thread trying to migrate back to CPU to enter - * infinite loop (one stopping migration because the other is - * waiting on pte migration entry). We always return true here. - * - * FIXME proper solution is to rework migration_entry_wait() so - * it does not need to take a reference on page. - */ - return is_device_private_page(page); - } + if (is_zone_device_page(page)) + extra++; /* For file back page */ if (page_mapping(page)) -- cgit v1.2.3-58-ga151 From 3ddd9a808cee7284931312f2f3e854c9617f44b2 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:10:50 -0800 Subject: sysctl: add a new register_sysctl_init() interface Patch series "sysctl: first set of kernel/sysctl cleanups", v2. Finally had time to respin the series of the work we had started last year on cleaning up the kernel/sysct.c kitchen sink. People keeps stuffing their sysctls in that file and this creates a maintenance burden. So this effort is aimed at placing sysctls where they actually belong. I'm going to split patches up into series as there is quite a bit of work. This first set adds register_sysctl_init() for uses of registerting a sysctl on the init path, adds const where missing to a few places, generalizes common values so to be more easy to share, and starts the move of a few kernel/sysctl.c out where they belong. The majority of rework on v2 in this first patch set is 0-day fixes. Eric Biederman's feedback is later addressed in subsequent patch sets. I'll only post the first two patch sets for now. We can address the rest once the first two patch sets get completely reviewed / Acked. This patch (of 9): The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Today though folks heavily rely on tables on kernel/sysctl.c so they can easily just extend this table with their needed sysctls. In order to help users move their sysctls out we need to provide a helper which can be used during code initialization. We special-case the initialization use of register_sysctl() since it *is* safe to fail, given all that sysctls do is provide a dynamic interface to query or modify at runtime an existing variable. So the use case of register_sysctl() on init should *not* stop if the sysctls don't end up getting registered. It would be counter productive to stop boot if a simple sysctl registration failed. Provide a helper for init then, and document the recommended init levels to use for callers of this routine. We will later use this in subsequent patches to start slimming down kernel/sysctl.c tables and moving sysctl registration to the code which actually needs these sysctls. [mcgrof@kernel.org: major commit log and documentation rephrasing also moved to fs/proc/proc_sysctl.c ] Link: https://lkml.kernel.org/r/20211123202347.818157-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211123202347.818157-2-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Cc: Iurii Zaikin Cc: "Eric W. Biederman" Cc: Peter Zijlstra Cc: Greg Kroah-Hartman Cc: Paul Turner Cc: Andy Shevchenko Cc: Sebastian Reichel Cc: Tetsuo Handa Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Qing Wang Cc: Benjamin LaHaise Cc: Al Viro Cc: Jan Kara Cc: Amir Goldstein Cc: Stephen Kitt Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 33 +++++++++++++++++++++++++++++++++ include/linux/sysctl.h | 3 +++ 2 files changed, 36 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 389e1e42e7d9..55a54973f061 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -1383,6 +1384,38 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab } EXPORT_SYMBOL(register_sysctl); +/** + * __register_sysctl_init() - register sysctl table to path + * @path: path name for sysctl base + * @table: This is the sysctl table that needs to be registered to the path + * @table_name: The name of sysctl table, only used for log printing when + * registration fails + * + * The sysctl interface is used by userspace to query or modify at runtime + * a predefined value set on a variable. These variables however have default + * values pre-set. Code which depends on these variables will always work even + * if register_sysctl() fails. If register_sysctl() fails you'd just loose the + * ability to query or modify the sysctls dynamically at run time. Chances of + * register_sysctl() failing on init are extremely low, and so for both reasons + * this function does not return any error as it is used by initialization code. + * + * Context: Can only be called after your respective sysctl base path has been + * registered. So for instance, most base directories are registered early on + * init before init levels are processed through proc_sys_init() and + * sysctl_init(). + */ +void __init __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name) +{ + struct ctl_table_header *hdr = register_sysctl(path, table); + + if (unlikely(!hdr)) { + pr_err("failed when register_sysctl %s to %s\n", table_name, path); + return; + } + kmemleak_not_leak(hdr); +} + static char *append_path(const char *path, char *pos, const char *name) { int namelen; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 1fa2b69c6fc3..d3ab7969b6b5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -199,6 +199,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +extern void __register_sysctl_init(const char *path, struct ctl_table *table, + const char *table_name); +#define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) void do_sysctl_args(void); extern int pwrsw_enabled; -- cgit v1.2.3-58-ga151 From 78e36f3b0dae586f623c4a37ec5eb5496f5abbe1 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:10:55 -0800 Subject: sysctl: move some boundary constants from sysctl.c to sysctl_vals sysctl has helpers which let us specify boundary values for a min or max int value. Since these are used for a boundary check only they don't change, so move these variables to sysctl_vals to avoid adding duplicate variables. This will help with our cleanup of kernel/sysctl.c. [akpm@linux-foundation.org: update it for "mm/pagealloc: sysctl: change watermark_scale_factor max limit to 30%"] [mcgrof@kernel.org: major rebase] Link: https://lkml.kernel.org/r/20211123202347.818157-3-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 2 +- include/linux/sysctl.h | 13 ++++++++++--- kernel/sysctl.c | 45 +++++++++++++++++++-------------------------- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 55a54973f061..cd8bbf13d0f0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -26,7 +26,7 @@ static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { 0, 1, INT_MAX }; +const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX }; EXPORT_SYMBOL(sysctl_vals); /* Support for permanently empty directories */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index d3ab7969b6b5..47cf70c8eb93 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -38,9 +38,16 @@ struct ctl_table_header; struct ctl_dir; /* Keep the same order as in fs/proc/proc_sysctl.c */ -#define SYSCTL_ZERO ((void *)&sysctl_vals[0]) -#define SYSCTL_ONE ((void *)&sysctl_vals[1]) -#define SYSCTL_INT_MAX ((void *)&sysctl_vals[2]) +#define SYSCTL_NEG_ONE ((void *)&sysctl_vals[0]) +#define SYSCTL_ZERO ((void *)&sysctl_vals[1]) +#define SYSCTL_ONE ((void *)&sysctl_vals[2]) +#define SYSCTL_TWO ((void *)&sysctl_vals[3]) +#define SYSCTL_FOUR ((void *)&sysctl_vals[4]) +#define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[5]) +#define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[6]) +#define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[7]) +#define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8]) +#define SYSCTL_INT_MAX ((void *)&sysctl_vals[9]) extern const int sysctl_vals[]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ef77be575d87..901a9dfe4d62 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -114,16 +114,9 @@ static int sixty = 60; #endif -static int __maybe_unused neg_one = -1; -static int __maybe_unused two = 2; -static int __maybe_unused four = 4; static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; -static int one_hundred = 100; -static int two_hundred = 200; -static int one_thousand = 1000; -static int three_thousand = 3000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -1964,7 +1957,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, + .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, #endif @@ -2306,7 +2299,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax_sysadmin, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, #endif { @@ -2566,7 +2559,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, + .extra1 = SYSCTL_NEG_ONE, }, #endif #ifdef CONFIG_RT_MUTEXES @@ -2628,7 +2621,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "perf_event_max_stack", @@ -2646,7 +2639,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_thousand, + .extra2 = SYSCTL_ONE_THOUSAND, }, #endif { @@ -2677,7 +2670,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = bpf_unpriv_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "bpf_stats_enabled", @@ -2731,7 +2724,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = overcommit_policy_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "panic_on_oom", @@ -2740,7 +2733,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "oom_kill_allocating_task", @@ -2785,7 +2778,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = dirty_background_ratio_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_background_bytes", @@ -2802,7 +2795,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = dirty_ratio_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_bytes", @@ -2842,7 +2835,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two_hundred, + .extra2 = SYSCTL_TWO_HUNDRED, }, #ifdef CONFIG_HUGETLB_PAGE { @@ -2899,7 +2892,7 @@ static struct ctl_table vm_table[] = { .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &four, + .extra2 = SYSCTL_FOUR, }, #ifdef CONFIG_COMPACTION { @@ -2916,7 +2909,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = compaction_proactiveness_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "extfrag_threshold", @@ -2961,7 +2954,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = watermark_scale_factor_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &three_thousand, + .extra2 = SYSCTL_THREE_THOUSAND, }, { .procname = "percpu_pagelist_high_fraction", @@ -3040,7 +3033,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "min_slab_ratio", @@ -3049,7 +3042,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = sysctl_min_slab_ratio_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, #endif #ifdef CONFIG_SMP @@ -3339,7 +3332,7 @@ static struct ctl_table fs_table[] = { .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "protected_regular", @@ -3348,7 +3341,7 @@ static struct ctl_table fs_table[] = { .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "suid_dumpable", @@ -3357,7 +3350,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax_coredump, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) { -- cgit v1.2.3-58-ga151 From bbe7a10ed83a5fa0b0ff6161ecdc4e65a0e9c993 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:00 -0800 Subject: hung_task: move hung_task sysctl interface to hung_task.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move hung_task sysctl interface to hung_task.c and use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: commit log refresh and fixed 2-3 0day reported compile issues] Link: https://lkml.kernel.org/r/20211123202347.818157-4-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Paul Turner Cc: Peter Zijlstra Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/sysctl.h | 14 +------- kernel/hung_task.c | 81 ++++++++++++++++++++++++++++++++++++++++++-- kernel/sysctl.c | 61 --------------------------------- 3 files changed, 79 insertions(+), 77 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 304f431178fd..c19dd5a2c05c 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -7,20 +7,8 @@ struct ctl_table; #ifdef CONFIG_DETECT_HUNG_TASK - -#ifdef CONFIG_SMP -extern unsigned int sysctl_hung_task_all_cpu_backtrace; -#else -#define sysctl_hung_task_all_cpu_backtrace 0 -#endif /* CONFIG_SMP */ - -extern int sysctl_hung_task_check_count; -extern unsigned int sysctl_hung_task_panic; +/* used for hung_task and block/ */ extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_check_interval_secs; -extern int sysctl_hung_task_warnings; -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9888e2bc8c76..52501e5f7655 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -63,7 +63,9 @@ static struct task_struct *watchdog_task; * Should we dump all CPUs backtraces in a hung task event? * Defaults to 0, can be changed via sysctl. */ -unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#else +#define sysctl_hung_task_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ /* @@ -222,11 +224,13 @@ static long hung_timeout_jiffies(unsigned long last_checked, MAX_SCHEDULE_TIMEOUT; } +#ifdef CONFIG_SYSCTL /* * Process updating of timeout sysctl */ -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -241,6 +245,76 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ +static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); +static struct ctl_table hung_task_sysctls[] = { +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ + { + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hung_task_check_count", + .data = &sysctl_hung_task_check_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "hung_task_timeout_secs", + .data = &sysctl_hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)&hung_task_timeout_max, + }, + { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)&hung_task_timeout_max, + }, + { + .procname = "hung_task_warnings", + .data = &sysctl_hung_task_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_NEG_ONE, + }, + {} +}; + +static void __init hung_task_sysctl_init(void) +{ + register_sysctl_init("kernel", hung_task_sysctls); +} +#else +#define hung_task_sysctl_init() do { } while (0) +#endif /* CONFIG_SYSCTL */ + + static atomic_t reset_hung_task = ATOMIC_INIT(0); void reset_hung_task_detector(void) @@ -310,6 +384,7 @@ static int __init hung_task_init(void) pm_notifier(hungtask_pm_notify, 0); watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); + hung_task_sysctl_init(); return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 901a9dfe4d62..a4e00abe6f78 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -134,13 +134,6 @@ static int minolduid; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -/* - * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs - * and hung_task_check_interval_secs - */ -#ifdef CONFIG_DETECT_HUNG_TASK -static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); -#endif #ifdef CONFIG_INOTIFY_USER #include @@ -2508,60 +2501,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_DETECT_HUNG_TASK -#ifdef CONFIG_SMP - { - .procname = "hung_task_all_cpu_backtrace", - .data = &sysctl_hung_task_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ - { - .procname = "hung_task_panic", - .data = &sysctl_hung_task_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "hung_task_check_count", - .data = &sysctl_hung_task_check_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "hung_task_timeout_secs", - .data = &sysctl_hung_task_timeout_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_check_interval_secs", - .data = &sysctl_hung_task_check_interval_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_warnings", - .data = &sysctl_hung_task_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_NEG_ONE, - }, -#endif #ifdef CONFIG_RT_MUTEXES { .procname = "max_lock_depth", -- cgit v1.2.3-58-ga151 From dd0693fdf054f2ed37202ed56649ae2cccd29474 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:05 -0800 Subject: watchdog: move watchdog sysctl interface to watchdog.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic of proc sysctl. So, move the watchdog syscl interface to watchdog.c. Use register_sysctl() to register the sysctl interface to avoid merge conflicts when different features modify sysctl.c at the same time. [mcgrof@kernel.org: justify the move on the commit log] Link: https://lkml.kernel.org/r/20211123202347.818157-5-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Paul Turner Cc: Peter Zijlstra Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 96 --------------------------------------------------- kernel/watchdog.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 96 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a4e00abe6f78..55e25996e3cf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -103,16 +103,10 @@ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include #endif -#ifdef CONFIG_LOCKUP_DETECTOR -#include -#endif #if defined(CONFIG_SYSCTL) /* Constants used for minimum and maximum */ -#ifdef CONFIG_LOCKUP_DETECTOR -static int sixty = 60; -#endif static unsigned long zero_ul; static unsigned long one_ul = 1; @@ -2309,96 +2303,6 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_LOCKUP_DETECTOR) - { - .procname = "watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_thresh", - .data = &watchdog_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog_thresh, - .extra1 = SYSCTL_ZERO, - .extra2 = &sixty, - }, - { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = NMI_WATCHDOG_SYSCTL_PERM, - .proc_handler = proc_nmi_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - { - .procname = "soft_watchdog", - .data = &soft_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_soft_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR - { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "hardlockup_all_cpu_backtrace", - .data = &sysctl_hardlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#endif - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ad912511a0c0..99afb88d2e85 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -740,6 +740,106 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, mutex_unlock(&watchdog_mutex); return err; } + +static const int sixty = 60; + +static struct ctl_table watchdog_sysctls[] = { + { + .procname = "watchdog", + .data = &watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_thresh", + .data = &watchdog_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog_thresh, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&sixty, + }, + { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = NMI_WATCHDOG_SYSCTL_PERM, + .proc_handler = proc_nmi_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + { + .procname = "soft_watchdog", + .data = &soft_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_soft_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR + { + .procname = "hardlockup_panic", + .data = &hardlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "hardlockup_all_cpu_backtrace", + .data = &sysctl_hardlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif + {} +}; + +static void __init watchdog_sysctl_init(void) +{ + register_sysctl_init("kernel", watchdog_sysctls); +} +#else +#define watchdog_sysctl_init() do { } while (0) #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) @@ -753,4 +853,5 @@ void __init lockup_detector_init(void) if (!watchdog_nmi_probe()) nmi_watchdog_available = true; lockup_detector_setup(); + watchdog_sysctl_init(); } -- cgit v1.2.3-58-ga151 From f628867da46f8867e1854e43d7200e42ec22eee2 Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Fri, 21 Jan 2022 22:11:09 -0800 Subject: sysctl: make ngroups_max const ngroups_max is a read-only sysctl entry, reflecting NGROUPS_MAX. Make it const, in the same way as cap_last_cap. Link: https://lkml.kernel.org/r/20211123202347.818157-6-mcgrof@kernel.org Signed-off-by: Stephen Kitt Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Tetsuo Handa Cc: Xiaoming Ni Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 55e25996e3cf..29babf1f0d4a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -125,7 +125,7 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static int maxolduid = 65535; static int minolduid; -static int ngroups_max = NGROUPS_MAX; +static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -2291,7 +2291,7 @@ static struct ctl_table kern_table[] = { #endif { .procname = "ngroups_max", - .data = &ngroups_max, + .data = (void *)&ngroups_max, .maxlen = sizeof (int), .mode = 0444, .proc_handler = proc_dointvec, -- cgit v1.2.3-58-ga151 From d73840ec2f747b860331bbba53677d0ce38fb9c1 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:14 -0800 Subject: sysctl: use const for typically used max/min proc sysctls When proc_dointvec_minmax() or proc_doulongvec_minmax() are used we are using the extra1 and extra2 parameters on the sysctl table only for a min and max boundary, these extra1 and extra2 arguments are then used for read-only operations. So make them const to reflect this. [mcgrof@kernel.org: commit log love] Link: https://lkml.kernel.org/r/20211123202347.818157-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 53 ++++++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 29babf1f0d4a..134c2cb9ed4e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -108,27 +108,26 @@ /* Constants used for minimum and maximum */ -static unsigned long zero_ul; -static unsigned long one_ul = 1; -static unsigned long long_max = LONG_MAX; +static const unsigned long zero_ul; +static const unsigned long one_ul = 1; +static const unsigned long long_max = LONG_MAX; #ifdef CONFIG_PRINTK -static int ten_thousand = 10000; +static const int ten_thousand = 10000; #endif #ifdef CONFIG_PERF_EVENTS -static int six_hundred_forty_kb = 640 * 1024; +static const int six_hundred_forty_kb = 640 * 1024; #endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ -static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; +static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ -static int maxolduid = 65535; -static int minolduid; +static const int maxolduid = 65535; +static const int minolduid; static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; - #ifdef CONFIG_INOTIFY_USER #include #endif @@ -172,8 +171,8 @@ int sysctl_legacy_va_layout; #endif #ifdef CONFIG_COMPACTION -static int min_extfrag_threshold; -static int max_extfrag_threshold = 1000; +static const int min_extfrag_threshold; +static const int max_extfrag_threshold = 1000; #endif #endif /* CONFIG_SYSCTL */ @@ -2177,8 +2176,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = (void *)&minolduid, + .extra2 = (void *)&maxolduid, }, { .procname = "overflowgid", @@ -2186,8 +2185,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = (void *)&minolduid, + .extra2 = (void *)&maxolduid, }, #ifdef CONFIG_S390 { @@ -2261,7 +2260,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &ten_thousand, + .extra2 = (void *)&ten_thousand, }, { .procname = "printk_devkmsg", @@ -2473,7 +2472,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &six_hundred_forty_kb, + .extra2 = (void *)&six_hundred_forty_kb, }, { .procname = "perf_event_max_contexts_per_stack", @@ -2629,7 +2628,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_background_bytes), .mode = 0644, .proc_handler = dirty_background_bytes_handler, - .extra1 = &one_ul, + .extra1 = (void *)&one_ul, }, { .procname = "dirty_ratio", @@ -2646,7 +2645,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_dirty_bytes), .mode = 0644, .proc_handler = dirty_bytes_handler, - .extra1 = &dirty_bytes_min, + .extra1 = (void *)&dirty_bytes_min, }, { .procname = "dirty_writeback_centisecs", @@ -2760,8 +2759,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &min_extfrag_threshold, - .extra2 = &max_extfrag_threshold, + .extra1 = (void *)&min_extfrag_threshold, + .extra2 = (void *)&max_extfrag_threshold, }, { .procname = "compact_unevictable_allowed", @@ -3047,8 +3046,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero_ul, - .extra2 = &long_max, + .extra1 = (void *)&zero_ul, + .extra2 = (void *)&long_max, }, { .procname = "nr_open", @@ -3072,8 +3071,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = (void *)&minolduid, + .extra2 = (void *)&maxolduid, }, { .procname = "overflowgid", @@ -3081,8 +3080,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = (void *)&minolduid, + .extra2 = (void *)&maxolduid, }, #ifdef CONFIG_FILE_LOCKING { -- cgit v1.2.3-58-ga151 From 2452dcb9f7f2bab5b47344148bc23a732be9195c Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:19 -0800 Subject: sysctl: use SYSCTL_ZERO to replace some static int zero uses Use the variable SYSCTL_ZERO to replace some static int boundary variables with a value of 0 (minolduid, min_extfrag_threshold, min_wakeup_granularity_ns). Link: https://lkml.kernel.org/r/20211123202347.818157-8-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 134c2cb9ed4e..a2175a305f10 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -123,7 +123,7 @@ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static const int maxolduid = 65535; -static const int minolduid; +/* minolduid is SYSCTL_ZERO */ static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -171,7 +171,7 @@ int sysctl_legacy_va_layout; #endif #ifdef CONFIG_COMPACTION -static const int min_extfrag_threshold; +/* min_extfrag_threshold is SYSCTL_ZERO */; static const int max_extfrag_threshold = 1000; #endif @@ -2176,7 +2176,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&minolduid, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&maxolduid, }, { @@ -2185,7 +2185,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&minolduid, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&maxolduid, }, #ifdef CONFIG_S390 @@ -2759,7 +2759,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&min_extfrag_threshold, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&max_extfrag_threshold, }, { @@ -3071,7 +3071,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&minolduid, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&maxolduid, }, { @@ -3080,7 +3080,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&minolduid, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&maxolduid, }, #ifdef CONFIG_FILE_LOCKING -- cgit v1.2.3-58-ga151 From 86b12b6c5d6b46e64bf2e8080528781032e4bd90 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:24 -0800 Subject: aio: move aio sysctl to aio.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Move aio sysctl to aio.c and use the new register_sysctl_init() to register the sysctl interface for aio. [mcgrof@kernel.org: adjust commit log to justify the move] Link: https://lkml.kernel.org/r/20211123202347.818157-9-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Jan Kara Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 31 +++++++++++++++++++++++++++++-- include/linux/aio.h | 4 ---- kernel/sysctl.c | 17 ----------------- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index f6f1cbffef9e..4ceba13a7db0 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -220,9 +220,35 @@ struct aio_kiocb { /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); -unsigned long aio_nr; /* current system wide number of aio requests */ -unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ +static unsigned long aio_nr; /* current system wide number of aio requests */ +static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ +#ifdef CONFIG_SYSCTL +static struct ctl_table aio_sysctls[] = { + { + .procname = "aio-nr", + .data = &aio_nr, + .maxlen = sizeof(aio_nr), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "aio-max-nr", + .data = &aio_max_nr, + .maxlen = sizeof(aio_max_nr), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + {} +}; + +static void __init aio_sysctl_init(void) +{ + register_sysctl_init("fs", aio_sysctls); +} +#else +#define aio_sysctl_init() do { } while (0) +#endif static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; @@ -275,6 +301,7 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); + aio_sysctl_init(); return 0; } __initcall(aio_setup); diff --git a/include/linux/aio.h b/include/linux/aio.h index b83e68dd006f..86892a4fe7c8 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -20,8 +20,4 @@ static inline void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) { } #endif /* CONFIG_AIO */ -/* for sysctl: */ -extern unsigned long aio_nr; -extern unsigned long aio_max_nr; - #endif /* __LINUX__AIO_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a2175a305f10..822555a0ec76 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -20,7 +20,6 @@ */ #include -#include #include #include #include @@ -3111,22 +3110,6 @@ static struct ctl_table fs_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_AIO - { - .procname = "aio-nr", - .data = &aio_nr, - .maxlen = sizeof(aio_nr), - .mode = 0444, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "aio-max-nr", - .data = &aio_max_nr, - .maxlen = sizeof(aio_max_nr), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif /* CONFIG_AIO */ #ifdef CONFIG_INOTIFY_USER { .procname = "inotify", -- cgit v1.2.3-58-ga151 From 49a4de75719b6c0f1f375df9908a95cef1e34945 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:29 -0800 Subject: dnotify: move dnotify sysctl to dnotify.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move dnotify sysctls to dnotify.c and use the new register_sysctl_init() to register the sysctl interface. [mcgrof@kernel.org: adjust the commit log to justify the move] Link: https://lkml.kernel.org/r/20211123202347.818157-10-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Acked-by: Jan Kara Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/dnotify/dnotify.c | 21 ++++++++++++++++++++- include/linux/dnotify.h | 1 - kernel/sysctl.c | 10 ---------- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index d5ebebb034ff..829dd4a61b66 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -19,7 +19,25 @@ #include #include -int dir_notify_enable __read_mostly = 1; +static int dir_notify_enable __read_mostly = 1; +#ifdef CONFIG_SYSCTL +static struct ctl_table dnotify_sysctls[] = { + { + .procname = "dir-notify-enable", + .data = &dir_notify_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +static void __init dnotify_sysctl_init(void) +{ + register_sysctl_init("fs", dnotify_sysctls); +} +#else +#define dnotify_sysctl_init() do { } while (0) +#endif static struct kmem_cache *dnotify_struct_cache __read_mostly; static struct kmem_cache *dnotify_mark_cache __read_mostly; @@ -386,6 +404,7 @@ static int __init dnotify_init(void) dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); if (IS_ERR(dnotify_group)) panic("unable to allocate fsnotify group for dnotify\n"); + dnotify_sysctl_init(); return 0; } diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h index b87c3b85a166..b1d26f9f1c9f 100644 --- a/include/linux/dnotify.h +++ b/include/linux/dnotify.h @@ -29,7 +29,6 @@ struct dnotify_struct { FS_CREATE | FS_RENAME |\ FS_MOVED_FROM | FS_MOVED_TO) -extern int dir_notify_enable; extern void dnotify_flush(struct file *, fl_owner_t); extern int fcntl_dirnotify(int, struct file *, unsigned long); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 822555a0ec76..fb37825cd3cd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include #include @@ -3091,15 +3090,6 @@ static struct ctl_table fs_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_DNOTIFY - { - .procname = "dir-notify-enable", - .data = &dir_notify_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_MMU #ifdef CONFIG_FILE_LOCKING { -- cgit v1.2.3-58-ga151 From c8dd55410ba07de602169e037cbb62e495ad1880 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:11:34 -0800 Subject: hpet: simplify subdirectory registration with register_sysctl() Patch series "sysctl: second set of kernel/sysctl cleanups", v2. This is the 2nd set of kernel/sysctl.c cleanups. The diff stat should reflect how this is a much better way to deal with theses. Fortunately coccinelle can be used to ensure correctness for most of these and/or future merge conflicts. Note that since this is part of a larger effort to cleanup kernel/sysctl.c I think we have no other option but to go with merging these patches in either Andrew's tree or keep them staged in a separate tree and send a merge request later. Otherwise kernel/sysctl.c will end up becoming a sore spot for the next merge window. This patch (of 8): There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. // pycocci sysctl-subdir-register-sysctl-simplify.cocci drivers/char/hpet.c @c1@ expression E1; identifier subdir, sysctls; @@ static struct ctl_table subdir[] = { { .procname = E1, .maxlen = 0, .mode = 0555, .child = sysctls, }, { } }; @c2@ identifier c1.subdir; expression E2; identifier base; @@ static struct ctl_table base[] = { { .procname = E2, .maxlen = 0, .mode = 0555, .child = subdir, }, { } }; @c3@ identifier c2.base; identifier header; @@ header = register_sysctl_table(base); @r1 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.subdir, c1.sysctls; @@ -static struct ctl_table subdir[] = { - { - .procname = E1, - .maxlen = 0, - .mode = 0555, - .child = sysctls, - }, - { } -}; @r2 depends on c1 && c2 && c3@ identifier c1.subdir; expression c2.E2; identifier c2.base; @@ -static struct ctl_table base[] = { - { - .procname = E2, - .maxlen = 0, - .mode = 0555, - .child = subdir, - }, - { } -}; @initialize:python@ @@ def make_my_fresh_expression(s1, s2): return '"' + s1.strip('"') + "/" + s2.strip('"') + '"' @r3 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.sysctls; expression c2.E2; identifier c2.base; identifier c3.header; fresh identifier E3 = script:python(E2, E1) { make_my_fresh_expression(E2, E1) }; @@ header = -register_sysctl_table(base); +register_sysctl(E3, sysctls); Generated-by: Coccinelle SmPL Link: https://lkml.kernel.org/r/20211123202422.819032-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211123202422.819032-2-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Xiaoming Ni Cc: "Eric W. Biederman" Cc: Clemens Ladisch Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: David Airlie Cc: Benjamin Herrenschmidt Cc: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Cc: Jan Kara Cc: Amir Goldstein Cc: Phillip Potter Cc: Al Viro Cc: Julia Lawall Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/hpet.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 4e5431f01450..563dfae3b8da 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -746,26 +746,6 @@ static struct ctl_table hpet_table[] = { {} }; -static struct ctl_table hpet_root[] = { - { - .procname = "hpet", - .maxlen = 0, - .mode = 0555, - .child = hpet_table, - }, - {} -}; - -static struct ctl_table dev_root[] = { - { - .procname = "dev", - .maxlen = 0, - .mode = 0555, - .child = hpet_root, - }, - {} -}; - static struct ctl_table_header *sysctl_header; /* @@ -1061,7 +1041,7 @@ static int __init hpet_init(void) if (result < 0) return -ENODEV; - sysctl_header = register_sysctl_table(dev_root); + sysctl_header = register_sysctl("dev/hpet", hpet_table); result = acpi_bus_register_driver(&hpet_acpi_driver); if (result < 0) { -- cgit v1.2.3-58-ga151 From e5a1fd997cc2deda1b08d5faae04625de0440a1e Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:11:39 -0800 Subject: i915: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. // pycocci sysctl-subdir-register-sysctl-simplify.cocci PATH @c1@ expression E1; identifier subdir, sysctls; @@ static struct ctl_table subdir[] = { { .procname = E1, .maxlen = 0, .mode = 0555, .child = sysctls, }, { } }; @c2@ identifier c1.subdir; expression E2; identifier base; @@ static struct ctl_table base[] = { { .procname = E2, .maxlen = 0, .mode = 0555, .child = subdir, }, { } }; @c3@ identifier c2.base; identifier header; @@ header = register_sysctl_table(base); @r1 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.subdir, c1.sysctls; @@ -static struct ctl_table subdir[] = { - { - .procname = E1, - .maxlen = 0, - .mode = 0555, - .child = sysctls, - }, - { } -}; @r2 depends on c1 && c2 && c3@ identifier c1.subdir; expression c2.E2; identifier c2.base; @@ -static struct ctl_table base[] = { - { - .procname = E2, - .maxlen = 0, - .mode = 0555, - .child = subdir, - }, - { } -}; @initialize:python@ @@ def make_my_fresh_expression(s1, s2): return '"' + s1.strip('"') + "/" + s2.strip('"') + '"' @r3 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.sysctls; expression c2.E2; identifier c2.base; identifier c3.header; fresh identifier E3 = script:python(E2, E1) { make_my_fresh_expression(E2, E1) }; @@ header = -register_sysctl_table(base); +register_sysctl(E3, sysctls); Generated-by: Coccinelle SmPL Link: https://lkml.kernel.org/r/20211123202422.819032-3-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Acked-by: Jani Nikula Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Xiaoming Ni Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/i915_perf.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 170bba913c30..e27f3b7cf094 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -4273,26 +4273,6 @@ static struct ctl_table oa_table[] = { {} }; -static struct ctl_table i915_root[] = { - { - .procname = "i915", - .maxlen = 0, - .mode = 0555, - .child = oa_table, - }, - {} -}; - -static struct ctl_table dev_root[] = { - { - .procname = "dev", - .maxlen = 0, - .mode = 0555, - .child = i915_root, - }, - {} -}; - static void oa_init_supported_formats(struct i915_perf *perf) { struct drm_i915_private *i915 = perf->i915; @@ -4488,7 +4468,7 @@ static int destroy_config(int id, void *p, void *data) int i915_perf_sysctl_register(void) { - sysctl_header = register_sysctl_table(dev_root); + sysctl_header = register_sysctl("dev/i915", oa_table); return 0; } -- cgit v1.2.3-58-ga151 From e99f5e7479110868ac04c5f6593a15c2da61f969 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:11:44 -0800 Subject: macintosh/mac_hid.c: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. // pycocci sysctl-subdir-register-sysctl-simplify.cocci PATH @c1@ expression E1; identifier subdir, sysctls; @@ static struct ctl_table subdir[] = { { .procname = E1, .maxlen = 0, .mode = 0555, .child = sysctls, }, { } }; @c2@ identifier c1.subdir; expression E2; identifier base; @@ static struct ctl_table base[] = { { .procname = E2, .maxlen = 0, .mode = 0555, .child = subdir, }, { } }; @c3@ identifier c2.base; identifier header; @@ header = register_sysctl_table(base); @r1 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.subdir, c1.sysctls; @@ -static struct ctl_table subdir[] = { - { - .procname = E1, - .maxlen = 0, - .mode = 0555, - .child = sysctls, - }, - { } -}; @r2 depends on c1 && c2 && c3@ identifier c1.subdir; expression c2.E2; identifier c2.base; @@ -static struct ctl_table base[] = { - { - .procname = E2, - .maxlen = 0, - .mode = 0555, - .child = subdir, - }, - { } -}; @initialize:python@ @@ def make_my_fresh_expression(s1, s2): return '"' + s1.strip('"') + "/" + s2.strip('"') + '"' @r3 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.sysctls; expression c2.E2; identifier c2.base; identifier c3.header; fresh identifier E3 = script:python(E2, E1) { make_my_fresh_expression(E2, E1) }; @@ header = -register_sysctl_table(base); +register_sysctl(E3, sysctls); Generated-by: Coccinelle SmPL Link: https://lkml.kernel.org/r/20211123202422.819032-4-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Xiaoming Ni Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/macintosh/mac_hid.c | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c index 28b8581b44dd..d8c4d5664145 100644 --- a/drivers/macintosh/mac_hid.c +++ b/drivers/macintosh/mac_hid.c @@ -239,33 +239,11 @@ static struct ctl_table mac_hid_files[] = { { } }; -/* dir in /proc/sys/dev */ -static struct ctl_table mac_hid_dir[] = { - { - .procname = "mac_hid", - .maxlen = 0, - .mode = 0555, - .child = mac_hid_files, - }, - { } -}; - -/* /proc/sys/dev itself, in case that is not there yet */ -static struct ctl_table mac_hid_root_dir[] = { - { - .procname = "dev", - .maxlen = 0, - .mode = 0555, - .child = mac_hid_dir, - }, - { } -}; - static struct ctl_table_header *mac_hid_sysctl_header; static int __init mac_hid_init(void) { - mac_hid_sysctl_header = register_sysctl_table(mac_hid_root_dir); + mac_hid_sysctl_header = register_sysctl("dev/mac_hid", mac_hid_files); if (!mac_hid_sysctl_header) return -ENOMEM; -- cgit v1.2.3-58-ga151 From c42ff46f97c1c25577a84fbfb111710d25a129e0 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:11:49 -0800 Subject: ocfs2: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. // pycocci sysctl-subdir-register-sysctl-simplify.cocci PATH @c1@ expression E1; identifier subdir, sysctls; @@ static struct ctl_table subdir[] = { { .procname = E1, .maxlen = 0, .mode = 0555, .child = sysctls, }, { } }; @c2@ identifier c1.subdir; expression E2; identifier base; @@ static struct ctl_table base[] = { { .procname = E2, .maxlen = 0, .mode = 0555, .child = subdir, }, { } }; @c3@ identifier c2.base; identifier header; @@ header = register_sysctl_table(base); @r1 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.subdir, c1.sysctls; @@ -static struct ctl_table subdir[] = { - { - .procname = E1, - .maxlen = 0, - .mode = 0555, - .child = sysctls, - }, - { } -}; @r2 depends on c1 && c2 && c3@ identifier c1.subdir; expression c2.E2; identifier c2.base; @@ -static struct ctl_table base[] = { - { - .procname = E2, - .maxlen = 0, - .mode = 0555, - .child = subdir, - }, - { } -}; @initialize:python@ @@ def make_my_fresh_expression(s1, s2): return '"' + s1.strip('"') + "/" + s2.strip('"') + '"' @r3 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.sysctls; expression c2.E2; identifier c2.base; identifier c3.header; fresh identifier E3 = script:python(E2, E1) { make_my_fresh_expression(E2, E1) }; @@ header = -register_sysctl_table(base); +register_sysctl(E3, sysctls); Generated-by: Coccinelle SmPL Link: https://lkml.kernel.org/r/20211123202422.819032-5-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Reviewed-by: Jan Kara Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Xiaoming Ni Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stackglue.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 16f1bfc407f2..731558a6f27d 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -672,31 +672,8 @@ static struct ctl_table ocfs2_mod_table[] = { { } }; -static struct ctl_table ocfs2_kern_table[] = { - { - .procname = "ocfs2", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_mod_table - }, - { } -}; - -static struct ctl_table ocfs2_root_table[] = { - { - .procname = "fs", - .data = NULL, - .maxlen = 0, - .mode = 0555, - .child = ocfs2_kern_table - }, - { } -}; - static struct ctl_table_header *ocfs2_table_header; - /* * Initialization */ @@ -705,7 +682,7 @@ static int __init ocfs2_stack_glue_init(void) { strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); - ocfs2_table_header = register_sysctl_table(ocfs2_root_table); + ocfs2_table_header = register_sysctl("fs/ocfs2", ocfs2_mod_table); if (!ocfs2_table_header) { printk(KERN_ERR "ocfs2 stack glue: unable to register sysctl\n"); -- cgit v1.2.3-58-ga151 From 04bc883c986d9c8a64fc1f1cc2cbc328c2b2a496 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:11:54 -0800 Subject: test_sysctl: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. // pycocci sysctl-subdir-register-sysctl-simplify.cocci lib/test_sysctl.c @c1@ expression E1; identifier subdir, sysctls; @@ static struct ctl_table subdir[] = { { .procname = E1, .maxlen = 0, .mode = 0555, .child = sysctls, }, { } }; @c2@ identifier c1.subdir; expression E2; identifier base; @@ static struct ctl_table base[] = { { .procname = E2, .maxlen = 0, .mode = 0555, .child = subdir, }, { } }; @c3@ identifier c2.base; identifier header; @@ header = register_sysctl_table(base); @r1 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.subdir, c1.sysctls; @@ -static struct ctl_table subdir[] = { - { - .procname = E1, - .maxlen = 0, - .mode = 0555, - .child = sysctls, - }, - { } -}; @r2 depends on c1 && c2 && c3@ identifier c1.subdir; expression c2.E2; identifier c2.base; @@ -static struct ctl_table base[] = { - { - .procname = E2, - .maxlen = 0, - .mode = 0555, - .child = subdir, - }, - { } -}; @initialize:python@ @@ def make_my_fresh_expression(s1, s2): return '"' + s1.strip('"') + "/" + s2.strip('"') + '"' @r3 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.sysctls; expression c2.E2; identifier c2.base; identifier c3.header; fresh identifier E3 = script:python(E2, E1) { make_my_fresh_expression(E2, E1) }; @@ header = -register_sysctl_table(base); +register_sysctl(E3, sysctls); Generated-by: Coccinelle SmPL Link: https://lkml.kernel.org/r/20211123202422.819032-6-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Xiaoming Ni Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_sysctl.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index 3750323973f4..a5a3d6c27e1f 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -128,26 +128,6 @@ static struct ctl_table test_table[] = { { } }; -static struct ctl_table test_sysctl_table[] = { - { - .procname = "test_sysctl", - .maxlen = 0, - .mode = 0555, - .child = test_table, - }, - { } -}; - -static struct ctl_table test_sysctl_root_table[] = { - { - .procname = "debug", - .maxlen = 0, - .mode = 0555, - .child = test_sysctl_table, - }, - { } -}; - static struct ctl_table_header *test_sysctl_header; static int __init test_sysctl_init(void) @@ -155,7 +135,7 @@ static int __init test_sysctl_init(void) test_data.bitmap_0001 = kzalloc(SYSCTL_TEST_BITMAP_SIZE/8, GFP_KERNEL); if (!test_data.bitmap_0001) return -ENOMEM; - test_sysctl_header = register_sysctl_table(test_sysctl_root_table); + test_sysctl_header = register_sysctl("debug/test_sysctl", test_table); if (!test_sysctl_header) { kfree(test_data.bitmap_0001); return -ENOMEM; -- cgit v1.2.3-58-ga151 From 7b9ad122b52c9839e1f68f16c907990a6ad6f793 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:59 -0800 Subject: inotify: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. Move inotify_user sysctl to inotify_user.c while at it to remove clutter from kernel/sysctl.c. [mcgrof@kernel.org: remember to register fanotify_table] Link: https://lkml.kernel.org/r/YZ5A6iWLb0h3N3RC@bombadil.infradead.org [mcgrof@kernel.org: update commit log to reflect new path we decided to take] Link: https://lkml.kernel.org/r/20211123202422.819032-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 10 +++++++++- fs/notify/inotify/inotify_user.c | 11 ++++++++++- include/linux/fanotify.h | 2 -- include/linux/inotify.h | 3 --- kernel/sysctl.c | 21 --------------------- 5 files changed, 19 insertions(+), 28 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 73a3e939c921..73b1615f9d96 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -59,7 +59,7 @@ static int fanotify_max_queued_events __read_mostly; static long ft_zero = 0; static long ft_int_max = INT_MAX; -struct ctl_table fanotify_table[] = { +static struct ctl_table fanotify_table[] = { { .procname = "max_user_groups", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], @@ -88,6 +88,13 @@ struct ctl_table fanotify_table[] = { }, { } }; + +static void __init fanotify_sysctls_init(void) +{ + register_sysctl("fs/fanotify", fanotify_table); +} +#else +#define fanotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ /* @@ -1743,6 +1750,7 @@ static int __init fanotify_user_setup(void) init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = FANOTIFY_DEFAULT_MAX_GROUPS; init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; + fanotify_sysctls_init(); return 0; } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 29fca3284bb5..54583f62dc44 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -58,7 +58,7 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly; static long it_zero = 0; static long it_int_max = INT_MAX; -struct ctl_table inotify_table[] = { +static struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], @@ -87,6 +87,14 @@ struct ctl_table inotify_table[] = { }, { } }; + +static void __init inotify_sysctls_init(void) +{ + register_sysctl("fs/inotify", inotify_table); +} + +#else +#define inotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) @@ -849,6 +857,7 @@ static int __init inotify_user_setup(void) inotify_max_queued_events = 16384; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max; + inotify_sysctls_init(); return 0; } diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 3afdf339d53c..419cadcd7ff5 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -5,8 +5,6 @@ #include #include -extern struct ctl_table fanotify_table[]; /* for sysctl */ - #define FAN_GROUP_FLAG(group, flag) \ ((group)->fanotify_data.flags & (flag)) diff --git a/include/linux/inotify.h b/include/linux/inotify.h index 6a24905f6e1e..8d20caa1b268 100644 --- a/include/linux/inotify.h +++ b/include/linux/inotify.h @@ -7,11 +7,8 @@ #ifndef _LINUX_INOTIFY_H #define _LINUX_INOTIFY_H -#include #include -extern struct ctl_table inotify_table[]; /* for sysctl */ - #define ALL_INOTIFY_BITS (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | \ IN_MOVED_TO | IN_CREATE | IN_DELETE | \ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fb37825cd3cd..2ee0e76888d8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -126,13 +126,6 @@ static const int maxolduid = 65535; static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -#ifdef CONFIG_INOTIFY_USER -#include -#endif -#ifdef CONFIG_FANOTIFY -#include -#endif - #ifdef CONFIG_PROC_SYSCTL /** @@ -3100,20 +3093,6 @@ static struct ctl_table fs_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_INOTIFY_USER - { - .procname = "inotify", - .mode = 0555, - .child = inotify_table, - }, -#endif -#ifdef CONFIG_FANOTIFY - { - .procname = "fanotify", - .mode = 0555, - .child = fanotify_table, - }, -#endif #ifdef CONFIG_EPOLL { .procname = "epoll", -- cgit v1.2.3-58-ga151 From ad8f74315b335c55f3d8bd7c37d21d6c74d1be20 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:04 -0800 Subject: cdrom: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. // pycocci sysctl-subdir-register-sysctl-simplify.cocci PATH @c1@ expression E1; identifier subdir, sysctls; @@ static struct ctl_table subdir[] = { { .procname = E1, .maxlen = 0, .mode = 0555, .child = sysctls, }, { } }; @c2@ identifier c1.subdir; expression E2; identifier base; @@ static struct ctl_table base[] = { { .procname = E2, .maxlen = 0, .mode = 0555, .child = subdir, }, { } }; @c3@ identifier c2.base; identifier header; @@ header = register_sysctl_table(base); @r1 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.subdir, c1.sysctls; @@ -static struct ctl_table subdir[] = { - { - .procname = E1, - .maxlen = 0, - .mode = 0555, - .child = sysctls, - }, - { } -}; @r2 depends on c1 && c2 && c3@ identifier c1.subdir; expression c2.E2; identifier c2.base; @@ -static struct ctl_table base[] = { - { - .procname = E2, - .maxlen = 0, - .mode = 0555, - .child = subdir, - }, - { } -}; @initialize:python@ @@ def make_my_fresh_expression(s1, s2): return '"' + s1.strip('"') + "/" + s2.strip('"') + '"' @r3 depends on c1 && c2 && c3@ expression c1.E1; identifier c1.sysctls; expression c2.E2; identifier c2.base; identifier c3.header; fresh identifier E3 = script:python(E2, E1) { make_my_fresh_expression(E2, E1) }; @@ header = -register_sysctl_table(base); +register_sysctl(E3, sysctls); Generated-by: Coccinelle SmPL Link: https://lkml.kernel.org/r/20211123202422.819032-8-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Xiaoming Ni Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/cdrom/cdrom.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 9877e413fce3..1b57d4666e43 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3691,27 +3691,6 @@ static struct ctl_table cdrom_table[] = { }, { } }; - -static struct ctl_table cdrom_cdrom_table[] = { - { - .procname = "cdrom", - .maxlen = 0, - .mode = 0555, - .child = cdrom_table, - }, - { } -}; - -/* Make sure that /proc/sys/dev is there */ -static struct ctl_table cdrom_root_table[] = { - { - .procname = "dev", - .maxlen = 0, - .mode = 0555, - .child = cdrom_cdrom_table, - }, - { } -}; static struct ctl_table_header *cdrom_sysctl_header; static void cdrom_sysctl_register(void) @@ -3721,7 +3700,7 @@ static void cdrom_sysctl_register(void) if (!atomic_add_unless(&initialized, 1, 1)) return; - cdrom_sysctl_header = register_sysctl_table(cdrom_root_table); + cdrom_sysctl_header = register_sysctl("dev/cdrom", cdrom_table); /* set the defaults */ cdrom_sysctl_settings.autoclose = autoclose; -- cgit v1.2.3-58-ga151 From a8f5de894f76f1c73f4a068d04897a5e2f873825 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:09 -0800 Subject: eventpoll: simplify sysctl declaration with register_sysctl() The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the epoll_table sysctl to fs/eventpoll.c and use register_sysctl(). Link: https://lkml.kernel.org/r/20211123202422.819032-9-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 10 +++++++++- include/linux/poll.h | 2 -- include/linux/sysctl.h | 1 - kernel/sysctl.c | 7 ------- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 06f4c5ae1451..e2daa940ebce 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -307,7 +307,7 @@ static void unlist_file(struct epitems_head *head) static long long_zero; static long long_max = LONG_MAX; -struct ctl_table epoll_table[] = { +static struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, @@ -319,6 +319,13 @@ struct ctl_table epoll_table[] = { }, { } }; + +static void __init epoll_sysctls_init(void) +{ + register_sysctl("fs/epoll", epoll_table); +} +#else +#define epoll_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static const struct file_operations eventpoll_fops; @@ -2378,6 +2385,7 @@ static int __init eventpoll_init(void) /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); + epoll_sysctls_init(); ephead_cache = kmem_cache_create("ep_head", sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); diff --git a/include/linux/poll.h b/include/linux/poll.h index 1cdc32b1f1b0..a9e0e1c2d1f2 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -8,12 +8,10 @@ #include #include #include -#include #include #include #include -extern struct ctl_table epoll_table[]; /* for sysctl */ /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating additional memory. */ #ifdef __clang__ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 47cf70c8eb93..6dd0f277f844 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -219,7 +219,6 @@ extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; extern struct ctl_table random_table[]; extern struct ctl_table firmware_config_table[]; -extern struct ctl_table epoll_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2ee0e76888d8..09fa72299f18 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3093,13 +3093,6 @@ static struct ctl_table fs_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_EPOLL - { - .procname = "epoll", - .mode = 0555, - .child = epoll_table, - }, -#endif #endif { .procname = "protected_symlinks", -- cgit v1.2.3-58-ga151 From 6aad36d421d8bfe156508fa4edfe67827234cf0f Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:13 -0800 Subject: firmware_loader: move firmware sysctl to its own files Patch series "sysctl: 3rd set of kernel/sysctl cleanups", v2. This is the third set of patches to help address cleaning the kitchen seink in kernel/sysctl.c and to move sysctls away to where they are actually implemented / used. This patch (of 8): kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the firmware configuration sysctl table to the only place where it is used, and make it clear that if sysctls are disabled this is not used. [akpm@linux-foundation.org: export register_firmware_config_sysctl and unregister_firmware_config_sysctl to modules] [akpm@linux-foundation.org: use EXPORT_SYMBOL_NS_GPL instead] [sfr@canb.auug.org.au: fix that so it compiles] Link: https://lkml.kernel.org/r/20211201160626.401d828d@canb.auug.org.au [mcgrof@kernel.org: major commit log update to justify the move] Link: https://lkml.kernel.org/r/20211124231435.1445213-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211124231435.1445213-2-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Signed-off-by: Stephen Rothwell Cc: Kees Cook Cc: Iurii Zaikin Cc: Eric Biederman Cc: Stephen Kitt Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: "Theodore Ts'o" Cc: Al Viro Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt (VMware) Cc: John Ogness Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Martin K. Petersen Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Suren Baghdasaryan Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/firmware_loader/fallback.c | 7 ++++++- drivers/base/firmware_loader/fallback.h | 11 +++++++++++ drivers/base/firmware_loader/fallback_table.c | 25 +++++++++++++++++++++++-- include/linux/sysctl.h | 1 - kernel/sysctl.c | 7 ------- 5 files changed, 40 insertions(+), 11 deletions(-) diff --git a/drivers/base/firmware_loader/fallback.c b/drivers/base/firmware_loader/fallback.c index d7d63c1aa993..4afb0e9312c0 100644 --- a/drivers/base/firmware_loader/fallback.c +++ b/drivers/base/firmware_loader/fallback.c @@ -199,11 +199,16 @@ static struct class firmware_class = { int register_sysfs_loader(void) { - return class_register(&firmware_class); + int ret = class_register(&firmware_class); + + if (ret != 0) + return ret; + return register_firmware_config_sysctl(); } void unregister_sysfs_loader(void) { + unregister_firmware_config_sysctl(); class_unregister(&firmware_class); } diff --git a/drivers/base/firmware_loader/fallback.h b/drivers/base/firmware_loader/fallback.h index 3af7205b302f..9f3055d3b4ca 100644 --- a/drivers/base/firmware_loader/fallback.h +++ b/drivers/base/firmware_loader/fallback.h @@ -42,6 +42,17 @@ void fw_fallback_set_default_timeout(void); int register_sysfs_loader(void); void unregister_sysfs_loader(void); +#ifdef CONFIG_SYSCTL +extern int register_firmware_config_sysctl(void); +extern void unregister_firmware_config_sysctl(void); +#else +static inline int register_firmware_config_sysctl(void) +{ + return 0; +} +static inline void unregister_firmware_config_sysctl(void) { } +#endif /* CONFIG_SYSCTL */ + #else /* CONFIG_FW_LOADER_USER_HELPER */ static inline int firmware_fallback_sysfs(struct firmware *fw, const char *name, struct device *device, diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c index 46a731dede6f..e5ac098d0742 100644 --- a/drivers/base/firmware_loader/fallback_table.c +++ b/drivers/base/firmware_loader/fallback_table.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -24,7 +25,7 @@ struct firmware_fallback_config fw_fallback_config = { EXPORT_SYMBOL_NS_GPL(fw_fallback_config, FIRMWARE_LOADER_PRIVATE); #ifdef CONFIG_SYSCTL -struct ctl_table firmware_config_table[] = { +static struct ctl_table firmware_config_table[] = { { .procname = "force_sysfs_fallback", .data = &fw_fallback_config.force_sysfs_fallback, @@ -45,4 +46,24 @@ struct ctl_table firmware_config_table[] = { }, { } }; -#endif + +static struct ctl_table_header *firmware_config_sysct_table_header; +int register_firmware_config_sysctl(void) +{ + firmware_config_sysct_table_header = + register_sysctl("kernel/firmware_config", + firmware_config_table); + if (!firmware_config_sysct_table_header) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL_NS_GPL(register_firmware_config_sysctl, FIRMWARE_LOADER_PRIVATE); + +void unregister_firmware_config_sysctl(void) +{ + unregister_sysctl_table(firmware_config_sysct_table_header); + firmware_config_sysct_table_header = NULL; +} +EXPORT_SYMBOL_NS_GPL(unregister_firmware_config_sysctl, FIRMWARE_LOADER_PRIVATE); + +#endif /* CONFIG_SYSCTL */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 6dd0f277f844..3985e9c80155 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -218,7 +218,6 @@ extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; extern struct ctl_table random_table[]; -extern struct ctl_table firmware_config_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 09fa72299f18..5fc3ae16e995 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2154,13 +2154,6 @@ static struct ctl_table kern_table[] = { .mode = 0555, .child = usermodehelper_table, }, -#ifdef CONFIG_FW_LOADER_USER_HELPER - { - .procname = "firmware_config", - .mode = 0555, - .child = firmware_config_table, - }, -#endif { .procname = "overflowuid", .data = &overflowuid, -- cgit v1.2.3-58-ga151 From 5475e8f03c80bbce7b43a57d861f5acc44a60b22 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:18 -0800 Subject: random: move the random sysctl declarations to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the random sysctls to their own file and use register_sysctl_init(). [mcgrof@kernel.org: commit log update to justify the move] Link: https://lkml.kernel.org/r/20211124231435.1445213-3-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/random.c | 14 ++++++++++++-- include/linux/sysctl.h | 1 - kernel/sysctl.c | 5 ----- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index b411182df6f6..68613f0b6887 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1992,8 +1992,7 @@ static int proc_do_entropy(struct ctl_table *table, int write, void *buffer, } static int sysctl_poolsize = POOL_BITS; -extern struct ctl_table random_table[]; -struct ctl_table random_table[] = { +static struct ctl_table random_table[] = { { .procname = "poolsize", .data = &sysctl_poolsize, @@ -2055,6 +2054,17 @@ struct ctl_table random_table[] = { #endif { } }; + +/* + * rand_initialize() is called before sysctl_init(), + * so we cannot call register_sysctl_init() in rand_initialize() + */ +static int __init random_sysctls_init(void) +{ + register_sysctl_init("kernel/random", random_table); + return 0; +} +device_initcall(random_sysctls_init); #endif /* CONFIG_SYSCTL */ struct batched_entropy { diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 3985e9c80155..fce05a060bc5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -217,7 +217,6 @@ extern int unaligned_dump_stack; extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; -extern struct ctl_table random_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5fc3ae16e995..67a5b04f2f84 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2144,11 +2144,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_max_threads, }, - { - .procname = "random", - .mode = 0555, - .child = random_table, - }, { .procname = "usermodehelper", .mode = 0555, -- cgit v1.2.3-58-ga151 From ee9efac48a082904d17a20131aa73d82f058cdd6 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:23 -0800 Subject: sysctl: add helper to register a sysctl mount point The way to create a subdirectory on top of sysctl_mount_point is a bit obscure, and *why* we do that even so more. Provide a helper which makes it clear why we do this. [akpm@linux-foundation.org: export register_sysctl_mount_point() to modules] Link: https://lkml.kernel.org/r/20211124231435.1445213-4-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Suggested-by: "Eric W. Biederman" Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 14 ++++++++++++++ include/linux/sysctl.h | 7 +++++++ 2 files changed, 21 insertions(+) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index cd8bbf13d0f0..6bb2d9a3513d 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -35,6 +35,20 @@ struct ctl_table sysctl_mount_point[] = { { } }; +/** + * register_sysctl_mount_point() - registers a sysctl mount point + * @path: path for the mount point + * + * Used to create a permanently empty directory to serve as mount point. + * There are some subtle but important permission checks this allows in the + * case of unprivileged mounts. + */ +struct ctl_table_header *register_sysctl_mount_point(const char *path) +{ + return register_sysctl(path, sysctl_mount_point); +} +EXPORT_SYMBOL(register_sysctl_mount_point); + static bool is_empty_dir(struct ctl_table_header *head) { return head->ctl_table[0].child == sysctl_mount_point; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index fce05a060bc5..746c098a6ff5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -209,6 +209,8 @@ extern int sysctl_init(void); extern void __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name); #define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) +extern struct ctl_table_header *register_sysctl_mount_point(const char *path); + void do_sysctl_args(void); extern int pwrsw_enabled; @@ -224,6 +226,11 @@ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * return NULL; } +static inline struct sysctl_header *register_sysctl_mount_point(const char *path) +{ + return NULL; +} + static inline struct ctl_table_header *register_sysctl_paths( const struct ctl_path *path, struct ctl_table *table) { -- cgit v1.2.3-58-ga151 From 3ba442d5331ff4d4339faf4986d0841386196f92 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:28 -0800 Subject: fs: move binfmt_misc sysctl to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. This moves the binfmt_misc sysctl to its own file to help remove clutter from kernel/sysctl.c. Link: https://lkml.kernel.org/r/20211124231435.1445213-5-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 6 +++++- kernel/sysctl.c | 7 ------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index e1eae7ea823a..ddea6acbddde 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -822,7 +822,11 @@ static int __init init_misc_binfmt(void) int err = register_filesystem(&bm_fs_type); if (!err) insert_binfmt(&misc_format); - return err; + if (!register_sysctl_mount_point("fs/binfmt_misc")) { + pr_warn("Failed to create fs/binfmt_misc sysctl mount point"); + return -ENOMEM; + } + return 0; } static void __exit exit_misc_binfmt(void) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 67a5b04f2f84..fb3cc7a1d4ad 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3127,13 +3127,6 @@ static struct ctl_table fs_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) - { - .procname = "binfmt_misc", - .mode = 0555, - .child = sysctl_mount_point, - }, -#endif { .procname = "pipe-max-size", .data = &pipe_max_size, -- cgit v1.2.3-58-ga151 From faaa357a55e03490fb280ac211be2298e635b220 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:33 -0800 Subject: printk: move printk sysctl to printk/sysctl.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move printk sysctl from kernel/sysctl.c to kernel/printk/sysctl.c. Use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: fixed compile issues when PRINTK is not set, commit log update] Link: https://lkml.kernel.org/r/20211124231435.1445213-6-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/Makefile | 5 ++- kernel/printk/internal.h | 6 ++++ kernel/printk/printk.c | 1 + kernel/printk/sysctl.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 68 -------------------------------------- 5 files changed, 96 insertions(+), 69 deletions(-) create mode 100644 kernel/printk/sysctl.c diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index d118739874c0..f5b388e810b9 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -2,5 +2,8 @@ obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o -obj-$(CONFIG_PRINTK) += printk_ringbuffer.o obj-$(CONFIG_PRINTK_INDEX) += index.o + +obj-$(CONFIG_PRINTK) += printk_support.o +printk_support-y := printk_ringbuffer.o +printk_support-$(CONFIG_SYSCTL) += sysctl.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 9f3ed2fdb721..6b1c4b399845 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -4,6 +4,12 @@ */ #include +#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) +void __init printk_sysctl_init(void); +#else +#define printk_sysctl_init() do { } while (0) +#endif + #ifdef CONFIG_PRINTK /* Flags for a single printk record. */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 155229f0cf0f..363a493bded8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3211,6 +3211,7 @@ static int __init printk_late_init(void) ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", console_cpu_notify, NULL); WARN_ON(ret < 0); + printk_sysctl_init(); return 0; } late_initcall(printk_late_init); diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c new file mode 100644 index 000000000000..653ae04aab7f --- /dev/null +++ b/kernel/printk/sysctl.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sysctl.c: General linux system control interface + */ + +#include +#include +#include +#include +#include "internal.h" + +static const int ten_thousand = 10000; + +static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} + +static struct ctl_table printk_sysctls[] = { + { + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_ratelimit", + .data = &printk_ratelimit_state.interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "printk_ratelimit_burst", + .data = &printk_ratelimit_state.burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&ten_thousand, + }, + { + .procname = "printk_devkmsg", + .data = devkmsg_log_str, + .maxlen = DEVKMSG_STR_MAX_SIZE, + .mode = 0644, + .proc_handler = devkmsg_sysctl_set_loglvl, + }, + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "kptr_restrict", + .data = &kptr_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + {} +}; + +void __init printk_sysctl_init(void) +{ + register_sysctl_init("kernel", printk_sysctls); +} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fb3cc7a1d4ad..509f782483db 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -908,17 +908,6 @@ static int proc_taint(struct ctl_table *table, int write, return err; } -#ifdef CONFIG_PRINTK -static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} -#endif - /** * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure * @min: pointer to minimum allowable value @@ -2210,63 +2199,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, -#if defined CONFIG_PRINTK - { - .procname = "printk", - .data = &console_loglevel, - .maxlen = 4*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_ratelimit", - .data = &printk_ratelimit_state.interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "printk_ratelimit_burst", - .data = &printk_ratelimit_state.burst, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_delay", - .data = &printk_delay_msec, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&ten_thousand, - }, - { - .procname = "printk_devkmsg", - .data = devkmsg_log_str, - .maxlen = DEVKMSG_STR_MAX_SIZE, - .mode = 0644, - .proc_handler = devkmsg_sysctl_set_loglvl, - }, - { - .procname = "dmesg_restrict", - .data = &dmesg_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "kptr_restrict", - .data = &kptr_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, -#endif { .procname = "ngroups_max", .data = (void *)&ngroups_max, -- cgit v1.2.3-58-ga151 From 26d1c80fd61e59d5e2eb6fda00e0148a6704ddeb Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:38 -0800 Subject: scsi/sg: move sg-big-buff sysctl to scsi/sg.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the sg-big-buff sysctl from kernel/sysctl.c to drivers/scsi/sg.c and use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: commit log update] Link: https://lkml.kernel.org/r/20211124231435.1445213-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/scsi/sg.c | 35 ++++++++++++++++++++++++++++++++++- include/scsi/sg.h | 4 ---- kernel/sysctl.c | 12 ------------ 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ad12b3261845..6b43e97bd417 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -77,7 +77,7 @@ static int sg_proc_init(void); #define SG_DEFAULT_TIMEOUT mult_frac(SG_DEFAULT_TIMEOUT_USER, HZ, USER_HZ) -int sg_big_buff = SG_DEF_RESERVED_SIZE; +static int sg_big_buff = SG_DEF_RESERVED_SIZE; /* N.B. This variable is readable and writeable via /proc/scsi/sg/def_reserved_size . Each time sg_open() is called a buffer of this size (or less if there is not enough memory) will be reserved @@ -1634,6 +1634,37 @@ MODULE_PARM_DESC(scatter_elem_sz, "scatter gather element " MODULE_PARM_DESC(def_reserved_size, "size of buffer reserved for each fd"); MODULE_PARM_DESC(allow_dio, "allow direct I/O (default: 0 (disallow))"); +#ifdef CONFIG_SYSCTL +#include + +static struct ctl_table sg_sysctls[] = { + { + .procname = "sg-big-buff", + .data = &sg_big_buff, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + {} +}; + +static struct ctl_table_header *hdr; +static void register_sg_sysctls(void) +{ + if (!hdr) + hdr = register_sysctl("kernel", sg_sysctls); +} + +static void unregister_sg_sysctls(void) +{ + if (hdr) + unregister_sysctl_table(hdr); +} +#else +#define register_sg_sysctls() do { } while (0) +#define unregister_sg_sysctls() do { } while (0) +#endif /* CONFIG_SYSCTL */ + static int __init init_sg(void) { @@ -1666,6 +1697,7 @@ init_sg(void) return 0; } class_destroy(sg_sysfs_class); + register_sg_sysctls(); err_out: unregister_chrdev_region(MKDEV(SCSI_GENERIC_MAJOR, 0), SG_MAX_DEVS); return rc; @@ -1674,6 +1706,7 @@ err_out: static void __exit exit_sg(void) { + unregister_sg_sysctls(); #ifdef CONFIG_SCSI_PROC_FS remove_proc_subtree("scsi/sg", NULL); #endif /* CONFIG_SCSI_PROC_FS */ diff --git a/include/scsi/sg.h b/include/scsi/sg.h index 843cefb8efce..068e35d36557 100644 --- a/include/scsi/sg.h +++ b/include/scsi/sg.h @@ -29,10 +29,6 @@ * For utility and test programs see: http://sg.danny.cz/sg/sg3_utils.html */ -#ifdef __KERNEL__ -extern int sg_big_buff; /* for sysctl */ -#endif - typedef struct sg_iovec /* same structure as used by readv() Linux system */ { /* call. It defines one scatter-gather element. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 509f782483db..b0cced3808d4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -95,9 +95,6 @@ #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) #include #endif -#ifdef CONFIG_CHR_DEV_SG -#include -#endif #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include #endif @@ -2090,15 +2087,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dostring, }, #endif -#ifdef CONFIG_CHR_DEV_SG - { - .procname = "sg-big-buff", - .data = &sg_big_buff, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_BSD_PROCESS_ACCT { .procname = "acct", -- cgit v1.2.3-58-ga151 From 0df8bdd5e3b3e557ce2c2575fce0c64c5dd1045a Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:43 -0800 Subject: stackleak: move stack_erasing sysctl to stackleak.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the stack_erasing sysctl from kernel/sysctl.c to kernel/stackleak.c and use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: commit log update] Link: https://lkml.kernel.org/r/20211124231435.1445213-8-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stackleak.h | 5 ----- kernel/stackleak.c | 26 ++++++++++++++++++++++++-- kernel/sysctl.c | 14 -------------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index a59db2f08e76..ccaab2043fcd 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -23,11 +23,6 @@ static inline void stackleak_task_init(struct task_struct *t) # endif } -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE -int stack_erasing_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -#endif - #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */ static inline void stackleak_task_init(struct task_struct *t) { } #endif diff --git a/kernel/stackleak.c b/kernel/stackleak.c index ce161a8e8d97..66b8af394e58 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -16,11 +16,13 @@ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include #include +#include static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); -int stack_erasing_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +#ifdef CONFIG_SYSCTL +static int stack_erasing_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; int state = !static_branch_unlikely(&stack_erasing_bypass); @@ -42,6 +44,26 @@ int stack_erasing_sysctl(struct ctl_table *table, int write, state ? "enabled" : "disabled"); return ret; } +static struct ctl_table stackleak_sysctls[] = { + { + .procname = "stack_erasing", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = stack_erasing_sysctl, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init stackleak_sysctls_init(void) +{ + register_sysctl_init("kernel", stackleak_sysctls); + return 0; +} +late_initcall(stackleak_sysctls_init); +#endif /* CONFIG_SYSCTL */ #define skip_erasing() static_branch_unlikely(&stack_erasing_bypass) #else diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b0cced3808d4..3e06dafdd2c3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -95,9 +95,6 @@ #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) #include #endif -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE -#include -#endif #if defined(CONFIG_SYSCTL) @@ -2442,17 +2439,6 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, -#endif -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE - { - .procname = "stack_erasing", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = stack_erasing_sysctl, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #endif { } }; -- cgit v1.2.3-58-ga151 From b1f2aff888af54a057c2c3c0d88a13ef5d37b52a Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:48 -0800 Subject: sysctl: share unsigned long const values Provide a way to share unsigned long values. This will allow others to not have to re-invent these values. Link: https://lkml.kernel.org/r/20211124231435.1445213-9-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 3 +++ include/linux/sysctl.h | 6 ++++++ kernel/sysctl.c | 9 +++------ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 6bb2d9a3513d..2eb5987091ea 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -29,6 +29,9 @@ static const struct inode_operations proc_sys_dir_operations; const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX }; EXPORT_SYMBOL(sysctl_vals); +const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; +EXPORT_SYMBOL_GPL(sysctl_long_vals); + /* Support for permanently empty directories */ struct ctl_table sysctl_mount_point[] = { diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 746c098a6ff5..2de6d20d191b 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -51,6 +51,12 @@ struct ctl_dir; extern const int sysctl_vals[]; +#define SYSCTL_LONG_ZERO ((void *)&sysctl_long_vals[0]) +#define SYSCTL_LONG_ONE ((void *)&sysctl_long_vals[1]) +#define SYSCTL_LONG_MAX ((void *)&sysctl_long_vals[2]) + +extern const unsigned long sysctl_long_vals[]; + typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e06dafdd2c3..1b8a1cb8aac9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -100,9 +100,6 @@ /* Constants used for minimum and maximum */ -static const unsigned long zero_ul; -static const unsigned long one_ul = 1; -static const unsigned long long_max = LONG_MAX; #ifdef CONFIG_PRINTK static const int ten_thousand = 10000; #endif @@ -2513,7 +2510,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_background_bytes), .mode = 0644, .proc_handler = dirty_background_bytes_handler, - .extra1 = (void *)&one_ul, + .extra1 = SYSCTL_LONG_ONE, }, { .procname = "dirty_ratio", @@ -2931,8 +2928,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = (void *)&zero_ul, - .extra2 = (void *)&long_max, + .extra1 = SYSCTL_LONG_ZERO, + .extra2 = SYSCTL_LONG_MAX, }, { .procname = "nr_open", -- cgit v1.2.3-58-ga151 From 1d67fe585049d3e2448b997af78c68cbf90ada09 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:52 -0800 Subject: fs: move inode sysctls to its own file Patch series "sysctl: 4th set of kernel/sysctl cleanups". This is slimming down the fs uses of kernel/sysctl.c to the point that the next step is to just get rid of the fs base directory for it and move that elsehwere, so that next patch series starts dealing with that to demo how we can end up cleaning up a full base directory from kernel/sysctl.c, one at a time. This patch (of 9): kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the inode sysctls to its own file. Since we are no longer using this outside of fs/ remove the extern declaration of its respective proc helper. We use early_initcall() as it is the earliest we can use. [arnd@arndb.de: avoid unused-variable warning] Link: https://lkml.kernel.org/r/20211203190123.874239-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20211129205548.605569-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211129205548.605569-2-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Kees Cook Cc: Iurii Zaikin Cc: Xiaoming Ni Cc: Eric Biederman Cc: Stephen Kitt Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Andy Shevchenko Cc: Jeff Layton Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 39 ++++++++++++++++++++++++++++++++------- include/linux/fs.h | 3 --- kernel/sysctl.c | 14 -------------- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 980e7b7a5460..63324df6fa27 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -67,11 +67,6 @@ const struct address_space_operations empty_aops = { }; EXPORT_SYMBOL(empty_aops); -/* - * Statistics gathering.. - */ -struct inodes_stat_t inodes_stat; - static DEFINE_PER_CPU(unsigned long, nr_inodes); static DEFINE_PER_CPU(unsigned long, nr_unused); @@ -106,13 +101,43 @@ long get_nr_dirty_inodes(void) * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL -int proc_nr_inodes(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +/* + * Statistics gathering.. + */ +static struct inodes_stat_t inodes_stat; + +static int proc_nr_inodes(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } + +static struct ctl_table inodes_sysctls[] = { + { + .procname = "inode-nr", + .data = &inodes_stat, + .maxlen = 2*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "inode-state", + .data = &inodes_stat, + .maxlen = 7*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { } +}; + +static int __init init_fs_inode_sysctls(void) +{ + register_sysctl_init("fs", inodes_sysctls); + return 0; +} +early_initcall(init_fs_inode_sysctls); #endif static int no_open(struct inode *inode, struct file *file) diff --git a/include/linux/fs.h b/include/linux/fs.h index c8510da6cc6d..044de67c8167 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -82,7 +82,6 @@ extern void __init files_maxfiles_init(void); extern struct files_stat_struct files_stat; extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; -extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; extern int sysctl_protected_symlinks; extern int sysctl_protected_hardlinks; @@ -3537,8 +3536,6 @@ int proc_nr_files(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int proc_nr_inodes(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1b8a1cb8aac9..402c9d3246bf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2901,20 +2901,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "inode-nr", - .data = &inodes_stat, - .maxlen = 2*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "inode-state", - .data = &inodes_stat, - .maxlen = 7*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, { .procname = "file-nr", .data = &files_stat, -- cgit v1.2.3-58-ga151 From 204d5a24e15562b2816825c0f9b49d26814b77be Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:56 -0800 Subject: fs: move fs stat sysctls to file_table.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. We can create the sysctl dynamically on early init for fs stat to help with this clutter. This dusts off the fs stat syctls knobs and puts them into where they are declared. Link: https://lkml.kernel.org/r/20211129205548.605569-3-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file_table.c | 47 +++++++++++++++++++++++++++++++++++++++-------- include/linux/fs.h | 3 --- kernel/sysctl.c | 25 ------------------------- 3 files changed, 39 insertions(+), 36 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 45437f8e1003..57edef16dce4 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -33,7 +33,7 @@ #include "internal.h" /* sysctl tunables... */ -struct files_stat_struct files_stat = { +static struct files_stat_struct files_stat = { .max_files = NR_FILE }; @@ -75,22 +75,53 @@ unsigned long get_max_files(void) } EXPORT_SYMBOL_GPL(get_max_files); +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) + /* * Handle nr_files sysctl */ -#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -int proc_nr_files(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_nr_files(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } -#else -int proc_nr_files(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) + +static struct ctl_table fs_stat_sysctls[] = { + { + .procname = "file-nr", + .data = &files_stat, + .maxlen = sizeof(files_stat), + .mode = 0444, + .proc_handler = proc_nr_files, + }, + { + .procname = "file-max", + .data = &files_stat.max_files, + .maxlen = sizeof(files_stat.max_files), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = SYSCTL_LONG_ZERO, + .extra2 = SYSCTL_LONG_MAX, + }, + { + .procname = "nr_open", + .data = &sysctl_nr_open, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &sysctl_nr_open_min, + .extra2 = &sysctl_nr_open_max, + }, + { } +}; + +static int __init init_fs_stat_sysctls(void) { - return -ENOSYS; + register_sysctl_init("fs", fs_stat_sysctls); + return 0; } +fs_initcall(init_fs_stat_sysctls); #endif static struct file *__alloc_file(int flags, const struct cred *cred) diff --git a/include/linux/fs.h b/include/linux/fs.h index 044de67c8167..1e6761966120 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -79,7 +79,6 @@ extern void __init inode_init_early(void); extern void __init files_init(void); extern void __init files_maxfiles_init(void); -extern struct files_stat_struct files_stat; extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; extern int leases_enable, lease_break_time; @@ -3532,8 +3531,6 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; -int proc_nr_files(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int __init list_bdev_fs_names(char *buf, size_t size); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 402c9d3246bf..d2c411b15854 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2901,31 +2901,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "file-nr", - .data = &files_stat, - .maxlen = sizeof(files_stat), - .mode = 0444, - .proc_handler = proc_nr_files, - }, - { - .procname = "file-max", - .data = &files_stat.max_files, - .maxlen = sizeof(files_stat.max_files), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = SYSCTL_LONG_ZERO, - .extra2 = SYSCTL_LONG_MAX, - }, - { - .procname = "nr_open", - .data = &sysctl_nr_open, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &sysctl_nr_open_min, - .extra2 = &sysctl_nr_open_max, - }, { .procname = "dentry-state", .data = &dentry_stat, -- cgit v1.2.3-58-ga151 From c8c0c239d5ab1e3e8d2bb0453ce642fe2c6357ec Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:59 -0800 Subject: fs: move dcache sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the dcache sysctl clutter out of kernel/sysctl.c. This is a small one-off entry, perhaps later we can simplify this representation, but for now we use the helpers we have. We won't know how we can simplify this further untl we're fully done with the cleanup. [arnd@arndb.de: avoid unused-function warning] Link: https://lkml.kernel.org/r/20211203190123.874239-2-arnd@kernel.org Link: https://lkml.kernel.org/r/20211129205548.605569-4-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 37 +++++++++++++++++++++++++++++++------ include/linux/dcache.h | 10 ---------- include/linux/fs.h | 2 -- kernel/sysctl.c | 7 ------- 4 files changed, 31 insertions(+), 25 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index cf871a81f4fd..c84269c6e8bf 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -115,10 +115,13 @@ static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } - -/* Statistics gathering. */ -struct dentry_stat_t dentry_stat = { - .age_limit = 45, +struct dentry_stat_t { + long nr_dentry; + long nr_unused; + long age_limit; /* age in seconds */ + long want_pages; /* pages requested by system */ + long nr_negative; /* # of unused negative dentries */ + long dummy; /* Reserved for future use */ }; static DEFINE_PER_CPU(long, nr_dentry); @@ -126,6 +129,10 @@ static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +/* Statistics gathering. */ +static struct dentry_stat_t dentry_stat = { + .age_limit = 45, +}; /* * Here we resort to our own counters instead of using generic per-cpu counters @@ -167,14 +174,32 @@ static long get_nr_dentry_negative(void) return sum < 0 ? 0 : sum; } -int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +static int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } + +static struct ctl_table fs_dcache_sysctls[] = { + { + .procname = "dentry-state", + .data = &dentry_stat, + .maxlen = 6*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_dentry, + }, + { } +}; + +static int __init init_fs_dcache_sysctls(void) +{ + register_sysctl_init("fs", fs_dcache_sysctls); + return 0; +} +fs_initcall(init_fs_dcache_sysctls); #endif /* diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 9e23d33bb6f1..f5bba51480b2 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -61,16 +61,6 @@ extern const struct qstr empty_name; extern const struct qstr slash_name; extern const struct qstr dotdot_name; -struct dentry_stat_t { - long nr_dentry; - long nr_unused; - long age_limit; /* age in seconds */ - long want_pages; /* pages requested by system */ - long nr_negative; /* # of unused negative dentries */ - long dummy; /* Reserved for future use */ -}; -extern struct dentry_stat_t dentry_stat; - /* * Try to keep struct dentry aligned on 64 byte cachelines (this will * give reasonable cacheline footprint with larger lines without the diff --git a/include/linux/fs.h b/include/linux/fs.h index 1e6761966120..9b856d5da9e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3531,8 +3531,6 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; -int proc_nr_dentry(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d2c411b15854..4d182fb5c22e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2901,13 +2901,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "dentry-state", - .data = &dentry_stat, - .maxlen = 6*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_dentry, - }, { .procname = "overflowuid", .data = &fs_overflowuid, -- cgit v1.2.3-58-ga151 From 54771613e8a7dbbba2a205ddf1b33e25a290b3fd Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:03 -0800 Subject: sysctl: move maxolduid as a sysctl specific const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The maxolduid value is only shared for sysctl purposes for use on a max range. Just stuff this into our shared const array. [akpm@linux-foundation.org: fix sysctl_vals[], per Mickaël] Link: https://lkml.kernel.org/r/20211129205548.605569-5-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Signed-off-by: Mickaël Salaün Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 2 +- include/linux/sysctl.h | 3 +++ kernel/sysctl.c | 12 ++++-------- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 2eb5987091ea..b9d53b53d769 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -26,7 +26,7 @@ static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX }; +const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX, 65535 }; EXPORT_SYMBOL(sysctl_vals); const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 2de6d20d191b..bb921eb8a02d 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -49,6 +49,9 @@ struct ctl_dir; #define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8]) #define SYSCTL_INT_MAX ((void *)&sysctl_vals[9]) +/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +#define SYSCTL_MAXOLDUID ((void *)&sysctl_vals[10]) + extern const int sysctl_vals[]; #define SYSCTL_LONG_ZERO ((void *)&sysctl_long_vals[0]) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4d182fb5c22e..5bbb4c59dd1a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -110,10 +110,6 @@ static const int six_hundred_forty_kb = 640 * 1024; /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; -/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ -static const int maxolduid = 65535; -/* minolduid is SYSCTL_ZERO */ - static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -2127,7 +2123,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&maxolduid, + .extra2 = SYSCTL_MAXOLDUID, }, { .procname = "overflowgid", @@ -2136,7 +2132,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&maxolduid, + .extra2 = SYSCTL_MAXOLDUID, }, #ifdef CONFIG_S390 { @@ -2908,7 +2904,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&maxolduid, + .extra2 = SYSCTL_MAXOLDUID, }, { .procname = "overflowgid", @@ -2917,7 +2913,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&maxolduid, + .extra2 = SYSCTL_MAXOLDUID, }, #ifdef CONFIG_FILE_LOCKING { -- cgit v1.2.3-58-ga151 From d1d8ac9edf10e83f16d13f009439585a1f93ccb2 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:06 -0800 Subject: fs: move shared sysctls to fs/sysctls.c To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move sysctls which are shared between filesystems into a common file outside of kernel/sysctl.c. Link: https://lkml.kernel.org/r/20211129205548.605569-6-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Makefile | 1 + fs/sysctls.c | 38 ++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 18 ------------------ 3 files changed, 39 insertions(+), 18 deletions(-) create mode 100644 fs/sysctls.c diff --git a/fs/Makefile b/fs/Makefile index 84c5e4cdfee5..ea8770d124da 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -28,6 +28,7 @@ obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-y += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o +obj-$(CONFIG_SYSCTL) += sysctls.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o diff --git a/fs/sysctls.c b/fs/sysctls.c new file mode 100644 index 000000000000..54216cd1ecd7 --- /dev/null +++ b/fs/sysctls.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * /proc/sys/fs shared sysctls + * + * These sysctls are shared between different filesystems. + */ +#include +#include + +static struct ctl_table fs_shared_sysctls[] = { + { + .procname = "overflowuid", + .data = &fs_overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, + { + .procname = "overflowgid", + .data = &fs_overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, + { } +}; + +static int __init init_fs_shared_sysctls(void) +{ + register_sysctl_init("fs", fs_shared_sysctls); + return 0; +} + +early_initcall(init_fs_shared_sysctls); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5bbb4c59dd1a..6d9d2001b790 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2897,24 +2897,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "overflowuid", - .data = &fs_overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_MAXOLDUID, - }, - { - .procname = "overflowgid", - .data = &fs_overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_MAXOLDUID, - }, #ifdef CONFIG_FILE_LOCKING { .procname = "leases-enable", -- cgit v1.2.3-58-ga151 From dd81faa88340a1fe8cd81c8ecbadd8e95c58549c Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:10 -0800 Subject: fs: move locking sysctls where they are used kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. The locking fs sysctls are only used on fs/locks.c, so move them there. Link: https://lkml.kernel.org/r/20211129205548.605569-7-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/locks.c | 34 ++++++++++++++++++++++++++++++++-- include/linux/fs.h | 4 ---- kernel/sysctl.c | 20 -------------------- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 0fca9d680978..8c6df10cd9ed 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -62,6 +62,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -88,8 +89,37 @@ static int target_leasetype(struct file_lock *fl) return fl->fl_type; } -int leases_enable = 1; -int lease_break_time = 45; +static int leases_enable = 1; +static int lease_break_time = 45; + +#ifdef CONFIG_SYSCTL +static struct ctl_table locks_sysctls[] = { + { + .procname = "leases-enable", + .data = &leases_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_MMU + { + .procname = "lease-break-time", + .data = &lease_break_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_MMU */ + {} +}; + +static int __init init_fs_locks_sysctls(void) +{ + register_sysctl_init("fs", locks_sysctls); + return 0; +} +early_initcall(init_fs_locks_sysctls); +#endif /* CONFIG_SYSCTL */ /* * The global file_lock_list is only used for displaying /proc/locks, so we diff --git a/include/linux/fs.h b/include/linux/fs.h index 9b856d5da9e2..0e08c3dd8f75 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -82,10 +82,6 @@ extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; extern int leases_enable, lease_break_time; -extern int sysctl_protected_symlinks; -extern int sysctl_protected_hardlinks; -extern int sysctl_protected_fifos; -extern int sysctl_protected_regular; typedef __kernel_rwf_t rwf_t; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6d9d2001b790..073948e9d165 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2897,26 +2897,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { -#ifdef CONFIG_FILE_LOCKING - { - .procname = "leases-enable", - .data = &leases_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MMU -#ifdef CONFIG_FILE_LOCKING - { - .procname = "lease-break-time", - .data = &lease_break_time, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#endif { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, -- cgit v1.2.3-58-ga151 From 9c011be132972ff94bde2ae99064e29f94e85c68 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:13 -0800 Subject: fs: move namei sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move namei's own sysctl knobs to its own file. Other than the move we also avoid initializing two static variables to 0 as this is not needed: * sysctl_protected_symlinks * sysctl_protected_hardlinks Link: https://lkml.kernel.org/r/20211129205548.605569-8-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++---- include/linux/fs.h | 1 - kernel/sysctl.c | 36 --------------------------------- 3 files changed, 54 insertions(+), 41 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index d81f04f8d818..b867a92c078e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1020,10 +1020,60 @@ static inline void put_link(struct nameidata *nd) path_put(&last->link); } -int sysctl_protected_symlinks __read_mostly = 0; -int sysctl_protected_hardlinks __read_mostly = 0; -int sysctl_protected_fifos __read_mostly; -int sysctl_protected_regular __read_mostly; +static int sysctl_protected_symlinks __read_mostly; +static int sysctl_protected_hardlinks __read_mostly; +static int sysctl_protected_fifos __read_mostly; +static int sysctl_protected_regular __read_mostly; + +#ifdef CONFIG_SYSCTL +static struct ctl_table namei_sysctls[] = { + { + .procname = "protected_symlinks", + .data = &sysctl_protected_symlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_hardlinks", + .data = &sysctl_protected_hardlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_fifos", + .data = &sysctl_protected_fifos, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "protected_regular", + .data = &sysctl_protected_regular, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { } +}; + +static int __init init_fs_namei_sysctls(void) +{ + register_sysctl_init("fs", namei_sysctls); + return 0; +} +fs_initcall(init_fs_namei_sysctls); + +#endif /* CONFIG_SYSCTL */ /** * may_follow_link - Check symlink following for unsafe situations diff --git a/include/linux/fs.h b/include/linux/fs.h index 0e08c3dd8f75..9617dea24978 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -81,7 +81,6 @@ extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; -extern int leases_enable, lease_break_time; typedef __kernel_rwf_t rwf_t; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 073948e9d165..53fb4692facc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2897,42 +2897,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "protected_symlinks", - .data = &sysctl_protected_symlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_hardlinks", - .data = &sysctl_protected_hardlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_fifos", - .data = &sysctl_protected_fifos, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "protected_regular", - .data = &sysctl_protected_regular, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, { .procname = "suid_dumpable", .data = &suid_dumpable, -- cgit v1.2.3-58-ga151 From 66ad398634c21e0a42ce10002ae06c39352da0d1 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:17 -0800 Subject: fs: move fs/exec.c sysctls into its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the fs/exec.c respective sysctls to its own file. Since checkpatch complains about style issues with the old code, this move also fixes a few of those minor style issues: * Use pr_warn() instead of prink(WARNING * New empty lines are wanted at the beginning of routines Link: https://lkml.kernel.org/r/20211129205548.605569-9-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 66 ------------------------------------------ 2 files changed, 90 insertions(+), 66 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 3c3c366a9bcf..310750340e4a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -2099,3 +2100,92 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd, argv, envp, flags); } #endif + +#ifdef CONFIG_SYSCTL + +static void validate_coredump_safety(void) +{ +#ifdef CONFIG_COREDUMP + if (suid_dumpable == SUID_DUMP_ROOT && + core_pattern[0] != '/' && core_pattern[0] != '|') { + pr_warn( +"Unsafe core_pattern used with fs.suid_dumpable=2.\n" +"Pipe handler or fully qualified core dump path required.\n" +"Set kernel.core_pattern before fs.suid_dumpable.\n" + ); + } +#endif +} + +static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!error) + validate_coredump_safety(); + return error; +} + +static struct ctl_table fs_exec_sysctls[] = { + { + .procname = "suid_dumpable", + .data = &suid_dumpable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_coredump, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { } +}; + +#ifdef CONFIG_COREDUMP + +static int proc_dostring_coredump(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dostring(table, write, buffer, lenp, ppos); + + if (!error) + validate_coredump_safety(); + return error; +} + +static struct ctl_table kernel_exec_sysctls[] = { + { + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "core_pattern", + .data = core_pattern, + .maxlen = CORENAME_MAX_SIZE, + .mode = 0644, + .proc_handler = proc_dostring_coredump, + }, + { + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; +#endif + +static int __init init_fs_exec_sysctls(void) +{ + register_sysctl_init("fs", fs_exec_sysctls); +#ifdef CONFIG_COREDUMP + register_sysctl_init("kernel", kernel_exec_sysctls); +#endif + return 0; +} + +fs_initcall(init_fs_exec_sysctls); +#endif /* CONFIG_SYSCTL */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 53fb4692facc..8bab82aa0192 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1117,40 +1117,6 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write, do_proc_dopipe_max_size_conv, NULL); } -static void validate_coredump_safety(void) -{ -#ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMP_ROOT && - core_pattern[0] != '/' && core_pattern[0] != '|') { - printk(KERN_WARNING -"Unsafe core_pattern used with fs.suid_dumpable=2.\n" -"Pipe handler or fully qualified core dump path required.\n" -"Set kernel.core_pattern before fs.suid_dumpable.\n" - ); - } -#endif -} - -static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} - -#ifdef CONFIG_COREDUMP -static int proc_dostring_coredump(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dostring(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} -#endif - #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -1874,29 +1840,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_COREDUMP - { - .procname = "core_uses_pid", - .data = &core_uses_pid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "core_pattern", - .data = core_pattern, - .maxlen = CORENAME_MAX_SIZE, - .mode = 0644, - .proc_handler = proc_dostring_coredump, - }, - { - .procname = "core_pipe_limit", - .data = &core_pipe_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -2897,15 +2840,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "suid_dumpable", - .data = &suid_dumpable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_coredump, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, { .procname = "pipe-max-size", .data = &pipe_max_size, -- cgit v1.2.3-58-ga151 From 1998f19324d24df7de4e74d81503b4299eb99e7d Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:20 -0800 Subject: fs: move pipe sysctls to is own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the pipe sysctls to its own file. Link: https://lkml.kernel.org/r/20211129205548.605569-10-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 64 ++++++++++++++++++++++++++++++++++++++++++++--- include/linux/pipe_fs_i.h | 4 --- include/linux/sysctl.h | 6 +++++ kernel/sysctl.c | 61 +++++--------------------------------------- 4 files changed, 73 insertions(+), 62 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 6d4342bad9f1..cc28623a67b6 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -50,13 +51,13 @@ * The max size that a non-root user is allowed to grow the pipe. Can * be set by root in /proc/sys/fs/pipe-max-size */ -unsigned int pipe_max_size = 1048576; +static unsigned int pipe_max_size = 1048576; /* Maximum allocatable pages per user. Hard limit is unset by default, soft * matches default values. */ -unsigned long pipe_user_pages_hard; -unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; +static unsigned long pipe_user_pages_hard; +static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; /* * We use head and tail indices that aren't masked off, except at the point of @@ -1428,6 +1429,60 @@ static struct file_system_type pipe_fs_type = { .kill_sb = kill_anon_super, }; +#ifdef CONFIG_SYSCTL +static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + if (write) { + unsigned int val; + + val = round_pipe_size(*lvalp); + if (val == 0) + return -EINVAL; + + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +static int proc_dopipe_max_size(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_dopipe_max_size_conv, NULL); +} + +static struct ctl_table fs_pipe_sysctls[] = { + { + .procname = "pipe-max-size", + .data = &pipe_max_size, + .maxlen = sizeof(pipe_max_size), + .mode = 0644, + .proc_handler = proc_dopipe_max_size, + }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { } +}; +#endif + static int __init init_pipe_fs(void) { int err = register_filesystem(&pipe_fs_type); @@ -1439,6 +1494,9 @@ static int __init init_pipe_fs(void) unregister_filesystem(&pipe_fs_type); } } +#ifdef CONFIG_SYSCTL + register_sysctl_init("fs", fs_pipe_sysctls); +#endif return err; } diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index fc5642431b92..c00c618ef290 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -238,10 +238,6 @@ void pipe_lock(struct pipe_inode_info *); void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); -extern unsigned int pipe_max_size; -extern unsigned long pipe_user_pages_hard; -extern unsigned long pipe_user_pages_soft; - /* Wait for a pipe to be readable/writable while dropping the pipe lock */ void pipe_wait_readable(struct pipe_inode_info *); void pipe_wait_writable(struct pipe_inode_info *); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index bb921eb8a02d..4294e9668bd5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -221,6 +221,12 @@ extern void __register_sysctl_init(const char *path, struct ctl_table *table, extern struct ctl_table_header *register_sysctl_mount_point(const char *path); void do_sysctl_args(void); +int do_proc_douintvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data); extern int pwrsw_enabled; extern int unaligned_enabled; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8bab82aa0192..f79ec2cf8721 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include #include @@ -761,12 +760,12 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); } -static int do_proc_douintvec(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) +int do_proc_douintvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) { return __do_proc_douintvec(table->data, table, write, buffer, lenp, ppos, conv, data); @@ -1090,33 +1089,6 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write, } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); -static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - if (write) { - unsigned int val; - - val = round_pipe_size(*lvalp); - if (val == 0) - return -EINVAL; - - *valp = val; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long) val; - } - - return 0; -} - -static int proc_dopipe_max_size(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_dopipe_max_size_conv, NULL); -} - #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -2840,27 +2812,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "pipe-max-size", - .data = &pipe_max_size, - .maxlen = sizeof(pipe_max_size), - .mode = 0644, - .proc_handler = proc_dopipe_max_size, - }, - { - .procname = "pipe-user-pages-hard", - .data = &pipe_user_pages_hard, - .maxlen = sizeof(pipe_user_pages_hard), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "pipe-user-pages-soft", - .data = &pipe_user_pages_soft, - .maxlen = sizeof(pipe_user_pages_soft), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, { .procname = "mount-max", .data = &sysctl_mount_max, -- cgit v1.2.3-58-ga151 From 51cb8dfc5a5c39e6c70376b9dc9a14d624a9d271 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:24 -0800 Subject: sysctl: add and use base directory declarer and registration helper Patch series "sysctl: add and use base directory declarer and registration helper". In this patch series we start addressing base directories, and so we start with the "fs" sysctls. The end goal is we end up completely moving all "fs" sysctl knobs out from kernel/sysctl. This patch (of 6): Add a set of helpers which can be used to declare and register base directory sysctls on their own. We do this so we can later move each of the base sysctl directories like "fs", "kernel", etc, to their own respective files instead of shoving the declarations and registrations all on kernel/sysctl.c. The lazy approach has caught up and with this, we just end up extending the list of base directories / sysctls on one file and this makes maintenance difficult due to merge conflicts from many developers. The declarations are used first by kernel/sysctl.c for registration its own base which over time we'll try to clean up. It will be used in the next patch to demonstrate how to cleanly deal with base sysctl directories. [mcgrof@kernel.org: null-terminate the ctl_table arrays] Link: https://lkml.kernel.org/r/YafJY3rXDYnjK/gs@bombadil.infradead.org Link: https://lkml.kernel.org/r/20211129211943.640266-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211129211943.640266-2-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Kees Cook Cc: Iurii Zaikin Cc: Xiaoming Ni Cc: Eric Biederman Cc: Stephen Kitt Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Christian Brauner Cc: Eric Biggers Cc: "Naveen N. Rao" Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 9 +++++++++ include/linux/sysctl.h | 24 ++++++++++++++++++++++++ kernel/sysctl.c | 41 ++++++++++------------------------------- 3 files changed, 43 insertions(+), 31 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b9d53b53d769..95dfd195e04e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1646,6 +1646,15 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table) } EXPORT_SYMBOL(register_sysctl_table); +int __register_sysctl_base(struct ctl_table *base_table) +{ + struct ctl_table_header *hdr; + + hdr = register_sysctl_table(base_table); + kmemleak_not_leak(hdr); + return 0; +} + static void put_links(struct ctl_table_header *header) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 4294e9668bd5..02134a8abad7 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -194,6 +194,20 @@ struct ctl_path { #ifdef CONFIG_SYSCTL +#define DECLARE_SYSCTL_BASE(_name, _table) \ +static struct ctl_table _name##_base_table[] = { \ + { \ + .procname = #_name, \ + .mode = 0555, \ + .child = _table, \ + }, \ + { }, \ +} + +extern int __register_sysctl_base(struct ctl_table *base_table); + +#define register_sysctl_base(_name) __register_sysctl_base(_name##_base_table) + void proc_sys_poll_notify(struct ctl_table_poll *poll); extern void setup_sysctl_set(struct ctl_table_set *p, @@ -236,6 +250,16 @@ extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; #else /* CONFIG_SYSCTL */ + +#define DECLARE_SYSCTL_BASE(_name, _table) + +static inline int __register_sysctl_base(struct ctl_table *base_table) +{ + return 0; +} + +#define register_sysctl_base(table) __register_sysctl_base(table) + static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) { return NULL; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f79ec2cf8721..12456a535059 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2851,41 +2851,20 @@ static struct ctl_table dev_table[] = { { } }; -static struct ctl_table sysctl_base_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = kern_table, - }, - { - .procname = "vm", - .mode = 0555, - .child = vm_table, - }, - { - .procname = "fs", - .mode = 0555, - .child = fs_table, - }, - { - .procname = "debug", - .mode = 0555, - .child = debug_table, - }, - { - .procname = "dev", - .mode = 0555, - .child = dev_table, - }, - { } -}; +DECLARE_SYSCTL_BASE(kernel, kern_table); +DECLARE_SYSCTL_BASE(vm, vm_table); +DECLARE_SYSCTL_BASE(fs, fs_table); +DECLARE_SYSCTL_BASE(debug, debug_table); +DECLARE_SYSCTL_BASE(dev, dev_table); int __init sysctl_init(void) { - struct ctl_table_header *hdr; + register_sysctl_base(kernel); + register_sysctl_base(vm); + register_sysctl_base(fs); + register_sysctl_base(debug); + register_sysctl_base(dev); - hdr = register_sysctl_table(sysctl_base_table); - kmemleak_not_leak(hdr); return 0; } #endif /* CONFIG_SYSCTL */ -- cgit v1.2.3-58-ga151 From ab171b952c6e065779687b44041038efdadb3915 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:27 -0800 Subject: fs: move namespace sysctls and declare fs base directory This moves the namespace sysctls to its own file as part of the kernel/sysctl.c spring cleaning Since we have now removed all sysctls for "fs", we now have to declare it on the filesystem code, we do that using the new helper, which reduces boiler plate code. We rename init_fs_shared_sysctls() to init_fs_sysctls() to reflect that now fs/sysctls.c is taking on the burden of being the first to register the base directory as well. Lastly, since init code will load in the order in which we link it we have to move the sysctl code to be linked in early, so that its early init routine runs prior to other fs code. This way, other filesystem code can register their own sysctls using the helpers after this: * register_sysctl_init() * register_sysctl() Link: https://lkml.kernel.org/r/20211129211943.640266-3-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Makefile | 3 ++- fs/namespace.c | 24 +++++++++++++++++++++++- fs/sysctls.c | 9 +++++---- include/linux/mount.h | 3 --- kernel/sysctl.c | 14 -------------- 5 files changed, 30 insertions(+), 23 deletions(-) diff --git a/fs/Makefile b/fs/Makefile index ea8770d124da..dab324aea08f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -6,6 +6,8 @@ # Rewritten to use lists instead of if-statements. # +obj-$(CONFIG_SYSCTL) += sysctls.o + obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ ioctl.o readdir.o select.o dcache.o inode.o \ @@ -28,7 +30,6 @@ obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-y += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o -obj-$(CONFIG_SYSCTL) += sysctls.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o diff --git a/fs/namespace.c b/fs/namespace.c index dc31ad6b370f..40b994a29e90 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -37,7 +37,7 @@ #include "internal.h" /* Maximum number of mounts in a mount namespace */ -unsigned int sysctl_mount_max __read_mostly = 100000; +static unsigned int sysctl_mount_max __read_mostly = 100000; static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; @@ -4620,3 +4620,25 @@ const struct proc_ns_operations mntns_operations = { .install = mntns_install, .owner = mntns_owner, }; + +#ifdef CONFIG_SYSCTL +static struct ctl_table fs_namespace_sysctls[] = { + { + .procname = "mount-max", + .data = &sysctl_mount_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, + { } +}; + +static int __init init_fs_namespace_sysctls(void) +{ + register_sysctl_init("fs", fs_namespace_sysctls); + return 0; +} +fs_initcall(init_fs_namespace_sysctls); + +#endif /* CONFIG_SYSCTL */ diff --git a/fs/sysctls.c b/fs/sysctls.c index 54216cd1ecd7..c701273c9432 100644 --- a/fs/sysctls.c +++ b/fs/sysctls.c @@ -29,10 +29,11 @@ static struct ctl_table fs_shared_sysctls[] = { { } }; -static int __init init_fs_shared_sysctls(void) +DECLARE_SYSCTL_BASE(fs, fs_shared_sysctls); + +static int __init init_fs_sysctls(void) { - register_sysctl_init("fs", fs_shared_sysctls); - return 0; + return register_sysctl_base(fs); } -early_initcall(init_fs_shared_sysctls); +early_initcall(init_fs_sysctls); diff --git a/include/linux/mount.h b/include/linux/mount.h index 5d92a7e1a742..7f18a7555dff 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -113,9 +113,6 @@ extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); extern dev_t name_to_dev_t(const char *name); - -extern unsigned int sysctl_mount_max; - extern bool path_is_mountpoint(const struct path *path); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 12456a535059..cdc3dc5f0005 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2811,18 +2811,6 @@ static struct ctl_table vm_table[] = { { } }; -static struct ctl_table fs_table[] = { - { - .procname = "mount-max", - .data = &sysctl_mount_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, - { } -}; - static struct ctl_table debug_table[] = { #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { @@ -2853,7 +2841,6 @@ static struct ctl_table dev_table[] = { DECLARE_SYSCTL_BASE(kernel, kern_table); DECLARE_SYSCTL_BASE(vm, vm_table); -DECLARE_SYSCTL_BASE(fs, fs_table); DECLARE_SYSCTL_BASE(debug, debug_table); DECLARE_SYSCTL_BASE(dev, dev_table); @@ -2861,7 +2848,6 @@ int __init sysctl_init(void) { register_sysctl_base(kernel); register_sysctl_base(vm); - register_sysctl_base(fs); register_sysctl_base(debug); register_sysctl_base(dev); -- cgit v1.2.3-58-ga151 From d8c0418aac78e661b5283c9d6a1dfc61d44f26fd Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:31 -0800 Subject: kernel/sysctl.c: rename sysctl_init() to sysctl_init_bases() Rename sysctl_init() to sysctl_init_bases() so to reflect exactly what this is doing. Link: https://lkml.kernel.org/r/20211129211943.640266-4-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/alignment.c | 2 +- arch/sh/mm/alignment.c | 2 +- fs/proc/proc_sysctl.c | 4 ++-- include/linux/sysctl.h | 2 +- kernel/sysctl.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index adbb3817d0be..6f499559d193 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@ -1005,7 +1005,7 @@ static int __init noalign_setup(char *__unused) __setup("noalign", noalign_setup); /* - * This needs to be done after sysctl_init, otherwise sys/ will be + * This needs to be done after sysctl_init_bases(), otherwise sys/ will be * overwritten. Actually, this shouldn't be in sys/ at all since * it isn't a sysctl, and it doesn't contain sysctl information. * We now locate it in /proc/cpu/alignment instead. diff --git a/arch/sh/mm/alignment.c b/arch/sh/mm/alignment.c index fb517b82a87b..d802801ceb93 100644 --- a/arch/sh/mm/alignment.c +++ b/arch/sh/mm/alignment.c @@ -161,7 +161,7 @@ static const struct proc_ops alignment_proc_ops = { }; /* - * This needs to be done after sysctl_init, otherwise sys/ will be + * This needs to be done after sysctl_init_bases(), otherwise sys/ will be * overwritten. Actually, this shouldn't be in sys/ at all since * it isn't a sysctl, and it doesn't contain sysctl information. * We now locate it in /proc/cpu/alignment instead. diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 95dfd195e04e..7d9cfc730bd4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1419,7 +1419,7 @@ EXPORT_SYMBOL(register_sysctl); * Context: Can only be called after your respective sysctl base path has been * registered. So for instance, most base directories are registered early on * init before init levels are processed through proc_sys_init() and - * sysctl_init(). + * sysctl_init_bases(). */ void __init __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name) @@ -1768,7 +1768,7 @@ int __init proc_sys_init(void) proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; - return sysctl_init(); + return sysctl_init_bases(); } struct sysctl_alias { diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 02134a8abad7..180adf7da785 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -228,7 +228,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); -extern int sysctl_init(void); +extern int sysctl_init_bases(void); extern void __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name); #define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cdc3dc5f0005..bb07183cd305 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2844,7 +2844,7 @@ DECLARE_SYSCTL_BASE(vm, vm_table); DECLARE_SYSCTL_BASE(debug, debug_table); DECLARE_SYSCTL_BASE(dev, dev_table); -int __init sysctl_init(void) +int __init sysctl_init_bases(void) { register_sysctl_base(kernel); register_sysctl_base(vm); -- cgit v1.2.3-58-ga151 From fdcd4073fccc6f989308be3f1d61d8a68cd990ce Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:13:34 -0800 Subject: printk: fix build warning when CONFIG_PRINTK=n build warning when CONFIG_PRINTK=n kernel/printk/printk.c:175:5: warning: no previous prototype for 'devkmsg_sysctl_set_loglvl' [-Wmissing-prototypes] devkmsg_sysctl_set_loglvl() is only used in sysctl.c when CONFIG_PRINTK=y, but it participates in the build when CONFIG_PRINTK=n. So add compile dependency CONFIG_PRINTK=y && CONFIG_SYSCTL=y to fix the build warning. Link: https://lkml.kernel.org/r/20211129211943.640266-5-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 4 ---- kernel/printk/internal.h | 2 ++ kernel/printk/printk.c | 3 ++- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index 9497f6b98339..1522df223c0f 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -183,10 +183,6 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, extern int printk_delay_msec; extern int dmesg_restrict; -extern int -devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf, - size_t *lenp, loff_t *ppos); - extern void wake_up_klogd(void); char *log_buf_addr_get(void); diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 6b1c4b399845..d947ca6c84f9 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -6,6 +6,8 @@ #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) void __init printk_sysctl_init(void); +int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #else #define printk_sysctl_init() do { } while (0) #endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 363a493bded8..82abfaf3c2aa 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -171,7 +171,7 @@ static int __init control_devkmsg(char *str) __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; - +#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -210,6 +210,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, return 0; } +#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ /* Number of registered extended console drivers. */ static int nr_ext_console_drivers; -- cgit v1.2.3-58-ga151 From f0bc21b268c1464603192a00851cdbbf7c2cdc36 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:13:38 -0800 Subject: fs/coredump: move coredump sysctls into its own file This moves the fs/coredump.c respective sysctls to its own file. Link: https://lkml.kernel.org/r/20211129211943.640266-6-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coredump.c | 66 ++++++++++++++++++++++++++++++++++++++++++++---- fs/exec.c | 55 ---------------------------------------- include/linux/coredump.h | 10 +++++--- kernel/sysctl.c | 2 -- 4 files changed, 67 insertions(+), 66 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 7dece20b162b..1c060c0a2d72 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -52,9 +53,9 @@ #include -int core_uses_pid; -unsigned int core_pipe_limit; -char core_pattern[CORENAME_MAX_SIZE] = "core"; +static int core_uses_pid; +static unsigned int core_pipe_limit; +static char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; struct core_name { @@ -62,8 +63,6 @@ struct core_name { int used, size; }; -/* The maximal length of core_pattern is also specified in sysctl.c */ - static int expand_corename(struct core_name *cn, int size) { char *corename = krealloc(cn->corename, size, GFP_KERNEL); @@ -893,6 +892,63 @@ int dump_align(struct coredump_params *cprm, int align) } EXPORT_SYMBOL(dump_align); +#ifdef CONFIG_SYSCTL + +void validate_coredump_safety(void) +{ + if (suid_dumpable == SUID_DUMP_ROOT && + core_pattern[0] != '/' && core_pattern[0] != '|') { + pr_warn( +"Unsafe core_pattern used with fs.suid_dumpable=2.\n" +"Pipe handler or fully qualified core dump path required.\n" +"Set kernel.core_pattern before fs.suid_dumpable.\n" + ); + } +} + +static int proc_dostring_coredump(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dostring(table, write, buffer, lenp, ppos); + + if (!error) + validate_coredump_safety(); + return error; +} + +static struct ctl_table coredump_sysctls[] = { + { + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "core_pattern", + .data = core_pattern, + .maxlen = CORENAME_MAX_SIZE, + .mode = 0644, + .proc_handler = proc_dostring_coredump, + }, + { + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static int __init init_fs_coredump_sysctls(void) +{ + register_sysctl_init("kernel", coredump_sysctls); + return 0; +} +fs_initcall(init_fs_coredump_sysctls); +#endif /* CONFIG_SYSCTL */ + /* * The purpose of always_dump_vma() is to make sure that special kernel mappings * that are useful for post-mortem analysis are included in every core dump. diff --git a/fs/exec.c b/fs/exec.c index 310750340e4a..79f2c9483302 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2103,20 +2103,6 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd, #ifdef CONFIG_SYSCTL -static void validate_coredump_safety(void) -{ -#ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMP_ROOT && - core_pattern[0] != '/' && core_pattern[0] != '|') { - pr_warn( -"Unsafe core_pattern used with fs.suid_dumpable=2.\n" -"Pipe handler or fully qualified core dump path required.\n" -"Set kernel.core_pattern before fs.suid_dumpable.\n" - ); - } -#endif -} - static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -2140,50 +2126,9 @@ static struct ctl_table fs_exec_sysctls[] = { { } }; -#ifdef CONFIG_COREDUMP - -static int proc_dostring_coredump(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dostring(table, write, buffer, lenp, ppos); - - if (!error) - validate_coredump_safety(); - return error; -} - -static struct ctl_table kernel_exec_sysctls[] = { - { - .procname = "core_uses_pid", - .data = &core_uses_pid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "core_pattern", - .data = core_pattern, - .maxlen = CORENAME_MAX_SIZE, - .mode = 0644, - .proc_handler = proc_dostring_coredump, - }, - { - .procname = "core_pipe_limit", - .data = &core_pipe_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { } -}; -#endif - static int __init init_fs_exec_sysctls(void) { register_sysctl_init("fs", fs_exec_sysctls); -#ifdef CONFIG_COREDUMP - register_sysctl_init("kernel", kernel_exec_sysctls); -#endif return 0; } diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 78fcd776b185..248a68c668b4 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -14,10 +14,6 @@ struct core_vma_metadata { unsigned long dump_size; }; -extern int core_uses_pid; -extern char core_pattern[]; -extern unsigned int core_pipe_limit; - /* * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. @@ -37,4 +33,10 @@ extern void do_coredump(const kernel_siginfo_t *siginfo); static inline void do_coredump(const kernel_siginfo_t *siginfo) {} #endif +#if defined(CONFIG_COREDUMP) && defined(CONFIG_SYSCTL) +extern void validate_coredump_safety(void); +#else +static inline void validate_coredump_safety(void) {} +#endif + #endif /* _LINUX_COREDUMP_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bb07183cd305..c78585292005 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -62,12 +62,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include -- cgit v1.2.3-58-ga151 From a737a3c6744bc822d1e6a837fef550e665ddf877 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:13:41 -0800 Subject: kprobe: move sysctl_kprobes_optimization to kprobes.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Move sysctl_kprobes_optimization from kernel/sysctl.c to kernel/kprobes.c. Use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: fix compile issue when CONFIG_OPTPROBES is disabled] Link: https://lkml.kernel.org/r/20211129211943.640266-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 6 ------ kernel/kprobes.c | 30 ++++++++++++++++++++++++++---- kernel/sysctl.c | 12 ------------ 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 8c8f7a4d93af..19b884353b15 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -348,12 +348,6 @@ extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs); DEFINE_INSN_CACHE_OPS(optinsn); -#ifdef CONFIG_SYSCTL -extern int sysctl_kprobes_optimization; -extern int proc_kprobes_optimization_handler(struct ctl_table *table, - int write, void *buffer, - size_t *length, loff_t *ppos); -#endif /* CONFIG_SYSCTL */ extern void wait_for_kprobe_optimizer(void); #else /* !CONFIG_OPTPROBES */ static inline void wait_for_kprobe_optimizer(void) { } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 21eccc961bba..94cab8c9ce56 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -48,6 +48,9 @@ #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) +#if !defined(CONFIG_OPTPROBES) || !defined(CONFIG_SYSCTL) +#define kprobe_sysctls_init() do { } while (0) +#endif static int kprobes_initialized; /* kprobe_table can be accessed by @@ -938,10 +941,10 @@ static void unoptimize_all_kprobes(void) } static DEFINE_MUTEX(kprobe_sysctl_mutex); -int sysctl_kprobes_optimization; -int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, - loff_t *ppos) +static int sysctl_kprobes_optimization; +static int proc_kprobes_optimization_handler(struct ctl_table *table, + int write, void *buffer, + size_t *length, loff_t *ppos) { int ret; @@ -957,6 +960,24 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, return ret; } + +static struct ctl_table kprobe_sysctls[] = { + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static void __init kprobe_sysctls_init(void) +{ + register_sysctl_init("debug", kprobe_sysctls); +} #endif /* CONFIG_SYSCTL */ /* Put a breakpoint for a probe. */ @@ -2584,6 +2605,7 @@ static int __init init_kprobes(void) err = register_module_notifier(&kprobe_module_nb); kprobes_initialized = (err == 0); + kprobe_sysctls_init(); return err; } early_initcall(init_kprobes); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c78585292005..c322bb629d10 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include @@ -2818,17 +2817,6 @@ static struct ctl_table debug_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, -#endif -#if defined(CONFIG_OPTPROBES) - { - .procname = "kprobes-optimization", - .data = &sysctl_kprobes_optimization, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_kprobes_optimization_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #endif { } }; -- cgit v1.2.3-58-ga151 From e565a8ed1ee4b481539b66cd6f54df9ecf1e9861 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 21 Jan 2022 22:13:45 -0800 Subject: kernel/sysctl.c: remove unused variable ten_thousand The const variable ten_thousand is not used, it is redundant and can be removed. Cleans up clang warning: kernel/sysctl.c:99:18: warning: unused variable 'ten_thousand' [-Wunused-const-variable] static const int ten_thousand = 10000; Link: https://lkml.kernel.org/r/20211221184501.574670-1-colin.i.king@gmail.com Fixes: c26da54dc8ca ("printk: move printk sysctl to printk/sysctl.c") Signed-off-by: Colin Ian King Acked-by: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c322bb629d10..e92e8d21d6bf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -96,9 +96,6 @@ /* Constants used for minimum and maximum */ -#ifdef CONFIG_PRINTK -static const int ten_thousand = 10000; -#endif #ifdef CONFIG_PERF_EVENTS static const int six_hundred_forty_kb = 640 * 1024; #endif -- cgit v1.2.3-58-ga151 From 1622ed7d0743201293094162c26019d2573ecacb Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Jan 2022 22:13:48 -0800 Subject: sysctl: returns -EINVAL when a negative value is passed to proc_doulongvec_minmax When we pass a negative value to the proc_doulongvec_minmax() function, the function returns 0, but the corresponding interface value does not change. we can easily reproduce this problem with the following commands: cd /proc/sys/fs/epoll echo -1 > max_user_watches; echo $?; cat max_user_watches This function requires a non-negative number to be passed in, so when a negative number is passed in, -EINVAL is returned. Link: https://lkml.kernel.org/r/20211220092627.3744624-1-libaokun1@huawei.com Signed-off-by: Baokun Li Reported-by: Hulk Robot Acked-by: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e92e8d21d6bf..5ae443b2882e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1145,10 +1145,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); - if (err) + if (err || neg) { + err = -EINVAL; break; - if (neg) - continue; + } + val = convmul * val / convdiv; if ((min && val < *min) || (max && val > *max)) { err = -EINVAL; -- cgit v1.2.3-58-ga151 From 67f1c9cd0c561d25354c1e289cdd0d77d3112513 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:13:51 -0800 Subject: zsmalloc: introduce some helper functions Patch series "zsmalloc: remove bit_spin_lock", v2. zsmalloc uses bit_spin_lock to minimize space overhead since it's zpage granularity lock. However, it causes zsmalloc non-working under PREEMPT_RT as well as adding too much complication. This patchset tries to replace the bit_spin_lock with per-pool rwlock. It also removes unnecessary zspage isolation logic from class, which was the other part too much complication added into zsmalloc. Last patch changes the get_cpu_var to local_lock to make it work in PREEMPT_RT. This patch (of 9): get_zspage_mapping returns fullness as well as class_idx. However, the fullness is usually not used since it could be stale in some contexts. It causes misleading as well as unnecessary instructions so this patch introduces zspage_class. obj_to_location also produces page and index but we don't need always the index, either so this patch introduces obj_to_page. Link: https://lkml.kernel.org/r/20211115185909.3949505-1-minchan@kernel.org Link: https://lkml.kernel.org/r/20211115185909.3949505-2-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 54 +++++++++++++++++++++++------------------------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 0d3b65939016..dfa58511fa5b 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -517,6 +517,12 @@ static void get_zspage_mapping(struct zspage *zspage, *class_idx = zspage->class; } +static struct size_class *zspage_class(struct zs_pool *pool, + struct zspage *zspage) +{ + return pool->size_class[zspage->class]; +} + static void set_zspage_mapping(struct zspage *zspage, unsigned int class_idx, enum fullness_group fullness) @@ -844,6 +850,12 @@ static void obj_to_location(unsigned long obj, struct page **page, *obj_idx = (obj & OBJ_INDEX_MASK); } +static void obj_to_page(unsigned long obj, struct page **page) +{ + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); +} + /** * location_to_obj - get obj value encoded from (, ) * @page: page object resides in zspage @@ -1246,8 +1258,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, unsigned long obj, off; unsigned int obj_idx; - unsigned int class_idx; - enum fullness_group fg; struct size_class *class; struct mapping_area *area; struct page *pages[2]; @@ -1270,8 +1280,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, /* migration cannot move any subpage in this zspage */ migrate_read_lock(zspage); - get_zspage_mapping(zspage, &class_idx, &fg); - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; area = &get_cpu_var(zs_map_area); @@ -1304,16 +1313,13 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) unsigned long obj, off; unsigned int obj_idx; - unsigned int class_idx; - enum fullness_group fg; struct size_class *class; struct mapping_area *area; obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); - get_zspage_mapping(zspage, &class_idx, &fg); - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; area = this_cpu_ptr(&zs_map_area); @@ -1491,8 +1497,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle) struct zspage *zspage; struct page *f_page; unsigned long obj; - unsigned int f_objidx; - int class_idx; struct size_class *class; enum fullness_group fullness; bool isolated; @@ -1502,13 +1506,11 @@ void zs_free(struct zs_pool *pool, unsigned long handle) pin_tag(handle); obj = handle_to_obj(handle); - obj_to_location(obj, &f_page, &f_objidx); + obj_to_page(obj, &f_page); zspage = get_zspage(f_page); migrate_read_lock(zspage); - - get_zspage_mapping(zspage, &class_idx, &fullness); - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); spin_lock(&class->lock); obj_free(class, obj); @@ -1866,8 +1868,6 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { struct zs_pool *pool; struct size_class *class; - int class_idx; - enum fullness_group fullness; struct zspage *zspage; struct address_space *mapping; @@ -1880,15 +1880,10 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) zspage = get_zspage(page); - /* - * Without class lock, fullness could be stale while class_idx is okay - * because class_idx is constant unless page is freed so we should get - * fullness again under class lock. - */ - get_zspage_mapping(zspage, &class_idx, &fullness); mapping = page_mapping(page); pool = mapping->private_data; - class = pool->size_class[class_idx]; + + class = zspage_class(pool, zspage); spin_lock(&class->lock); if (get_zspage_inuse(zspage) == 0) { @@ -1907,6 +1902,9 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) * size_class to prevent further object allocation from the zspage. */ if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { + enum fullness_group fullness; + unsigned int class_idx; + get_zspage_mapping(zspage, &class_idx, &fullness); atomic_long_inc(&pool->isolated_pages); remove_zspage(class, zspage, fullness); @@ -1923,8 +1921,6 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, { struct zs_pool *pool; struct size_class *class; - int class_idx; - enum fullness_group fullness; struct zspage *zspage; struct page *dummy; void *s_addr, *d_addr, *addr; @@ -1949,9 +1945,8 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, /* Concurrent compactor cannot migrate any subpage in zspage */ migrate_write_lock(zspage); - get_zspage_mapping(zspage, &class_idx, &fullness); pool = mapping->private_data; - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); offset = get_first_obj_offset(page); spin_lock(&class->lock); @@ -2049,8 +2044,6 @@ static void zs_page_putback(struct page *page) { struct zs_pool *pool; struct size_class *class; - int class_idx; - enum fullness_group fg; struct address_space *mapping; struct zspage *zspage; @@ -2058,10 +2051,9 @@ static void zs_page_putback(struct page *page) VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); - get_zspage_mapping(zspage, &class_idx, &fg); mapping = page_mapping(page); pool = mapping->private_data; - class = pool->size_class[class_idx]; + class = zspage_class(pool, zspage); spin_lock(&class->lock); dec_zspage_isolation(zspage); -- cgit v1.2.3-58-ga151 From 3828a76470792aaa5f2de5c0d7fce497187c1e35 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:13:54 -0800 Subject: zsmalloc: rename zs_stat_type to class_stat_type The stat aims for class stat, not zspage so rename it. Link: https://lkml.kernel.org/r/20211115185909.3949505-3-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index dfa58511fa5b..e219593cb9de 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -158,7 +158,7 @@ enum fullness_group { NR_ZS_FULLNESS, }; -enum zs_stat_type { +enum class_stat_type { CLASS_EMPTY, CLASS_ALMOST_EMPTY, CLASS_ALMOST_FULL, @@ -549,21 +549,21 @@ static int get_size_class_index(int size) return min_t(int, ZS_SIZE_CLASSES - 1, idx); } -/* type can be of enum type zs_stat_type or fullness_group */ -static inline void zs_stat_inc(struct size_class *class, +/* type can be of enum type class_stat_type or fullness_group */ +static inline void class_stat_inc(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] += cnt; } -/* type can be of enum type zs_stat_type or fullness_group */ -static inline void zs_stat_dec(struct size_class *class, +/* type can be of enum type class_stat_type or fullness_group */ +static inline void class_stat_dec(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] -= cnt; } -/* type can be of enum type zs_stat_type or fullness_group */ +/* type can be of enum type class_stat_type or fullness_group */ static inline unsigned long zs_stat_get(struct size_class *class, int type) { @@ -725,7 +725,7 @@ static void insert_zspage(struct size_class *class, { struct zspage *head; - zs_stat_inc(class, fullness, 1); + class_stat_inc(class, fullness, 1); head = list_first_entry_or_null(&class->fullness_list[fullness], struct zspage, list); /* @@ -750,7 +750,7 @@ static void remove_zspage(struct size_class *class, VM_BUG_ON(is_zspage_isolated(zspage)); list_del_init(&zspage->list); - zs_stat_dec(class, fullness, 1); + class_stat_dec(class, fullness, 1); } /* @@ -964,7 +964,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, cache_free_zspage(pool, zspage); - zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); + class_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); } @@ -1394,7 +1394,7 @@ static unsigned long obj_malloc(struct size_class *class, kunmap_atomic(vaddr); mod_zspage_inuse(zspage, 1); - zs_stat_inc(class, OBJ_USED, 1); + class_stat_inc(class, OBJ_USED, 1); obj = location_to_obj(m_page, obj); @@ -1458,7 +1458,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) record_obj(handle, obj); atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); - zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); + class_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); @@ -1489,7 +1489,7 @@ static void obj_free(struct size_class *class, unsigned long obj) kunmap_atomic(vaddr); set_freeobj(zspage, f_objidx); mod_zspage_inuse(zspage, -1); - zs_stat_dec(class, OBJ_USED, 1); + class_stat_dec(class, OBJ_USED, 1); } void zs_free(struct zs_pool *pool, unsigned long handle) -- cgit v1.2.3-58-ga151 From 0a5f079b810765be3bd931fce0f88154035af897 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:13:57 -0800 Subject: zsmalloc: decouple class actions from zspage works This patch moves class stat update out of obj_malloc since it's not related to zspage operation. This is a preparation to introduce new lock scheme in next patch. Link: https://lkml.kernel.org/r/20211115185909.3949505-4-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e219593cb9de..c45807f170e8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1360,17 +1360,19 @@ size_t zs_huge_class_size(struct zs_pool *pool) } EXPORT_SYMBOL_GPL(zs_huge_class_size); -static unsigned long obj_malloc(struct size_class *class, +static unsigned long obj_malloc(struct zs_pool *pool, struct zspage *zspage, unsigned long handle) { int i, nr_page, offset; unsigned long obj; struct link_free *link; + struct size_class *class; struct page *m_page; unsigned long m_offset; void *vaddr; + class = pool->size_class[zspage->class]; handle |= OBJ_ALLOCATED_TAG; obj = get_freeobj(zspage); @@ -1394,7 +1396,6 @@ static unsigned long obj_malloc(struct size_class *class, kunmap_atomic(vaddr); mod_zspage_inuse(zspage, 1); - class_stat_inc(class, OBJ_USED, 1); obj = location_to_obj(m_page, obj); @@ -1433,10 +1434,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) spin_lock(&class->lock); zspage = find_get_zspage(class); if (likely(zspage)) { - obj = obj_malloc(class, zspage, handle); + obj = obj_malloc(pool, zspage, handle); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(class, zspage); record_obj(handle, obj); + class_stat_inc(class, OBJ_USED, 1); spin_unlock(&class->lock); return handle; @@ -1451,7 +1453,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) } spin_lock(&class->lock); - obj = obj_malloc(class, zspage, handle); + obj = obj_malloc(pool, zspage, handle); newfg = get_fullness_group(class, zspage); insert_zspage(class, zspage, newfg); set_zspage_mapping(zspage, class->index, newfg); @@ -1459,6 +1461,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); class_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); + class_stat_inc(class, OBJ_USED, 1); /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); @@ -1468,7 +1471,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) } EXPORT_SYMBOL_GPL(zs_malloc); -static void obj_free(struct size_class *class, unsigned long obj) +static void obj_free(int class_size, unsigned long obj) { struct link_free *link; struct zspage *zspage; @@ -1478,7 +1481,7 @@ static void obj_free(struct size_class *class, unsigned long obj) void *vaddr; obj_to_location(obj, &f_page, &f_objidx); - f_offset = (class->size * f_objidx) & ~PAGE_MASK; + f_offset = (class_size * f_objidx) & ~PAGE_MASK; zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); @@ -1489,7 +1492,6 @@ static void obj_free(struct size_class *class, unsigned long obj) kunmap_atomic(vaddr); set_freeobj(zspage, f_objidx); mod_zspage_inuse(zspage, -1); - class_stat_dec(class, OBJ_USED, 1); } void zs_free(struct zs_pool *pool, unsigned long handle) @@ -1513,7 +1515,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle) class = zspage_class(pool, zspage); spin_lock(&class->lock); - obj_free(class, obj); + obj_free(class->size, obj); + class_stat_dec(class, OBJ_USED, 1); fullness = fix_fullness_group(class, zspage); if (fullness != ZS_EMPTY) { migrate_read_unlock(zspage); @@ -1671,7 +1674,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, } used_obj = handle_to_obj(handle); - free_obj = obj_malloc(class, get_zspage(d_page), handle); + free_obj = obj_malloc(pool, get_zspage(d_page), handle); zs_object_copy(class, free_obj, used_obj); obj_idx++; /* @@ -1683,7 +1686,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); unpin_tag(handle); - obj_free(class, used_obj); + obj_free(class->size, used_obj); } /* Remember last position in this iteration */ -- cgit v1.2.3-58-ga151 From 3ae92ac23bd88ed18c43a5b138d5a91252c31d2a Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:14:01 -0800 Subject: zsmalloc: introduce obj_allocated The usage pattern for obj_to_head is to check whether the zpage is allocated or not. Thus, introduce obj_allocated. Link: https://lkml.kernel.org/r/20211115185909.3949505-5-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c45807f170e8..5c72fd5f8fac 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -877,13 +877,21 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static unsigned long obj_to_head(struct page *page, void *obj) +static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) { + unsigned long handle; + if (unlikely(PageHugeObject(page))) { VM_BUG_ON_PAGE(!is_first_page(page), page); - return page->index; + handle = page->index; } else - return *(unsigned long *)obj; + handle = *(unsigned long *)obj; + + if (!(handle & OBJ_ALLOCATED_TAG)) + return false; + + *phandle = handle & ~OBJ_ALLOCATED_TAG; + return true; } static inline int testpin_tag(unsigned long handle) @@ -1606,7 +1614,6 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, static unsigned long find_alloced_obj(struct size_class *class, struct page *page, int *obj_idx) { - unsigned long head; int offset = 0; int index = *obj_idx; unsigned long handle = 0; @@ -1616,9 +1623,7 @@ static unsigned long find_alloced_obj(struct size_class *class, offset += class->size * index; while (offset < PAGE_SIZE) { - head = obj_to_head(page, addr + offset); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; + if (obj_allocated(page, addr + offset, &handle)) { if (trypin_tag(handle)) break; handle = 0; @@ -1928,7 +1933,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, struct page *dummy; void *s_addr, *d_addr, *addr; int offset, pos; - unsigned long handle, head; + unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; int ret = -EAGAIN; @@ -1964,9 +1969,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, pos = offset; s_addr = kmap_atomic(page); while (pos < PAGE_SIZE) { - head = obj_to_head(page, s_addr + pos); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; + if (obj_allocated(page, s_addr + pos, &handle)) { if (!trypin_tag(handle)) goto unpin_objects; } @@ -1982,9 +1985,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, for (addr = s_addr + offset; addr < s_addr + pos; addr += class->size) { - head = obj_to_head(page, addr); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; + if (obj_allocated(page, addr, &handle)) { BUG_ON(!testpin_tag(handle)); old_obj = handle_to_obj(handle); @@ -2029,9 +2030,7 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, unpin_objects: for (addr = s_addr + offset; addr < s_addr + pos; addr += class->size) { - head = obj_to_head(page, addr); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; + if (obj_allocated(page, addr, &handle)) { BUG_ON(!testpin_tag(handle)); unpin_tag(handle); } -- cgit v1.2.3-58-ga151 From a41ec880aa7beeabcb3bfa476ef35b23f376133b Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:14:04 -0800 Subject: zsmalloc: move huge compressed obj from page to zspage The flag aims for zspage, not per page. Let's move it to zspage. Link: https://lkml.kernel.org/r/20211115185909.3949505-6-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5c72fd5f8fac..648306245d51 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -121,6 +121,7 @@ #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) +#define HUGE_BITS 1 #define FULLNESS_BITS 2 #define CLASS_BITS 8 #define ISOLATED_BITS 3 @@ -213,22 +214,6 @@ struct size_class { struct zs_size_stat stats; }; -/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ -static void SetPageHugeObject(struct page *page) -{ - SetPageOwnerPriv1(page); -} - -static void ClearPageHugeObject(struct page *page) -{ - ClearPageOwnerPriv1(page); -} - -static int PageHugeObject(struct page *page) -{ - return PageOwnerPriv1(page); -} - /* * Placed within free objects to form a singly linked list. * For every zspage, zspage->freeobj gives head of this list. @@ -278,6 +263,7 @@ struct zs_pool { struct zspage { struct { + unsigned int huge:HUGE_BITS; unsigned int fullness:FULLNESS_BITS; unsigned int class:CLASS_BITS + 1; unsigned int isolated:ISOLATED_BITS; @@ -298,6 +284,17 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; +/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ +static void SetZsHugePage(struct zspage *zspage) +{ + zspage->huge = 1; +} + +static bool ZsHugePage(struct zspage *zspage) +{ + return zspage->huge; +} + #ifdef CONFIG_COMPACTION static int zs_register_migration(struct zs_pool *pool); static void zs_unregister_migration(struct zs_pool *pool); @@ -830,7 +827,9 @@ static struct zspage *get_zspage(struct page *page) static struct page *get_next_page(struct page *page) { - if (unlikely(PageHugeObject(page))) + struct zspage *zspage = get_zspage(page); + + if (unlikely(ZsHugePage(zspage))) return NULL; return (struct page *)page->index; @@ -880,8 +879,9 @@ static unsigned long handle_to_obj(unsigned long handle) static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) { unsigned long handle; + struct zspage *zspage = get_zspage(page); - if (unlikely(PageHugeObject(page))) { + if (unlikely(ZsHugePage(zspage))) { VM_BUG_ON_PAGE(!is_first_page(page), page); handle = page->index; } else @@ -920,7 +920,6 @@ static void reset_page(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); page_mapcount_reset(page); - ClearPageHugeObject(page); page->index = 0; } @@ -1062,7 +1061,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, SetPagePrivate(page); if (unlikely(class->objs_per_zspage == 1 && class->pages_per_zspage == 1)) - SetPageHugeObject(page); + SetZsHugePage(zspage); } else { prev_page->index = (unsigned long)page; } @@ -1307,7 +1306,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, ret = __zs_map_object(area, pages, off, class->size); out: - if (likely(!PageHugeObject(page))) + if (likely(!ZsHugePage(zspage))) ret += ZS_HANDLE_SIZE; return ret; @@ -1395,7 +1394,7 @@ static unsigned long obj_malloc(struct zs_pool *pool, vaddr = kmap_atomic(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); set_freeobj(zspage, link->next >> OBJ_TAG_BITS); - if (likely(!PageHugeObject(m_page))) + if (likely(!ZsHugePage(zspage))) /* record handle in the header of allocated chunk */ link->handle = handle; else @@ -1496,7 +1495,10 @@ static void obj_free(int class_size, unsigned long obj) /* Insert this object in containing zspage's freelist */ link = (struct link_free *)(vaddr + f_offset); - link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + if (likely(!ZsHugePage(zspage))) + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + else + f_page->index = 0; kunmap_atomic(vaddr); set_freeobj(zspage, f_objidx); mod_zspage_inuse(zspage, -1); @@ -1867,7 +1869,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, create_page_chain(class, zspage, pages); set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); - if (unlikely(PageHugeObject(oldpage))) + if (unlikely(ZsHugePage(zspage))) newpage->index = oldpage->index; __SetPageMovable(newpage, page_mapping(oldpage)); } -- cgit v1.2.3-58-ga151 From c4549b871102db01ba23cce4bd0cdac511f8991d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:14:07 -0800 Subject: zsmalloc: remove zspage isolation for migration zspage isolation for migration introduced additional exceptions to be dealt with since the zspage was isolated from class list. The reason why I isolated zspage from class list was to prevent race between obj_malloc and page migration via allocating zpage from the zspage further. However, it couldn't prevent object freeing from zspage so it needed corner case handling. This patch removes the whole mess. Now, we are fine since class->lock and zspage->lock can prevent the race. Link: https://lkml.kernel.org/r/20211115185909.3949505-7-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 157 +++------------------------------------------------------- 1 file changed, 8 insertions(+), 149 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 648306245d51..c150cea15a6c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -254,10 +254,6 @@ struct zs_pool { #ifdef CONFIG_COMPACTION struct inode *inode; struct work_struct free_work; - /* A wait queue for when migration races with async_free_zspage() */ - struct wait_queue_head migration_wait; - atomic_long_t isolated_pages; - bool destroying; #endif }; @@ -454,11 +450,6 @@ MODULE_ALIAS("zpool-zsmalloc"); /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); -static bool is_zspage_isolated(struct zspage *zspage) -{ - return zspage->isolated; -} - static __maybe_unused int is_first_page(struct page *page) { return PagePrivate(page); @@ -744,7 +735,6 @@ static void remove_zspage(struct size_class *class, enum fullness_group fullness) { VM_BUG_ON(list_empty(&class->fullness_list[fullness])); - VM_BUG_ON(is_zspage_isolated(zspage)); list_del_init(&zspage->list); class_stat_dec(class, fullness, 1); @@ -770,13 +760,9 @@ static enum fullness_group fix_fullness_group(struct size_class *class, if (newfg == currfg) goto out; - if (!is_zspage_isolated(zspage)) { - remove_zspage(class, zspage, currfg); - insert_zspage(class, zspage, newfg); - } - + remove_zspage(class, zspage, currfg); + insert_zspage(class, zspage, newfg); set_zspage_mapping(zspage, class_idx, newfg); - out: return newfg; } @@ -1511,7 +1497,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle) unsigned long obj; struct size_class *class; enum fullness_group fullness; - bool isolated; if (unlikely(!handle)) return; @@ -1533,11 +1518,9 @@ void zs_free(struct zs_pool *pool, unsigned long handle) goto out; } - isolated = is_zspage_isolated(zspage); migrate_read_unlock(zspage); /* If zspage is isolated, zs_page_putback will free the zspage */ - if (likely(!isolated)) - free_zspage(pool, class, zspage); + free_zspage(pool, class, zspage); out: spin_unlock(&class->lock); @@ -1718,7 +1701,6 @@ static struct zspage *isolate_zspage(struct size_class *class, bool source) zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], struct zspage, list); if (zspage) { - VM_BUG_ON(is_zspage_isolated(zspage)); remove_zspage(class, zspage, fg[i]); return zspage; } @@ -1739,8 +1721,6 @@ static enum fullness_group putback_zspage(struct size_class *class, { enum fullness_group fullness; - VM_BUG_ON(is_zspage_isolated(zspage)); - fullness = get_fullness_group(class, zspage); insert_zspage(class, zspage, fullness); set_zspage_mapping(zspage, class->index, fullness); @@ -1822,35 +1802,10 @@ static void inc_zspage_isolation(struct zspage *zspage) static void dec_zspage_isolation(struct zspage *zspage) { + VM_BUG_ON(zspage->isolated == 0); zspage->isolated--; } -static void putback_zspage_deferred(struct zs_pool *pool, - struct size_class *class, - struct zspage *zspage) -{ - enum fullness_group fg; - - fg = putback_zspage(class, zspage); - if (fg == ZS_EMPTY) - schedule_work(&pool->free_work); - -} - -static inline void zs_pool_dec_isolated(struct zs_pool *pool) -{ - VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); - atomic_long_dec(&pool->isolated_pages); - /* - * Checking pool->destroying must happen after atomic_long_dec() - * for pool->isolated_pages above. Paired with the smp_mb() in - * zs_unregister_migration(). - */ - smp_mb__after_atomic(); - if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) - wake_up_all(&pool->migration_wait); -} - static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -1876,10 +1831,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { - struct zs_pool *pool; - struct size_class *class; struct zspage *zspage; - struct address_space *mapping; /* * Page is locked so zspage couldn't be destroyed. For detail, look at @@ -1889,39 +1841,9 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) VM_BUG_ON_PAGE(PageIsolated(page), page); zspage = get_zspage(page); - - mapping = page_mapping(page); - pool = mapping->private_data; - - class = zspage_class(pool, zspage); - - spin_lock(&class->lock); - if (get_zspage_inuse(zspage) == 0) { - spin_unlock(&class->lock); - return false; - } - - /* zspage is isolated for object migration */ - if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { - spin_unlock(&class->lock); - return false; - } - - /* - * If this is first time isolation for the zspage, isolate zspage from - * size_class to prevent further object allocation from the zspage. - */ - if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { - enum fullness_group fullness; - unsigned int class_idx; - - get_zspage_mapping(zspage, &class_idx, &fullness); - atomic_long_inc(&pool->isolated_pages); - remove_zspage(class, zspage, fullness); - } - + migrate_write_lock(zspage); inc_zspage_isolation(zspage); - spin_unlock(&class->lock); + migrate_write_unlock(zspage); return true; } @@ -2004,21 +1926,6 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, dec_zspage_isolation(zspage); - /* - * Page migration is done so let's putback isolated zspage to - * the list if @page is final isolated subpage in the zspage. - */ - if (!is_zspage_isolated(zspage)) { - /* - * We cannot race with zs_destroy_pool() here because we wait - * for isolation to hit zero before we start destroying. - * Also, we ensure that everyone can see pool->destroying before - * we start waiting. - */ - putback_zspage_deferred(pool, class, zspage); - zs_pool_dec_isolated(pool); - } - if (page_zone(newpage) != page_zone(page)) { dec_zone_page_state(page, NR_ZSPAGES); inc_zone_page_state(newpage, NR_ZSPAGES); @@ -2046,30 +1953,15 @@ unpin_objects: static void zs_page_putback(struct page *page) { - struct zs_pool *pool; - struct size_class *class; - struct address_space *mapping; struct zspage *zspage; VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); - mapping = page_mapping(page); - pool = mapping->private_data; - class = zspage_class(pool, zspage); - - spin_lock(&class->lock); + migrate_write_lock(zspage); dec_zspage_isolation(zspage); - if (!is_zspage_isolated(zspage)) { - /* - * Due to page_lock, we cannot free zspage immediately - * so let's defer. - */ - putback_zspage_deferred(pool, class, zspage); - zs_pool_dec_isolated(pool); - } - spin_unlock(&class->lock); + migrate_write_unlock(zspage); } static const struct address_space_operations zsmalloc_aops = { @@ -2091,36 +1983,8 @@ static int zs_register_migration(struct zs_pool *pool) return 0; } -static bool pool_isolated_are_drained(struct zs_pool *pool) -{ - return atomic_long_read(&pool->isolated_pages) == 0; -} - -/* Function for resolving migration */ -static void wait_for_isolated_drain(struct zs_pool *pool) -{ - - /* - * We're in the process of destroying the pool, so there are no - * active allocations. zs_page_isolate() fails for completely free - * zspages, so we need only wait for the zs_pool's isolated - * count to hit zero. - */ - wait_event(pool->migration_wait, - pool_isolated_are_drained(pool)); -} - static void zs_unregister_migration(struct zs_pool *pool) { - pool->destroying = true; - /* - * We need a memory barrier here to ensure global visibility of - * pool->destroying. Thus pool->isolated pages will either be 0 in which - * case we don't care, or it will be > 0 and pool->destroying will - * ensure that we wake up once isolation hits 0. - */ - smp_mb(); - wait_for_isolated_drain(pool); /* This can block */ flush_work(&pool->free_work); iput(pool->inode); } @@ -2150,7 +2014,6 @@ static void async_free_zspage(struct work_struct *work) spin_unlock(&class->lock); } - list_for_each_entry_safe(zspage, tmp, &free_pages, list) { list_del(&zspage->list); lock_zspage(zspage); @@ -2363,10 +2226,6 @@ struct zs_pool *zs_create_pool(const char *name) if (!pool->name) goto err; -#ifdef CONFIG_COMPACTION - init_waitqueue_head(&pool->migration_wait); -#endif - if (create_cache(pool)) goto err; -- cgit v1.2.3-58-ga151 From 4a57d6bbaecd28c8175dc5da013009e4158018c2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:14:10 -0800 Subject: locking/rwlocks: introduce write_lock_nested In preparation for converting bit_spin_lock to rwlock in zsmalloc so that multiple writers of zspages can run at the same time but those zspages are supposed to be different zspage instance. Thus, it's not deadlock. This patch adds write_lock_nested to support the case for LOCKDEP. [minchan@kernel.org: fix write_lock_nested for RT] Link: https://lkml.kernel.org/r/YZfrMTAXV56HFWJY@google.com [bigeasy@linutronix.de: fixup write_lock_nested() implementation] Link: https://lkml.kernel.org/r/20211123170134.y6xb7pmpgdn4m3bn@linutronix.de Link: https://lkml.kernel.org/r/20211115185909.3949505-8-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Sebastian Andrzej Siewior Acked-by: Peter Zijlstra (Intel) Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: Naresh Kamboju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rwlock.h | 6 ++++++ include/linux/rwlock_api_smp.h | 8 ++++++++ include/linux/rwlock_rt.h | 10 ++++++++++ include/linux/spinlock_api_up.h | 1 + kernel/locking/spinlock.c | 10 ++++++++++ kernel/locking/spinlock_rt.c | 12 ++++++++++++ 6 files changed, 47 insertions(+) diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 2c0ad417ce3c..8f416c5e929e 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -55,6 +55,12 @@ do { \ #define write_lock(lock) _raw_write_lock(lock) #define read_lock(lock) _raw_read_lock(lock) +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define write_lock_nested(lock, subclass) _raw_write_lock_nested(lock, subclass) +#else +#define write_lock_nested(lock, subclass) _raw_write_lock(lock) +#endif + #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define read_lock_irqsave(lock, flags) \ diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index f1db6f17c4fb..dceb0a59b692 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -17,6 +17,7 @@ void __lockfunc _raw_read_lock(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_write_lock(rwlock_t *lock) __acquires(lock); +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) __acquires(lock); void __lockfunc _raw_read_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_write_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_read_lock_irq(rwlock_t *lock) __acquires(lock); @@ -209,6 +210,13 @@ static inline void __raw_write_lock(rwlock_t *lock) LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); } +static inline void __raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + preempt_disable(); + rwlock_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); +} + #endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */ static inline void __raw_write_unlock(rwlock_t *lock) diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h index 49c1f3842ed5..8544ff05e594 100644 --- a/include/linux/rwlock_rt.h +++ b/include/linux/rwlock_rt.h @@ -28,6 +28,7 @@ extern void rt_read_lock(rwlock_t *rwlock); extern int rt_read_trylock(rwlock_t *rwlock); extern void rt_read_unlock(rwlock_t *rwlock); extern void rt_write_lock(rwlock_t *rwlock); +extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass); extern int rt_write_trylock(rwlock_t *rwlock); extern void rt_write_unlock(rwlock_t *rwlock); @@ -83,6 +84,15 @@ static __always_inline void write_lock(rwlock_t *rwlock) rt_write_lock(rwlock); } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static __always_inline void write_lock_nested(rwlock_t *rwlock, int subclass) +{ + rt_write_lock_nested(rwlock, subclass); +} +#else +#define write_lock_nested(lock, subclass) rt_write_lock(((void)(subclass), (lock))) +#endif + static __always_inline void write_lock_bh(rwlock_t *rwlock) { local_bh_disable(); diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index d0d188861ad6..b8ba00ccccde 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -59,6 +59,7 @@ #define _raw_spin_lock_nested(lock, subclass) __LOCK(lock) #define _raw_read_lock(lock) __LOCK(lock) #define _raw_write_lock(lock) __LOCK(lock) +#define _raw_write_lock_nested(lock, subclass) __LOCK(lock) #define _raw_spin_lock_bh(lock) __LOCK_BH(lock) #define _raw_read_lock_bh(lock) __LOCK_BH(lock) #define _raw_write_lock_bh(lock) __LOCK_BH(lock) diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index b562f9289372..7f49baaa4979 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -300,6 +300,16 @@ void __lockfunc _raw_write_lock(rwlock_t *lock) __raw_write_lock(lock); } EXPORT_SYMBOL(_raw_write_lock); + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock))) +#endif + +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + __raw_write_lock_nested(lock, subclass); +} +EXPORT_SYMBOL(_raw_write_lock_nested); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index 9e396a09fe0f..48a19ed8486d 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -239,6 +239,18 @@ void __sched rt_write_lock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_write_lock); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) +{ + rtlock_might_resched(); + rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_write_lock_nested); +#endif + void __sched rt_read_unlock(rwlock_t *rwlock) { rwlock_release(&rwlock->dep_map, _RET_IP_); -- cgit v1.2.3-58-ga151 From b475d42d2c43321d8bea685f54916220cb76b511 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:14:13 -0800 Subject: zsmalloc: replace per zpage lock with pool->migrate_lock The zsmalloc has used a bit for spin_lock in zpage handle to keep zpage object alive during several operations. However, it causes the problem for PREEMPT_RT as well as introducing too complicated. This patch replaces the bit spin_lock with pool->migrate_lock rwlock. It could make the code simple as well as zsmalloc work under PREEMPT_RT. The drawback is the pool->migrate_lock is bigger granuarity than per zpage lock so the contention would be higher than old when both IO-related operations(i.e., zsmalloc, zsfree, zs_[map|unmap]) and compaction(page/zpage migration) are going in parallel(*, the migrate_lock is rwlock and IO related functions are all read side lock so there is no contention). However, the write-side is fast enough(dominant overhead is just page copy) so it wouldn't affect much. If the lock granurity becomes more problem later, we could introduce table locks based on handle as a hash value. Link: https://lkml.kernel.org/r/20211115185909.3949505-9-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 205 +++++++++++++++++++++++++++------------------------------- 1 file changed, 96 insertions(+), 109 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c150cea15a6c..1b26fa6b9fa8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -30,6 +30,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +/* + * lock ordering: + * page_lock + * pool->migrate_lock + * class->lock + * zspage->lock + */ + #include #include #include @@ -100,15 +108,6 @@ #define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) -/* - * Memory for allocating for handle keeps object position by - * encoding and the encoded value has a room - * in least bit(ie, look at obj_to_location). - * We use the bit to synchronize between object access by - * user and migration. - */ -#define HANDLE_PIN_BIT 0 - /* * Head in allocated object should have OBJ_ALLOCATED_TAG * to identify the object was allocated or not. @@ -255,6 +254,8 @@ struct zs_pool { struct inode *inode; struct work_struct free_work; #endif + /* protect page/zspage migration */ + rwlock_t migrate_lock; }; struct zspage { @@ -297,6 +298,9 @@ static void zs_unregister_migration(struct zs_pool *pool); static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); +static void migrate_write_lock(struct zspage *zspage); +static void migrate_write_lock_nested(struct zspage *zspage); +static void migrate_write_unlock(struct zspage *zspage); static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); @@ -308,6 +312,9 @@ static void zs_unregister_migration(struct zs_pool *pool) {} static void migrate_lock_init(struct zspage *zspage) {} static void migrate_read_lock(struct zspage *zspage) {} static void migrate_read_unlock(struct zspage *zspage) {} +static void migrate_write_lock(struct zspage *zspage) {} +static void migrate_write_lock_nested(struct zspage *zspage) {} +static void migrate_write_unlock(struct zspage *zspage) {} static void kick_deferred_free(struct zs_pool *pool) {} static void init_deferred_free(struct zs_pool *pool) {} static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} @@ -359,14 +366,10 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) kmem_cache_free(pool->zspage_cachep, zspage); } +/* class->lock(which owns the handle) synchronizes races */ static void record_obj(unsigned long handle, unsigned long obj) { - /* - * lsb of @obj represents handle lock while other bits - * represent object value the handle is pointing so - * updating shouldn't do store tearing. - */ - WRITE_ONCE(*(unsigned long *)handle, obj); + *(unsigned long *)handle = obj; } /* zpool driver */ @@ -880,26 +883,6 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) return true; } -static inline int testpin_tag(unsigned long handle) -{ - return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); -} - -static inline int trypin_tag(unsigned long handle) -{ - return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); -} - -static void pin_tag(unsigned long handle) __acquires(bitlock) -{ - bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); -} - -static void unpin_tag(unsigned long handle) __releases(bitlock) -{ - bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); -} - static void reset_page(struct page *page) { __ClearPageMovable(page); @@ -968,6 +951,11 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class, VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(list_empty(&zspage->list)); + /* + * Since zs_free couldn't be sleepable, this function cannot call + * lock_page. The page locks trylock_zspage got will be released + * by __free_zspage. + */ if (!trylock_zspage(zspage)) { kick_deferred_free(pool); return; @@ -1263,15 +1251,20 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, */ BUG_ON(in_interrupt()); - /* From now on, migration cannot move the object */ - pin_tag(handle); - + /* It guarantees it can get zspage from handle safely */ + read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); - /* migration cannot move any subpage in this zspage */ + /* + * migration cannot move any zpages in this zspage. Here, class->lock + * is too heavy since callers would take some time until they calls + * zs_unmap_object API so delegate the locking from class to zspage + * which is smaller granularity. + */ migrate_read_lock(zspage); + read_unlock(&pool->migrate_lock); class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; @@ -1330,7 +1323,6 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) put_cpu_var(zs_map_area); migrate_read_unlock(zspage); - unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); @@ -1424,6 +1416,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; + /* class->lock effectively protects the zpage migration */ spin_lock(&class->lock); zspage = find_get_zspage(class); if (likely(zspage)) { @@ -1501,30 +1494,27 @@ void zs_free(struct zs_pool *pool, unsigned long handle) if (unlikely(!handle)) return; - pin_tag(handle); + /* + * The pool->migrate_lock protects the race with zpage's migration + * so it's safe to get the page from handle. + */ + read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); obj_to_page(obj, &f_page); zspage = get_zspage(f_page); - - migrate_read_lock(zspage); class = zspage_class(pool, zspage); - spin_lock(&class->lock); + read_unlock(&pool->migrate_lock); + obj_free(class->size, obj); class_stat_dec(class, OBJ_USED, 1); fullness = fix_fullness_group(class, zspage); - if (fullness != ZS_EMPTY) { - migrate_read_unlock(zspage); + if (fullness != ZS_EMPTY) goto out; - } - migrate_read_unlock(zspage); - /* If zspage is isolated, zs_page_putback will free the zspage */ free_zspage(pool, class, zspage); out: - spin_unlock(&class->lock); - unpin_tag(handle); cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); @@ -1608,11 +1598,8 @@ static unsigned long find_alloced_obj(struct size_class *class, offset += class->size * index; while (offset < PAGE_SIZE) { - if (obj_allocated(page, addr + offset, &handle)) { - if (trypin_tag(handle)) - break; - handle = 0; - } + if (obj_allocated(page, addr + offset, &handle)) + break; offset += class->size; index++; @@ -1658,7 +1645,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, /* Stop if there is no more space */ if (zspage_full(class, get_zspage(d_page))) { - unpin_tag(handle); ret = -ENOMEM; break; } @@ -1667,15 +1653,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, free_obj = obj_malloc(pool, get_zspage(d_page), handle); zs_object_copy(class, free_obj, used_obj); obj_idx++; - /* - * record_obj updates handle's value to free_obj and it will - * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which - * breaks synchronization using pin_tag(e,g, zs_free) so - * let's keep the lock bit. - */ - free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); - unpin_tag(handle); obj_free(class->size, used_obj); } @@ -1789,6 +1767,11 @@ static void migrate_write_lock(struct zspage *zspage) write_lock(&zspage->lock); } +static void migrate_write_lock_nested(struct zspage *zspage) +{ + write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING); +} + static void migrate_write_unlock(struct zspage *zspage) { write_unlock(&zspage->lock); @@ -1856,11 +1839,10 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, struct zspage *zspage; struct page *dummy; void *s_addr, *d_addr, *addr; - int offset, pos; + int offset; unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; - int ret = -EAGAIN; /* * We cannot support the _NO_COPY case here, because copy needs to @@ -1873,32 +1855,25 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); - zspage = get_zspage(page); - - /* Concurrent compactor cannot migrate any subpage in zspage */ - migrate_write_lock(zspage); pool = mapping->private_data; + + /* + * The pool migrate_lock protects the race between zpage migration + * and zs_free. + */ + write_lock(&pool->migrate_lock); + zspage = get_zspage(page); class = zspage_class(pool, zspage); - offset = get_first_obj_offset(page); + /* + * the class lock protects zpage alloc/free in the zspage. + */ spin_lock(&class->lock); - if (!get_zspage_inuse(zspage)) { - /* - * Set "offset" to end of the page so that every loops - * skips unnecessary object scanning. - */ - offset = PAGE_SIZE; - } + /* the migrate_write_lock protects zpage access via zs_map_object */ + migrate_write_lock(zspage); - pos = offset; + offset = get_first_obj_offset(page); s_addr = kmap_atomic(page); - while (pos < PAGE_SIZE) { - if (obj_allocated(page, s_addr + pos, &handle)) { - if (!trypin_tag(handle)) - goto unpin_objects; - } - pos += class->size; - } /* * Here, any user cannot access all objects in the zspage so let's move. @@ -1907,25 +1882,30 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, memcpy(d_addr, s_addr, PAGE_SIZE); kunmap_atomic(d_addr); - for (addr = s_addr + offset; addr < s_addr + pos; + for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { if (obj_allocated(page, addr, &handle)) { - BUG_ON(!testpin_tag(handle)); old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); new_obj = (unsigned long)location_to_obj(newpage, obj_idx); - new_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, new_obj); } } + kunmap_atomic(s_addr); replace_sub_page(class, zspage, newpage, page); - get_page(newpage); - + /* + * Since we complete the data copy and set up new zspage structure, + * it's okay to release migration_lock. + */ + write_unlock(&pool->migrate_lock); + spin_unlock(&class->lock); dec_zspage_isolation(zspage); + migrate_write_unlock(zspage); + get_page(newpage); if (page_zone(newpage) != page_zone(page)) { dec_zone_page_state(page, NR_ZSPAGES); inc_zone_page_state(newpage, NR_ZSPAGES); @@ -1933,22 +1913,8 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, reset_page(page); put_page(page); - page = newpage; - - ret = MIGRATEPAGE_SUCCESS; -unpin_objects: - for (addr = s_addr + offset; addr < s_addr + pos; - addr += class->size) { - if (obj_allocated(page, addr, &handle)) { - BUG_ON(!testpin_tag(handle)); - unpin_tag(handle); - } - } - kunmap_atomic(s_addr); - spin_unlock(&class->lock); - migrate_write_unlock(zspage); - return ret; + return MIGRATEPAGE_SUCCESS; } static void zs_page_putback(struct page *page) @@ -2077,8 +2043,13 @@ static unsigned long __zs_compact(struct zs_pool *pool, struct zspage *dst_zspage = NULL; unsigned long pages_freed = 0; + /* protect the race between zpage migration and zs_free */ + write_lock(&pool->migrate_lock); + /* protect zpage allocation/free */ spin_lock(&class->lock); while ((src_zspage = isolate_zspage(class, true))) { + /* protect someone accessing the zspage(i.e., zs_map_object) */ + migrate_write_lock(src_zspage); if (!zs_can_compact(class)) break; @@ -2087,6 +2058,8 @@ static unsigned long __zs_compact(struct zs_pool *pool, cc.s_page = get_first_page(src_zspage); while ((dst_zspage = isolate_zspage(class, false))) { + migrate_write_lock_nested(dst_zspage); + cc.d_page = get_first_page(dst_zspage); /* * If there is no more space in dst_page, resched @@ -2096,6 +2069,10 @@ static unsigned long __zs_compact(struct zs_pool *pool, break; putback_zspage(class, dst_zspage); + migrate_write_unlock(dst_zspage); + dst_zspage = NULL; + if (rwlock_is_contended(&pool->migrate_lock)) + break; } /* Stop if we couldn't find slot */ @@ -2103,19 +2080,28 @@ static unsigned long __zs_compact(struct zs_pool *pool, break; putback_zspage(class, dst_zspage); + migrate_write_unlock(dst_zspage); + if (putback_zspage(class, src_zspage) == ZS_EMPTY) { + migrate_write_unlock(src_zspage); free_zspage(pool, class, src_zspage); pages_freed += class->pages_per_zspage; - } + } else + migrate_write_unlock(src_zspage); spin_unlock(&class->lock); + write_unlock(&pool->migrate_lock); cond_resched(); + write_lock(&pool->migrate_lock); spin_lock(&class->lock); } - if (src_zspage) + if (src_zspage) { putback_zspage(class, src_zspage); + migrate_write_unlock(src_zspage); + } spin_unlock(&class->lock); + write_unlock(&pool->migrate_lock); return pages_freed; } @@ -2221,6 +2207,7 @@ struct zs_pool *zs_create_pool(const char *name) return NULL; init_deferred_free(pool); + rwlock_init(&pool->migrate_lock); pool->name = kstrdup(name, GFP_KERNEL); if (!pool->name) -- cgit v1.2.3-58-ga151 From a37265995c867a4e413761d846cef0445b08d6d5 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 21 Jan 2022 22:14:17 -0800 Subject: zsmalloc: replace get_cpu_var with local_lock The usage of get_cpu_var() in zs_map_object() is problematic because it disables preemption and makes it impossible to acquire any sleeping lock on PREEMPT_RT such as a spinlock_t. Replace the get_cpu_var() usage with a local_lock_t which is embedded struct mapping_area. It ensures that the access the struct is synchronized against all users on the same CPU. [minchan: remove the bit_spin_lock part and change the title] Link: https://lkml.kernel.org/r/20211115185909.3949505-10-minchan@kernel.org Signed-off-by: Mike Galbraith Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Minchan Kim Tested-by: Sebastian Andrzej Siewior Cc: Peter Zijlstra (Intel) Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 1b26fa6b9fa8..9152fbde33b5 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -65,6 +65,7 @@ #include #include #include +#include #define ZSPAGE_MAGIC 0x58 @@ -276,6 +277,7 @@ struct zspage { }; struct mapping_area { + local_lock_t lock; char *vm_buf; /* copy buffer for objects that span pages */ char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ @@ -451,7 +453,9 @@ MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ -static DEFINE_PER_CPU(struct mapping_area, zs_map_area); +static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static __maybe_unused int is_first_page(struct page *page) { @@ -1269,7 +1273,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; - area = &get_cpu_var(zs_map_area); + local_lock(&zs_map_area.lock); + area = this_cpu_ptr(&zs_map_area); area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ @@ -1320,7 +1325,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } - put_cpu_var(zs_map_area); + local_unlock(&zs_map_area.lock); migrate_read_unlock(zspage); } -- cgit v1.2.3-58-ga151 From 6dfbbae14a7b961f41d80a106e1ab60e86d061c5 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 21 Jan 2022 22:14:20 -0800 Subject: fs: proc: store PDE()->data into inode->i_private PDE_DATA(inode) is introduced to get user private data and hide the layout of struct proc_dir_entry. The inode->i_private is used to do the same thing as well. Save a copy of user private data to inode-> i_private when proc inode is allocated. This means the user also can get their private data by inode->i_private. Introduce pde_data() to wrap inode->i_private so that we can remove PDE_DATA() from fs/proc/generic.c and make PTE_DATE() as a wrapper of pde_data(). It will be easier if we decide to remove PDE_DATE() in the future. Link: https://lkml.kernel.org/r/20211124081956.87711-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Christian Brauner Cc: Alexey Dobriyan Cc: Alexey Gladkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 6 ------ fs/proc/inode.c | 1 + fs/proc/internal.h | 5 ----- include/linux/proc_fs.h | 13 ++++++++++++- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5b78739e60e4..f2132407e133 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -791,12 +791,6 @@ void proc_remove(struct proc_dir_entry *de) } EXPORT_SYMBOL(proc_remove); -void *PDE_DATA(const struct inode *inode) -{ - return __PDE_DATA(inode); -} -EXPORT_SYMBOL(PDE_DATA); - /* * Pull a user buffer into memory and pass it to the file's write handler if * one is supplied. The ->write() method is permitted to modify the diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 599eb724ff2d..f84355c5a36d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -650,6 +650,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) return NULL; } + inode->i_private = de->data; inode->i_ino = de->low_ino; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); PROC_I(inode)->pde = de; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 03415f3fb3a8..06a80f78433d 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -115,11 +115,6 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode) return PROC_I(inode)->pde; } -static inline void *__PDE_DATA(const struct inode *inode) -{ - return PDE(inode)->data; -} - static inline struct pid *proc_pid(const struct inode *inode) { return PROC_I(inode)->pid; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 01b9268451a8..b6e7005cc1b2 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -110,7 +110,18 @@ extern struct proc_dir_entry *proc_create_data(const char *, umode_t, struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops); extern void proc_set_size(struct proc_dir_entry *, loff_t); extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t); -extern void *PDE_DATA(const struct inode *); + +/* + * Obtain the private data passed by user through proc_create_data() or + * related. + */ +static inline void *pde_data(const struct inode *inode) +{ + return inode->i_private; +} + +#define PDE_DATA(i) pde_data(i) + extern void *proc_get_parent_data(const struct inode *); extern void proc_remove(struct proc_dir_entry *); extern void remove_proc_entry(const char *, struct proc_dir_entry *); -- cgit v1.2.3-58-ga151 From 359745d78351c6f5442435f81549f0207ece28aa Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 21 Jan 2022 22:14:23 -0800 Subject: proc: remove PDE_DATA() completely Remove PDE_DATA() completely and replace it with pde_data(). [akpm@linux-foundation.org: fix naming clash in drivers/nubus/proc.c] [akpm@linux-foundation.org: now fix it properly] Link: https://lkml.kernel.org/r/20211124081956.87711-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Christian Brauner Cc: Alexey Dobriyan Cc: Alexey Gladkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/srm_env.c | 4 +-- arch/arm/kernel/atags_proc.c | 2 +- arch/ia64/kernel/salinfo.c | 10 +++--- arch/powerpc/kernel/proc_powerpc.c | 4 +-- arch/sh/mm/alignment.c | 2 +- arch/xtensa/platforms/iss/simdisk.c | 4 +-- drivers/acpi/proc.c | 2 +- drivers/hwmon/dell-smm-hwmon.c | 4 +-- drivers/net/bonding/bond_procfs.c | 8 ++--- drivers/net/wireless/cisco/airo.c | 22 ++++++------- drivers/net/wireless/intersil/hostap/hostap_ap.c | 16 +++++----- .../net/wireless/intersil/hostap/hostap_download.c | 2 +- drivers/net/wireless/intersil/hostap/hostap_proc.c | 24 +++++++-------- drivers/net/wireless/ray_cs.c | 2 +- drivers/nubus/proc.c | 36 +++++++++++----------- drivers/parisc/led.c | 4 +-- drivers/pci/proc.c | 10 +++--- drivers/platform/x86/thinkpad_acpi.c | 4 +-- drivers/platform/x86/toshiba_acpi.c | 16 +++++----- drivers/pnp/isapnp/proc.c | 2 +- drivers/pnp/pnpbios/proc.c | 4 +-- drivers/scsi/scsi_proc.c | 4 +-- drivers/usb/gadget/function/rndis.c | 4 +-- drivers/zorro/proc.c | 2 +- fs/afs/proc.c | 6 ++-- fs/ext4/mballoc.c | 14 ++++----- fs/jbd2/journal.c | 2 +- fs/proc/proc_net.c | 8 ++--- include/linux/proc_fs.h | 4 +-- include/linux/seq_file.h | 2 +- ipc/util.c | 2 +- kernel/irq/proc.c | 8 ++--- kernel/resource.c | 4 +-- net/atm/proc.c | 4 +-- net/bluetooth/af_bluetooth.c | 8 ++--- net/can/bcm.c | 2 +- net/can/proc.c | 2 +- net/core/neighbour.c | 6 ++-- net/core/pktgen.c | 6 ++-- net/ipv4/netfilter/ipt_CLUSTERIP.c | 6 ++-- net/ipv4/raw.c | 8 ++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 6 ++-- net/netfilter/x_tables.c | 10 +++--- net/netfilter/xt_hashlimit.c | 18 +++++------ net/netfilter/xt_recent.c | 4 +-- net/sunrpc/auth_gss/svcauth_gss.c | 4 +-- net/sunrpc/cache.c | 24 +++++++-------- net/sunrpc/stats.c | 2 +- sound/core/info.c | 4 +-- 50 files changed, 178 insertions(+), 180 deletions(-) diff --git a/arch/alpha/kernel/srm_env.c b/arch/alpha/kernel/srm_env.c index 528d2be58182..217b4dca51dd 100644 --- a/arch/alpha/kernel/srm_env.c +++ b/arch/alpha/kernel/srm_env.c @@ -83,14 +83,14 @@ static int srm_env_proc_show(struct seq_file *m, void *v) static int srm_env_proc_open(struct inode *inode, struct file *file) { - return single_open(file, srm_env_proc_show, PDE_DATA(inode)); + return single_open(file, srm_env_proc_show, pde_data(inode)); } static ssize_t srm_env_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { int res; - unsigned long id = (unsigned long)PDE_DATA(file_inode(file)); + unsigned long id = (unsigned long)pde_data(file_inode(file)); char *buf = (char *) __get_free_page(GFP_USER); unsigned long ret1, ret2; diff --git a/arch/arm/kernel/atags_proc.c b/arch/arm/kernel/atags_proc.c index 3c2faf2bd124..3ec2afe78423 100644 --- a/arch/arm/kernel/atags_proc.c +++ b/arch/arm/kernel/atags_proc.c @@ -13,7 +13,7 @@ struct buffer { static ssize_t atags_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct buffer *b = PDE_DATA(file_inode(file)); + struct buffer *b = pde_data(file_inode(file)); return simple_read_from_buffer(buf, count, ppos, b->data, b->size); } diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index a25ab9b37953..bd3ba276e69c 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -282,7 +282,7 @@ salinfo_event_open(struct inode *inode, struct file *file) static ssize_t salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { - struct salinfo_data *data = PDE_DATA(file_inode(file)); + struct salinfo_data *data = pde_data(file_inode(file)); char cmd[32]; size_t size; int i, n, cpu = -1; @@ -340,7 +340,7 @@ static const struct proc_ops salinfo_event_proc_ops = { static int salinfo_log_open(struct inode *inode, struct file *file) { - struct salinfo_data *data = PDE_DATA(inode); + struct salinfo_data *data = pde_data(inode); if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -365,7 +365,7 @@ salinfo_log_open(struct inode *inode, struct file *file) static int salinfo_log_release(struct inode *inode, struct file *file) { - struct salinfo_data *data = PDE_DATA(inode); + struct salinfo_data *data = pde_data(inode); if (data->state == STATE_NO_DATA) { vfree(data->log_buffer); @@ -433,7 +433,7 @@ retry: static ssize_t salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { - struct salinfo_data *data = PDE_DATA(file_inode(file)); + struct salinfo_data *data = pde_data(file_inode(file)); u8 *buf; u64 bufsize; @@ -494,7 +494,7 @@ salinfo_log_clear(struct salinfo_data *data, int cpu) static ssize_t salinfo_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - struct salinfo_data *data = PDE_DATA(file_inode(file)); + struct salinfo_data *data = pde_data(file_inode(file)); char cmd[32]; size_t size; u32 offset; diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c index 877817471e3c..6a029f2378e1 100644 --- a/arch/powerpc/kernel/proc_powerpc.c +++ b/arch/powerpc/kernel/proc_powerpc.c @@ -25,7 +25,7 @@ static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes loff_t *ppos) { return simple_read_from_buffer(buf, nbytes, ppos, - PDE_DATA(file_inode(file)), PAGE_SIZE); + pde_data(file_inode(file)), PAGE_SIZE); } static int page_map_mmap( struct file *file, struct vm_area_struct *vma ) @@ -34,7 +34,7 @@ static int page_map_mmap( struct file *file, struct vm_area_struct *vma ) return -EINVAL; remap_pfn_range(vma, vma->vm_start, - __pa(PDE_DATA(file_inode(file))) >> PAGE_SHIFT, + __pa(pde_data(file_inode(file))) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); return 0; } diff --git a/arch/sh/mm/alignment.c b/arch/sh/mm/alignment.c index d802801ceb93..3a76a766f423 100644 --- a/arch/sh/mm/alignment.c +++ b/arch/sh/mm/alignment.c @@ -140,7 +140,7 @@ static int alignment_proc_open(struct inode *inode, struct file *file) static ssize_t alignment_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - int *data = PDE_DATA(file_inode(file)); + int *data = pde_data(file_inode(file)); char mode; if (count > 0) { diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 07b642c1916a..8eb6ad1a3a1d 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -208,7 +208,7 @@ static int simdisk_detach(struct simdisk *dev) static ssize_t proc_read_simdisk(struct file *file, char __user *buf, size_t size, loff_t *ppos) { - struct simdisk *dev = PDE_DATA(file_inode(file)); + struct simdisk *dev = pde_data(file_inode(file)); const char *s = dev->filename; if (s) { ssize_t n = simple_read_from_buffer(buf, size, ppos, @@ -225,7 +225,7 @@ static ssize_t proc_write_simdisk(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char *tmp = memdup_user_nul(buf, count); - struct simdisk *dev = PDE_DATA(file_inode(file)); + struct simdisk *dev = pde_data(file_inode(file)); int err; if (IS_ERR(tmp)) diff --git a/drivers/acpi/proc.c b/drivers/acpi/proc.c index 0cca7991f186..4322f2da6d10 100644 --- a/drivers/acpi/proc.c +++ b/drivers/acpi/proc.c @@ -127,7 +127,7 @@ static int acpi_system_wakeup_device_open_fs(struct inode *inode, struct file *file) { return single_open(file, acpi_system_wakeup_device_seq_show, - PDE_DATA(inode)); + pde_data(inode)); } static const struct proc_ops acpi_system_wakeup_device_proc_ops = { diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index d401f9acf450..9949eeb79378 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -451,7 +451,7 @@ static int i8k_get_power_status(void) static long i8k_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { - struct dell_smm_data *data = PDE_DATA(file_inode(fp)); + struct dell_smm_data *data = pde_data(file_inode(fp)); int __user *argp = (int __user *)arg; int speed, err; int val = 0; @@ -585,7 +585,7 @@ static int i8k_proc_show(struct seq_file *seq, void *offset) static int i8k_open_fs(struct inode *inode, struct file *file) { - return single_open(file, i8k_proc_show, PDE_DATA(inode)); + return single_open(file, i8k_proc_show, pde_data(inode)); } static const struct proc_ops i8k_proc_ops = { diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c index 2ec11af5f0cc..46b150e6289e 100644 --- a/drivers/net/bonding/bond_procfs.c +++ b/drivers/net/bonding/bond_procfs.c @@ -11,7 +11,7 @@ static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { - struct bonding *bond = PDE_DATA(file_inode(seq->file)); + struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; loff_t off = 0; @@ -30,7 +30,7 @@ static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) static void *bond_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct bonding *bond = PDE_DATA(file_inode(seq->file)); + struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; bool found = false; @@ -57,7 +57,7 @@ static void bond_info_seq_stop(struct seq_file *seq, void *v) static void bond_info_show_master(struct seq_file *seq) { - struct bonding *bond = PDE_DATA(file_inode(seq->file)); + struct bonding *bond = pde_data(file_inode(seq->file)); const struct bond_opt_value *optval; struct slave *curr, *primary; int i; @@ -175,7 +175,7 @@ static void bond_info_show_master(struct seq_file *seq) static void bond_info_show_slave(struct seq_file *seq, const struct slave *slave) { - struct bonding *bond = PDE_DATA(file_inode(seq->file)); + struct bonding *bond = pde_data(file_inode(seq->file)); seq_printf(seq, "\nSlave Interface: %s\n", slave->dev->name); seq_printf(seq, "MII Status: %s\n", bond_slave_link_status(slave->link)); diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c index 45594f003ef7..452d08545d31 100644 --- a/drivers/net/wireless/cisco/airo.c +++ b/drivers/net/wireless/cisco/airo.c @@ -4672,7 +4672,7 @@ static ssize_t proc_write(struct file *file, static int proc_status_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *apriv = dev->ml_priv; CapabilityRid cap_rid; StatusRid status_rid; @@ -4756,7 +4756,7 @@ static int proc_stats_rid_open(struct inode *inode, u16 rid) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *apriv = dev->ml_priv; StatsRid stats; int i, j; @@ -4819,7 +4819,7 @@ static inline int sniffing_mode(struct airo_info *ai) static void proc_config_on_close(struct inode *inode, struct file *file) { struct proc_data *data = file->private_data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; char *line; @@ -5030,7 +5030,7 @@ static const char *get_rmode(__le16 mode) static int proc_config_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; int i; __le16 mode; @@ -5120,7 +5120,7 @@ static int proc_config_open(struct inode *inode, struct file *file) static void proc_SSID_on_close(struct inode *inode, struct file *file) { struct proc_data *data = file->private_data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; SsidRid SSID_rid; int i; @@ -5156,7 +5156,7 @@ static void proc_SSID_on_close(struct inode *inode, struct file *file) static void proc_APList_on_close(struct inode *inode, struct file *file) { struct proc_data *data = file->private_data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; APListRid *APList_rid = &ai->APList; int i; @@ -5280,7 +5280,7 @@ static int set_wep_tx_idx(struct airo_info *ai, u16 index, int perm, int lock) static void proc_wepkey_on_close(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; int i, rc; char key[16]; @@ -5331,7 +5331,7 @@ static void proc_wepkey_on_close(struct inode *inode, struct file *file) static int proc_wepkey_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; char *ptr; WepKeyRid wkr; @@ -5379,7 +5379,7 @@ static int proc_wepkey_open(struct inode *inode, struct file *file) static int proc_SSID_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; int i; char *ptr; @@ -5423,7 +5423,7 @@ static int proc_SSID_open(struct inode *inode, struct file *file) static int proc_APList_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; int i; char *ptr; @@ -5462,7 +5462,7 @@ static int proc_APList_open(struct inode *inode, struct file *file) static int proc_BSSList_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; char *ptr; BSSListRid BSSList_rid; diff --git a/drivers/net/wireless/intersil/hostap/hostap_ap.c b/drivers/net/wireless/intersil/hostap/hostap_ap.c index 8bcc1cdcb75b..462ccc7d7d1a 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_ap.c +++ b/drivers/net/wireless/intersil/hostap/hostap_ap.c @@ -69,7 +69,7 @@ static void prism2_send_mgmt(struct net_device *dev, #if !defined(PRISM2_NO_PROCFS_DEBUG) && defined(CONFIG_PROC_FS) static int ap_debug_proc_show(struct seq_file *m, void *v) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); seq_printf(m, "BridgedUnicastFrames=%u\n", ap->bridged_unicast); seq_printf(m, "BridgedMulticastFrames=%u\n", ap->bridged_multicast); @@ -320,7 +320,7 @@ void hostap_deauth_all_stas(struct net_device *dev, struct ap_data *ap, static int ap_control_proc_show(struct seq_file *m, void *v) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); char *policy_txt; struct mac_entry *entry; @@ -352,20 +352,20 @@ static int ap_control_proc_show(struct seq_file *m, void *v) static void *ap_control_proc_start(struct seq_file *m, loff_t *_pos) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); spin_lock_bh(&ap->mac_restrictions.lock); return seq_list_start_head(&ap->mac_restrictions.mac_list, *_pos); } static void *ap_control_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); return seq_list_next(v, &ap->mac_restrictions.mac_list, _pos); } static void ap_control_proc_stop(struct seq_file *m, void *v) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); spin_unlock_bh(&ap->mac_restrictions.lock); } @@ -554,20 +554,20 @@ static int prism2_ap_proc_show(struct seq_file *m, void *v) static void *prism2_ap_proc_start(struct seq_file *m, loff_t *_pos) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); spin_lock_bh(&ap->sta_table_lock); return seq_list_start_head(&ap->sta_list, *_pos); } static void *prism2_ap_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); return seq_list_next(v, &ap->sta_list, _pos); } static void prism2_ap_proc_stop(struct seq_file *m, void *v) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); spin_unlock_bh(&ap->sta_table_lock); } diff --git a/drivers/net/wireless/intersil/hostap/hostap_download.c b/drivers/net/wireless/intersil/hostap/hostap_download.c index 7c6a5a6d1d45..3672291ced5c 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_download.c +++ b/drivers/net/wireless/intersil/hostap/hostap_download.c @@ -227,7 +227,7 @@ static int prism2_download_aux_dump_proc_open(struct inode *inode, struct file * sizeof(struct prism2_download_aux_dump)); if (ret == 0) { struct seq_file *m = file->private_data; - m->private = PDE_DATA(inode); + m->private = pde_data(inode); } return ret; } diff --git a/drivers/net/wireless/intersil/hostap/hostap_proc.c b/drivers/net/wireless/intersil/hostap/hostap_proc.c index 51c847d98755..61f68786056f 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_proc.c +++ b/drivers/net/wireless/intersil/hostap/hostap_proc.c @@ -97,20 +97,20 @@ static int prism2_wds_proc_show(struct seq_file *m, void *v) static void *prism2_wds_proc_start(struct seq_file *m, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); read_lock_bh(&local->iface_lock); return seq_list_start(&local->hostap_interfaces, *_pos); } static void *prism2_wds_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); return seq_list_next(v, &local->hostap_interfaces, _pos); } static void prism2_wds_proc_stop(struct seq_file *m, void *v) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); read_unlock_bh(&local->iface_lock); } @@ -123,7 +123,7 @@ static const struct seq_operations prism2_wds_proc_seqops = { static int prism2_bss_list_proc_show(struct seq_file *m, void *v) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); struct list_head *ptr = v; struct hostap_bss_info *bss; @@ -151,21 +151,21 @@ static int prism2_bss_list_proc_show(struct seq_file *m, void *v) static void *prism2_bss_list_proc_start(struct seq_file *m, loff_t *_pos) __acquires(&local->lock) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); spin_lock_bh(&local->lock); return seq_list_start_head(&local->bss_list, *_pos); } static void *prism2_bss_list_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); return seq_list_next(v, &local->bss_list, _pos); } static void prism2_bss_list_proc_stop(struct seq_file *m, void *v) __releases(&local->lock) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); spin_unlock_bh(&local->lock); } @@ -198,7 +198,7 @@ static int prism2_crypt_proc_show(struct seq_file *m, void *v) static ssize_t prism2_pda_proc_read(struct file *file, char __user *buf, size_t count, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(file)); + local_info_t *local = pde_data(file_inode(file)); size_t off; if (local->pda == NULL || *_pos >= PRISM2_PDA_SIZE) @@ -272,7 +272,7 @@ static int prism2_io_debug_proc_read(char *page, char **start, off_t off, #ifndef PRISM2_NO_STATION_MODES static int prism2_scan_results_proc_show(struct seq_file *m, void *v) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); unsigned long entry; int i, len; struct hfa384x_hostscan_result *scanres; @@ -322,7 +322,7 @@ static int prism2_scan_results_proc_show(struct seq_file *m, void *v) static void *prism2_scan_results_proc_start(struct seq_file *m, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); spin_lock_bh(&local->lock); /* We have a header (pos 0) + N results to show (pos 1...N) */ @@ -333,7 +333,7 @@ static void *prism2_scan_results_proc_start(struct seq_file *m, loff_t *_pos) static void *prism2_scan_results_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); ++*_pos; if (*_pos > local->last_scan_results_count) @@ -343,7 +343,7 @@ static void *prism2_scan_results_proc_next(struct seq_file *m, void *v, loff_t * static void prism2_scan_results_proc_stop(struct seq_file *m, void *v) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); spin_unlock_bh(&local->lock); } diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c index e3a3dc3e45b4..2987ad9271f6 100644 --- a/drivers/net/wireless/ray_cs.c +++ b/drivers/net/wireless/ray_cs.c @@ -2746,7 +2746,7 @@ static ssize_t int_proc_write(struct file *file, const char __user *buffer, nr = nr * 10 + c; p++; } while (--len); - *(int *)PDE_DATA(file_inode(file)) = nr; + *(int *)pde_data(file_inode(file)) = nr; return count; } diff --git a/drivers/nubus/proc.c b/drivers/nubus/proc.c index 88e1f9a0faaf..1fd667852271 100644 --- a/drivers/nubus/proc.c +++ b/drivers/nubus/proc.c @@ -93,30 +93,30 @@ struct nubus_proc_pde_data { static struct nubus_proc_pde_data * nubus_proc_alloc_pde_data(unsigned char *ptr, unsigned int size) { - struct nubus_proc_pde_data *pde_data; + struct nubus_proc_pde_data *pded; - pde_data = kmalloc(sizeof(*pde_data), GFP_KERNEL); - if (!pde_data) + pded = kmalloc(sizeof(*pded), GFP_KERNEL); + if (!pded) return NULL; - pde_data->res_ptr = ptr; - pde_data->res_size = size; - return pde_data; + pded->res_ptr = ptr; + pded->res_size = size; + return pded; } static int nubus_proc_rsrc_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct nubus_proc_pde_data *pde_data; + struct nubus_proc_pde_data *pded; - pde_data = PDE_DATA(inode); - if (!pde_data) + pded = pde_data(inode); + if (!pded) return 0; - if (pde_data->res_size > m->size) + if (pded->res_size > m->size) return -EFBIG; - if (pde_data->res_size) { + if (pded->res_size) { int lanes = (int)proc_get_parent_data(inode); struct nubus_dirent ent; @@ -124,11 +124,11 @@ static int nubus_proc_rsrc_show(struct seq_file *m, void *v) return 0; ent.mask = lanes; - ent.base = pde_data->res_ptr; + ent.base = pded->res_ptr; ent.data = 0; - nubus_seq_write_rsrc_mem(m, &ent, pde_data->res_size); + nubus_seq_write_rsrc_mem(m, &ent, pded->res_size); } else { - unsigned int data = (unsigned int)pde_data->res_ptr; + unsigned int data = (unsigned int)pded->res_ptr; seq_putc(m, data >> 16); seq_putc(m, data >> 8); @@ -142,18 +142,18 @@ void nubus_proc_add_rsrc_mem(struct proc_dir_entry *procdir, unsigned int size) { char name[9]; - struct nubus_proc_pde_data *pde_data; + struct nubus_proc_pde_data *pded; if (!procdir) return; snprintf(name, sizeof(name), "%x", ent->type); if (size) - pde_data = nubus_proc_alloc_pde_data(nubus_dirptr(ent), size); + pded = nubus_proc_alloc_pde_data(nubus_dirptr(ent), size); else - pde_data = NULL; + pded = NULL; proc_create_single_data(name, S_IFREG | 0444, procdir, - nubus_proc_rsrc_show, pde_data); + nubus_proc_rsrc_show, pded); } void nubus_proc_add_rsrc(struct proc_dir_entry *procdir, diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c index cf91cb024be3..1e4a5663d011 100644 --- a/drivers/parisc/led.c +++ b/drivers/parisc/led.c @@ -168,14 +168,14 @@ static int led_proc_show(struct seq_file *m, void *v) static int led_proc_open(struct inode *inode, struct file *file) { - return single_open(file, led_proc_show, PDE_DATA(inode)); + return single_open(file, led_proc_show, pde_data(inode)); } static ssize_t led_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - void *data = PDE_DATA(file_inode(file)); + void *data = pde_data(file_inode(file)); char *cur, lbuf[32]; int d; diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c index cb18f8a13ab6..9c7edec64f7e 100644 --- a/drivers/pci/proc.c +++ b/drivers/pci/proc.c @@ -21,14 +21,14 @@ static int proc_initialized; /* = 0 */ static loff_t proc_bus_pci_lseek(struct file *file, loff_t off, int whence) { - struct pci_dev *dev = PDE_DATA(file_inode(file)); + struct pci_dev *dev = pde_data(file_inode(file)); return fixed_size_llseek(file, off, whence, dev->cfg_size); } static ssize_t proc_bus_pci_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct pci_dev *dev = PDE_DATA(file_inode(file)); + struct pci_dev *dev = pde_data(file_inode(file)); unsigned int pos = *ppos; unsigned int cnt, size; @@ -114,7 +114,7 @@ static ssize_t proc_bus_pci_write(struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos) { struct inode *ino = file_inode(file); - struct pci_dev *dev = PDE_DATA(ino); + struct pci_dev *dev = pde_data(ino); int pos = *ppos; int size = dev->cfg_size; int cnt, ret; @@ -196,7 +196,7 @@ struct pci_filp_private { static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct pci_dev *dev = PDE_DATA(file_inode(file)); + struct pci_dev *dev = pde_data(file_inode(file)); #ifdef HAVE_PCI_MMAP struct pci_filp_private *fpriv = file->private_data; #endif /* HAVE_PCI_MMAP */ @@ -244,7 +244,7 @@ static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd, #ifdef HAVE_PCI_MMAP static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma) { - struct pci_dev *dev = PDE_DATA(file_inode(file)); + struct pci_dev *dev = pde_data(file_inode(file)); struct pci_filp_private *fpriv = file->private_data; int i, ret, write_combine = 0, res_bit = IORESOURCE_MEM; diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 82fa6148216c..098180fb1cfc 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -880,14 +880,14 @@ static int dispatch_proc_show(struct seq_file *m, void *v) static int dispatch_proc_open(struct inode *inode, struct file *file) { - return single_open(file, dispatch_proc_show, PDE_DATA(inode)); + return single_open(file, dispatch_proc_show, pde_data(inode)); } static ssize_t dispatch_proc_write(struct file *file, const char __user *userbuf, size_t count, loff_t *pos) { - struct ibm_struct *ibm = PDE_DATA(file_inode(file)); + struct ibm_struct *ibm = pde_data(file_inode(file)); char *kernbuf; int ret; diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 352508d30467..f113dec98e21 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -1368,7 +1368,7 @@ static int lcd_proc_show(struct seq_file *m, void *v) static int lcd_proc_open(struct inode *inode, struct file *file) { - return single_open(file, lcd_proc_show, PDE_DATA(inode)); + return single_open(file, lcd_proc_show, pde_data(inode)); } static int set_lcd_brightness(struct toshiba_acpi_dev *dev, int value) @@ -1404,7 +1404,7 @@ static int set_lcd_status(struct backlight_device *bd) static ssize_t lcd_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file)); + struct toshiba_acpi_dev *dev = pde_data(file_inode(file)); char cmd[42]; size_t len; int levels; @@ -1469,13 +1469,13 @@ static int video_proc_show(struct seq_file *m, void *v) static int video_proc_open(struct inode *inode, struct file *file) { - return single_open(file, video_proc_show, PDE_DATA(inode)); + return single_open(file, video_proc_show, pde_data(inode)); } static ssize_t video_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file)); + struct toshiba_acpi_dev *dev = pde_data(file_inode(file)); char *buffer; char *cmd; int lcd_out = -1, crt_out = -1, tv_out = -1; @@ -1580,13 +1580,13 @@ static int fan_proc_show(struct seq_file *m, void *v) static int fan_proc_open(struct inode *inode, struct file *file) { - return single_open(file, fan_proc_show, PDE_DATA(inode)); + return single_open(file, fan_proc_show, pde_data(inode)); } static ssize_t fan_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file)); + struct toshiba_acpi_dev *dev = pde_data(file_inode(file)); char cmd[42]; size_t len; int value; @@ -1628,13 +1628,13 @@ static int keys_proc_show(struct seq_file *m, void *v) static int keys_proc_open(struct inode *inode, struct file *file) { - return single_open(file, keys_proc_show, PDE_DATA(inode)); + return single_open(file, keys_proc_show, pde_data(inode)); } static ssize_t keys_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file)); + struct toshiba_acpi_dev *dev = pde_data(file_inode(file)); char cmd[42]; size_t len; int value; diff --git a/drivers/pnp/isapnp/proc.c b/drivers/pnp/isapnp/proc.c index 1ae458c02656..55ae72a2818b 100644 --- a/drivers/pnp/isapnp/proc.c +++ b/drivers/pnp/isapnp/proc.c @@ -22,7 +22,7 @@ static loff_t isapnp_proc_bus_lseek(struct file *file, loff_t off, int whence) static ssize_t isapnp_proc_bus_read(struct file *file, char __user * buf, size_t nbytes, loff_t * ppos) { - struct pnp_dev *dev = PDE_DATA(file_inode(file)); + struct pnp_dev *dev = pde_data(file_inode(file)); int pos = *ppos; int cnt, size = 256; diff --git a/drivers/pnp/pnpbios/proc.c b/drivers/pnp/pnpbios/proc.c index a806830e3a40..0f0d819b157f 100644 --- a/drivers/pnp/pnpbios/proc.c +++ b/drivers/pnp/pnpbios/proc.c @@ -173,13 +173,13 @@ static int pnpbios_proc_show(struct seq_file *m, void *v) static int pnpbios_proc_open(struct inode *inode, struct file *file) { - return single_open(file, pnpbios_proc_show, PDE_DATA(inode)); + return single_open(file, pnpbios_proc_show, pde_data(inode)); } static ssize_t pnpbios_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - void *data = PDE_DATA(file_inode(file)); + void *data = pde_data(file_inode(file)); struct pnp_bios_node *node; int boot = (long)data >> 8; u8 nodenum = (long)data; diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c index d6982d355739..95aee1ad1383 100644 --- a/drivers/scsi/scsi_proc.c +++ b/drivers/scsi/scsi_proc.c @@ -49,7 +49,7 @@ static DEFINE_MUTEX(global_host_template_mutex); static ssize_t proc_scsi_host_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct Scsi_Host *shost = PDE_DATA(file_inode(file)); + struct Scsi_Host *shost = pde_data(file_inode(file)); ssize_t ret = -ENOMEM; char *page; @@ -79,7 +79,7 @@ static int proc_scsi_show(struct seq_file *m, void *v) static int proc_scsi_host_open(struct inode *inode, struct file *file) { - return single_open_size(file, proc_scsi_show, PDE_DATA(inode), + return single_open_size(file, proc_scsi_show, pde_data(inode), 4 * PAGE_SIZE); } diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c index 64de9f1b874c..431d5a7d737e 100644 --- a/drivers/usb/gadget/function/rndis.c +++ b/drivers/usb/gadget/function/rndis.c @@ -1117,7 +1117,7 @@ static int rndis_proc_show(struct seq_file *m, void *v) static ssize_t rndis_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - rndis_params *p = PDE_DATA(file_inode(file)); + rndis_params *p = pde_data(file_inode(file)); u32 speed = 0; int i, fl_speed = 0; @@ -1161,7 +1161,7 @@ static ssize_t rndis_proc_write(struct file *file, const char __user *buffer, static int rndis_proc_open(struct inode *inode, struct file *file) { - return single_open(file, rndis_proc_show, PDE_DATA(inode)); + return single_open(file, rndis_proc_show, pde_data(inode)); } static const struct proc_ops rndis_proc_ops = { diff --git a/drivers/zorro/proc.c b/drivers/zorro/proc.c index 1c9ae08225d8..f916bf60b312 100644 --- a/drivers/zorro/proc.c +++ b/drivers/zorro/proc.c @@ -30,7 +30,7 @@ proc_bus_zorro_lseek(struct file *file, loff_t off, int whence) static ssize_t proc_bus_zorro_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct zorro_dev *z = PDE_DATA(file_inode(file)); + struct zorro_dev *z = pde_data(file_inode(file)); struct ConfigDev cd; loff_t pos = *ppos; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 065a28bfa3f1..e1b863449296 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -227,7 +227,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) __acquires(cell->proc_lock) { - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); rcu_read_lock(); return seq_hlist_start_head_rcu(&cell->proc_volumes, *_pos); @@ -236,7 +236,7 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) static void *afs_proc_cell_volumes_next(struct seq_file *m, void *v, loff_t *_pos) { - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); return seq_hlist_next_rcu(v, &cell->proc_volumes, _pos); } @@ -322,7 +322,7 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) { struct afs_vl_seq_net_private *priv = m->private; struct afs_vlserver_list *vllist; - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); loff_t pos = *_pos; rcu_read_lock(); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cf2fd9fc7d98..9f86dd947032 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2834,7 +2834,7 @@ out: static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) @@ -2845,7 +2845,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; ++*pos; @@ -2857,7 +2857,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; int err, buddy_loaded = 0; @@ -2985,7 +2985,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) __acquires(&EXT4_SB(sb)->s_mb_rb_lock) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; read_lock(&EXT4_SB(sb)->s_mb_rb_lock); @@ -2998,7 +2998,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock) static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; ++*pos; @@ -3010,7 +3010,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; @@ -3058,7 +3058,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) __releases(&EXT4_SB(sb)->s_mb_rb_lock) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0b86a4365b66..f13d548e4a7f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1212,7 +1212,7 @@ static const struct seq_operations jbd2_seq_info_ops = { static int jbd2_seq_info_open(struct inode *inode, struct file *file) { - journal_t *journal = PDE_DATA(inode); + journal_t *journal = pde_data(inode); struct jbd2_stats_proc_session *s; int rc, size; diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 39b823ab2564..e1cfeda397f3 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -138,7 +138,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data); * @parent: The parent directory in which to create. * @ops: The seq_file ops with which to read the file. * @write: The write method with which to 'modify' the file. - * @data: Data for retrieval by PDE_DATA(). + * @data: Data for retrieval by pde_data(). * * Create a network namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a @@ -153,7 +153,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data); * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling - * PDE_DATA() on the file inode. The network namespace must be accessed by + * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode, @@ -230,7 +230,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single); * @parent: The parent directory in which to create. * @show: The seqfile show method with which to read the file. * @write: The write method with which to 'modify' the file. - * @data: Data for retrieval by PDE_DATA(). + * @data: Data for retrieval by pde_data(). * * Create a network-namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a @@ -245,7 +245,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single); * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling - * PDE_DATA() on the file inode. The network namespace must be accessed by + * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_single_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode, diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index b6e7005cc1b2..81d6e4ec2294 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -120,8 +120,6 @@ static inline void *pde_data(const struct inode *inode) return inode->i_private; } -#define PDE_DATA(i) pde_data(i) - extern void *proc_get_parent_data(const struct inode *); extern void proc_remove(struct proc_dir_entry *); extern void remove_proc_entry(const char *, struct proc_dir_entry *); @@ -202,7 +200,7 @@ proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {} static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {} -static inline void *PDE_DATA(const struct inode *inode) {BUG(); return NULL;} +static inline void *pde_data(const struct inode *inode) {BUG(); return NULL;} static inline void *proc_get_parent_data(const struct inode *inode) { BUG(); return NULL; } static inline void proc_remove(struct proc_dir_entry *de) {} diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 72dbb44a4573..88cc16444b43 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -209,7 +209,7 @@ static const struct file_operations __name ## _fops = { \ #define DEFINE_PROC_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ - return single_open(file, __name ## _show, PDE_DATA(inode)); \ + return single_open(file, __name ## _show, pde_data(inode)); \ } \ \ static const struct proc_ops __name ## _proc_ops = { \ diff --git a/ipc/util.c b/ipc/util.c index fa2d86ef3fb8..a2208d0f26b2 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -894,7 +894,7 @@ static int sysvipc_proc_open(struct inode *inode, struct file *file) if (!iter) return -ENOMEM; - iter->iface = PDE_DATA(inode); + iter->iface = pde_data(inode); iter->ns = get_ipc_ns(current->nsproxy->ipc_ns); iter->pid_ns = get_pid_ns(task_active_pid_ns(current)); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index ee595ec09778..623b8136e9af 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -137,7 +137,7 @@ static inline int irq_select_affinity_usr(unsigned int irq) static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)PDE_DATA(file_inode(file)); + unsigned int irq = (int)(long)pde_data(file_inode(file)); cpumask_var_t new_value; int err; @@ -190,12 +190,12 @@ static ssize_t irq_affinity_list_proc_write(struct file *file, static int irq_affinity_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_proc_show, PDE_DATA(inode)); + return single_open(file, irq_affinity_proc_show, pde_data(inode)); } static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); + return single_open(file, irq_affinity_list_proc_show, pde_data(inode)); } static const struct proc_ops irq_affinity_proc_ops = { @@ -265,7 +265,7 @@ out: static int default_affinity_open(struct inode *inode, struct file *file) { - return single_open(file, default_affinity_show, PDE_DATA(inode)); + return single_open(file, default_affinity_show, pde_data(inode)); } static const struct proc_ops default_affinity_proc_ops = { diff --git a/kernel/resource.c b/kernel/resource.c index 5ad3eba619ba..9c08d6e9eef2 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -99,7 +99,7 @@ enum { MAX_IORES_LEVEL = 5 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { - struct resource *p = PDE_DATA(file_inode(m->file)); + struct resource *p = pde_data(file_inode(m->file)); loff_t l = 0; read_lock(&resource_lock); for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) @@ -115,7 +115,7 @@ static void r_stop(struct seq_file *m, void *v) static int r_show(struct seq_file *m, void *v) { - struct resource *root = PDE_DATA(file_inode(m->file)); + struct resource *root = pde_data(file_inode(m->file)); struct resource *r = v, *p; unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; diff --git a/net/atm/proc.c b/net/atm/proc.c index 4369ffa3302a..9bf736290e48 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -108,7 +108,7 @@ out: static inline void *vcc_walk(struct seq_file *seq, loff_t l) { struct vcc_state *state = seq->private; - int family = (uintptr_t)(PDE_DATA(file_inode(seq->file))); + int family = (uintptr_t)(pde_data(file_inode(seq->file))); return __vcc_walk(&state->sk, family, &state->bucket, l) ? state : NULL; @@ -324,7 +324,7 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf, page = get_zeroed_page(GFP_KERNEL); if (!page) return -ENOMEM; - dev = PDE_DATA(file_inode(file)); + dev = pde_data(file_inode(file)); if (!dev->ops->proc_read) length = -EINVAL; else { diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 1661979b6a6e..ee319779781e 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -611,7 +611,7 @@ EXPORT_SYMBOL(bt_sock_wait_ready); static void *bt_seq_start(struct seq_file *seq, loff_t *pos) __acquires(seq->private->l->lock) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); read_lock(&l->lock); return seq_hlist_start_head(&l->head, *pos); @@ -619,7 +619,7 @@ static void *bt_seq_start(struct seq_file *seq, loff_t *pos) static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); return seq_hlist_next(v, &l->head, pos); } @@ -627,14 +627,14 @@ static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void bt_seq_stop(struct seq_file *seq, void *v) __releases(seq->private->l->lock) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); read_unlock(&l->lock); } static int bt_seq_show(struct seq_file *seq, void *v) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); if (v == SEQ_START_TOKEN) { seq_puts(seq, "sk RefCnt Rmem Wmem User Inode Parent"); diff --git a/net/can/bcm.c b/net/can/bcm.c index bc88d901a1c0..95d209b52e6a 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -193,7 +193,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) { char ifname[IFNAMSIZ]; struct net *net = m->private; - struct sock *sk = (struct sock *)PDE_DATA(m->file->f_inode); + struct sock *sk = (struct sock *)pde_data(m->file->f_inode); struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; diff --git a/net/can/proc.c b/net/can/proc.c index b3099f0a3cb8..bbce97825f13 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -305,7 +305,7 @@ static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx, static int can_rcvlist_proc_show(struct seq_file *m, void *v) { /* double cast to prevent GCC warning */ - int idx = (int)(long)PDE_DATA(m->file->f_inode); + int idx = (int)(long)pde_data(m->file->f_inode); struct net_device *dev; struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = m->private; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 213cb7b26b7a..6c2016f7f3d1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3364,7 +3364,7 @@ EXPORT_SYMBOL(neigh_seq_stop); static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; if (*pos == 0) @@ -3381,7 +3381,7 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { @@ -3401,7 +3401,7 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v) static int neigh_stat_seq_show(struct seq_file *seq, void *v) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 560a5e712dc3..84b62cd7bc57 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -546,7 +546,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, static int pgctrl_open(struct inode *inode, struct file *file) { - return single_open(file, pgctrl_show, PDE_DATA(inode)); + return single_open(file, pgctrl_show, pde_data(inode)); } static const struct proc_ops pktgen_proc_ops = { @@ -1811,7 +1811,7 @@ static ssize_t pktgen_if_write(struct file *file, static int pktgen_if_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_if_show, PDE_DATA(inode)); + return single_open(file, pktgen_if_show, pde_data(inode)); } static const struct proc_ops pktgen_if_proc_ops = { @@ -1948,7 +1948,7 @@ out: static int pktgen_thread_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_thread_show, PDE_DATA(inode)); + return single_open(file, pktgen_thread_show, pde_data(inode)); } static const struct proc_ops pktgen_thread_proc_ops = { diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index b518f20c9a24..f8e176c77d1c 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -776,7 +776,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - struct clusterip_config *c = PDE_DATA(inode); + struct clusterip_config *c = pde_data(inode); sf->private = c; @@ -788,7 +788,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) static int clusterip_proc_release(struct inode *inode, struct file *file) { - struct clusterip_config *c = PDE_DATA(inode); + struct clusterip_config *c = pde_data(inode); int ret; ret = seq_release(inode, file); @@ -802,7 +802,7 @@ static int clusterip_proc_release(struct inode *inode, struct file *file) static ssize_t clusterip_proc_write(struct file *file, const char __user *input, size_t size, loff_t *ofs) { - struct clusterip_config *c = PDE_DATA(file_inode(file)); + struct clusterip_config *c = pde_data(file_inode(file)); #define PROC_WRITELEN 10 char buffer[PROC_WRITELEN+1]; unsigned long nodenum; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index a53f256bf9d3..9eb5fc247868 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -971,7 +971,7 @@ struct proto raw_prot = { static struct sock *raw_get_first(struct seq_file *seq) { struct sock *sk; - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; @@ -987,7 +987,7 @@ found: static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) { - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); do { @@ -1016,7 +1016,7 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) void *raw_seq_start(struct seq_file *seq, loff_t *pos) __acquires(&h->lock) { - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); read_lock(&h->lock); return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; @@ -1039,7 +1039,7 @@ EXPORT_SYMBOL_GPL(raw_seq_next); void raw_seq_stop(struct seq_file *seq, void *v) __releases(&h->lock) { - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); read_unlock(&h->lock); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b3f34e366b27..b53476e78c84 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3002,7 +3002,7 @@ static unsigned short seq_file_family(const struct seq_file *seq) #endif /* Iterated from proc fs */ - afinfo = PDE_DATA(file_inode(seq->file)); + afinfo = pde_data(file_inode(seq->file)); return afinfo->family; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 464590ea922e..090360939401 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2960,7 +2960,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; else - afinfo = PDE_DATA(file_inode(seq->file)); + afinfo = pde_data(file_inode(seq->file)); for (state->bucket = start; state->bucket <= afinfo->udp_table->mask; ++state->bucket) { @@ -2993,7 +2993,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; else - afinfo = PDE_DATA(file_inode(seq->file)); + afinfo = pde_data(file_inode(seq->file)); do { sk = sk_next(sk); @@ -3050,7 +3050,7 @@ void udp_seq_stop(struct seq_file *seq, void *v) if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; else - afinfo = PDE_DATA(file_inode(seq->file)); + afinfo = pde_data(file_inode(seq->file)); if (state->bucket <= afinfo->udp_table->mask) spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 25524e393349..54a489f16b17 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1517,7 +1517,7 @@ EXPORT_SYMBOL_GPL(xt_unregister_table); #ifdef CONFIG_PROC_FS static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) { - u8 af = (unsigned long)PDE_DATA(file_inode(seq->file)); + u8 af = (unsigned long)pde_data(file_inode(seq->file)); struct net *net = seq_file_net(seq); struct xt_pernet *xt_net; @@ -1529,7 +1529,7 @@ static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - u8 af = (unsigned long)PDE_DATA(file_inode(seq->file)); + u8 af = (unsigned long)pde_data(file_inode(seq->file)); struct net *net = seq_file_net(seq); struct xt_pernet *xt_net; @@ -1540,7 +1540,7 @@ static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void xt_table_seq_stop(struct seq_file *seq, void *v) { - u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file)); + u_int8_t af = (unsigned long)pde_data(file_inode(seq->file)); mutex_unlock(&xt[af].mutex); } @@ -1584,7 +1584,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, [MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC, [MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE, }; - uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file)); + uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; if (ppos != NULL) @@ -1633,7 +1633,7 @@ static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos, static void xt_mttg_seq_stop(struct seq_file *seq, void *v) { - uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file)); + uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; switch (trav->class) { diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 9c5cfd74a0ee..0859b8f76764 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -1052,7 +1052,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { static void *dl_seq_start(struct seq_file *s, loff_t *pos) __acquires(htable->lock) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket; spin_lock_bh(&htable->lock); @@ -1069,7 +1069,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos) static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; *pos = ++(*bucket); @@ -1083,7 +1083,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) static void dl_seq_stop(struct seq_file *s, void *v) __releases(htable->lock) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; if (!IS_ERR(bucket)) @@ -1125,7 +1125,7 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1140,7 +1140,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1155,7 +1155,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1169,7 +1169,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_show_v2(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = (unsigned int *)v; struct dsthash_ent *ent; @@ -1183,7 +1183,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v) static int dl_seq_show_v1(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; struct dsthash_ent *ent; @@ -1197,7 +1197,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v) static int dl_seq_show(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; struct dsthash_ent *ent; diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 0446307516cd..7ddb9a78e3fc 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -551,7 +551,7 @@ static int recent_seq_open(struct inode *inode, struct file *file) if (st == NULL) return -ENOMEM; - st->table = PDE_DATA(inode); + st->table = pde_data(inode); return 0; } @@ -559,7 +559,7 @@ static ssize_t recent_mt_proc_write(struct file *file, const char __user *input, size_t size, loff_t *loff) { - struct recent_table *t = PDE_DATA(file_inode(file)); + struct recent_table *t = pde_data(file_inode(file)); struct recent_entry *e; char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; const char *c = buf; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index b87565b64928..c2ba9d4cd2c7 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1433,7 +1433,7 @@ static bool use_gss_proxy(struct net *net) static ssize_t write_gssp(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct net *net = PDE_DATA(file_inode(file)); + struct net *net = pde_data(file_inode(file)); char tbuf[20]; unsigned long i; int res; @@ -1461,7 +1461,7 @@ static ssize_t write_gssp(struct file *file, const char __user *buf, static ssize_t read_gssp(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct net *net = PDE_DATA(file_inode(file)); + struct net *net = pde_data(file_inode(file)); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); unsigned long p = *ppos; char tbuf[10]; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 59641803472c..bb1177395b99 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1536,7 +1536,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf, static ssize_t cache_read_procfs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return cache_read(filp, buf, count, ppos, cd); } @@ -1544,14 +1544,14 @@ static ssize_t cache_read_procfs(struct file *filp, char __user *buf, static ssize_t cache_write_procfs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return cache_write(filp, buf, count, ppos, cd); } static __poll_t cache_poll_procfs(struct file *filp, poll_table *wait) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return cache_poll(filp, wait, cd); } @@ -1560,21 +1560,21 @@ static long cache_ioctl_procfs(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return cache_ioctl(inode, filp, cmd, arg, cd); } static int cache_open_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return cache_open(inode, filp, cd); } static int cache_release_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return cache_release(inode, filp, cd); } @@ -1591,14 +1591,14 @@ static const struct proc_ops cache_channel_proc_ops = { static int content_open_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return content_open(inode, filp, cd); } static int content_release_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return content_release(inode, filp, cd); } @@ -1612,14 +1612,14 @@ static const struct proc_ops content_proc_ops = { static int open_flush_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return open_flush(inode, filp, cd); } static int release_flush_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return release_flush(inode, filp, cd); } @@ -1627,7 +1627,7 @@ static int release_flush_procfs(struct inode *inode, struct file *filp) static ssize_t read_flush_procfs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return read_flush(filp, buf, count, ppos, cd); } @@ -1636,7 +1636,7 @@ static ssize_t write_flush_procfs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return write_flush(filp, buf, count, ppos, cd); } diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index c964b48eaaba..52908f9e6eab 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -66,7 +66,7 @@ static int rpc_proc_show(struct seq_file *seq, void *v) { static int rpc_proc_open(struct inode *inode, struct file *file) { - return single_open(file, rpc_proc_show, PDE_DATA(inode)); + return single_open(file, rpc_proc_show, pde_data(inode)); } static const struct proc_ops rpc_proc_ops = { diff --git a/sound/core/info.c b/sound/core/info.c index a451b24199c3..782fba87cc04 100644 --- a/sound/core/info.c +++ b/sound/core/info.c @@ -234,7 +234,7 @@ static int snd_info_entry_mmap(struct file *file, struct vm_area_struct *vma) static int snd_info_entry_open(struct inode *inode, struct file *file) { - struct snd_info_entry *entry = PDE_DATA(inode); + struct snd_info_entry *entry = pde_data(inode); struct snd_info_private_data *data; int mode, err; @@ -365,7 +365,7 @@ static int snd_info_seq_show(struct seq_file *seq, void *p) static int snd_info_text_entry_open(struct inode *inode, struct file *file) { - struct snd_info_entry *entry = PDE_DATA(inode); + struct snd_info_entry *entry = pde_data(inode); struct snd_info_private_data *data; int err; -- cgit v1.2.3-58-ga151 From 2dba5eb1c73b6ba2988ced07250edeac0f8cbf5a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 21 Jan 2022 22:14:27 -0800 Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() Currently, enabling CONFIG_STACKDEPOT means its stack_table will be allocated from memblock, even if stack depot ends up not actually used. The default size of stack_table is 4MB on 32-bit, 8MB on 64-bit. This is fine for use-cases such as KASAN which is also a config option and has overhead on its own. But it's an issue for functionality that has to be actually enabled on boot (page_owner) or depends on hardware (GPU drivers) and thus the memory might be wasted. This was raised as an issue [1] when attempting to add stackdepot support for SLUB's debug object tracking functionality. It's common to build kernels with CONFIG_SLUB_DEBUG and enable slub_debug on boot only when needed, or create only specific kmem caches with debugging for testing purposes. It would thus be more efficient if stackdepot's table was allocated only when actually going to be used. This patch thus makes the allocation (and whole stack_depot_init() call) optional: - Add a CONFIG_STACKDEPOT_ALWAYS_INIT flag to keep using the current well-defined point of allocation as part of mem_init(). Make CONFIG_KASAN select this flag. - Other users have to call stack_depot_init() as part of their own init when it's determined that stack depot will actually be used. This may depend on both config and runtime conditions. Convert current users which are page_owner and several in the DRM subsystem. Same will be done for SLUB later. - Because the init might now be called after the boot-time memblock allocation has given all memory to the buddy allocator, change stack_depot_init() to allocate stack_table with kvmalloc() when memblock is no longer available. Also handle allocation failure by disabling stackdepot (could have theoretically happened even with memblock allocation previously), and don't unnecessarily align the memblock allocation to its own size anymore. [1] https://lore.kernel.org/all/CAMuHMdW=eoVzM1Re5FVoEN87nKfiLmM2+Ah7eNu2KXEhCvbZyA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20211013073005.11351-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Dmitry Vyukov Reviewed-by: Marco Elver # stackdepot Cc: Marco Elver Cc: Vijayanand Jitta Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Geert Uytterhoeven Cc: Oliver Glitta Cc: Imran Khan From: Colin Ian King Subject: lib/stackdepot: fix spelling mistake and grammar in pr_err message There is a spelling mistake of the work allocation so fix this and re-phrase the message to make it easier to read. Link: https://lkml.kernel.org/r/20211015104159.11282-1-colin.king@canonical.com Signed-off-by: Colin Ian King Cc: Vlastimil Babka From: Vlastimil Babka Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() - fixup On FLATMEM, we call page_ext_init_flatmem_late() just before kmem_cache_init() which means stack_depot_init() (called by page owner init) will not recognize properly it should use kvmalloc() and not memblock_alloc(). memblock_alloc() will also not issue a warning and return a block memory that can be invalid and cause kernel page fault when saving stacks, as reported by the kernel test robot [1]. Fix this by moving page_ext_init_flatmem_late() below kmem_cache_init() so that slab_is_available() is true during stack_depot_init(). SPARSEMEM doesn't have this issue, as it doesn't do page_ext_init_flatmem_late(), but a different page_ext_init() even later in the boot process. Thanks to Mike Rapoport for pointing out the FLATMEM init ordering issue. While at it, also actually resolve a checkpatch warning in stack_depot_init() from DRM CI, which was supposed to be in the original patch already. [1] https://lore.kernel.org/all/20211014085450.GC18719@xsang-OptiPlex-9020/ Link: https://lkml.kernel.org/r/6abd9213-19a9-6d58-cedc-2414386d2d81@suse.cz Signed-off-by: Vlastimil Babka Reported-by: kernel test robot Cc: Mike Rapoport Cc: Stephen Rothwell From: Vlastimil Babka Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() - fixup3 Due to cd06ab2fd48f ("drm/locking: add backtrace for locking contended locks without backoff") landing recently to -next adding a new stack depot user in drivers/gpu/drm/drm_modeset_lock.c we need to add an appropriate call to stack_depot_init() there as well. Link: https://lkml.kernel.org/r/2a692365-cfa1-64f2-34e0-8aa5674dce5e@suse.cz Signed-off-by: Vlastimil Babka Cc: Jani Nikula Cc: Naresh Kamboju Cc: Marco Elver Cc: Vijayanand Jitta Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Geert Uytterhoeven Cc: Oliver Glitta Cc: Imran Khan Cc: Stephen Rothwell From: Vlastimil Babka Subject: lib/stackdepot: allow optional init and stack_table allocation by kvmalloc() - fixup4 Due to 4e66934eaadc ("lib: add reference counting tracking infrastructure") landing recently to net-next adding a new stack depot user in lib/ref_tracker.c we need to add an appropriate call to stack_depot_init() there as well. Link: https://lkml.kernel.org/r/45c1b738-1a2f-5b5f-2f6d-86fab206d01c@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Eric Dumazet Cc: Jiri Slab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_dp_mst_topology.c | 1 + drivers/gpu/drm/drm_mm.c | 4 ++++ drivers/gpu/drm/drm_modeset_lock.c | 9 +++++++++ drivers/gpu/drm/i915/intel_runtime_pm.c | 3 +++ include/linux/ref_tracker.h | 2 ++ include/linux/stackdepot.h | 25 ++++++++++++++++--------- init/main.c | 9 ++++++--- lib/Kconfig | 4 ++++ lib/Kconfig.kasan | 2 +- lib/stackdepot.c | 33 ++++++++++++++++++++++++++++----- mm/page_owner.c | 2 ++ 11 files changed, 76 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c index f3d79eda94bb..8b3822142fed 100644 --- a/drivers/gpu/drm/drm_dp_mst_topology.c +++ b/drivers/gpu/drm/drm_dp_mst_topology.c @@ -5511,6 +5511,7 @@ int drm_dp_mst_topology_mgr_init(struct drm_dp_mst_topology_mgr *mgr, mutex_init(&mgr->probe_lock); #if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS) mutex_init(&mgr->topology_ref_history_lock); + stack_depot_init(); #endif INIT_LIST_HEAD(&mgr->tx_msg_downq); INIT_LIST_HEAD(&mgr->destroy_port_list); diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c index 7d1c578388d3..8257f9d4f619 100644 --- a/drivers/gpu/drm/drm_mm.c +++ b/drivers/gpu/drm/drm_mm.c @@ -980,6 +980,10 @@ void drm_mm_init(struct drm_mm *mm, u64 start, u64 size) add_hole(&mm->head_node); mm->scan_active = 0; + +#ifdef CONFIG_DRM_DEBUG_MM + stack_depot_init(); +#endif } EXPORT_SYMBOL(drm_mm_init); diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c index c97323365675..918065982db4 100644 --- a/drivers/gpu/drm/drm_modeset_lock.c +++ b/drivers/gpu/drm/drm_modeset_lock.c @@ -107,6 +107,11 @@ static void __drm_stack_depot_print(depot_stack_handle_t stack_depot) kfree(buf); } + +static void __drm_stack_depot_init(void) +{ + stack_depot_init(); +} #else /* CONFIG_DRM_DEBUG_MODESET_LOCK */ static depot_stack_handle_t __drm_stack_depot_save(void) { @@ -115,6 +120,9 @@ static depot_stack_handle_t __drm_stack_depot_save(void) static void __drm_stack_depot_print(depot_stack_handle_t stack_depot) { } +static void __drm_stack_depot_init(void) +{ +} #endif /* CONFIG_DRM_DEBUG_MODESET_LOCK */ /** @@ -359,6 +367,7 @@ void drm_modeset_lock_init(struct drm_modeset_lock *lock) { ww_mutex_init(&lock->mutex, &crtc_ww_class); INIT_LIST_HEAD(&lock->head); + __drm_stack_depot_init(); } EXPORT_SYMBOL(drm_modeset_lock_init); diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c index 22dab36afcb6..53f1ccb78849 100644 --- a/drivers/gpu/drm/i915/intel_runtime_pm.c +++ b/drivers/gpu/drm/i915/intel_runtime_pm.c @@ -68,6 +68,9 @@ static noinline depot_stack_handle_t __save_depot_stack(void) static void init_intel_runtime_pm_wakeref(struct intel_runtime_pm *rpm) { spin_lock_init(&rpm->debug.lock); + + if (rpm->available) + stack_depot_init(); } static noinline depot_stack_handle_t diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h index c11c9db5825c..60f3453be23e 100644 --- a/include/linux/ref_tracker.h +++ b/include/linux/ref_tracker.h @@ -4,6 +4,7 @@ #include #include #include +#include struct ref_tracker; @@ -26,6 +27,7 @@ static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir, spin_lock_init(&dir->lock); dir->quarantine_avail = quarantine_count; refcount_set(&dir->untracked, 1); + stack_depot_init(); } void ref_tracker_dir_exit(struct ref_tracker_dir *dir); diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index c34b55a6e554..17f992fe6355 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -19,6 +19,22 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags, bool can_alloc); +/* + * Every user of stack depot has to call this during its own init when it's + * decided that it will be calling stack_depot_save() later. + * + * The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot + * enabled as part of mm_init(), for subsystems where it's known at compile time + * that stack depot will be used. + */ +int stack_depot_init(void); + +#ifdef CONFIG_STACKDEPOT_ALWAYS_INIT +static inline int stack_depot_early_init(void) { return stack_depot_init(); } +#else +static inline int stack_depot_early_init(void) { return 0; } +#endif + depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); @@ -30,13 +46,4 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, void stack_depot_print(depot_stack_handle_t stack); -#ifdef CONFIG_STACKDEPOT -int stack_depot_init(void); -#else -static inline int stack_depot_init(void) -{ - return 0; -} -#endif /* CONFIG_STACKDEPOT */ - #endif diff --git a/init/main.c b/init/main.c index bb984ed79de0..65fa2e41a9c0 100644 --- a/init/main.c +++ b/init/main.c @@ -834,12 +834,15 @@ static void __init mm_init(void) init_mem_debugging_and_hardening(); kfence_alloc_pool(); report_meminit(); - stack_depot_init(); + stack_depot_early_init(); mem_init(); mem_init_print_info(); - /* page_owner must be initialized after buddy is ready */ - page_ext_init_flatmem_late(); kmem_cache_init(); + /* + * page_owner must be initialized after buddy is ready, and also after + * slab is ready so that stack_depot_init() works properly + */ + page_ext_init_flatmem_late(); kmemleak_init(); pgtable_init(); debug_objects_mem_init(); diff --git a/lib/Kconfig b/lib/Kconfig index c20b68ad2bc3..51c368a50b16 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -673,6 +673,10 @@ config STACKDEPOT bool select STACKTRACE +config STACKDEPOT_ALWAYS_INIT + bool + select STACKDEPOT + config STACK_HASH_ORDER int "stack depot hash size (12 => 4KB, 20 => 1024KB)" range 12 20 diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index cdc842d090db..879757b6dd14 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -38,7 +38,7 @@ menuconfig KASAN CC_HAS_WORKING_NOSANITIZE_ADDRESS) || \ HAVE_ARCH_KASAN_HW_TAGS depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB) - select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT help Enables KASAN (KernelAddressSANitizer) - runtime memory debugger, designed to find out-of-bounds accesses and use-after-free bugs. diff --git a/lib/stackdepot.c b/lib/stackdepot.c index b437ae79aca1..00ccb106f1a8 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -161,18 +162,40 @@ static int __init is_stack_depot_disabled(char *str) } early_param("stack_depot_disable", is_stack_depot_disabled); -int __init stack_depot_init(void) +/* + * __ref because of memblock_alloc(), which will not be actually called after + * the __init code is gone, because at that point slab_is_available() is true + */ +__ref int stack_depot_init(void) { - if (!stack_depot_disable) { + static DEFINE_MUTEX(stack_depot_init_mutex); + + mutex_lock(&stack_depot_init_mutex); + if (!stack_depot_disable && !stack_table) { size_t size = (STACK_HASH_SIZE * sizeof(struct stack_record *)); int i; - stack_table = memblock_alloc(size, size); - for (i = 0; i < STACK_HASH_SIZE; i++) - stack_table[i] = NULL; + if (slab_is_available()) { + pr_info("Stack Depot allocating hash table with kvmalloc\n"); + stack_table = kvmalloc(size, GFP_KERNEL); + } else { + pr_info("Stack Depot allocating hash table with memblock_alloc\n"); + stack_table = memblock_alloc(size, SMP_CACHE_BYTES); + } + if (stack_table) { + for (i = 0; i < STACK_HASH_SIZE; i++) + stack_table[i] = NULL; + } else { + pr_err("Stack Depot hash table allocation failed, disabling\n"); + stack_depot_disable = true; + mutex_unlock(&stack_depot_init_mutex); + return -ENOMEM; + } } + mutex_unlock(&stack_depot_init_mutex); return 0; } +EXPORT_SYMBOL_GPL(stack_depot_init); /* Calculate hash for a stack */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) diff --git a/mm/page_owner.c b/mm/page_owner.c index 5eea061bb1e5..99e360df9465 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -80,6 +80,8 @@ static __init void init_page_owner(void) if (!page_owner_enabled) return; + stack_depot_init(); + register_dummy_stack(); register_failure_stack(); register_early_stack(); -- cgit v1.2.3-58-ga151 From e940066089490efde86abc519593be84362f4e53 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 21 Jan 2022 22:14:31 -0800 Subject: lib/stackdepot: always do filter_irq_stacks() in stack_depot_save() The non-interrupt portion of interrupt stack traces before interrupt entry is usually arbitrary. Therefore, saving stack traces of interrupts (that include entries before interrupt entry) to stack depot leads to unbounded stackdepot growth. As such, use of filter_irq_stacks() is a requirement to ensure stackdepot can efficiently deduplicate interrupt stacks. Looking through all current users of stack_depot_save(), none (except KASAN) pass the stack trace through filter_irq_stacks() before passing it on to stack_depot_save(). Rather than adding filter_irq_stacks() to all current users of stack_depot_save(), it became clear that stack_depot_save() should simply do filter_irq_stacks(). Link: https://lkml.kernel.org/r/20211130095727.2378739-1-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Alexander Potapenko Acked-by: Vlastimil Babka Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vijayanand Jitta Cc: "Gustavo A. R. Silva" Cc: Imran Khan Cc: Chris Wilson Cc: Jani Nikula Cc: Mika Kuoppala Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/stackdepot.c | 13 +++++++++++++ mm/kasan/common.c | 1 - 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 00ccb106f1a8..bf5ba9af0500 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -328,6 +328,9 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); * (allocates using GFP flags of @alloc_flags). If @can_alloc is %false, avoids * any allocations and will fail if no space is left to store the stack trace. * + * If the stack trace in @entries is from an interrupt, only the portion up to + * interrupt entry is saved. + * * Context: Any context, but setting @can_alloc to %false is required if * alloc_pages() cannot be used from the current context. Currently * this is the case from contexts where neither %GFP_ATOMIC nor @@ -346,6 +349,16 @@ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned long flags; u32 hash; + /* + * If this stack trace is from an interrupt, including anything before + * interrupt entry usually leads to unbounded stackdepot growth. + * + * Because use of filter_irq_stacks() is a requirement to ensure + * stackdepot can efficiently deduplicate interrupt stacks, always + * filter_irq_stacks() to simplify all callers' use of stackdepot. + */ + nr_entries = filter_irq_stacks(entries, nr_entries); + if (unlikely(nr_entries == 0) || stack_depot_disable) goto fast_exit; diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 7c06db78a76c..92196562687b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -36,7 +36,6 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - nr_entries = filter_irq_stacks(entries, nr_entries); return __stack_depot_save(entries, nr_entries, flags, can_alloc); } -- cgit v1.2.3-58-ga151 From 0a4ee518185e902758191d968600399f3bc2be31 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:34 -0800 Subject: mm: remove cleancache Patch series "remove Xen tmem leftovers". Since the removal of the Xen tmem driver in 2019, the cleancache hooks are entirely unused, as are large parts of frontswap. This series against linux-next (with the folio changes included) removes cleancaches, and cuts down frontswap to the bits actually used by zswap. This patch (of 13): The cleancache subsystem is unused since the removal of Xen tmem driver in commit 814bbf49dcd0 ("xen: remove tmem driver"). [akpm@linux-foundation.org: remove now-unreachable code] Link: https://lkml.kernel.org/r/20211224062246.1258487-1-hch@lst.de Link: https://lkml.kernel.org/r/20211224062246.1258487-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Acked-by: Geert Uytterhoeven Cc: Konrad Rzeszutek Wilk Cc: Hugh Dickins Cc: Seth Jennings Cc: Dan Streetman Cc: Vitaly Wool Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/cleancache.rst | 296 -------------------------------- Documentation/vm/frontswap.rst | 12 +- Documentation/vm/index.rst | 1 - MAINTAINERS | 7 - arch/arm/configs/bcm2835_defconfig | 1 - arch/arm/configs/qcom_defconfig | 1 - arch/m68k/configs/amiga_defconfig | 1 - arch/m68k/configs/apollo_defconfig | 1 - arch/m68k/configs/atari_defconfig | 1 - arch/m68k/configs/bvme6000_defconfig | 1 - arch/m68k/configs/hp300_defconfig | 1 - arch/m68k/configs/mac_defconfig | 1 - arch/m68k/configs/multi_defconfig | 1 - arch/m68k/configs/mvme147_defconfig | 1 - arch/m68k/configs/mvme16x_defconfig | 1 - arch/m68k/configs/q40_defconfig | 1 - arch/m68k/configs/sun3_defconfig | 1 - arch/m68k/configs/sun3x_defconfig | 1 - arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - block/bdev.c | 5 - fs/btrfs/extent_io.c | 10 -- fs/btrfs/super.c | 2 - fs/ext4/readpage.c | 6 - fs/ext4/super.c | 3 - fs/f2fs/data.c | 13 -- fs/mpage.c | 7 - fs/ntfs3/ntfs_fs.h | 1 - fs/ocfs2/super.c | 2 - fs/super.c | 3 - include/linux/cleancache.h | 124 -------------- include/linux/fs.h | 5 - mm/Kconfig | 22 --- mm/Makefile | 1 - mm/cleancache.c | 315 ----------------------------------- mm/filemap.c | 11 -- mm/truncate.c | 15 +- 37 files changed, 4 insertions(+), 873 deletions(-) delete mode 100644 Documentation/vm/cleancache.rst delete mode 100644 include/linux/cleancache.h delete mode 100644 mm/cleancache.c diff --git a/Documentation/vm/cleancache.rst b/Documentation/vm/cleancache.rst deleted file mode 100644 index 68cba9131c31..000000000000 --- a/Documentation/vm/cleancache.rst +++ /dev/null @@ -1,296 +0,0 @@ -.. _cleancache: - -========== -Cleancache -========== - -Motivation -========== - -Cleancache is a new optional feature provided by the VFS layer that -potentially dramatically increases page cache effectiveness for -many workloads in many environments at a negligible cost. - -Cleancache can be thought of as a page-granularity victim cache for clean -pages that the kernel's pageframe replacement algorithm (PFRA) would like -to keep around, but can't since there isn't enough memory. So when the -PFRA "evicts" a page, it first attempts to use cleancache code to -put the data contained in that page into "transcendent memory", memory -that is not directly accessible or addressable by the kernel and is -of unknown and possibly time-varying size. - -Later, when a cleancache-enabled filesystem wishes to access a page -in a file on disk, it first checks cleancache to see if it already -contains it; if it does, the page of data is copied into the kernel -and a disk access is avoided. - -Transcendent memory "drivers" for cleancache are currently implemented -in Xen (using hypervisor memory) and zcache (using in-kernel compressed -memory) and other implementations are in development. - -:ref:`FAQs ` are included below. - -Implementation Overview -======================= - -A cleancache "backend" that provides transcendent memory registers itself -to the kernel's cleancache "frontend" by calling cleancache_register_ops, -passing a pointer to a cleancache_ops structure with funcs set appropriately. -The functions provided must conform to certain semantics as follows: - -Most important, cleancache is "ephemeral". Pages which are copied into -cleancache have an indefinite lifetime which is completely unknowable -by the kernel and so may or may not still be in cleancache at any later time. -Thus, as its name implies, cleancache is not suitable for dirty pages. -Cleancache has complete discretion over what pages to preserve and what -pages to discard and when. - -Mounting a cleancache-enabled filesystem should call "init_fs" to obtain a -pool id which, if positive, must be saved in the filesystem's superblock; -a negative return value indicates failure. A "put_page" will copy a -(presumably about-to-be-evicted) page into cleancache and associate it with -the pool id, a file key, and a page index into the file. (The combination -of a pool id, a file key, and an index is sometimes called a "handle".) -A "get_page" will copy the page, if found, from cleancache into kernel memory. -An "invalidate_page" will ensure the page no longer is present in cleancache; -an "invalidate_inode" will invalidate all pages associated with the specified -file; and, when a filesystem is unmounted, an "invalidate_fs" will invalidate -all pages in all files specified by the given pool id and also surrender -the pool id. - -An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache -to treat the pool as shared using a 128-bit UUID as a key. On systems -that may run multiple kernels (such as hard partitioned or virtualized -systems) that may share a clustered filesystem, and where cleancache -may be shared among those kernels, calls to init_shared_fs that specify the -same UUID will receive the same pool id, thus allowing the pages to -be shared. Note that any security requirements must be imposed outside -of the kernel (e.g. by "tools" that control cleancache). Or a -cleancache implementation can simply disable shared_init by always -returning a negative value. - -If a get_page is successful on a non-shared pool, the page is invalidated -(thus making cleancache an "exclusive" cache). On a shared pool, the page -is NOT invalidated on a successful get_page so that it remains accessible to -other sharers. The kernel is responsible for ensuring coherency between -cleancache (shared or not), the page cache, and the filesystem, using -cleancache invalidate operations as required. - -Note that cleancache must enforce put-put-get coherency and get-get -coherency. For the former, if two puts are made to the same handle but -with different data, say AAA by the first put and BBB by the second, a -subsequent get can never return the stale data (AAA). For get-get coherency, -if a get for a given handle fails, subsequent gets for that handle will -never succeed unless preceded by a successful put with that handle. - -Last, cleancache provides no SMP serialization guarantees; if two -different Linux threads are simultaneously putting and invalidating a page -with the same handle, the results are indeterminate. Callers must -lock the page to ensure serial behavior. - -Cleancache Performance Metrics -============================== - -If properly configured, monitoring of cleancache is done via debugfs in -the `/sys/kernel/debug/cleancache` directory. The effectiveness of cleancache -can be measured (across all filesystems) with: - -``succ_gets`` - number of gets that were successful - -``failed_gets`` - number of gets that failed - -``puts`` - number of puts attempted (all "succeed") - -``invalidates`` - number of invalidates attempted - -A backend implementation may provide additional metrics. - -.. _faq: - -FAQ -=== - -* Where's the value? (Andrew Morton) - -Cleancache provides a significant performance benefit to many workloads -in many environments with negligible overhead by improving the -effectiveness of the pagecache. Clean pagecache pages are -saved in transcendent memory (RAM that is otherwise not directly -addressable to the kernel); fetching those pages later avoids "refaults" -and thus disk reads. - -Cleancache (and its sister code "frontswap") provide interfaces for -this transcendent memory (aka "tmem"), which conceptually lies between -fast kernel-directly-addressable RAM and slower DMA/asynchronous devices. -Disallowing direct kernel or userland reads/writes to tmem -is ideal when data is transformed to a different form and size (such -as with compression) or secretly moved (as might be useful for write- -balancing for some RAM-like devices). Evicted page-cache pages (and -swap pages) are a great use for this kind of slower-than-RAM-but-much- -faster-than-disk transcendent memory, and the cleancache (and frontswap) -"page-object-oriented" specification provides a nice way to read and -write -- and indirectly "name" -- the pages. - -In the virtual case, the whole point of virtualization is to statistically -multiplex physical resources across the varying demands of multiple -virtual machines. This is really hard to do with RAM and efforts to -do it well with no kernel change have essentially failed (except in some -well-publicized special-case workloads). Cleancache -- and frontswap -- -with a fairly small impact on the kernel, provide a huge amount -of flexibility for more dynamic, flexible RAM multiplexing. -Specifically, the Xen Transcendent Memory backend allows otherwise -"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple -virtual machines, but the pages can be compressed and deduplicated to -optimize RAM utilization. And when guest OS's are induced to surrender -underutilized RAM (e.g. with "self-ballooning"), page cache pages -are the first to go, and cleancache allows those pages to be -saved and reclaimed if overall host system memory conditions allow. - -And the identical interface used for cleancache can be used in -physical systems as well. The zcache driver acts as a memory-hungry -device that stores pages of data in a compressed state. And -the proposed "RAMster" driver shares RAM across multiple physical -systems. - -* Why does cleancache have its sticky fingers so deep inside the - filesystems and VFS? (Andrew Morton and Christoph Hellwig) - -The core hooks for cleancache in VFS are in most cases a single line -and the minimum set are placed precisely where needed to maintain -coherency (via cleancache_invalidate operations) between cleancache, -the page cache, and disk. All hooks compile into nothingness if -cleancache is config'ed off and turn into a function-pointer- -compare-to-NULL if config'ed on but no backend claims the ops -functions, or to a compare-struct-element-to-negative if a -backend claims the ops functions but a filesystem doesn't enable -cleancache. - -Some filesystems are built entirely on top of VFS and the hooks -in VFS are sufficient, so don't require an "init_fs" hook; the -initial implementation of cleancache didn't provide this hook. -But for some filesystems (such as btrfs), the VFS hooks are -incomplete and one or more hooks in fs-specific code are required. -And for some other filesystems, such as tmpfs, cleancache may -be counterproductive. So it seemed prudent to require a filesystem -to "opt in" to use cleancache, which requires adding a hook in -each filesystem. Not all filesystems are supported by cleancache -only because they haven't been tested. The existing set should -be sufficient to validate the concept, the opt-in approach means -that untested filesystems are not affected, and the hooks in the -existing filesystems should make it very easy to add more -filesystems in the future. - -The total impact of the hooks to existing fs and mm files is only -about 40 lines added (not counting comments and blank lines). - -* Why not make cleancache asynchronous and batched so it can more - easily interface with real devices with DMA instead of copying each - individual page? (Minchan Kim) - -The one-page-at-a-time copy semantics simplifies the implementation -on both the frontend and backend and also allows the backend to -do fancy things on-the-fly like page compression and -page deduplication. And since the data is "gone" (copied into/out -of the pageframe) before the cleancache get/put call returns, -a great deal of race conditions and potential coherency issues -are avoided. While the interface seems odd for a "real device" -or for real kernel-addressable RAM, it makes perfect sense for -transcendent memory. - -* Why is non-shared cleancache "exclusive"? And where is the - page "invalidated" after a "get"? (Minchan Kim) - -The main reason is to free up space in transcendent memory and -to avoid unnecessary cleancache_invalidate calls. If you want inclusive, -the page can be "put" immediately following the "get". If -put-after-get for inclusive becomes common, the interface could -be easily extended to add a "get_no_invalidate" call. - -The invalidate is done by the cleancache backend implementation. - -* What's the performance impact? - -Performance analysis has been presented at OLS'09 and LCA'10. -Briefly, performance gains can be significant on most workloads, -especially when memory pressure is high (e.g. when RAM is -overcommitted in a virtual workload); and because the hooks are -invoked primarily in place of or in addition to a disk read/write, -overhead is negligible even in worst case workloads. Basically -cleancache replaces I/O with memory-copy-CPU-overhead; on older -single-core systems with slow memory-copy speeds, cleancache -has little value, but in newer multicore machines, especially -consolidated/virtualized machines, it has great value. - -* How do I add cleancache support for filesystem X? (Boaz Harrash) - -Filesystems that are well-behaved and conform to certain -restrictions can utilize cleancache simply by making a call to -cleancache_init_fs at mount time. Unusual, misbehaving, or -poorly layered filesystems must either add additional hooks -and/or undergo extensive additional testing... or should just -not enable the optional cleancache. - -Some points for a filesystem to consider: - - - The FS should be block-device-based (e.g. a ram-based FS such - as tmpfs should not enable cleancache) - - To ensure coherency/correctness, the FS must ensure that all - file removal or truncation operations either go through VFS or - add hooks to do the equivalent cleancache "invalidate" operations - - To ensure coherency/correctness, either inode numbers must - be unique across the lifetime of the on-disk file OR the - FS must provide an "encode_fh" function. - - The FS must call the VFS superblock alloc and deactivate routines - or add hooks to do the equivalent cleancache calls done there. - - To maximize performance, all pages fetched from the FS should - go through the do_mpag_readpage routine or the FS should add - hooks to do the equivalent (cf. btrfs) - - Currently, the FS blocksize must be the same as PAGESIZE. This - is not an architectural restriction, but no backends currently - support anything different. - - A clustered FS should invoke the "shared_init_fs" cleancache - hook to get best performance for some backends. - -* Why not use the KVA of the inode as the key? (Christoph Hellwig) - -If cleancache would use the inode virtual address instead of -inode/filehandle, the pool id could be eliminated. But, this -won't work because cleancache retains pagecache data pages -persistently even when the inode has been pruned from the -inode unused list, and only invalidates the data page if the file -gets removed/truncated. So if cleancache used the inode kva, -there would be potential coherency issues if/when the inode -kva is reused for a different file. Alternately, if cleancache -invalidated the pages when the inode kva was freed, much of the value -of cleancache would be lost because the cache of pages in cleanache -is potentially much larger than the kernel pagecache and is most -useful if the pages survive inode cache removal. - -* Why is a global variable required? - -The cleancache_enabled flag is checked in all of the frequently-used -cleancache hooks. The alternative is a function call to check a static -variable. Since cleancache is enabled dynamically at runtime, systems -that don't enable cleancache would suffer thousands (possibly -tens-of-thousands) of unnecessary function calls per second. So the -global variable allows cleancache to be enabled by default at compile -time, but have insignificant performance impact when cleancache remains -disabled at runtime. - -* Does cleanache work with KVM? - -The memory model of KVM is sufficiently different that a cleancache -backend may have less value for KVM. This remains to be tested, -especially in an overcommitted system. - -* Does cleancache work in userspace? It sounds useful for - memory hungry caches like web browsers. (Jamie Lokier) - -No plans yet, though we agree it sounds useful, at least for -apps that bypass the page cache (e.g. O_DIRECT). - -Last updated: Dan Magenheimer, April 13 2011 diff --git a/Documentation/vm/frontswap.rst b/Documentation/vm/frontswap.rst index 1979f430c1c5..e2e5ab3e375e 100644 --- a/Documentation/vm/frontswap.rst +++ b/Documentation/vm/frontswap.rst @@ -8,12 +8,6 @@ Frontswap provides a "transcendent memory" interface for swap pages. In some environments, dramatic performance savings may be obtained because swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. -(Note, frontswap -- and :ref:`cleancache` (merged at 3.0) -- are the "frontends" -and the only necessary changes to the core kernel for transcendent memory; -all other supporting code -- the "backends" -- is implemented as drivers. -See the LWN.net article `Transcendent memory in a nutshell`_ -for a detailed overview of frontswap and related kernel parts) - .. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ Frontswap is so named because it can be thought of as the opposite of @@ -87,11 +81,9 @@ This interface is ideal when data is transformed to a different form and size (such as with compression) or secretly moved (as might be useful for write-balancing for some RAM-like devices). Swap pages (and evicted page-cache pages) are a great use for this kind of slower-than-RAM- -but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and -cleancache) interface to transcendent memory provides a nice way to read -and write -- and indirectly "name" -- the pages. +but-much-faster-than-disk "pseudo-RAM device". -Frontswap -- and cleancache -- with a fairly small impact on the kernel, +Frontswap with a fairly small impact on the kernel, provides a huge amount of flexibility for more dynamic, flexible RAM utilization in various system configurations: diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index 932440805453..44365c4574a3 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -15,7 +15,6 @@ algorithms. If you are looking for advice on simply allocating memory, see the active_mm arch_pgtable_helpers balance - cleancache damon/index free_page_reporting frontswap diff --git a/MAINTAINERS b/MAINTAINERS index 6c08cbe953fe..45f8750f6e30 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4705,13 +4705,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/cla F: include/linux/cfi.h F: kernel/cfi.c -CLEANCACHE API -M: Konrad Rzeszutek Wilk -L: linux-kernel@vger.kernel.org -S: Maintained -F: include/linux/cleancache.h -F: mm/cleancache.c - CLK API M: Russell King L: linux-clk@vger.kernel.org diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig index 383c632eba7b..a9ed79b7f871 100644 --- a/arch/arm/configs/bcm2835_defconfig +++ b/arch/arm/configs/bcm2835_defconfig @@ -31,7 +31,6 @@ CONFIG_ARCH_BCM2835=y CONFIG_PREEMPT_VOLUNTARY=y CONFIG_AEABI=y CONFIG_KSM=y -CONFIG_CLEANCACHE=y CONFIG_CMA=y CONFIG_SECCOMP=y CONFIG_KEXEC=y diff --git a/arch/arm/configs/qcom_defconfig b/arch/arm/configs/qcom_defconfig index 0daa9c0d298e..9981566f2096 100644 --- a/arch/arm/configs/qcom_defconfig +++ b/arch/arm/configs/qcom_defconfig @@ -27,7 +27,6 @@ CONFIG_PCIE_QCOM=y CONFIG_SMP=y CONFIG_PREEMPT=y CONFIG_HIGHMEM=y -CONFIG_CLEANCACHE=y CONFIG_ARM_APPENDED_DTB=y CONFIG_ARM_ATAG_DTB_COMPAT=y CONFIG_CPU_IDLE=y diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index a4b6c7108465..bc9952f8be66 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -45,7 +45,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 2db721965520..a77269c6e5ba 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -41,7 +41,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index c266a704eecd..7a74efa6b9a1 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -48,7 +48,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index f644f08dd6ed..a5323bf2eb33 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -38,7 +38,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index e4924650b687..5e80aa0869d5 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -40,7 +40,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 24113871ea76..e84326a3f62d 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -39,7 +39,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 6a7e4be70eea..337552f43339 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -59,7 +59,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 1d223247aff0..7b688f7d272a 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -37,7 +37,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 961f789f96c9..7c2cb31d63dd 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -38,7 +38,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index ff4b5e469390..ca43897af26d 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -39,7 +39,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 5f228621d0cc..e3d515f37144 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -35,7 +35,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index a600cb9e68c2..d601606c969b 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -35,7 +35,6 @@ CONFIG_IOSCHED_BFQ=m CONFIG_BINFMT_AOUT=m CONFIG_BINFMT_MISC=m # CONFIG_COMPACTION is not set -CONFIG_CLEANCACHE=y CONFIG_ZPOOL=m CONFIG_NET=y CONFIG_PACKET=y diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 354e51dcb3e2..7fe8975b49ec 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -96,7 +96,6 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y CONFIG_CMA_DEBUG=y CONFIG_CMA_DEBUGFS=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 8dee6c3782f3..466780c465f5 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -91,7 +91,6 @@ CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y CONFIG_KSM=y CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y CONFIG_CMA_SYSFS=y CONFIG_CMA_AREAS=7 diff --git a/block/bdev.c b/block/bdev.c index 8bf93a19041b..102837a37051 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include "../fs/internal.h" @@ -88,10 +87,6 @@ void invalidate_bdev(struct block_device *bdev) lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); } - /* 99% of the time, we don't need to flush the cleancache on the bdev. - * But, for the strange corners, lets be cautious - */ - cleancache_invalidate_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d6d48ecf823c..409bad3928db 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include "misc.h" #include "extent_io.h" @@ -3578,15 +3577,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, goto out; } - if (!PageUptodate(page)) { - if (cleancache_get_page(page) == 0) { - BUG_ON(blocksize != PAGE_SIZE); - unlock_extent(tree, start, end); - unlock_page(page); - goto out; - } - } - if (page->index == last_byte >> PAGE_SHIFT) { size_t zero_offset = offset_in_page(last_byte); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0ec09fe01be6..4d947ba32da9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -1374,7 +1373,6 @@ static int btrfs_fill_super(struct super_block *sb, goto fail_close; } - cleancache_init_fs(sb); sb->s_flags |= SB_ACTIVE; return 0; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 3db923403505..4cd62f1d848c 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -43,7 +43,6 @@ #include #include #include -#include #include "ext4.h" @@ -350,11 +349,6 @@ int ext4_mpage_readpages(struct inode *inode, } else if (fully_mapped) { SetPageMappedToDisk(page); } - if (fully_mapped && blocks_per_page == 1 && - !PageUptodate(page) && cleancache_get_page(page) == 0) { - SetPageUptodate(page); - goto confused; - } /* * This page will go to BIO. Do we need to send this diff --git a/fs/ext4/super.c b/fs/ext4/super.c index db9fe4843529..eee0d9ebfa6c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include @@ -3149,8 +3148,6 @@ done: EXT4_BLOCKS_PER_GROUP(sb), EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt, sbi->s_mount_opt2); - - cleancache_init_fs(sb); return err; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0a1d236212f8..8c417864c66a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -2035,12 +2034,6 @@ got_it: block_nr = map->m_pblk + block_in_file - map->m_lblk; SetPageMappedToDisk(page); - if (!PageUptodate(page) && (!PageSwapCache(page) && - !cleancache_get_page(page))) { - SetPageUptodate(page); - goto confused; - } - if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, DATA_GENERIC_ENHANCE_READ)) { ret = -EFSCORRUPTED; @@ -2096,12 +2089,6 @@ submit_and_realloc: ClearPageError(page); *last_block_in_bio = block_nr; goto out; -confused: - if (bio) { - __submit_bio(F2FS_I_SB(inode), bio, DATA); - bio = NULL; - } - unlock_page(page); out: *bio_ret = bio; return ret; diff --git a/fs/mpage.c b/fs/mpage.c index 334e7d09aa65..87f5cfef6caa 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -29,7 +29,6 @@ #include #include #include -#include #include "internal.h" /* @@ -284,12 +283,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) SetPageMappedToDisk(page); } - if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) && - cleancache_get_page(page) == 0) { - SetPageUptodate(page); - goto confused; - } - /* * This page will go to BIO. Do we need to send this BIO off first? */ diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 8aaec7e0804e..fb825059d488 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -11,7 +11,6 @@ #include #include -#include #include #include #include diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 1286b88b6fa1..2772dec9dcea 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #define CREATE_TRACE_POINTS @@ -2283,7 +2282,6 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_errno(status); goto bail; } - cleancache_init_shared_fs(sb); osb->ocfs2_wq = alloc_ordered_workqueue("ocfs2_wq", WQ_MEM_RECLAIM); if (!osb->ocfs2_wq) { diff --git a/fs/super.c b/fs/super.c index a6405d44d4ca..7af820ba5ad5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -260,7 +259,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_time_gran = 1000000000; s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; - s->cleancache_poolid = CLEANCACHE_NO_POOL; s->s_shrink.seeks = DEFAULT_SEEKS; s->s_shrink.scan_objects = super_cache_scan; @@ -330,7 +328,6 @@ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { - cleancache_invalidate_fs(s); unregister_shrinker(&s->s_shrink); fs->kill_sb(s); diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h deleted file mode 100644 index 5f5730c1d324..000000000000 --- a/include/linux/cleancache.h +++ /dev/null @@ -1,124 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_CLEANCACHE_H -#define _LINUX_CLEANCACHE_H - -#include -#include -#include - -#define CLEANCACHE_NO_POOL -1 -#define CLEANCACHE_NO_BACKEND -2 -#define CLEANCACHE_NO_BACKEND_SHARED -3 - -#define CLEANCACHE_KEY_MAX 6 - -/* - * cleancache requires every file with a page in cleancache to have a - * unique key unless/until the file is removed/truncated. For some - * filesystems, the inode number is unique, but for "modern" filesystems - * an exportable filehandle is required (see exportfs.h) - */ -struct cleancache_filekey { - union { - ino_t ino; - __u32 fh[CLEANCACHE_KEY_MAX]; - u32 key[CLEANCACHE_KEY_MAX]; - } u; -}; - -struct cleancache_ops { - int (*init_fs)(size_t); - int (*init_shared_fs)(uuid_t *uuid, size_t); - int (*get_page)(int, struct cleancache_filekey, - pgoff_t, struct page *); - void (*put_page)(int, struct cleancache_filekey, - pgoff_t, struct page *); - void (*invalidate_page)(int, struct cleancache_filekey, pgoff_t); - void (*invalidate_inode)(int, struct cleancache_filekey); - void (*invalidate_fs)(int); -}; - -extern int cleancache_register_ops(const struct cleancache_ops *ops); -extern void __cleancache_init_fs(struct super_block *); -extern void __cleancache_init_shared_fs(struct super_block *); -extern int __cleancache_get_page(struct page *); -extern void __cleancache_put_page(struct page *); -extern void __cleancache_invalidate_page(struct address_space *, struct page *); -extern void __cleancache_invalidate_inode(struct address_space *); -extern void __cleancache_invalidate_fs(struct super_block *); - -#ifdef CONFIG_CLEANCACHE -#define cleancache_enabled (1) -static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping) -{ - return mapping->host->i_sb->cleancache_poolid >= 0; -} -static inline bool cleancache_fs_enabled(struct page *page) -{ - return cleancache_fs_enabled_mapping(page->mapping); -} -#else -#define cleancache_enabled (0) -#define cleancache_fs_enabled(_page) (0) -#define cleancache_fs_enabled_mapping(_page) (0) -#endif - -/* - * The shim layer provided by these inline functions allows the compiler - * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE - * is disabled, to a single global variable check if CONFIG_CLEANCACHE - * is enabled but no cleancache "backend" has dynamically enabled it, - * and, for the most frequent cleancache ops, to a single global variable - * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled - * and a cleancache backend has dynamically enabled cleancache, but the - * filesystem referenced by that cleancache op has not enabled cleancache. - * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially - * no measurable performance impact. - */ - -static inline void cleancache_init_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_init_fs(sb); -} - -static inline void cleancache_init_shared_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_init_shared_fs(sb); -} - -static inline int cleancache_get_page(struct page *page) -{ - if (cleancache_enabled && cleancache_fs_enabled(page)) - return __cleancache_get_page(page); - return -1; -} - -static inline void cleancache_put_page(struct page *page) -{ - if (cleancache_enabled && cleancache_fs_enabled(page)) - __cleancache_put_page(page); -} - -static inline void cleancache_invalidate_page(struct address_space *mapping, - struct page *page) -{ - /* careful... page->mapping is NULL sometimes when this is called */ - if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) - __cleancache_invalidate_page(mapping, page); -} - -static inline void cleancache_invalidate_inode(struct address_space *mapping) -{ - if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) - __cleancache_invalidate_inode(mapping); -} - -static inline void cleancache_invalidate_fs(struct super_block *sb) -{ - if (cleancache_enabled) - __cleancache_invalidate_fs(sb); -} - -#endif /* _LINUX_CLEANCACHE_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 9617dea24978..f3daaea16554 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1535,11 +1535,6 @@ struct super_block { const struct dentry_operations *s_d_op; /* default d_op for dentries */ - /* - * Saved pool identifier for cleancache (-1 means none) - */ - int cleancache_poolid; - struct shrinker s_shrink; /* per-sb shrinker handle */ /* Number of inodes with nlink == 0 but still referenced */ diff --git a/mm/Kconfig b/mm/Kconfig index a99bd499ef51..430240289b02 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -444,28 +444,6 @@ config USE_PERCPU_NUMA_NODE_ID config HAVE_SETUP_PER_CPU_AREA bool -config CLEANCACHE - bool "Enable cleancache driver to cache clean pages if tmem is present" - help - Cleancache can be thought of as a page-granularity victim cache - for clean pages that the kernel's pageframe replacement algorithm - (PFRA) would like to keep around, but can't since there isn't enough - memory. So when the PFRA "evicts" a page, it first attempts to use - cleancache code to put the data contained in that page into - "transcendent memory", memory that is not directly accessible or - addressable by the kernel and is of unknown and possibly - time-varying size. And when a cleancache-enabled - filesystem wishes to access a page in a file on disk, it first - checks cleancache to see if it already contains it; if it does, - the page is copied into the kernel and a disk access is avoided. - When a transcendent memory driver is available (such as zcache or - Xen transcendent memory), a significant I/O reduction - may be achieved. When none is available, all cleancache calls - are reduced to a single pointer-compare-against-NULL resulting - in a negligible performance hit. - - If unsure, say Y to enable cleancache - config FRONTSWAP bool "Enable frontswap to cache swap pages if tmem is present" depends on SWAP diff --git a/mm/Makefile b/mm/Makefile index 588d3113f3b0..70d4309c9ce3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -104,7 +104,6 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o -obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZBUD) += zbud.o diff --git a/mm/cleancache.c b/mm/cleancache.c deleted file mode 100644 index db7eee9c0886..000000000000 --- a/mm/cleancache.c +++ /dev/null @@ -1,315 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Cleancache frontend - * - * This code provides the generic "frontend" layer to call a matching - * "backend" driver implementation of cleancache. See - * Documentation/vm/cleancache.rst for more information. - * - * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. - * Author: Dan Magenheimer - */ - -#include -#include -#include -#include -#include -#include - -/* - * cleancache_ops is set by cleancache_register_ops to contain the pointers - * to the cleancache "backend" implementation functions. - */ -static const struct cleancache_ops *cleancache_ops __read_mostly; - -/* - * Counters available via /sys/kernel/debug/cleancache (if debugfs is - * properly configured. These are for information only so are not protected - * against increment races. - */ -static u64 cleancache_succ_gets; -static u64 cleancache_failed_gets; -static u64 cleancache_puts; -static u64 cleancache_invalidates; - -static void cleancache_register_ops_sb(struct super_block *sb, void *unused) -{ - switch (sb->cleancache_poolid) { - case CLEANCACHE_NO_BACKEND: - __cleancache_init_fs(sb); - break; - case CLEANCACHE_NO_BACKEND_SHARED: - __cleancache_init_shared_fs(sb); - break; - } -} - -/* - * Register operations for cleancache. Returns 0 on success. - */ -int cleancache_register_ops(const struct cleancache_ops *ops) -{ - if (cmpxchg(&cleancache_ops, NULL, ops)) - return -EBUSY; - - /* - * A cleancache backend can be built as a module and hence loaded after - * a cleancache enabled filesystem has called cleancache_init_fs. To - * handle such a scenario, here we call ->init_fs or ->init_shared_fs - * for each active super block. To differentiate between local and - * shared filesystems, we temporarily initialize sb->cleancache_poolid - * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED - * respectively in case there is no backend registered at the time - * cleancache_init_fs or cleancache_init_shared_fs is called. - * - * Since filesystems can be mounted concurrently with cleancache - * backend registration, we have to be careful to guarantee that all - * cleancache enabled filesystems that has been mounted by the time - * cleancache_register_ops is called has got and all mounted later will - * get cleancache_poolid. This is assured by the following statements - * tied together: - * - * a) iterate_supers skips only those super blocks that has started - * ->kill_sb - * - * b) if iterate_supers encounters a super block that has not finished - * ->mount yet, it waits until it is finished - * - * c) cleancache_init_fs is called from ->mount and - * cleancache_invalidate_fs is called from ->kill_sb - * - * d) we call iterate_supers after cleancache_ops has been set - * - * From a) it follows that if iterate_supers skips a super block, then - * either the super block is already dead, in which case we do not need - * to bother initializing cleancache for it, or it was mounted after we - * initiated iterate_supers. In the latter case, it must have seen - * cleancache_ops set according to d) and initialized cleancache from - * ->mount by itself according to c). This proves that we call - * ->init_fs at least once for each active super block. - * - * From b) and c) it follows that if iterate_supers encounters a super - * block that has already started ->init_fs, it will wait until ->mount - * and hence ->init_fs has finished, then check cleancache_poolid, see - * that it has already been set and therefore do nothing. This proves - * that we call ->init_fs no more than once for each super block. - * - * Combined together, the last two paragraphs prove the function - * correctness. - * - * Note that various cleancache callbacks may proceed before this - * function is called or even concurrently with it, but since - * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop - * until the corresponding ->init_fs has been actually called and - * cleancache_ops has been set. - */ - iterate_supers(cleancache_register_ops_sb, NULL); - return 0; -} -EXPORT_SYMBOL(cleancache_register_ops); - -/* Called by a cleancache-enabled filesystem at time of mount */ -void __cleancache_init_fs(struct super_block *sb) -{ - int pool_id = CLEANCACHE_NO_BACKEND; - - if (cleancache_ops) { - pool_id = cleancache_ops->init_fs(PAGE_SIZE); - if (pool_id < 0) - pool_id = CLEANCACHE_NO_POOL; - } - sb->cleancache_poolid = pool_id; -} -EXPORT_SYMBOL(__cleancache_init_fs); - -/* Called by a cleancache-enabled clustered filesystem at time of mount */ -void __cleancache_init_shared_fs(struct super_block *sb) -{ - int pool_id = CLEANCACHE_NO_BACKEND_SHARED; - - if (cleancache_ops) { - pool_id = cleancache_ops->init_shared_fs(&sb->s_uuid, PAGE_SIZE); - if (pool_id < 0) - pool_id = CLEANCACHE_NO_POOL; - } - sb->cleancache_poolid = pool_id; -} -EXPORT_SYMBOL(__cleancache_init_shared_fs); - -/* - * If the filesystem uses exportable filehandles, use the filehandle as - * the key, else use the inode number. - */ -static int cleancache_get_key(struct inode *inode, - struct cleancache_filekey *key) -{ - int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *); - int len = 0, maxlen = CLEANCACHE_KEY_MAX; - struct super_block *sb = inode->i_sb; - - key->u.ino = inode->i_ino; - if (sb->s_export_op != NULL) { - fhfn = sb->s_export_op->encode_fh; - if (fhfn) { - len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL); - if (len <= FILEID_ROOT || len == FILEID_INVALID) - return -1; - if (maxlen > CLEANCACHE_KEY_MAX) - return -1; - } - } - return 0; -} - -/* - * "Get" data from cleancache associated with the poolid/inode/index - * that were specified when the data was put to cleanache and, if - * successful, use it to fill the specified page with data and return 0. - * The pageframe is unchanged and returns -1 if the get fails. - * Page must be locked by caller. - * - * The function has two checks before any action is taken - whether - * a backend is registered and whether the sb->cleancache_poolid - * is correct. - */ -int __cleancache_get_page(struct page *page) -{ - int ret = -1; - int pool_id; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (!cleancache_ops) { - cleancache_failed_gets++; - goto out; - } - - VM_BUG_ON_PAGE(!PageLocked(page), page); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id < 0) - goto out; - - if (cleancache_get_key(page->mapping->host, &key) < 0) - goto out; - - ret = cleancache_ops->get_page(pool_id, key, page->index, page); - if (ret == 0) - cleancache_succ_gets++; - else - cleancache_failed_gets++; -out: - return ret; -} -EXPORT_SYMBOL(__cleancache_get_page); - -/* - * "Put" data from a page to cleancache and associate it with the - * (previously-obtained per-filesystem) poolid and the page's, - * inode and page index. Page must be locked. Note that a put_page - * always "succeeds", though a subsequent get_page may succeed or fail. - * - * The function has two checks before any action is taken - whether - * a backend is registered and whether the sb->cleancache_poolid - * is correct. - */ -void __cleancache_put_page(struct page *page) -{ - int pool_id; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (!cleancache_ops) { - cleancache_puts++; - return; - } - - VM_BUG_ON_PAGE(!PageLocked(page), page); - pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (pool_id >= 0 && - cleancache_get_key(page->mapping->host, &key) >= 0) { - cleancache_ops->put_page(pool_id, key, page->index, page); - cleancache_puts++; - } -} -EXPORT_SYMBOL(__cleancache_put_page); - -/* - * Invalidate any data from cleancache associated with the poolid and the - * page's inode and page index so that a subsequent "get" will fail. - * - * The function has two checks before any action is taken - whether - * a backend is registered and whether the sb->cleancache_poolid - * is correct. - */ -void __cleancache_invalidate_page(struct address_space *mapping, - struct page *page) -{ - /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id = mapping->host->i_sb->cleancache_poolid; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (!cleancache_ops) - return; - - if (pool_id >= 0) { - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (cleancache_get_key(mapping->host, &key) >= 0) { - cleancache_ops->invalidate_page(pool_id, - key, page->index); - cleancache_invalidates++; - } - } -} -EXPORT_SYMBOL(__cleancache_invalidate_page); - -/* - * Invalidate all data from cleancache associated with the poolid and the - * mappings's inode so that all subsequent gets to this poolid/inode - * will fail. - * - * The function has two checks before any action is taken - whether - * a backend is registered and whether the sb->cleancache_poolid - * is correct. - */ -void __cleancache_invalidate_inode(struct address_space *mapping) -{ - int pool_id = mapping->host->i_sb->cleancache_poolid; - struct cleancache_filekey key = { .u.key = { 0 } }; - - if (!cleancache_ops) - return; - - if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) - cleancache_ops->invalidate_inode(pool_id, key); -} -EXPORT_SYMBOL(__cleancache_invalidate_inode); - -/* - * Called by any cleancache-enabled filesystem at time of unmount; - * note that pool_id is surrendered and may be returned by a subsequent - * cleancache_init_fs or cleancache_init_shared_fs. - */ -void __cleancache_invalidate_fs(struct super_block *sb) -{ - int pool_id; - - pool_id = sb->cleancache_poolid; - sb->cleancache_poolid = CLEANCACHE_NO_POOL; - - if (cleancache_ops && pool_id >= 0) - cleancache_ops->invalidate_fs(pool_id); -} -EXPORT_SYMBOL(__cleancache_invalidate_fs); - -static int __init init_cleancache(void) -{ -#ifdef CONFIG_DEBUG_FS - struct dentry *root = debugfs_create_dir("cleancache", NULL); - - debugfs_create_u64("succ_gets", 0444, root, &cleancache_succ_gets); - debugfs_create_u64("failed_gets", 0444, root, &cleancache_failed_gets); - debugfs_create_u64("puts", 0444, root, &cleancache_puts); - debugfs_create_u64("invalidates", 0444, root, &cleancache_invalidates); -#endif - return 0; -} -module_init(init_cleancache) diff --git a/mm/filemap.c b/mm/filemap.c index 60866ae711e2..b50910ac2c88 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -151,16 +150,6 @@ static void filemap_unaccount_folio(struct address_space *mapping, { long nr; - /* - * if we're uptodate, flush out into the cleancache, otherwise - * invalidate any existing cleancache entries. We can't leave - * stale data around in the cleancache once our page is gone - */ - if (folio_test_uptodate(folio) && folio_test_mappedtodisk(folio)) - cleancache_put_page(&folio->page); - else - cleancache_invalidate_page(mapping, &folio->page); - VM_BUG_ON_FOLIO(folio_mapped(folio), folio); if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) { int mapcount; diff --git a/mm/truncate.c b/mm/truncate.c index 5e243d7269c0..9dbf0b75da5d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -22,7 +22,6 @@ #include /* grr. try_to_release_page, do_invalidatepage */ #include -#include #include #include "internal.h" @@ -264,7 +263,6 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) */ folio_zero_range(folio, offset, length); - cleancache_invalidate_page(folio->mapping, &folio->page); if (folio_has_private(folio)) do_invalidatepage(&folio->page, offset, length); if (!folio_test_large(folio)) @@ -351,7 +349,7 @@ void truncate_inode_pages_range(struct address_space *mapping, bool same_folio; if (mapping_empty(mapping)) - goto out; + return; /* * 'start' and 'end' always covers the range of pages to be fully @@ -442,9 +440,6 @@ void truncate_inode_pages_range(struct address_space *mapping, folio_batch_release(&fbatch); index++; } - -out: - cleancache_invalidate_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -498,10 +493,6 @@ void truncate_inode_pages_final(struct address_space *mapping) xa_unlock_irq(&mapping->i_pages); } - /* - * Cleancache needs notification even if there are no pages or shadow - * entries. - */ truncate_inode_pages(mapping, 0); } EXPORT_SYMBOL(truncate_inode_pages_final); @@ -661,7 +652,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int did_range_unmap = 0; if (mapping_empty(mapping)) - goto out; + return 0; folio_batch_init(&fbatch); index = start; @@ -725,8 +716,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (dax_mapping(mapping)) { unmap_mapping_pages(mapping, start, end - start + 1, false); } -out: - cleancache_invalidate_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); -- cgit v1.2.3-58-ga151 From 3d6035f136009f9cae380022754cba31f32570c5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:38 -0800 Subject: frontswap: remove frontswap_writethrough frontswap_writethrough is never called, so remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/frontswap.rst | 6 ------ include/linux/frontswap.h | 1 - mm/frontswap.c | 23 +---------------------- 3 files changed, 1 insertion(+), 29 deletions(-) diff --git a/Documentation/vm/frontswap.rst b/Documentation/vm/frontswap.rst index e2e5ab3e375e..2ab660651d04 100644 --- a/Documentation/vm/frontswap.rst +++ b/Documentation/vm/frontswap.rst @@ -39,12 +39,6 @@ a disk write and, if the data is later read back, a disk read are avoided. If a store returns failure, transcendent memory has rejected the data, and the page can be written to swap as usual. -If a backend chooses, frontswap can be configured as a "writethrough -cache" by calling frontswap_writethrough(). In this mode, the reduction -in swap device writes is lost (and also a non-trivial performance advantage) -in order to allow the backend to arbitrarily "reclaim" space used to -store frontswap pages to more completely manage its memory usage. - Note that if a page is stored and the page already exists in transcendent memory (a "duplicate" store), either the store succeeds and the data is overwritten, or the store fails AND the page is invalidated. This ensures stale data may diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index b07d88c92bb2..4a03fda41572 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -26,7 +26,6 @@ struct frontswap_ops { extern void frontswap_register_ops(struct frontswap_ops *ops); extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); -extern void frontswap_writethrough(bool); #define FRONTSWAP_HAS_EXCLUSIVE_GETS extern void frontswap_tmem_exclusive_gets(bool); diff --git a/mm/frontswap.c b/mm/frontswap.c index 6bed12260dea..51a662a83955 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -32,16 +32,6 @@ static struct frontswap_ops *frontswap_ops __read_mostly; #define for_each_frontswap_ops(ops) \ for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next) -/* - * If enabled, frontswap_store will return failure even on success. As - * a result, the swap subsystem will always write the page to swap, in - * effect converting frontswap into a writethrough cache. In this mode, - * there is no direct reduction in swap writes, but a frontswap backend - * can unilaterally "reclaim" any pages in use with no data loss, thus - * providing increases control over maximum memory usage due to frontswap. - */ -static bool frontswap_writethrough_enabled __read_mostly; - /* * If enabled, the underlying tmem implementation is capable of doing * exclusive gets, so frontswap_load, on a successful tmem_get must @@ -170,15 +160,6 @@ void frontswap_register_ops(struct frontswap_ops *ops) } EXPORT_SYMBOL(frontswap_register_ops); -/* - * Enable/disable frontswap writethrough (see above). - */ -void frontswap_writethrough(bool enable) -{ - frontswap_writethrough_enabled = enable; -} -EXPORT_SYMBOL(frontswap_writethrough); - /* * Enable/disable frontswap exclusive gets (see above). */ @@ -283,9 +264,7 @@ int __frontswap_store(struct page *page) } else { inc_frontswap_failed_stores(); } - if (frontswap_writethrough_enabled) - /* report failure so swap also writes to swap device */ - ret = -1; + return ret; } EXPORT_SYMBOL(__frontswap_store); -- cgit v1.2.3-58-ga151 From 71024cb4a0bfe7767aec7a128d0a1a13a37b7fcd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:41 -0800 Subject: frontswap: remove frontswap_tmem_exclusive_gets frontswap_tmem_exclusive_gets is never called, so remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 2 -- mm/frontswap.c | 23 +---------------------- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 4a03fda41572..83a56392cc7f 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -26,8 +26,6 @@ struct frontswap_ops { extern void frontswap_register_ops(struct frontswap_ops *ops); extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); -#define FRONTSWAP_HAS_EXCLUSIVE_GETS -extern void frontswap_tmem_exclusive_gets(bool); extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); extern void __frontswap_init(unsigned type, unsigned long *map); diff --git a/mm/frontswap.c b/mm/frontswap.c index 51a662a83955..dba7f087ee86 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -32,13 +32,6 @@ static struct frontswap_ops *frontswap_ops __read_mostly; #define for_each_frontswap_ops(ops) \ for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next) -/* - * If enabled, the underlying tmem implementation is capable of doing - * exclusive gets, so frontswap_load, on a successful tmem_get must - * mark the page as no longer in frontswap AND mark it dirty. - */ -static bool frontswap_tmem_exclusive_gets_enabled __read_mostly; - #ifdef CONFIG_DEBUG_FS /* * Counters available via /sys/kernel/debug/frontswap (if debugfs is @@ -160,15 +153,6 @@ void frontswap_register_ops(struct frontswap_ops *ops) } EXPORT_SYMBOL(frontswap_register_ops); -/* - * Enable/disable frontswap exclusive gets (see above). - */ -void frontswap_tmem_exclusive_gets(bool enable) -{ - frontswap_tmem_exclusive_gets_enabled = enable; -} -EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); - /* * Called when a swap device is swapon'd. */ @@ -296,13 +280,8 @@ int __frontswap_load(struct page *page) if (!ret) /* successful load */ break; } - if (ret == 0) { + if (ret == 0) inc_frontswap_loads(); - if (frontswap_tmem_exclusive_gets_enabled) { - SetPageDirty(page); - __frontswap_clear(sis, offset); - } - } return ret; } EXPORT_SYMBOL(__frontswap_load); -- cgit v1.2.3-58-ga151 From 0b364446d734da76e421dbfb09e5268270cefaf0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:44 -0800 Subject: frontswap: remove frontswap_shrink frontswap_shrink is never called, so remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/frontswap.rst | 13 ------- include/linux/frontswap.h | 1 - mm/frontswap.c | 83 ------------------------------------------ 3 files changed, 97 deletions(-) diff --git a/Documentation/vm/frontswap.rst b/Documentation/vm/frontswap.rst index 2ab660651d04..feecc5e24477 100644 --- a/Documentation/vm/frontswap.rst +++ b/Documentation/vm/frontswap.rst @@ -255,19 +255,6 @@ the old data and ensure that it is no longer accessible. Since the swap subsystem then writes the new data to the read swap device, this is the correct course of action to ensure coherency. -* What is frontswap_shrink for? - -When the (non-frontswap) swap subsystem swaps out a page to a real -swap device, that page is only taking up low-value pre-allocated disk -space. But if frontswap has placed a page in transcendent memory, that -page may be taking up valuable real estate. The frontswap_shrink -routine allows code outside of the swap subsystem to force pages out -of the memory managed by frontswap and back into kernel-addressable memory. -For example, in RAMster, a "suction driver" thread will attempt -to "repatriate" pages sent to a remote machine back to the local machine; -this is driven using the frontswap_shrink mechanism when memory pressure -subsides. - * Why does the frontswap patch create the new include file swapfile.h? The frontswap code depends on some swap-subsystem-internal data diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 83a56392cc7f..d268d7bb6513 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -24,7 +24,6 @@ struct frontswap_ops { }; extern void frontswap_register_ops(struct frontswap_ops *ops); -extern void frontswap_shrink(unsigned long); extern unsigned long frontswap_curr_pages(void); extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); diff --git a/mm/frontswap.c b/mm/frontswap.c index dba7f087ee86..a77ebba6101b 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -341,89 +341,6 @@ static unsigned long __frontswap_curr_pages(void) return totalpages; } -static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, - int *swapid) -{ - int ret = -EINVAL; - struct swap_info_struct *si = NULL; - int si_frontswap_pages; - unsigned long total_pages_to_unuse = total; - unsigned long pages = 0, pages_to_unuse = 0; - - assert_spin_locked(&swap_lock); - plist_for_each_entry(si, &swap_active_head, list) { - si_frontswap_pages = atomic_read(&si->frontswap_pages); - if (total_pages_to_unuse < si_frontswap_pages) { - pages = pages_to_unuse = total_pages_to_unuse; - } else { - pages = si_frontswap_pages; - pages_to_unuse = 0; /* unuse all */ - } - /* ensure there is enough RAM to fetch pages from frontswap */ - if (security_vm_enough_memory_mm(current->mm, pages)) { - ret = -ENOMEM; - continue; - } - vm_unacct_memory(pages); - *unused = pages_to_unuse; - *swapid = si->type; - ret = 0; - break; - } - - return ret; -} - -/* - * Used to check if it's necessary and feasible to unuse pages. - * Return 1 when nothing to do, 0 when need to shrink pages, - * error code when there is an error. - */ -static int __frontswap_shrink(unsigned long target_pages, - unsigned long *pages_to_unuse, - int *type) -{ - unsigned long total_pages = 0, total_pages_to_unuse; - - assert_spin_locked(&swap_lock); - - total_pages = __frontswap_curr_pages(); - if (total_pages <= target_pages) { - /* Nothing to do */ - *pages_to_unuse = 0; - return 1; - } - total_pages_to_unuse = total_pages - target_pages; - return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); -} - -/* - * Frontswap, like a true swap device, may unnecessarily retain pages - * under certain circumstances; "shrink" frontswap is essentially a - * "partial swapoff" and works by calling try_to_unuse to attempt to - * unuse enough frontswap pages to attempt to -- subject to memory - * constraints -- reduce the number of pages in frontswap to the - * number given in the parameter target_pages. - */ -void frontswap_shrink(unsigned long target_pages) -{ - unsigned long pages_to_unuse = 0; - int type, ret; - - /* - * we don't want to hold swap_lock while doing a very - * lengthy try_to_unuse, but swap_list may change - * so restart scan from swap_active_head each time - */ - spin_lock(&swap_lock); - ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); - spin_unlock(&swap_lock); - if (ret == 0) - try_to_unuse(type, true, pages_to_unuse); - return; -} -EXPORT_SYMBOL(frontswap_shrink); - /* * Count and return the number of frontswap pages across all * swap devices. This is exported so that backend drivers can -- cgit v1.2.3-58-ga151 From 3e8e1af63d7a831f576477c25d9b89049bd2d53d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:47 -0800 Subject: frontswap: remove frontswap_curr_pages frontswap_curr_pages is never called, so remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 1 - mm/frontswap.c | 28 ---------------------------- 2 files changed, 29 deletions(-) diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index d268d7bb6513..5205c2977b20 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -24,7 +24,6 @@ struct frontswap_ops { }; extern void frontswap_register_ops(struct frontswap_ops *ops); -extern unsigned long frontswap_curr_pages(void); extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); extern void __frontswap_init(unsigned type, unsigned long *map); diff --git a/mm/frontswap.c b/mm/frontswap.c index a77ebba6101b..af8f68d0e5cc 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -330,34 +330,6 @@ void __frontswap_invalidate_area(unsigned type) } EXPORT_SYMBOL(__frontswap_invalidate_area); -static unsigned long __frontswap_curr_pages(void) -{ - unsigned long totalpages = 0; - struct swap_info_struct *si = NULL; - - assert_spin_locked(&swap_lock); - plist_for_each_entry(si, &swap_active_head, list) - totalpages += atomic_read(&si->frontswap_pages); - return totalpages; -} - -/* - * Count and return the number of frontswap pages across all - * swap devices. This is exported so that backend drivers can - * determine current usage without reading debugfs. - */ -unsigned long frontswap_curr_pages(void) -{ - unsigned long totalpages = 0; - - spin_lock(&swap_lock); - totalpages = __frontswap_curr_pages(); - spin_unlock(&swap_lock); - - return totalpages; -} -EXPORT_SYMBOL(frontswap_curr_pages); - static int __init init_frontswap(void) { #ifdef CONFIG_DEBUG_FS -- cgit v1.2.3-58-ga151 From 1cf53c894d15dd4b73397a56fa055d76d3db66b4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:51 -0800 Subject: frontswap: simplify frontswap_init Just use IS_ENABLED() and remove the __frontswap_init indirection. Also remove the unused export. Link: https://lkml.kernel.org/r/20211224062246.1258487-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 9 +-------- mm/frontswap.c | 3 +-- mm/swapfile.c | 3 ++- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 5205c2977b20..73d7beb44f2b 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -26,7 +26,7 @@ struct frontswap_ops { extern void frontswap_register_ops(struct frontswap_ops *ops); extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); -extern void __frontswap_init(unsigned type, unsigned long *map); +extern void frontswap_init(unsigned type, unsigned long *map); extern int __frontswap_store(struct page *page); extern int __frontswap_load(struct page *page); extern void __frontswap_invalidate_page(unsigned, pgoff_t); @@ -107,11 +107,4 @@ static inline void frontswap_invalidate_area(unsigned type) __frontswap_invalidate_area(type); } -static inline void frontswap_init(unsigned type, unsigned long *map) -{ -#ifdef CONFIG_FRONTSWAP - __frontswap_init(type, map); -#endif -} - #endif /* _LINUX_FRONTSWAP_H */ diff --git a/mm/frontswap.c b/mm/frontswap.c index af8f68d0e5cc..132d6ad6d70b 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -156,7 +156,7 @@ EXPORT_SYMBOL(frontswap_register_ops); /* * Called when a swap device is swapon'd. */ -void __frontswap_init(unsigned type, unsigned long *map) +void frontswap_init(unsigned type, unsigned long *map) { struct swap_info_struct *sis = swap_info[type]; struct frontswap_ops *ops; @@ -179,7 +179,6 @@ void __frontswap_init(unsigned type, unsigned long *map) for_each_frontswap_ops(ops) ops->init(type); } -EXPORT_SYMBOL(__frontswap_init); bool __frontswap_test(struct swap_info_struct *sis, pgoff_t offset) diff --git a/mm/swapfile.c b/mm/swapfile.c index caa9f81a0d15..df5930ccd93d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2463,7 +2463,8 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, struct swap_cluster_info *cluster_info, unsigned long *frontswap_map) { - frontswap_init(p->type, frontswap_map); + if (IS_ENABLED(CONFIG_FRONTSWAP)) + frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); setup_swap_info(p, prio, swap_map, cluster_info); -- cgit v1.2.3-58-ga151 From 360be5daa33fe74fc472195242ae2a2337c4320b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:54 -0800 Subject: frontswap: remove the frontswap exports None of the frontswap API is called from modular code. Link: https://lkml.kernel.org/r/20211224062246.1258487-8-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frontswap.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/frontswap.c b/mm/frontswap.c index 132d6ad6d70b..42d554da53bb 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -151,7 +151,6 @@ void frontswap_register_ops(struct frontswap_ops *ops) } } } -EXPORT_SYMBOL(frontswap_register_ops); /* * Called when a swap device is swapon'd. @@ -187,7 +186,6 @@ bool __frontswap_test(struct swap_info_struct *sis, return test_bit(offset, sis->frontswap_map); return false; } -EXPORT_SYMBOL(__frontswap_test); static inline void __frontswap_set(struct swap_info_struct *sis, pgoff_t offset) @@ -250,7 +248,6 @@ int __frontswap_store(struct page *page) return ret; } -EXPORT_SYMBOL(__frontswap_store); /* * "Get" data from frontswap associated with swaptype and offset that were @@ -283,7 +280,6 @@ int __frontswap_load(struct page *page) inc_frontswap_loads(); return ret; } -EXPORT_SYMBOL(__frontswap_load); /* * Invalidate any data from frontswap associated with the specified swaptype @@ -305,7 +301,6 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset) __frontswap_clear(sis, offset); inc_frontswap_invalidates(); } -EXPORT_SYMBOL(__frontswap_invalidate_page); /* * Invalidate all data from frontswap associated with all offsets for the @@ -327,7 +322,6 @@ void __frontswap_invalidate_area(unsigned type) atomic_set(&sis->frontswap_pages, 0); bitmap_zero(sis->frontswap_map, sis->max); } -EXPORT_SYMBOL(__frontswap_invalidate_area); static int __init init_frontswap(void) { -- cgit v1.2.3-58-ga151 From 10a9c496789fe2098bfc018650fc77b23ba08a54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:14:57 -0800 Subject: mm: simplify try_to_unuse Remove the unused frontswap and pages_to_unuse arguments, and mark the function static now that the caller in frontswap is gone. [akpm@linux-foundation.org: fix shmem_unuse() stub, per Matthew] Link: https://lkml.kernel.org/r/20211224062246.1258487-9-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Cc: Naresh Kamboju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 7 ---- include/linux/shmem_fs.h | 3 +- include/linux/swapfile.h | 1 - mm/shmem.c | 33 ++++--------------- mm/swapfile.c | 83 +++++++++++++---------------------------------- 5 files changed, 30 insertions(+), 97 deletions(-) diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 73d7beb44f2b..a9817d4fa74c 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -7,13 +7,6 @@ #include #include -/* - * Return code to denote that requested number of - * frontswap pages are unused(moved to page cache). - * Used in shmem_unuse and try_to_unuse. - */ -#define FRONTSWAP_PAGES_UNUSED 2 - struct frontswap_ops { void (*init)(unsigned); /* this swap type was just swapon'ed */ int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 166158b6e917..e65b80ed09e7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -83,8 +83,7 @@ extern void shmem_unlock_mapping(struct address_space *mapping); extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); -extern int shmem_unuse(unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse); +int shmem_unuse(unsigned int type); extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, pgoff_t index); diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index e06febf62978..809cd01ef2c5 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -9,7 +9,6 @@ extern spinlock_t swap_lock; extern struct plist_head swap_active_head; extern struct swap_info_struct *swap_info[]; -extern int try_to_unuse(unsigned int, bool, unsigned long); extern unsigned long generic_max_swapfile_size(void); extern unsigned long max_swapfile_size(void); diff --git a/mm/shmem.c b/mm/shmem.c index 66909efd0a1b..a09b29ec2b45 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include @@ -1152,7 +1151,7 @@ static void shmem_evict_inode(struct inode *inode) static int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices, - unsigned int type, bool frontswap) + unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct page *page; @@ -1173,9 +1172,6 @@ static int shmem_find_swap_entries(struct address_space *mapping, entry = radix_to_swp_entry(page); if (swp_type(entry) != type) continue; - if (frontswap && - !frontswap_test(swap_info[type], swp_offset(entry))) - continue; indices[ret] = xas.xa_index; entries[ret] = page; @@ -1228,26 +1224,20 @@ static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec, /* * If swap found in inode, free it and move page from swapcache to filecache. */ -static int shmem_unuse_inode(struct inode *inode, unsigned int type, - bool frontswap, unsigned long *fs_pages_to_unuse) +static int shmem_unuse_inode(struct inode *inode, unsigned int type) { struct address_space *mapping = inode->i_mapping; pgoff_t start = 0; struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; - bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0); int ret = 0; pagevec_init(&pvec); do { unsigned int nr_entries = PAGEVEC_SIZE; - if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE) - nr_entries = *fs_pages_to_unuse; - pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, - pvec.pages, indices, - type, frontswap); + pvec.pages, indices, type); if (pvec.nr == 0) { ret = 0; break; @@ -1257,14 +1247,6 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type, if (ret < 0) break; - if (frontswap_partial) { - *fs_pages_to_unuse -= ret; - if (*fs_pages_to_unuse == 0) { - ret = FRONTSWAP_PAGES_UNUSED; - break; - } - } - start = indices[pvec.nr - 1]; } while (true); @@ -1276,8 +1258,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type, * device 'type' back into memory, so the swap device can be * unused. */ -int shmem_unuse(unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) +int shmem_unuse(unsigned int type) { struct shmem_inode_info *info, *next; int error = 0; @@ -1300,8 +1281,7 @@ int shmem_unuse(unsigned int type, bool frontswap, atomic_inc(&info->stop_eviction); mutex_unlock(&shmem_swaplist_mutex); - error = shmem_unuse_inode(&info->vfs_inode, type, frontswap, - fs_pages_to_unuse); + error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); mutex_lock(&shmem_swaplist_mutex); @@ -4015,8 +3995,7 @@ int __init shmem_init(void) return 0; } -int shmem_unuse(unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) +int shmem_unuse(unsigned int type) { return 0; } diff --git a/mm/swapfile.c b/mm/swapfile.c index df5930ccd93d..82342c77791b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1923,8 +1923,7 @@ out: static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, - unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) + unsigned int type) { struct page *page; swp_entry_t entry; @@ -1945,9 +1944,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; offset = swp_offset(entry); - if (frontswap && !frontswap_test(si, offset)) - continue; - pte_unmap(pte); swap_map = &si->swap_map[offset]; page = lookup_swap_cache(entry, vma, addr); @@ -1979,11 +1975,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, try_to_free_swap(page); unlock_page(page); put_page(page); - - if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { - ret = FRONTSWAP_PAGES_UNUSED; - goto out; - } try_next: pte = pte_offset_map(pmd, addr); } while (pte++, addr += PAGE_SIZE, addr != end); @@ -1996,8 +1987,7 @@ out: static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) + unsigned int type) { pmd_t *pmd; unsigned long next; @@ -2009,8 +1999,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; - ret = unuse_pte_range(vma, pmd, addr, next, type, - frontswap, fs_pages_to_unuse); + ret = unuse_pte_range(vma, pmd, addr, next, type); if (ret) return ret; } while (pmd++, addr = next, addr != end); @@ -2019,8 +2008,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, - unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) + unsigned int type) { pud_t *pud; unsigned long next; @@ -2031,8 +2019,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - ret = unuse_pmd_range(vma, pud, addr, next, type, - frontswap, fs_pages_to_unuse); + ret = unuse_pmd_range(vma, pud, addr, next, type); if (ret) return ret; } while (pud++, addr = next, addr != end); @@ -2041,8 +2028,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) + unsigned int type) { p4d_t *p4d; unsigned long next; @@ -2053,16 +2039,14 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; - ret = unuse_pud_range(vma, p4d, addr, next, type, - frontswap, fs_pages_to_unuse); + ret = unuse_pud_range(vma, p4d, addr, next, type); if (ret) return ret; } while (p4d++, addr = next, addr != end); return 0; } -static int unuse_vma(struct vm_area_struct *vma, unsigned int type, - bool frontswap, unsigned long *fs_pages_to_unuse) +static int unuse_vma(struct vm_area_struct *vma, unsigned int type) { pgd_t *pgd; unsigned long addr, end, next; @@ -2076,16 +2060,14 @@ static int unuse_vma(struct vm_area_struct *vma, unsigned int type, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - ret = unuse_p4d_range(vma, pgd, addr, next, type, - frontswap, fs_pages_to_unuse); + ret = unuse_p4d_range(vma, pgd, addr, next, type); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0; } -static int unuse_mm(struct mm_struct *mm, unsigned int type, - bool frontswap, unsigned long *fs_pages_to_unuse) +static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; @@ -2093,8 +2075,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type, mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma) { - ret = unuse_vma(vma, type, frontswap, - fs_pages_to_unuse); + ret = unuse_vma(vma, type); if (ret) break; } @@ -2110,7 +2091,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type, * if there are no inuse entries after prev till end of the map. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, - unsigned int prev, bool frontswap) + unsigned int prev) { unsigned int i; unsigned char count; @@ -2124,8 +2105,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, for (i = prev + 1; i < si->max; i++) { count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) - if (!frontswap || frontswap_test(si, i)) - break; + break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); } @@ -2136,12 +2116,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, return i; } -/* - * If the boolean frontswap is true, only unuse pages_to_unuse pages; - * pages_to_unuse==0 means all pages; ignored if frontswap is false - */ -int try_to_unuse(unsigned int type, bool frontswap, - unsigned long pages_to_unuse) +static int try_to_unuse(unsigned int type) { struct mm_struct *prev_mm; struct mm_struct *mm; @@ -2155,13 +2130,10 @@ int try_to_unuse(unsigned int type, bool frontswap, if (!READ_ONCE(si->inuse_pages)) return 0; - if (!frontswap) - pages_to_unuse = 0; - retry: - retval = shmem_unuse(type, frontswap, &pages_to_unuse); + retval = shmem_unuse(type); if (retval) - goto out; + return retval; prev_mm = &init_mm; mmget(prev_mm); @@ -2178,11 +2150,10 @@ retry: spin_unlock(&mmlist_lock); mmput(prev_mm); prev_mm = mm; - retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); - + retval = unuse_mm(mm, type); if (retval) { mmput(prev_mm); - goto out; + return retval; } /* @@ -2199,7 +2170,7 @@ retry: i = 0; while (READ_ONCE(si->inuse_pages) && !signal_pending(current) && - (i = find_next_to_unuse(si, i, frontswap)) != 0) { + (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); page = find_get_page(swap_address_space(entry), i); @@ -2217,14 +2188,6 @@ retry: try_to_free_swap(page); unlock_page(page); put_page(page); - - /* - * For frontswap, we just need to unuse pages_to_unuse, if - * it was specified. Need not check frontswap again here as - * we already zeroed out pages_to_unuse if not frontswap. - */ - if (pages_to_unuse && --pages_to_unuse == 0) - goto out; } /* @@ -2242,10 +2205,10 @@ retry: if (READ_ONCE(si->inuse_pages)) { if (!signal_pending(current)) goto retry; - retval = -EINTR; + return -EINTR; } -out: - return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; + + return 0; } /* @@ -2577,7 +2540,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) disable_swap_slots_cache_lock(); set_current_oom_origin(); - err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ + err = try_to_unuse(p->type); clear_current_oom_origin(); if (err) { -- cgit v1.2.3-58-ga151 From bd9cd521496ba8d537d8f46f4167bf4221aba9a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:15:01 -0800 Subject: frontswap: remove frontswap_test frontswap_test is unused now, remove it. Link: https://lkml.kernel.org/r/20211224062246.1258487-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 11 ----------- mm/frontswap.c | 2 +- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index a9817d4fa74c..c5b2848d2240 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -18,7 +18,6 @@ struct frontswap_ops { extern void frontswap_register_ops(struct frontswap_ops *ops); -extern bool __frontswap_test(struct swap_info_struct *, pgoff_t); extern void frontswap_init(unsigned type, unsigned long *map); extern int __frontswap_store(struct page *page); extern int __frontswap_load(struct page *page); @@ -33,11 +32,6 @@ static inline bool frontswap_enabled(void) return static_branch_unlikely(&frontswap_enabled_key); } -static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) -{ - return __frontswap_test(sis, offset); -} - static inline void frontswap_map_set(struct swap_info_struct *p, unsigned long *map) { @@ -56,11 +50,6 @@ static inline bool frontswap_enabled(void) return false; } -static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) -{ - return false; -} - static inline void frontswap_map_set(struct swap_info_struct *p, unsigned long *map) { diff --git a/mm/frontswap.c b/mm/frontswap.c index 42d554da53bb..f51159f0d75d 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -179,7 +179,7 @@ void frontswap_init(unsigned type, unsigned long *map) ops->init(type); } -bool __frontswap_test(struct swap_info_struct *sis, +static bool __frontswap_test(struct swap_info_struct *sis, pgoff_t offset) { if (sis->frontswap_map) -- cgit v1.2.3-58-ga151 From f328c1d16e4c764992895ac9c9425cea861b2ca0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:15:04 -0800 Subject: frontswap: simplify frontswap_register_ops Given that frontswap_register_ops must be called from built-in code, there is no need to handle the case of swapfiles coming online before or during it, so delete the code that deals with that case. Link: https://lkml.kernel.org/r/20211224062246.1258487-11-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frontswap.c | 41 ----------------------------------------- 1 file changed, 41 deletions(-) diff --git a/mm/frontswap.c b/mm/frontswap.c index f51159f0d75d..35040fa4eba8 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -99,25 +99,6 @@ static inline void inc_frontswap_invalidates(void) { } */ void frontswap_register_ops(struct frontswap_ops *ops) { - DECLARE_BITMAP(a, MAX_SWAPFILES); - DECLARE_BITMAP(b, MAX_SWAPFILES); - struct swap_info_struct *si; - unsigned int i; - - bitmap_zero(a, MAX_SWAPFILES); - bitmap_zero(b, MAX_SWAPFILES); - - spin_lock(&swap_lock); - plist_for_each_entry(si, &swap_active_head, list) { - if (!WARN_ON(!si->frontswap_map)) - __set_bit(si->type, a); - } - spin_unlock(&swap_lock); - - /* the new ops needs to know the currently active swap devices */ - for_each_set_bit(i, a, MAX_SWAPFILES) - ops->init(i); - /* * Setting frontswap_ops must happen after the ops->init() calls * above; cmpxchg implies smp_mb() which will ensure the init is @@ -128,28 +109,6 @@ void frontswap_register_ops(struct frontswap_ops *ops) } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); static_branch_inc(&frontswap_enabled_key); - - spin_lock(&swap_lock); - plist_for_each_entry(si, &swap_active_head, list) { - if (si->frontswap_map) - __set_bit(si->type, b); - } - spin_unlock(&swap_lock); - - /* - * On the very unlikely chance that a swap device was added or - * removed between setting the "a" list bits and the ops init - * calls, we re-check and do init or invalidate for any changed - * bits. - */ - if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) { - for (i = 0; i < MAX_SWAPFILES; i++) { - if (!test_bit(i, a) && test_bit(i, b)) - ops->init(i); - else if (test_bit(i, a) && !test_bit(i, b)) - ops->invalidate_area(i); - } - } } /* -- cgit v1.2.3-58-ga151 From 633423a09cb5cfe61438283e1ce49c23cf4a0611 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:15:07 -0800 Subject: mm: mark swap_lock and swap_active_head static swap_lock and swap_active_head are only used in swapfile.c, so mark them static. Link: https://lkml.kernel.org/r/20211224062246.1258487-12-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swapfile.h | 2 -- mm/swapfile.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index 809cd01ef2c5..54078542134c 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -6,8 +6,6 @@ * these were static in swapfile.c but frontswap.c needs them and we don't * want to expose them to the dozens of source files that include swap.h */ -extern spinlock_t swap_lock; -extern struct plist_head swap_active_head; extern struct swap_info_struct *swap_info[]; extern unsigned long generic_max_swapfile_size(void); extern unsigned long max_swapfile_size(void); diff --git a/mm/swapfile.c b/mm/swapfile.c index 82342c77791b..bf0df7aa7158 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -49,7 +49,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -DEFINE_SPINLOCK(swap_lock); +static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; /* @@ -71,7 +71,7 @@ static const char Unused_offset[] = "Unused swap offset entry "; * all active swap_info_structs * protected with swap_lock, and ordered by priority. */ -PLIST_HEAD(swap_active_head); +static PLIST_HEAD(swap_active_head); /* * all available (active, not full) swap_info_structs -- cgit v1.2.3-58-ga151 From 1da0d94a3ec8c5f3793b7be8538b55e60ebeefe3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:15:10 -0800 Subject: frontswap: remove support for multiple ops There is only a single instance of frontswap ops in the kernel, so simplify the frontswap code by removing support for multiple operations. Link: https://lkml.kernel.org/r/20211224062246.1258487-13-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/frontswap.h | 3 +-- mm/frontswap.c | 50 ++++++++++++----------------------------------- mm/zswap.c | 8 ++++++-- 3 files changed, 19 insertions(+), 42 deletions(-) diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index c5b2848d2240..a631bac12220 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -13,10 +13,9 @@ struct frontswap_ops { int (*load)(unsigned, pgoff_t, struct page *); /* load a page */ void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */ void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */ - struct frontswap_ops *next; /* private pointer to next ops */ }; -extern void frontswap_register_ops(struct frontswap_ops *ops); +int frontswap_register_ops(const struct frontswap_ops *ops); extern void frontswap_init(unsigned type, unsigned long *map); extern int __frontswap_store(struct page *page); diff --git a/mm/frontswap.c b/mm/frontswap.c index 35040fa4eba8..6f69b044a8cc 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -27,10 +27,7 @@ DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key); * may be registered, but implementations can never deregister. This * is a simple singly-linked list of all registered implementations. */ -static struct frontswap_ops *frontswap_ops __read_mostly; - -#define for_each_frontswap_ops(ops) \ - for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next) +static const struct frontswap_ops *frontswap_ops __read_mostly; #ifdef CONFIG_DEBUG_FS /* @@ -97,18 +94,14 @@ static inline void inc_frontswap_invalidates(void) { } /* * Register operations for frontswap */ -void frontswap_register_ops(struct frontswap_ops *ops) +int frontswap_register_ops(const struct frontswap_ops *ops) { - /* - * Setting frontswap_ops must happen after the ops->init() calls - * above; cmpxchg implies smp_mb() which will ensure the init is - * complete at this point. - */ - do { - ops->next = frontswap_ops; - } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); + if (frontswap_ops) + return -EINVAL; + frontswap_ops = ops; static_branch_inc(&frontswap_enabled_key); + return 0; } /* @@ -117,7 +110,6 @@ void frontswap_register_ops(struct frontswap_ops *ops) void frontswap_init(unsigned type, unsigned long *map) { struct swap_info_struct *sis = swap_info[type]; - struct frontswap_ops *ops; VM_BUG_ON(sis == NULL); @@ -133,9 +125,7 @@ void frontswap_init(unsigned type, unsigned long *map) * p->frontswap set to something valid to work properly. */ frontswap_map_set(sis, map); - - for_each_frontswap_ops(ops) - ops->init(type); + frontswap_ops->init(type); } static bool __frontswap_test(struct swap_info_struct *sis, @@ -174,7 +164,6 @@ int __frontswap_store(struct page *page) int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); - struct frontswap_ops *ops; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(!PageLocked(page)); @@ -188,16 +177,10 @@ int __frontswap_store(struct page *page) */ if (__frontswap_test(sis, offset)) { __frontswap_clear(sis, offset); - for_each_frontswap_ops(ops) - ops->invalidate_page(type, offset); + frontswap_ops->invalidate_page(type, offset); } - /* Try to store in each implementation, until one succeeds. */ - for_each_frontswap_ops(ops) { - ret = ops->store(type, offset, page); - if (!ret) /* successful store */ - break; - } + ret = frontswap_ops->store(type, offset, page); if (ret == 0) { __frontswap_set(sis, offset); inc_frontswap_succ_stores(); @@ -220,7 +203,6 @@ int __frontswap_load(struct page *page) int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); - struct frontswap_ops *ops; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(!PageLocked(page)); @@ -230,11 +212,7 @@ int __frontswap_load(struct page *page) return -1; /* Try loading from each implementation, until one succeeds. */ - for_each_frontswap_ops(ops) { - ret = ops->load(type, offset, page); - if (!ret) /* successful load */ - break; - } + ret = frontswap_ops->load(type, offset, page); if (ret == 0) inc_frontswap_loads(); return ret; @@ -247,7 +225,6 @@ int __frontswap_load(struct page *page) void __frontswap_invalidate_page(unsigned type, pgoff_t offset) { struct swap_info_struct *sis = swap_info[type]; - struct frontswap_ops *ops; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(sis == NULL); @@ -255,8 +232,7 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset) if (!__frontswap_test(sis, offset)) return; - for_each_frontswap_ops(ops) - ops->invalidate_page(type, offset); + frontswap_ops->invalidate_page(type, offset); __frontswap_clear(sis, offset); inc_frontswap_invalidates(); } @@ -268,7 +244,6 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset) void __frontswap_invalidate_area(unsigned type) { struct swap_info_struct *sis = swap_info[type]; - struct frontswap_ops *ops; VM_BUG_ON(!frontswap_ops); VM_BUG_ON(sis == NULL); @@ -276,8 +251,7 @@ void __frontswap_invalidate_area(unsigned type) if (sis->frontswap_map == NULL) return; - for_each_frontswap_ops(ops) - ops->invalidate_area(type); + frontswap_ops->invalidate_area(type); atomic_set(&sis->frontswap_pages, 0); bitmap_zero(sis->frontswap_map, sis->max); } diff --git a/mm/zswap.c b/mm/zswap.c index 7944e3e57e78..cdf6950fcb2e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1378,7 +1378,7 @@ static void zswap_frontswap_init(unsigned type) zswap_trees[type] = tree; } -static struct frontswap_ops zswap_frontswap_ops = { +static const struct frontswap_ops zswap_frontswap_ops = { .store = zswap_frontswap_store, .load = zswap_frontswap_load, .invalidate_page = zswap_frontswap_invalidate_page, @@ -1475,11 +1475,15 @@ static int __init init_zswap(void) if (!shrink_wq) goto fallback_fail; - frontswap_register_ops(&zswap_frontswap_ops); + ret = frontswap_register_ops(&zswap_frontswap_ops); + if (ret) + goto destroy_wq; if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); return 0; +destroy_wq: + destroy_workqueue(shrink_wq); fallback_fail: if (pool) zswap_pool_destroy(pool); -- cgit v1.2.3-58-ga151 From 6e61dde82e8bfe65e8ebbe43da45e615bc529236 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Jan 2022 22:15:14 -0800 Subject: mm: hide the FRONTSWAP Kconfig symbol Select FRONTSWAP from ZSWAP instead of prompting for it. Link: https://lkml.kernel.org/r/20211224062246.1258487-14-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Juergen Gross Cc: Dan Streetman Cc: Geert Uytterhoeven Cc: Hugh Dickins Cc: Konrad Rzeszutek Wilk Cc: Matthew Wilcox (Oracle) Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 430240289b02..3326ee3903f3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -445,20 +445,7 @@ config HAVE_SETUP_PER_CPU_AREA bool config FRONTSWAP - bool "Enable frontswap to cache swap pages if tmem is present" - depends on SWAP - help - Frontswap is so named because it can be thought of as the opposite - of a "backing" store for a swap device. The data is stored into - "transcendent memory", memory that is not directly accessible or - addressable by the kernel and is of unknown and possibly - time-varying size. When space in transcendent memory is available, - a significant swap I/O reduction may be achieved. When none is - available, all frontswap calls are reduced to a single pointer- - compare-against-NULL resulting in a negligible performance hit - and swap data is stored as normal on the matching swap device. - - If unsure, say Y to enable frontswap. + bool config CMA bool "Contiguous Memory Allocator" @@ -523,7 +510,8 @@ config MEM_SOFT_DIRTY config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" - depends on FRONTSWAP && CRYPTO=y + depends on SWAP && CRYPTO=y + select FRONTSWAP select ZPOOL help A lightweight compressed cache for swap pages. It takes -- cgit v1.2.3-58-ga151 From 10756dc5b02bff370ddd351d7744bc99ada659c2 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 3 Jan 2022 04:24:02 +0300 Subject: usr/include/Makefile: add linux/nfc.h to the compile-test coverage As linux/nfc.h userspace compilation was finally fixed by commits 79b69a83705e ("nfc: uapi: use kernel size_t to fix user-space builds") and 7175f02c4e5f ("uapi: fix linux/nfc.h userspace compilation errors"), there is no need to keep the compile-test exception for it in usr/include/Makefile. Signed-off-by: Dmitry V. Levin Signed-off-by: Masahiro Yamada --- usr/include/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/usr/include/Makefile b/usr/include/Makefile index 94403806ea56..7be7468c177b 100644 --- a/usr/include/Makefile +++ b/usr/include/Makefile @@ -34,7 +34,6 @@ no-header-test += linux/hdlc/ioctl.h no-header-test += linux/ivtv.h no-header-test += linux/kexec.h no-header-test += linux/matroxfb.h -no-header-test += linux/nfc.h no-header-test += linux/omap3isp.h no-header-test += linux/omapfb.h no-header-test += linux/patchkey.h -- cgit v1.2.3-58-ga151 From e92e2634ef3a95376ad917452a476fccaff83fde Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 20 Jan 2022 14:31:00 +0900 Subject: Revert "Makefile: Do not quote value for CONFIG_CC_IMPLICIT_FALLTHROUGH" This reverts commit cd8c917a56f20f48748dd43d9ae3caff51d5b987. Commit 129ab0d2d9f3 ("kbuild: do not quote string values in include/config/auto.conf") provided the final solution. Now reverting the temporary workaround. Signed-off-by: Masahiro Yamada --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3f07f0f04475..c94559a97dca 100644 --- a/Makefile +++ b/Makefile @@ -778,7 +778,7 @@ stackp-flags-$(CONFIG_STACKPROTECTOR_STRONG) := -fstack-protector-strong KBUILD_CFLAGS += $(stackp-flags-y) KBUILD_CFLAGS-$(CONFIG_WERROR) += -Werror -KBUILD_CFLAGS += $(KBUILD_CFLAGS-y) $(CONFIG_CC_IMPLICIT_FALLTHROUGH:"%"=%) +KBUILD_CFLAGS += $(KBUILD_CFLAGS-y) $(CONFIG_CC_IMPLICIT_FALLTHROUGH) ifdef CONFIG_CC_IS_CLANG KBUILD_CPPFLAGS += -Qunused-arguments -- cgit v1.2.3-58-ga151 From ad29a2fb3c201ef066b0a9fe10a6e14dd0d59c48 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 21 Jan 2022 04:22:04 +0900 Subject: certs: Fix build error when CONFIG_MODULE_SIG_KEY is PKCS#11 URI When CONFIG_MODULE_SIG_KEY is PKCS#11 URL (pkcs11:*), signing_key.x509 fails to build: certs/Makefile:77: *** target pattern contains no '%'. Stop. Due to the typo, $(X509_DEP) contains a colon. Fix it. Fixes: b8c96a6b466c ("certs: simplify $(srctree)/ handling and remove config_filename macro") Signed-off-by: Masahiro Yamada --- certs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/certs/Makefile b/certs/Makefile index f7041c29a2e0..0c459cfd09df 100644 --- a/certs/Makefile +++ b/certs/Makefile @@ -68,7 +68,7 @@ $(obj)/x509.genkey: endif # CONFIG_MODULE_SIG_KEY # If CONFIG_MODULE_SIG_KEY isn't a PKCS#11 URI, depend on it -ifneq ($(filter-out pkcs11:%, %(CONFIG_MODULE_SIG_KEY)),) +ifneq ($(filter-out pkcs11:%, $(CONFIG_MODULE_SIG_KEY)),) X509_DEP := $(CONFIG_MODULE_SIG_KEY) endif -- cgit v1.2.3-58-ga151 From e6340b6526eeec5a00fe26a6ff515afe7d0affa4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 21 Jan 2022 04:22:05 +0900 Subject: certs: Fix build error when CONFIG_MODULE_SIG_KEY is empty Since b8c96a6b466c ("certs: simplify $(srctree)/ handling and remove config_filename macro"), when CONFIG_MODULE_SIG_KEY is empty, signing_key.x509 fails to build: CERT certs/signing_key.x509 Usage: extract-cert make[1]: *** [certs/Makefile:78: certs/signing_key.x509] Error 2 make: *** [Makefile:1831: certs] Error 2 Pass "" to the first argument of extract-cert to fix the build error. Link: https://lore.kernel.org/linux-kbuild/20220120094606.2skuyb26yjlnu66q@lion.mk-sys.cz/T/#u Fixes: b8c96a6b466c ("certs: simplify $(srctree)/ handling and remove config_filename macro") Reported-by: Michal Kubecek Signed-off-by: Masahiro Yamada Tested-by: Michal Kubecek --- certs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/certs/Makefile b/certs/Makefile index 0c459cfd09df..3ea7fe60823f 100644 --- a/certs/Makefile +++ b/certs/Makefile @@ -75,7 +75,7 @@ endif $(obj)/system_certificates.o: $(obj)/signing_key.x509 $(obj)/signing_key.x509: $(X509_DEP) $(obj)/extract-cert FORCE - $(call if_changed,extract_certs,$(if $(X509_DEP),$<,$(CONFIG_MODULE_SIG_KEY))) + $(call if_changed,extract_certs,$(if $(CONFIG_MODULE_SIG_KEY),$(if $(X509_DEP),$<,$(CONFIG_MODULE_SIG_KEY)),"")) endif # CONFIG_MODULE_SIG targets += signing_key.x509 -- cgit v1.2.3-58-ga151 From 9edcde68d653e1f8f895fbb69a0043c6a56ae35e Mon Sep 17 00:00:00 2001 From: Yao Jin Date: Fri, 21 Jan 2022 14:59:54 +0800 Subject: perf script: Fix printing 'phys_addr' failure issue Perf script was failed to print the phys_addr for SPE profiling. One 'dummy' event is added by SPE profiling but it doesn't have PHYS_ADDR attribute set, perf script then exits with error. Now referring to 'addr', use evsel__do_check_stype() to check the type. Before: # perf record -e arm_spe_0/branch_filter=0,ts_enable=1,pa_enable=1,load_filter=1,jitter=0,\ store_filter=0,min_latency=0,event_filter=2/ -p 4064384 -- sleep 3 # perf script -F pid,tid,addr,phys_addr Samples for 'dummy:u' event do not have PHYS_ADDR attribute set. Cannot print 'phys_addr' field. After: # perf record -e arm_spe_0/branch_filter=0,ts_enable=1,pa_enable=1,load_filter=1,jitter=0,\ store_filter=0,min_latency=0,event_filter=2/ -p 4064384 -- sleep 3 # perf script -F pid,tid,addr,phys_addr 4064384/4064384 ffff802f921be0d0 2f921be0d0 4064384/4064384 ffff802f921be0d0 2f921be0d0 Reviewed-by: German Gomez Signed-off-by: Yao Jin Cc: Alexander Shishkin Cc: Hanjun Guo Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20220121065954.2121900-1-liwei391@huawei.com Signed-off-by: Wei Li Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index ecd4f99a6c14..abae8184e171 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -515,7 +515,7 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session) return -EINVAL; if (PRINT_FIELD(PHYS_ADDR) && - evsel__check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", PERF_OUTPUT_PHYS_ADDR)) + evsel__do_check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", PERF_OUTPUT_PHYS_ADDR, allow_user_set)) return -EINVAL; if (PRINT_FIELD(DATA_PAGE_SIZE) && -- cgit v1.2.3-58-ga151 From 1d1d9af254ffc3bc38c59484c50c600d1d0c96da Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 21 Jan 2022 20:58:09 -0800 Subject: perf python: Fix cpu_map__item() building MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Value should be built as an integer. Switch some uses of perf_cpu_map to use the library API. Fixes: 6d18804b963b78dc ("perf cpumap: Give CPUs their own type") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Andi Kleen Cc: André Almeida Cc: Andrew Morton Cc: Andy Shevchenko Cc: Darren Hart Cc: Davidlohr Bueso Cc: Dmitriy Vyukov Cc: Eric Dumazet Cc: German Gomez Cc: Ian Rogers Cc: James Clark Cc: Jin Yao Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miaoqian Lin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Riccardo Mancini Cc: Shunsuke Nakamura Cc: Song Liu Cc: Stephane Eranian Cc: Stephen Brennan Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Thomas Richter Cc: Yury Norov Link: http://lore.kernel.org/lkml/20220122045811.3402706-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/python.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index f3e5131f183c..52d8995cfd73 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -638,17 +638,17 @@ static Py_ssize_t pyrf_cpu_map__length(PyObject *obj) { struct pyrf_cpu_map *pcpus = (void *)obj; - return pcpus->cpus->nr; + return perf_cpu_map__nr(pcpus->cpus); } static PyObject *pyrf_cpu_map__item(PyObject *obj, Py_ssize_t i) { struct pyrf_cpu_map *pcpus = (void *)obj; - if (i >= pcpus->cpus->nr) + if (i >= perf_cpu_map__nr(pcpus->cpus)) return NULL; - return Py_BuildValue("i", pcpus->cpus->map[i]); + return Py_BuildValue("i", perf_cpu_map__cpu(pcpus->cpus, i).cpu); } static PySequenceMethods pyrf_cpu_map__sequence_methods = { -- cgit v1.2.3-58-ga151 From 440286993960bea4aa09d912a5497d92d09ae54c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 21 Jan 2022 20:58:10 -0800 Subject: perf cpumap: Migrate to libperf cpumap api MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch from directly accessing the perf_cpu_map to using the appropriate libperf API when possible. Using the API simplifies the job of refactoring use of perf_cpu_map. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alexey Bayduraev Cc: Andi Kleen Cc: Andrew Morton Cc: André Almeida Cc: Andy Shevchenko Cc: Darren Hart Cc: Davidlohr Bueso Cc: Dmitriy Vyukov Cc: Eric Dumazet Cc: German Gomez Cc: James Clark Cc: Jin Yao Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kan Liang Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Miaoqian Lin Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Riccardo Mancini Cc: Shunsuke Nakamura Cc: Song Liu Cc: Stephane Eranian Cc: Stephen Brennan Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Thomas Richter Cc: Yury Norov Link: http://lore.kernel.org/lkml/20220122045811.3402706-3-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/evsel.c | 4 +-- tools/perf/bench/epoll-ctl.c | 2 +- tools/perf/bench/epoll-wait.c | 2 +- tools/perf/bench/evlist-open-close.c | 4 +-- tools/perf/bench/futex-hash.c | 2 +- tools/perf/bench/futex-lock-pi.c | 2 +- tools/perf/bench/futex-requeue.c | 2 +- tools/perf/bench/futex-wake-parallel.c | 2 +- tools/perf/bench/futex-wake.c | 2 +- tools/perf/builtin-ftrace.c | 2 +- tools/perf/builtin-stat.c | 7 ++-- tools/perf/tests/bitmap.c | 4 +-- tools/perf/tests/event_update.c | 8 ++--- tools/perf/tests/mem2node.c | 9 +++--- tools/perf/tests/mmap-basic.c | 5 +-- tools/perf/tests/topology.c | 37 ++++++++++++---------- tools/perf/util/auxtrace.c | 2 +- tools/perf/util/counts.c | 2 +- tools/perf/util/cpumap.h | 2 +- tools/perf/util/cputopo.c | 4 +-- tools/perf/util/evlist-hybrid.c | 11 ++++--- tools/perf/util/evsel.c | 20 ++++++------ tools/perf/util/evsel.h | 3 +- tools/perf/util/mmap.c | 2 +- tools/perf/util/perf_api_probe.c | 4 +-- tools/perf/util/record.c | 6 ++-- .../util/scripting-engines/trace-event-python.c | 4 +-- tools/perf/util/session.c | 4 +-- tools/perf/util/svghelper.c | 4 +-- tools/perf/util/synthetic-events.c | 18 +++++------ tools/perf/util/top.c | 6 ++-- 31 files changed, 99 insertions(+), 87 deletions(-) diff --git a/tools/lib/perf/evsel.c b/tools/lib/perf/evsel.c index 7ea86a44eae5..210ea7c06ce8 100644 --- a/tools/lib/perf/evsel.c +++ b/tools/lib/perf/evsel.c @@ -141,7 +141,7 @@ int perf_evsel__open(struct perf_evsel *evsel, struct perf_cpu_map *cpus, } if (evsel->fd == NULL && - perf_evsel__alloc_fd(evsel, cpus->nr, threads->nr) < 0) + perf_evsel__alloc_fd(evsel, perf_cpu_map__nr(cpus), threads->nr) < 0) return -ENOMEM; perf_cpu_map__for_each_cpu(cpu, idx, cpus) { @@ -384,7 +384,7 @@ int perf_evsel__apply_filter(struct perf_evsel *evsel, const char *filter) { int err = 0, i; - for (i = 0; i < evsel->cpus->nr && !err; i++) + for (i = 0; i < perf_cpu_map__nr(evsel->cpus) && !err; i++) err = perf_evsel__run_ioctl(evsel, PERF_EVENT_IOC_SET_FILTER, (void *)filter, i); diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c index 1a17ec83d3c4..740ae764537e 100644 --- a/tools/perf/bench/epoll-ctl.c +++ b/tools/perf/bench/epoll-ctl.c @@ -333,7 +333,7 @@ int bench_epoll_ctl(int argc, const char **argv) /* default to the number of CPUs */ if (!nthreads) - nthreads = cpu->nr; + nthreads = perf_cpu_map__nr(cpu); worker = calloc(nthreads, sizeof(*worker)); if (!worker) diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c index 0d1dd8879197..37de970c9743 100644 --- a/tools/perf/bench/epoll-wait.c +++ b/tools/perf/bench/epoll-wait.c @@ -452,7 +452,7 @@ int bench_epoll_wait(int argc, const char **argv) /* default to the number of CPUs and leave one for the writer pthread */ if (!nthreads) - nthreads = cpu->nr - 1; + nthreads = perf_cpu_map__nr(cpu) - 1; worker = calloc(nthreads, sizeof(*worker)); if (!worker) { diff --git a/tools/perf/bench/evlist-open-close.c b/tools/perf/bench/evlist-open-close.c index 482738e9bdad..de56601f69ee 100644 --- a/tools/perf/bench/evlist-open-close.c +++ b/tools/perf/bench/evlist-open-close.c @@ -71,7 +71,7 @@ static int evlist__count_evsel_fds(struct evlist *evlist) int cnt = 0; evlist__for_each_entry(evlist, evsel) - cnt += evsel->core.threads->nr * evsel->core.cpus->nr; + cnt += evsel->core.threads->nr * perf_cpu_map__nr(evsel->core.cpus); return cnt; } @@ -151,7 +151,7 @@ static int bench_evlist_open_close__run(char *evstr) init_stats(&time_stats); - printf(" Number of cpus:\t%d\n", evlist->core.cpus->nr); + printf(" Number of cpus:\t%d\n", perf_cpu_map__nr(evlist->core.cpus)); printf(" Number of threads:\t%d\n", evlist->core.threads->nr); printf(" Number of events:\t%d (%d fds)\n", evlist->core.nr_entries, evlist__count_evsel_fds(evlist)); diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index 9627b6ab8670..dbcecec4eeda 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -150,7 +150,7 @@ int bench_futex_hash(int argc, const char **argv) } if (!params.nthreads) /* default to the number of CPUs */ - params.nthreads = cpu->nr; + params.nthreads = perf_cpu_map__nr(cpu); worker = calloc(params.nthreads, sizeof(*worker)); if (!worker) diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c index a512a320df74..6fc9a3d55c1f 100644 --- a/tools/perf/bench/futex-lock-pi.c +++ b/tools/perf/bench/futex-lock-pi.c @@ -173,7 +173,7 @@ int bench_futex_lock_pi(int argc, const char **argv) } if (!params.nthreads) - params.nthreads = cpu->nr; + params.nthreads = perf_cpu_map__nr(cpu); worker = calloc(params.nthreads, sizeof(*worker)); if (!worker) diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c index aca47ce8b1e7..2f59d5d1c509 100644 --- a/tools/perf/bench/futex-requeue.c +++ b/tools/perf/bench/futex-requeue.c @@ -175,7 +175,7 @@ int bench_futex_requeue(int argc, const char **argv) } if (!params.nthreads) - params.nthreads = cpu->nr; + params.nthreads = perf_cpu_map__nr(cpu); worker = calloc(params.nthreads, sizeof(*worker)); if (!worker) diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c index 888ee6037945..861deb934745 100644 --- a/tools/perf/bench/futex-wake-parallel.c +++ b/tools/perf/bench/futex-wake-parallel.c @@ -252,7 +252,7 @@ int bench_futex_wake_parallel(int argc, const char **argv) err(EXIT_FAILURE, "calloc"); if (!params.nthreads) - params.nthreads = cpu->nr; + params.nthreads = perf_cpu_map__nr(cpu); /* some sanity checks */ if (params.nwakes > params.nthreads || diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c index aa82db51c0ab..cfda48bef1d7 100644 --- a/tools/perf/bench/futex-wake.c +++ b/tools/perf/bench/futex-wake.c @@ -151,7 +151,7 @@ int bench_futex_wake(int argc, const char **argv) } if (!params.nthreads) - params.nthreads = cpu->nr; + params.nthreads = perf_cpu_map__nr(cpu); worker = calloc(params.nthreads, sizeof(*worker)); if (!worker) diff --git a/tools/perf/builtin-ftrace.c b/tools/perf/builtin-ftrace.c index 71452599f87d..dec24dc0e767 100644 --- a/tools/perf/builtin-ftrace.c +++ b/tools/perf/builtin-ftrace.c @@ -281,7 +281,7 @@ static int set_tracing_cpumask(struct perf_cpu_map *cpumap) int ret; int last_cpu; - last_cpu = perf_cpu_map__cpu(cpumap, cpumap->nr - 1).cpu; + last_cpu = perf_cpu_map__cpu(cpumap, perf_cpu_map__nr(cpumap) - 1).cpu; mask_size = last_cpu / 4 + 2; /* one more byte for EOS */ mask_size += last_cpu / 32; /* ',' is needed for every 32th cpus */ diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 934e992c966f..3f98689dd687 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -230,11 +230,12 @@ static bool cpus_map_matched(struct evsel *a, struct evsel *b) if (!a->core.cpus || !b->core.cpus) return false; - if (a->core.cpus->nr != b->core.cpus->nr) + if (perf_cpu_map__nr(a->core.cpus) != perf_cpu_map__nr(b->core.cpus)) return false; - for (int i = 0; i < a->core.cpus->nr; i++) { - if (a->core.cpus->map[i].cpu != b->core.cpus->map[i].cpu) + for (int i = 0; i < perf_cpu_map__nr(a->core.cpus); i++) { + if (perf_cpu_map__cpu(a->core.cpus, i).cpu != + perf_cpu_map__cpu(b->core.cpus, i).cpu) return false; } diff --git a/tools/perf/tests/bitmap.c b/tools/perf/tests/bitmap.c index 0bf399c49849..4965dd666956 100644 --- a/tools/perf/tests/bitmap.c +++ b/tools/perf/tests/bitmap.c @@ -17,8 +17,8 @@ static unsigned long *get_bitmap(const char *str, int nbits) bm = bitmap_zalloc(nbits); if (map && bm) { - for (i = 0; i < map->nr; i++) - set_bit(map->map[i].cpu, bm); + for (i = 0; i < perf_cpu_map__nr(map); i++) + set_bit(perf_cpu_map__cpu(map, i).cpu, bm); } if (map) diff --git a/tools/perf/tests/event_update.c b/tools/perf/tests/event_update.c index 16b6d6f47f38..78db4d704e76 100644 --- a/tools/perf/tests/event_update.c +++ b/tools/perf/tests/event_update.c @@ -75,10 +75,10 @@ static int process_event_cpus(struct perf_tool *tool __maybe_unused, TEST_ASSERT_VAL("wrong id", ev->id == 123); TEST_ASSERT_VAL("wrong type", ev->type == PERF_EVENT_UPDATE__CPUS); - TEST_ASSERT_VAL("wrong cpus", map->nr == 3); - TEST_ASSERT_VAL("wrong cpus", map->map[0].cpu == 1); - TEST_ASSERT_VAL("wrong cpus", map->map[1].cpu == 2); - TEST_ASSERT_VAL("wrong cpus", map->map[2].cpu == 3); + TEST_ASSERT_VAL("wrong cpus", perf_cpu_map__nr(map) == 3); + TEST_ASSERT_VAL("wrong cpus", perf_cpu_map__cpu(map, 0).cpu == 1); + TEST_ASSERT_VAL("wrong cpus", perf_cpu_map__cpu(map, 1).cpu == 2); + TEST_ASSERT_VAL("wrong cpus", perf_cpu_map__cpu(map, 2).cpu == 3); perf_cpu_map__put(map); return 0; } diff --git a/tools/perf/tests/mem2node.c b/tools/perf/tests/mem2node.c index f4a4aba33f76..4c96829510c9 100644 --- a/tools/perf/tests/mem2node.c +++ b/tools/perf/tests/mem2node.c @@ -25,14 +25,15 @@ static unsigned long *get_bitmap(const char *str, int nbits) { struct perf_cpu_map *map = perf_cpu_map__new(str); unsigned long *bm = NULL; - int i; bm = bitmap_zalloc(nbits); if (map && bm) { - for (i = 0; i < map->nr; i++) { - set_bit(map->map[i].cpu, bm); - } + struct perf_cpu cpu; + int i; + + perf_cpu_map__for_each_cpu(cpu, i, map) + set_bit(cpu.cpu, bm); } if (map) diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index 0ad62914b4d7..c3c17600f29c 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -59,11 +59,12 @@ static int test__basic_mmap(struct test_suite *test __maybe_unused, int subtest } CPU_ZERO(&cpu_set); - CPU_SET(cpus->map[0].cpu, &cpu_set); + CPU_SET(perf_cpu_map__cpu(cpus, 0).cpu, &cpu_set); sched_setaffinity(0, sizeof(cpu_set), &cpu_set); if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) { pr_debug("sched_setaffinity() failed on CPU %d: %s ", - cpus->map[0].cpu, str_error_r(errno, sbuf, sizeof(sbuf))); + perf_cpu_map__cpu(cpus, 0).cpu, + str_error_r(errno, sbuf, sizeof(sbuf))); goto out_free_cpus; } diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index c4ef0c7002f1..ee1e3dcbc0bd 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -122,44 +122,48 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) } // Test that CPU ID contains socket, die, core and CPU - for (i = 0; i < map->nr; i++) { + for (i = 0; i < perf_cpu_map__nr(map); i++) { id = aggr_cpu_id__cpu(perf_cpu_map__cpu(map, i), NULL); - TEST_ASSERT_VAL("Cpu map - CPU ID doesn't match", map->map[i].cpu == id.cpu.cpu); + TEST_ASSERT_VAL("Cpu map - CPU ID doesn't match", + perf_cpu_map__cpu(map, i).cpu == id.cpu.cpu); TEST_ASSERT_VAL("Cpu map - Core ID doesn't match", - session->header.env.cpu[map->map[i].cpu].core_id == id.core); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].core_id == id.core); TEST_ASSERT_VAL("Cpu map - Socket ID doesn't match", - session->header.env.cpu[map->map[i].cpu].socket_id == id.socket); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + id.socket); TEST_ASSERT_VAL("Cpu map - Die ID doesn't match", - session->header.env.cpu[map->map[i].cpu].die_id == id.die); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); TEST_ASSERT_VAL("Cpu map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Cpu map - Thread is set", id.thread == -1); } // Test that core ID contains socket, die and core - for (i = 0; i < map->nr; i++) { + for (i = 0; i < perf_cpu_map__nr(map); i++) { id = aggr_cpu_id__core(perf_cpu_map__cpu(map, i), NULL); TEST_ASSERT_VAL("Core map - Core ID doesn't match", - session->header.env.cpu[map->map[i].cpu].core_id == id.core); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].core_id == id.core); TEST_ASSERT_VAL("Core map - Socket ID doesn't match", - session->header.env.cpu[map->map[i].cpu].socket_id == id.socket); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + id.socket); TEST_ASSERT_VAL("Core map - Die ID doesn't match", - session->header.env.cpu[map->map[i].cpu].die_id == id.die); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); TEST_ASSERT_VAL("Core map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Core map - Thread is set", id.thread == -1); } // Test that die ID contains socket and die - for (i = 0; i < map->nr; i++) { + for (i = 0; i < perf_cpu_map__nr(map); i++) { id = aggr_cpu_id__die(perf_cpu_map__cpu(map, i), NULL); TEST_ASSERT_VAL("Die map - Socket ID doesn't match", - session->header.env.cpu[map->map[i].cpu].socket_id == id.socket); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + id.socket); TEST_ASSERT_VAL("Die map - Die ID doesn't match", - session->header.env.cpu[map->map[i].cpu].die_id == id.die); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].die_id == id.die); TEST_ASSERT_VAL("Die map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Die map - Core is set", id.core == -1); @@ -168,10 +172,11 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) } // Test that socket ID contains only socket - for (i = 0; i < map->nr; i++) { + for (i = 0; i < perf_cpu_map__nr(map); i++) { id = aggr_cpu_id__socket(perf_cpu_map__cpu(map, i), NULL); TEST_ASSERT_VAL("Socket map - Socket ID doesn't match", - session->header.env.cpu[map->map[i].cpu].socket_id == id.socket); + session->header.env.cpu[perf_cpu_map__cpu(map, i).cpu].socket_id == + id.socket); TEST_ASSERT_VAL("Socket map - Node ID is set", id.node == -1); TEST_ASSERT_VAL("Socket map - Die ID is set", id.die == -1); @@ -181,10 +186,10 @@ static int check_cpu_topology(char *path, struct perf_cpu_map *map) } // Test that node ID contains only node - for (i = 0; i < map->nr; i++) { + for (i = 0; i < perf_cpu_map__nr(map); i++) { id = aggr_cpu_id__node(perf_cpu_map__cpu(map, i), NULL); TEST_ASSERT_VAL("Node map - Node ID doesn't match", - cpu__get_node(map->map[i]) == id.node); + cpu__get_node(perf_cpu_map__cpu(map, i)) == id.node); TEST_ASSERT_VAL("Node map - Socket is set", id.socket == -1); TEST_ASSERT_VAL("Node map - Die ID is set", id.die == -1); TEST_ASSERT_VAL("Node map - Core is set", id.core == -1); diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 5632efc44738..825336304a37 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -174,7 +174,7 @@ void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp, mp->idx = idx; if (per_cpu) { - mp->cpu = evlist->core.cpus->map[idx]; + mp->cpu = perf_cpu_map__cpu(evlist->core.cpus, idx); if (evlist->core.threads) mp->tid = perf_thread_map__pid(evlist->core.threads, 0); else diff --git a/tools/perf/util/counts.c b/tools/perf/util/counts.c index 2b81707b9dba..7a447d918458 100644 --- a/tools/perf/util/counts.c +++ b/tools/perf/util/counts.c @@ -61,7 +61,7 @@ int evsel__alloc_counts(struct evsel *evsel) struct perf_cpu_map *cpus = evsel__cpus(evsel); int nthreads = perf_thread_map__nr(evsel->core.threads); - evsel->counts = perf_counts__new(cpus ? cpus->nr : 1, nthreads); + evsel->counts = perf_counts__new(perf_cpu_map__nr(cpus), nthreads); return evsel->counts != NULL ? 0 : -ENOMEM; } diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 0d3c2006a15d..5c85fbd709b4 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -57,7 +57,7 @@ struct perf_cpu cpu__max_present_cpu(void); */ static inline bool cpu_map__is_dummy(struct perf_cpu_map *cpus) { - return cpus->nr == 1 && cpus->map[0].cpu == -1; + return perf_cpu_map__nr(cpus) == 1 && perf_cpu_map__cpu(cpus, 0).cpu == -1; } /** diff --git a/tools/perf/util/cputopo.c b/tools/perf/util/cputopo.c index e20b835a1194..d275d843c155 100644 --- a/tools/perf/util/cputopo.c +++ b/tools/perf/util/cputopo.c @@ -325,7 +325,7 @@ struct numa_topology *numa_topology__new(void) if (!node_map) goto out; - nr = (u32) node_map->nr; + nr = (u32) perf_cpu_map__nr(node_map); tp = zalloc(sizeof(*tp) + sizeof(tp->nodes[0])*nr); if (!tp) @@ -334,7 +334,7 @@ struct numa_topology *numa_topology__new(void) tp->nr = nr; for (i = 0; i < nr; i++) { - if (load_numa_node(&tp->nodes[i], node_map->map[i].cpu)) { + if (load_numa_node(&tp->nodes[i], perf_cpu_map__cpu(node_map, i).cpu)) { numa_topology__delete(tp); tp = NULL; break; diff --git a/tools/perf/util/evlist-hybrid.c b/tools/perf/util/evlist-hybrid.c index 7c554234b43d..7f234215147d 100644 --- a/tools/perf/util/evlist-hybrid.c +++ b/tools/perf/util/evlist-hybrid.c @@ -124,22 +124,23 @@ int evlist__fix_hybrid_cpus(struct evlist *evlist, const char *cpu_list) events_nr++; - if (matched_cpus->nr > 0 && (unmatched_cpus->nr > 0 || - matched_cpus->nr < cpus->nr || - matched_cpus->nr < pmu->cpus->nr)) { + if (perf_cpu_map__nr(matched_cpus) > 0 && + (perf_cpu_map__nr(unmatched_cpus) > 0 || + perf_cpu_map__nr(matched_cpus) < perf_cpu_map__nr(cpus) || + perf_cpu_map__nr(matched_cpus) < perf_cpu_map__nr(pmu->cpus))) { perf_cpu_map__put(evsel->core.cpus); perf_cpu_map__put(evsel->core.own_cpus); evsel->core.cpus = perf_cpu_map__get(matched_cpus); evsel->core.own_cpus = perf_cpu_map__get(matched_cpus); - if (unmatched_cpus->nr > 0) { + if (perf_cpu_map__nr(unmatched_cpus) > 0) { cpu_map__snprint(matched_cpus, buf1, sizeof(buf1)); pr_warning("WARNING: use %s in '%s' for '%s', skip other cpus in list.\n", buf1, pmu->name, evsel->name); } } - if (matched_cpus->nr == 0) { + if (perf_cpu_map__nr(matched_cpus) == 0) { evlist__remove(evlist, evsel); evsel__delete(evsel); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 2f6b18af49e5..fb0a2debf015 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1782,7 +1782,7 @@ static int __evsel__prepare_open(struct evsel *evsel, struct perf_cpu_map *cpus, nthreads = threads->nr; if (evsel->core.fd == NULL && - perf_evsel__alloc_fd(&evsel->core, cpus->nr, nthreads) < 0) + perf_evsel__alloc_fd(&evsel->core, perf_cpu_map__nr(cpus), nthreads) < 0) return -ENOMEM; evsel->open_flags = PERF_FLAG_FD_CLOEXEC; @@ -2020,9 +2020,10 @@ retry_open: test_attr__ready(); pr_debug2_peo("sys_perf_event_open: pid %d cpu %d group_fd %d flags %#lx", - pid, cpus->map[idx].cpu, group_fd, evsel->open_flags); + pid, perf_cpu_map__cpu(cpus, idx).cpu, group_fd, evsel->open_flags); - fd = sys_perf_event_open(&evsel->core.attr, pid, cpus->map[idx].cpu, + fd = sys_perf_event_open(&evsel->core.attr, pid, + perf_cpu_map__cpu(cpus, idx).cpu, group_fd, evsel->open_flags); FD(evsel, idx, thread) = fd; @@ -2038,7 +2039,8 @@ retry_open: bpf_counter__install_pe(evsel, idx, fd); if (unlikely(test_attr__enabled)) { - test_attr__open(&evsel->core.attr, pid, cpus->map[idx], + test_attr__open(&evsel->core.attr, pid, + perf_cpu_map__cpu(cpus, idx), fd, group_fd, evsel->open_flags); } @@ -2079,7 +2081,8 @@ try_fallback: if (evsel__precise_ip_fallback(evsel)) goto retry_open; - if (evsel__ignore_missing_thread(evsel, cpus->nr, idx, threads, thread, err)) { + if (evsel__ignore_missing_thread(evsel, perf_cpu_map__nr(cpus), + idx, threads, thread, err)) { /* We just removed 1 thread, so lower the upper nthreads limit. */ nthreads--; @@ -2119,7 +2122,7 @@ out_close: int evsel__open(struct evsel *evsel, struct perf_cpu_map *cpus, struct perf_thread_map *threads) { - return evsel__open_cpu(evsel, cpus, threads, 0, cpus ? cpus->nr : 1); + return evsel__open_cpu(evsel, cpus, threads, 0, perf_cpu_map__nr(cpus)); } void evsel__close(struct evsel *evsel) @@ -2131,8 +2134,7 @@ void evsel__close(struct evsel *evsel) int evsel__open_per_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, int cpu_map_idx) { if (cpu_map_idx == -1) - return evsel__open_cpu(evsel, cpus, NULL, 0, - cpus ? cpus->nr : 1); + return evsel__open_cpu(evsel, cpus, NULL, 0, perf_cpu_map__nr(cpus)); return evsel__open_cpu(evsel, cpus, NULL, cpu_map_idx, cpu_map_idx + 1); } @@ -2982,7 +2984,7 @@ int evsel__store_ids(struct evsel *evsel, struct evlist *evlist) struct perf_cpu_map *cpus = evsel->core.cpus; struct perf_thread_map *threads = evsel->core.threads; - if (perf_evsel__alloc_id(&evsel->core, cpus->nr, threads->nr)) + if (perf_evsel__alloc_id(&evsel->core, perf_cpu_map__nr(cpus), threads->nr)) return -ENOMEM; return store_evsel_ids(evsel, evlist); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 5720ceebffac..041b42d33bf5 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -11,6 +11,7 @@ #include #include "symbol_conf.h" #include +#include struct bpf_object; struct cgroup; @@ -191,7 +192,7 @@ static inline struct perf_cpu_map *evsel__cpus(struct evsel *evsel) static inline int evsel__nr_cpus(struct evsel *evsel) { - return evsel__cpus(evsel)->nr; + return perf_cpu_map__nr(evsel__cpus(evsel)); } void evsel__compute_deltas(struct evsel *evsel, int cpu, int thread, diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 12261ed8c15b..0e8ff8d1e206 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -250,7 +250,7 @@ static void build_node_mask(int node, struct mmap_cpu_mask *mask) nr_cpus = perf_cpu_map__nr(cpu_map); for (idx = 0; idx < nr_cpus; idx++) { - cpu = cpu_map->map[idx]; /* map c index to online cpu index */ + cpu = perf_cpu_map__cpu(cpu_map, idx); /* map c index to online cpu index */ if (cpu__get_node(cpu) == node) set_bit(cpu.cpu, mask->bits); } diff --git a/tools/perf/util/perf_api_probe.c b/tools/perf/util/perf_api_probe.c index 734d006d9a8c..c28dd50bd571 100644 --- a/tools/perf/util/perf_api_probe.c +++ b/tools/perf/util/perf_api_probe.c @@ -67,7 +67,7 @@ static bool perf_probe_api(setup_probe_fn_t fn) cpus = perf_cpu_map__new(NULL); if (!cpus) return false; - cpu = cpus->map[0]; + cpu = perf_cpu_map__cpu(cpus, 0); perf_cpu_map__put(cpus); do { @@ -144,7 +144,7 @@ bool perf_can_record_cpu_wide(void) if (!cpus) return false; - cpu = cpus->map[0]; + cpu = perf_cpu_map__cpu(cpus, 0); perf_cpu_map__put(cpus); fd = sys_perf_event_open(&attr, -1, cpu.cpu, -1, 0); diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 20461f174991..007a64681416 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -106,7 +106,7 @@ void evlist__config(struct evlist *evlist, struct record_opts *opts, struct call if (opts->group) evlist__set_leader(evlist); - if (evlist->core.cpus->map[0].cpu < 0) + if (perf_cpu_map__cpu(evlist->core.cpus, 0).cpu < 0) opts->no_inherit = true; use_comm_exec = perf_can_comm_exec(); @@ -248,11 +248,11 @@ bool evlist__can_select_event(struct evlist *evlist, const char *str) struct perf_cpu_map *cpus = perf_cpu_map__new(NULL); if (cpus) - cpu = cpus->map[0]; + cpu = perf_cpu_map__cpu(cpus, 0); perf_cpu_map__put(cpus); } else { - cpu = evlist->core.cpus->map[0]; + cpu = perf_cpu_map__cpu(evlist->core.cpus, 0); } while (1) { diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index f5ad0e62227a..e752e1f4a5f0 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -1607,8 +1607,8 @@ static void python_process_stat(struct perf_stat_config *config, } for (thread = 0; thread < threads->nr; thread++) { - for (cpu = 0; cpu < cpus->nr; cpu++) { - process_stat(counter, cpus->map[cpu], + for (cpu = 0; cpu < perf_cpu_map__nr(cpus); cpu++) { + process_stat(counter, perf_cpu_map__cpu(cpus, cpu), perf_thread_map__pid(threads, thread), tstamp, perf_counts(counter->counts, cpu, thread)); } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index f19348dddd55..2c0d30f08e78 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2537,8 +2537,8 @@ int perf_session__cpu_bitmap(struct perf_session *session, return -1; } - for (i = 0; i < map->nr; i++) { - struct perf_cpu cpu = map->map[i]; + for (i = 0; i < perf_cpu_map__nr(map); i++) { + struct perf_cpu cpu = perf_cpu_map__cpu(map, i); if (cpu.cpu >= nr_cpus) { pr_err("Requested CPU %d too large. " diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c index 4c9f211249db..1e0c731fc539 100644 --- a/tools/perf/util/svghelper.c +++ b/tools/perf/util/svghelper.c @@ -734,8 +734,8 @@ static int str_to_bitmap(char *s, cpumask_t *b, int nr_cpus) if (!m) return -1; - for (i = 0; i < m->nr; i++) { - c = m->map[i]; + for (i = 0; i < perf_cpu_map__nr(m); i++) { + c = perf_cpu_map__cpu(m, i); if (c.cpu >= nr_cpus) { ret = -1; break; diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index c9ba8050cc2b..70f095624a0b 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1186,12 +1186,12 @@ int perf_event__synthesize_thread_map2(struct perf_tool *tool, static void synthesize_cpus(struct cpu_map_entries *cpus, struct perf_cpu_map *map) { - int i; + int i, map_nr = perf_cpu_map__nr(map); - cpus->nr = map->nr; + cpus->nr = map_nr; - for (i = 0; i < map->nr; i++) - cpus->cpu[i] = map->map[i].cpu; + for (i = 0; i < map_nr; i++) + cpus->cpu[i] = perf_cpu_map__cpu(map, i).cpu; } static void synthesize_mask(struct perf_record_record_cpu_map *mask, @@ -1202,13 +1202,13 @@ static void synthesize_mask(struct perf_record_record_cpu_map *mask, mask->nr = BITS_TO_LONGS(max); mask->long_size = sizeof(long); - for (i = 0; i < map->nr; i++) - set_bit(map->map[i].cpu, mask->mask); + for (i = 0; i < perf_cpu_map__nr(map); i++) + set_bit(perf_cpu_map__cpu(map, i).cpu, mask->mask); } static size_t cpus_size(struct perf_cpu_map *map) { - return sizeof(struct cpu_map_entries) + map->nr * sizeof(u16); + return sizeof(struct cpu_map_entries) + perf_cpu_map__nr(map) * sizeof(u16); } static size_t mask_size(struct perf_cpu_map *map, int *max) @@ -1217,9 +1217,9 @@ static size_t mask_size(struct perf_cpu_map *map, int *max) *max = 0; - for (i = 0; i < map->nr; i++) { + for (i = 0; i < perf_cpu_map__nr(map); i++) { /* bit position of the cpu is + 1 */ - int bit = map->map[i].cpu + 1; + int bit = perf_cpu_map__cpu(map, i).cpu + 1; if (bit > *max) *max = bit; diff --git a/tools/perf/util/top.c b/tools/perf/util/top.c index 27945eeb0cb5..c1ebfc5d2e0c 100644 --- a/tools/perf/util/top.c +++ b/tools/perf/util/top.c @@ -95,15 +95,15 @@ size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size) if (target->cpu_list) ret += SNPRINTF(bf + ret, size - ret, ", CPU%s: %s)", - top->evlist->core.cpus->nr > 1 ? "s" : "", + perf_cpu_map__nr(top->evlist->core.cpus) > 1 ? "s" : "", target->cpu_list); else { if (target->tid) ret += SNPRINTF(bf + ret, size - ret, ")"); else ret += SNPRINTF(bf + ret, size - ret, ", %d CPU%s)", - top->evlist->core.cpus->nr, - top->evlist->core.cpus->nr > 1 ? "s" : ""); + perf_cpu_map__nr(top->evlist->core.cpus), + perf_cpu_map__nr(top->evlist->core.cpus) > 1 ? "s" : ""); } perf_top__reset_sample_counters(top); -- cgit v1.2.3-58-ga151 From 24ead7c254b42c4ea252e57bf9928154dc7744e0 Mon Sep 17 00:00:00 2001 From: Lv Ruyi Date: Mon, 17 Jan 2022 08:37:30 +0000 Subject: perf cpumap: Remove duplicate include in cpumap.h Remove all but the first include of stdbool.h from cpumap.h. Reported-by: Zeal Robot Signed-off-by: Lv Ruyi Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Link: https://lore.kernel.org/r/20220117083730.863200-1-lv.ruyi@zte.com.cn Signed-off-by: CGEL ZTE Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cpumap.h | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 5c85fbd709b4..703ae6d3386e 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -4,7 +4,6 @@ #include #include -#include #include #include -- cgit v1.2.3-58-ga151 From 3606c0e1a1050d397ad759a62607e419fd8b0ccb Mon Sep 17 00:00:00 2001 From: German Gomez Date: Tue, 18 Jan 2022 14:40:54 +0000 Subject: perf evsel: Override attr->sample_period for non-libpfm4 events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A previous patch preventing "attr->sample_period" values from being overridden in pfm events changed a related behaviour in arm-spe. Before said patch: perf record -c 10000 -e arm_spe_0// -- sleep 1 Would yield an SPE event with period=10000. After the patch, the period in "-c 10000" was being ignored because the arm-spe code initializes sample_period to a non-zero value. This patch restores the previous behaviour for non-libpfm4 events. Fixes: ae5dcc8abe31 (“perf record: Prevent override of attr->sample_period for libpfm4 events”) Reported-by: Chase Conklin Signed-off-by: German Gomez Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: John Fastabend Cc: KP Singh Cc: Mark Rutland Cc: Martin KaFai Lau Cc: Namhyung Kim Cc: Song Liu Cc: Stephane Eranian Cc: Yonghong Song Cc: bpf@vger.kernel.org Cc: netdev@vger.kernel.org Link: http://lore.kernel.org/lkml/20220118144054.2541-1-german.gomez@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index fb0a2debf015..22d3267ce294 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1064,6 +1064,17 @@ void __weak arch_evsel__fixup_new_cycles(struct perf_event_attr *attr __maybe_un { } +static void evsel__set_default_freq_period(struct record_opts *opts, + struct perf_event_attr *attr) +{ + if (opts->freq) { + attr->freq = 1; + attr->sample_freq = opts->freq; + } else { + attr->sample_period = opts->default_interval; + } +} + /* * The enable_on_exec/disabled value strategy: * @@ -1130,14 +1141,12 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts, * We default some events to have a default interval. But keep * it a weak assumption overridable by the user. */ - if (!attr->sample_period) { - if (opts->freq) { - attr->freq = 1; - attr->sample_freq = opts->freq; - } else { - attr->sample_period = opts->default_interval; - } - } + if ((evsel->is_libpfm_event && !attr->sample_period) || + (!evsel->is_libpfm_event && (!attr->sample_period || + opts->user_freq != UINT_MAX || + opts->user_interval != ULLONG_MAX))) + evsel__set_default_freq_period(opts, attr); + /* * If attr->freq was set (here or earlier), ask for period * to be sampled. -- cgit v1.2.3-58-ga151 From 864bc8c905261f264c3ea357027cf555fe51c5a3 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 17 Jan 2022 23:10:13 +0800 Subject: perf parse-events: Support event alias in form foo-bar-baz Event aliasing for events whose name in the form foo-bar-baz is not supported, while foo-bar, foo_bar_baz, and other combinations are, i.e. two hyphens are not supported. The HiSilicon D06 platform has events in such form: $ ./perf list sdir-home-migrate List of pre-defined events (to be used in -e): uncore hha: sdir-home-migrate [Unit: hisi_sccl,hha] $ sudo ./perf stat -e sdir-home-migrate event syntax error: 'sdir-home-migrate' \___ parser error Run 'perf list' for a list of valid events Usage: perf stat [] [] -e, --event event selector. use 'perf list' to list available events To support, add an extra PMU event symbol type for "baz", and add a new rule in the bison file. Signed-off-by: John Garry Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Qi Liu Cc: Shaokun Zhang Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/1642432215-234089-2-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 25 +++++++++++++++++++++++-- tools/perf/util/parse-events.h | 1 + tools/perf/util/parse-events.l | 2 ++ tools/perf/util/parse-events.y | 17 +++++++++++++++-- 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index acf20ce98ce9..879f606e07e6 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -2098,8 +2098,17 @@ static void perf_pmu__parse_init(void) pmu = NULL; while ((pmu = perf_pmu__scan(pmu)) != NULL) { list_for_each_entry(alias, &pmu->aliases, list) { - if (strchr(alias->name, '-')) + char *tmp = strchr(alias->name, '-'); + + if (tmp) { + char *tmp2 = NULL; + + tmp2 = strchr(tmp + 1, '-'); len++; + if (tmp2) + len++; + } + len++; } } @@ -2119,8 +2128,20 @@ static void perf_pmu__parse_init(void) list_for_each_entry(alias, &pmu->aliases, list) { struct perf_pmu_event_symbol *p = perf_pmu_events_list + len; char *tmp = strchr(alias->name, '-'); + char *tmp2 = NULL; - if (tmp != NULL) { + if (tmp) + tmp2 = strchr(tmp + 1, '-'); + if (tmp2) { + SET_SYMBOL(strndup(alias->name, tmp - alias->name), + PMU_EVENT_SYMBOL_PREFIX); + p++; + tmp++; + SET_SYMBOL(strndup(tmp, tmp2 - tmp), PMU_EVENT_SYMBOL_SUFFIX); + p++; + SET_SYMBOL(strdup(++tmp2), PMU_EVENT_SYMBOL_SUFFIX2); + len += 3; + } else if (tmp) { SET_SYMBOL(strndup(alias->name, tmp - alias->name), PMU_EVENT_SYMBOL_PREFIX); p++; diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index c7fc93f54577..a38b8b160e80 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -53,6 +53,7 @@ enum perf_pmu_event_symbol_type { PMU_EVENT_SYMBOL, /* normal style PMU event */ PMU_EVENT_SYMBOL_PREFIX, /* prefix of pre-suf style event */ PMU_EVENT_SYMBOL_SUFFIX, /* suffix of pre-suf style event */ + PMU_EVENT_SYMBOL_SUFFIX2, /* suffix of pre-suf2 style event */ }; struct perf_pmu_event_symbol { diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 4efe9872c667..5b6e4b5249cf 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -149,6 +149,8 @@ static int pmu_str_check(yyscan_t scanner, struct parse_events_state *parse_stat return PE_PMU_EVENT_PRE; case PMU_EVENT_SYMBOL_SUFFIX: return PE_PMU_EVENT_SUF; + case PMU_EVENT_SYMBOL_SUFFIX2: + return PE_PMU_EVENT_SUF2; case PMU_EVENT_SYMBOL: return parse_state->fake_pmu ? PE_PMU_EVENT_FAKE : PE_KERNEL_PMU_EVENT; diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 174158982fae..be8c51770051 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -69,7 +69,7 @@ static void inc_group_count(struct list_head *list, %token PE_NAME_CACHE_TYPE PE_NAME_CACHE_OP_RESULT %token PE_PREFIX_MEM PE_PREFIX_RAW PE_PREFIX_GROUP %token PE_ERROR -%token PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE +%token PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_PMU_EVENT_SUF2 PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE %token PE_ARRAY_ALL PE_ARRAY_RANGE %token PE_DRV_CFG_TERM %type PE_VALUE @@ -87,7 +87,7 @@ static void inc_group_count(struct list_head *list, %type PE_MODIFIER_EVENT %type PE_MODIFIER_BP %type PE_EVENT_NAME -%type PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE +%type PE_PMU_EVENT_PRE PE_PMU_EVENT_SUF PE_PMU_EVENT_SUF2 PE_KERNEL_PMU_EVENT PE_PMU_EVENT_FAKE %type PE_DRV_CFG_TERM %type event_pmu_name %destructor { free ($$); } @@ -372,6 +372,19 @@ PE_KERNEL_PMU_EVENT opt_pmu_config $$ = list; } | +PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF '-' PE_PMU_EVENT_SUF2 sep_dc +{ + struct list_head *list; + char pmu_name[128]; + snprintf(pmu_name, sizeof(pmu_name), "%s-%s-%s", $1, $3, $5); + free($1); + free($3); + free($5); + if (parse_events_multi_pmu_add(_parse_state, pmu_name, NULL, &list) < 0) + YYABORT; + $$ = list; +} +| PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc { struct list_head *list; -- cgit v1.2.3-58-ga151 From 34fa67e72085201ea94b5332eae316951331958f Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 17 Jan 2022 23:10:14 +0800 Subject: perf test: Add pmu-events test for aliases with hyphens Add a test for aliases with hyphens in the name to ensure that the pmu-events tables are as expects. There should be no reason why these sort of aliases would be treated differently, but no harm in checking. Signed-off-by: John Garry Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Qi Liu Cc: Shaokun Zhang Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/1642432215-234089-3-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- .../pmu-events/arch/test/test_soc/cpu/uncore.json | 16 +++++++++++ tools/perf/tests/pmu-events.c | 32 ++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/tools/perf/pmu-events/arch/test/test_soc/cpu/uncore.json b/tools/perf/pmu-events/arch/test/test_soc/cpu/uncore.json index 73089c682f80..41bac1c6a008 100644 --- a/tools/perf/pmu-events/arch/test/test_soc/cpu/uncore.json +++ b/tools/perf/pmu-events/arch/test/test_soc/cpu/uncore.json @@ -18,6 +18,22 @@ "Invert": "0", "EdgeDetect": "0" }, + { + "Unit": "CBO", + "EventCode": "0xE0", + "UMask": "0x00", + "EventName": "event-hyphen", + "BriefDescription": "UNC_CBO_HYPHEN", + "PublicDescription": "UNC_CBO_HYPHEN" + }, + { + "Unit": "CBO", + "EventCode": "0xC0", + "UMask": "0x00", + "EventName": "event-two-hyph", + "BriefDescription": "UNC_CBO_TWO_HYPH", + "PublicDescription": "UNC_CBO_TWO_HYPH" + }, { "EventCode": "0x7", "EventName": "uncore_hisi_l3c.rd_hit_cpipe", diff --git a/tools/perf/tests/pmu-events.c b/tools/perf/tests/pmu-events.c index df1c9a3cc05b..1c695fb5a79c 100644 --- a/tools/perf/tests/pmu-events.c +++ b/tools/perf/tests/pmu-events.c @@ -143,6 +143,34 @@ static const struct perf_pmu_test_event unc_cbo_xsnp_response_miss_eviction = { .matching_pmu = "uncore_cbox_0", }; +static const struct perf_pmu_test_event uncore_hyphen = { + .event = { + .name = "event-hyphen", + .event = "umask=0x00,event=0xe0", + .desc = "Unit: uncore_cbox UNC_CBO_HYPHEN", + .topic = "uncore", + .long_desc = "UNC_CBO_HYPHEN", + .pmu = "uncore_cbox", + }, + .alias_str = "umask=0,event=0xe0", + .alias_long_desc = "UNC_CBO_HYPHEN", + .matching_pmu = "uncore_cbox_0", +}; + +static const struct perf_pmu_test_event uncore_two_hyph = { + .event = { + .name = "event-two-hyph", + .event = "umask=0x00,event=0xc0", + .desc = "Unit: uncore_cbox UNC_CBO_TWO_HYPH", + .topic = "uncore", + .long_desc = "UNC_CBO_TWO_HYPH", + .pmu = "uncore_cbox", + }, + .alias_str = "umask=0,event=0xc0", + .alias_long_desc = "UNC_CBO_TWO_HYPH", + .matching_pmu = "uncore_cbox_0", +}; + static const struct perf_pmu_test_event uncore_hisi_l3c_rd_hit_cpipe = { .event = { .name = "uncore_hisi_l3c.rd_hit_cpipe", @@ -188,6 +216,8 @@ static const struct perf_pmu_test_event uncore_imc_cache_hits = { static const struct perf_pmu_test_event *uncore_events[] = { &uncore_hisi_ddrc_flux_wcmd, &unc_cbo_xsnp_response_miss_eviction, + &uncore_hyphen, + &uncore_two_hyph, &uncore_hisi_l3c_rd_hit_cpipe, &uncore_imc_free_running_cache_miss, &uncore_imc_cache_hits, @@ -654,6 +684,8 @@ static struct perf_pmu_test_pmu test_pmus[] = { }, .aliases = { &unc_cbo_xsnp_response_miss_eviction, + &uncore_hyphen, + &uncore_two_hyph, }, }, { -- cgit v1.2.3-58-ga151 From b4a7276c5e9a79c238a2fad4fb9498dd3558ad2e Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 17 Jan 2022 23:10:15 +0800 Subject: perf test: Add parse-events test for aliases with hyphens Add a test which allows us to test parsing an event alias with hyphens. Since these events typically do not exist on most host systems, add the alias to the fake pmu. Function perf_pmu__test_parse_init() has terms added to match known test aliases. Signed-off-by: John Garry Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kajol Jain Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Qi Liu Cc: Shaokun Zhang Cc: linuxarm@huawei.com Link: https://lore.kernel.org/r/1642432215-234089-4-git-send-email-john.garry@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/parse-events.c | 49 +++++++++++++++++++++++++++++++++++++++++ tools/perf/util/parse-events.c | 42 +++++++++++++++++++++++++++-------- 2 files changed, 82 insertions(+), 9 deletions(-) diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index a508f1dbcb2a..e71efadb24f5 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -2069,6 +2069,31 @@ static int test_event(struct evlist_test *e) return ret; } +static int test_event_fake_pmu(const char *str) +{ + struct parse_events_error err; + struct evlist *evlist; + int ret; + + evlist = evlist__new(); + if (!evlist) + return -ENOMEM; + + parse_events_error__init(&err); + perf_pmu__test_parse_init(); + ret = __parse_events(evlist, str, &err, &perf_pmu__fake); + if (ret) { + pr_debug("failed to parse event '%s', err %d, str '%s'\n", + str, ret, err.str); + parse_events_error__print(&err, str); + } + + parse_events_error__exit(&err); + evlist__delete(evlist); + + return ret; +} + static int test_events(struct evlist_test *events, unsigned cnt) { int ret1, ret2 = 0; @@ -2276,6 +2301,26 @@ static int test_pmu_events_alias(char *event, char *alias) return test_event(&e); } +static int test_pmu_events_alias2(void) +{ + static const char events[][30] = { + "event-hyphen", + "event-two-hyph", + }; + unsigned long i; + int ret = 0; + + for (i = 0; i < ARRAY_SIZE(events); i++) { + ret = test_event_fake_pmu(&events[i][0]); + if (ret) { + pr_err("check_parse_fake %s failed\n", &events[i][0]); + break; + } + } + + return ret; +} + static int test__parse_events(struct test_suite *test __maybe_unused, int subtest __maybe_unused) { int ret1, ret2 = 0; @@ -2313,6 +2358,10 @@ do { \ return ret; } + ret1 = test_pmu_events_alias2(); + if (!ret2) + ret2 = ret1; + ret1 = test_terms(test__terms, ARRAY_SIZE(test__terms)); if (!ret2) ret2 = ret1; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 879f606e07e6..9739b05b999e 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1697,6 +1697,15 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, } } } + + if (parse_state->fake_pmu) { + if (!parse_events_add_pmu(parse_state, list, str, head, + true, true)) { + pr_debug("%s -> %s/%s/\n", str, "fake_pmu", str); + ok++; + } + } + out_err: if (ok) *listp = list; @@ -2168,23 +2177,38 @@ err: */ int perf_pmu__test_parse_init(void) { - struct perf_pmu_event_symbol *list; + struct perf_pmu_event_symbol *list, *tmp, symbols[] = { + {(char *)"read", PMU_EVENT_SYMBOL}, + {(char *)"event", PMU_EVENT_SYMBOL_PREFIX}, + {(char *)"two", PMU_EVENT_SYMBOL_SUFFIX}, + {(char *)"hyphen", PMU_EVENT_SYMBOL_SUFFIX}, + {(char *)"hyph", PMU_EVENT_SYMBOL_SUFFIX2}, + }; + unsigned long i, j; - list = malloc(sizeof(*list) * 1); + tmp = list = malloc(sizeof(*list) * ARRAY_SIZE(symbols)); if (!list) return -ENOMEM; - list->type = PMU_EVENT_SYMBOL; - list->symbol = strdup("read"); - - if (!list->symbol) { - free(list); - return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(symbols); i++, tmp++) { + tmp->type = symbols[i].type; + tmp->symbol = strdup(symbols[i].symbol); + if (!list->symbol) + goto err_free; } perf_pmu_events_list = list; - perf_pmu_events_list_num = 1; + perf_pmu_events_list_num = ARRAY_SIZE(symbols); + + qsort(perf_pmu_events_list, ARRAY_SIZE(symbols), + sizeof(struct perf_pmu_event_symbol), comp_pmu); return 0; + +err_free: + for (j = 0, tmp = list; j < i; j++, tmp++) + free(tmp->symbol); + free(list); + return -ENOMEM; } enum perf_pmu_event_symbol_type -- cgit v1.2.3-58-ga151 From f0ac5b85810a69104ee6bc939bcbaecfe4db9a3e Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Wed, 12 Jan 2022 08:01:09 +0000 Subject: perf tools: Remove redundant err variable Return value from perf_event__process_tracing_data() directly instead of taking this in another redundant variable. Reported-by: Zeal Robot Signed-off-by: Minghao Chi Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Link: http://lore.kernel.org/lkml/20220112080109.666800-1-chi.minghao@zte.com.cn Signed-off-by: CGEL ZTE Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-inject.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 409b721666cb..fbf43a454cba 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -535,12 +535,9 @@ static int perf_event__repipe_exit(struct perf_tool *tool, static int perf_event__repipe_tracing_data(struct perf_session *session, union perf_event *event) { - int err; - perf_event__repipe_synth(session->tool, event); - err = perf_event__process_tracing_data(session, event); - return err; + return perf_event__process_tracing_data(session, event); } static int dso__read_build_id(struct dso *dso) -- cgit v1.2.3-58-ga151 From 6b9b6413700e104934734b72a3be622a76923b98 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 22 Jan 2022 09:17:10 -0500 Subject: ftrace: Fix assuming build time sort works for s390 To speed up the boot process, as mcount_loc needs to be sorted for ftrace to work properly, sorting it at build time is more efficient than boot up and can save milliseconds of time. Unfortunately, this change broke s390 as it will modify the mcount_loc location after the sorting takes place and will put back the unsorted locations. Since the sorting is skipped at boot up if it is believed that it was sorted at run time, ftrace can crash as its algorithms are dependent on the list being sorted. Add a new config BUILDTIME_MCOUNT_SORT that is set when BUILDTIME_TABLE_SORT but not if S390 is set. Use this config to determine if sorting should take place at boot up. Link: https://lore.kernel.org/all/yt9dee51ctfn.fsf@linux.ibm.com/ Fixes: 72b3942a173c ("scripts: ftrace - move the sort-processing in ftrace_init") Reported-by: Sven Schnelle Tested-by: Heiko Carstens Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 9 ++++++++- kernel/trace/ftrace.c | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f468767bc287..752ed89a293b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -70,6 +70,13 @@ config HAVE_C_RECORDMCOUNT help C version of recordmcount available? +config BUILDTIME_MCOUNT_SORT + bool + default y + depends on BUILDTIME_TABLE_SORT && !S390 + help + Sort the mcount_loc section at build time. + config TRACER_MAX_TRACE bool @@ -918,7 +925,7 @@ config EVENT_TRACE_TEST_SYSCALLS config FTRACE_SORT_STARTUP_TEST bool "Verify compile time sorting of ftrace functions" depends on DYNAMIC_FTRACE - depends on BUILDTIME_TABLE_SORT + depends on BUILDTIME_MCOUNT_SORT help Sorting of the mcount_loc sections that is used to find the where the ftrace knows where to patch functions for tracing diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 403e485bf091..b01e1fa62193 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6429,10 +6429,10 @@ static int ftrace_process_locs(struct module *mod, /* * Sorting mcount in vmlinux at build time depend on - * CONFIG_BUILDTIME_TABLE_SORT, while mcount loc in + * CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in * modules can not be sorted at build time. */ - if (!IS_ENABLED(CONFIG_BUILDTIME_TABLE_SORT) || mod) { + if (!IS_ENABLED(CONFIG_BUILDTIME_MCOUNT_SORT) || mod) { sort(start, count, sizeof(*start), ftrace_cmp_ips, NULL); } else { -- cgit v1.2.3-58-ga151 From e783362eb54cd99b2cac8b3a9aeac942e6f6ac07 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 23 Jan 2022 10:12:53 +0200 Subject: Linux 5.17-rc1 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c94559a97dca..0fb4f94a6885 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 -PATCHLEVEL = 16 +PATCHLEVEL = 17 SUBLEVEL = 0 -EXTRAVERSION = +EXTRAVERSION = -rc1 NAME = Gobble Gobble # *DOCUMENTATION* -- cgit v1.2.3-58-ga151 From 0cb5950f3f3b51a4e8657d106f897f2b913e0586 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 20 Jan 2022 14:27:56 +0000 Subject: btrfs: fix deadlock when reserving space during defrag When defragging we can end up collecting a range for defrag that has already pages under delalloc (dirty), as long as the respective extent map for their range is not mapped to a hole, a prealloc extent or the extent map is from an old generation. Most of the time that is harmless from a functional perspective at least, however it can result in a deadlock: 1) At defrag_collect_targets() we find an extent map that meets all requirements but there's delalloc for the range it covers, and we add its range to list of ranges to defrag; 2) The defrag_collect_targets() function is called at defrag_one_range(), after it locked a range that overlaps the range of the extent map; 3) At defrag_one_range(), while the range is still locked, we call defrag_one_locked_target() for the range associated to the extent map we collected at step 1); 4) Then finally at defrag_one_locked_target() we do a call to btrfs_delalloc_reserve_space(), which will reserve data and metadata space. If the space reservations can not be satisfied right away, the flusher might be kicked in and start flushing delalloc and wait for the respective ordered extents to complete. If this happens we will deadlock, because both flushing delalloc and finishing an ordered extent, requires locking the range in the inode's io tree, which was already locked at defrag_collect_targets(). So fix this by skipping extent maps for which there's already delalloc. Fixes: eb793cf857828d ("btrfs: defrag: introduce helper to collect target file extents") CC: stable@vger.kernel.org # 5.16 Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6d5d6d2c4ad1..a883f1824a17 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1213,6 +1213,35 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (em->generation < newer_than) goto next; + /* + * Our start offset might be in the middle of an existing extent + * map, so take that into account. + */ + range_len = em->len - (cur - em->start); + /* + * If this range of the extent map is already flagged for delalloc, + * skip it, because: + * + * 1) We could deadlock later, when trying to reserve space for + * delalloc, because in case we can't immediately reserve space + * the flusher can start delalloc and wait for the respective + * ordered extents to complete. The deadlock would happen + * because we do the space reservation while holding the range + * locked, and starting writeback, or finishing an ordered + * extent, requires locking the range; + * + * 2) If there's delalloc there, it means there's dirty pages for + * which writeback has not started yet (we clean the delalloc + * flag when starting writeback and after creating an ordered + * extent). If we mark pages in an adjacent range for defrag, + * then we will have a larger contiguous range for delalloc, + * very likely resulting in a larger extent after writeback is + * triggered (except in a case of free space fragmentation). + */ + if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC, 0, NULL)) + goto next; + /* * For do_compress case, we want to compress all valid file * extents, thus no @extent_thresh or mergeable check. @@ -1221,7 +1250,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, goto add; /* Skip too large extent */ - if (em->len >= extent_thresh) + if (range_len >= extent_thresh) goto next; next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, -- cgit v1.2.3-58-ga151 From 3c9d31c715948aaff0ee6d322a91a2dec07770bf Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 20 Jan 2022 17:11:52 +0000 Subject: btrfs: add back missing dirty page rate limiting to defrag A defrag operation can dirty a lot of pages, specially if operating on the entire file or a large file range. Any task dirtying pages should periodically call balance_dirty_pages_ratelimited(), as stated in that function's comments, otherwise they can leave too many dirty pages in the system. This is what we did before the refactoring in 5.16, and it should have remained, just like in the buffered write path and relocation. So restore that behaviour. Fixes: 7b508037d4cac3 ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") CC: stable@vger.kernel.org # 5.16 Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a883f1824a17..ac420060aac3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1579,6 +1579,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, } while (cur < last_byte) { + const unsigned long prev_sectors_defragged = sectors_defragged; u64 cluster_end; /* The cluster size 256K should always be page aligned */ @@ -1610,6 +1611,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, cluster_end + 1 - cur, extent_thresh, newer_than, do_compress, §ors_defragged, max_to_defrag); + + if (sectors_defragged > prev_sectors_defragged) + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_inode_unlock(inode, 0); if (ret < 0) break; -- cgit v1.2.3-58-ga151 From 27cdfde181bcacd226c230b2fd831f6f5b8c215f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 20 Jan 2022 17:41:17 +0000 Subject: btrfs: update writeback index when starting defrag When starting a defrag, we should update the writeback index of the inode's mapping in case it currently has a value beyond the start of the range we are defragging. This can help performance and often result in getting less extents after writeback - for e.g., if the current value of the writeback index sits somewhere in the middle of a range that gets dirty by the defrag, then after writeback we can get two smaller extents instead of a single, larger extent. We used to have this before the refactoring in 5.16, but it was removed without any reason to do so. Originally it was added in kernel 3.1, by commit 2a0f7f5769992b ("Btrfs: fix recursive auto-defrag"), in order to fix a loop with autodefrag resulting in dirtying and writing pages over and over, but some testing on current code did not show that happening, at least with the test described in that commit. So add back the behaviour, as at the very least it is a nice to have optimization. Fixes: 7b508037d4cac3 ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") CC: stable@vger.kernel.org # 5.16 Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ac420060aac3..eef5b300b9a9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1537,6 +1537,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, int compress_type = BTRFS_COMPRESS_ZLIB; int ret = 0; u32 extent_thresh = range->extent_thresh; + pgoff_t start_index; if (isize == 0) return 0; @@ -1578,6 +1579,14 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, file_ra_state_init(ra, inode->i_mapping); } + /* + * Make writeback start from the beginning of the range, so that the + * defrag range can be written sequentially. + */ + start_index = cur >> PAGE_SHIFT; + if (start_index < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = start_index; + while (cur < last_byte) { const unsigned long prev_sectors_defragged = sectors_defragged; u64 cluster_end; -- cgit v1.2.3-58-ga151 From c74ead223deb88bdf18af8c772d7ca5a9b6c3c2b Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Sun, 23 Jan 2022 23:54:58 +0800 Subject: net: stmmac: reduce unnecessary wakeups from eee sw timer Currently, on EEE capable platforms, if EEE SW timer is used, the SW timer cause 1 wakeup/s even if the TX has successfully entered EEE. Remove this unnecessary wakeup by only calling mod_timer() if we haven't successfully entered EEE. Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 6708ca2aa4f7..54071ae73879 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -402,7 +402,7 @@ static void stmmac_lpi_entry_timer_config(struct stmmac_priv *priv, bool en) * Description: this function is to verify and enter in LPI mode in case of * EEE. */ -static void stmmac_enable_eee_mode(struct stmmac_priv *priv) +static int stmmac_enable_eee_mode(struct stmmac_priv *priv) { u32 tx_cnt = priv->plat->tx_queues_to_use; u32 queue; @@ -412,13 +412,14 @@ static void stmmac_enable_eee_mode(struct stmmac_priv *priv) struct stmmac_tx_queue *tx_q = &priv->tx_queue[queue]; if (tx_q->dirty_tx != tx_q->cur_tx) - return; /* still unfinished work */ + return -EBUSY; /* still unfinished work */ } /* Check and enter in LPI mode */ if (!priv->tx_path_in_lpi_mode) stmmac_set_eee_mode(priv, priv->hw, priv->plat->en_tx_lpi_clockgating); + return 0; } /** @@ -450,8 +451,8 @@ static void stmmac_eee_ctrl_timer(struct timer_list *t) { struct stmmac_priv *priv = from_timer(priv, t, eee_ctrl_timer); - stmmac_enable_eee_mode(priv); - mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(priv->tx_lpi_timer)); + if (stmmac_enable_eee_mode(priv)) + mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(priv->tx_lpi_timer)); } /** @@ -2647,8 +2648,8 @@ static int stmmac_tx_clean(struct stmmac_priv *priv, int budget, u32 queue) if (priv->eee_enabled && !priv->tx_path_in_lpi_mode && priv->eee_sw_timer_en) { - stmmac_enable_eee_mode(priv); - mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(priv->tx_lpi_timer)); + if (stmmac_enable_eee_mode(priv)) + mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(priv->tx_lpi_timer)); } /* We still have pending packets, let's call for a new scheduling */ -- cgit v1.2.3-58-ga151 From 29eb31542787e1019208a2e1047bb7c76c069536 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Mon, 24 Jan 2022 11:29:54 +0800 Subject: yam: fix a memory leak in yam_siocdevprivate() ym needs to be free when ym->cmd != SIOCYAMSMCS. Fixes: 0781168e23a2 ("yam: fix a missing-check bug") Signed-off-by: Hangyu Hua Signed-off-by: David S. Miller --- drivers/net/hamradio/yam.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index 6376b8485976..980f2be32f05 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -950,9 +950,7 @@ static int yam_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __ ym = memdup_user(data, sizeof(struct yamdrv_ioctl_mcs)); if (IS_ERR(ym)) return PTR_ERR(ym); - if (ym->cmd != SIOCYAMSMCS) - return -EINVAL; - if (ym->bitrate > YAM_MAXBITRATE) { + if (ym->cmd != SIOCYAMSMCS || ym->bitrate > YAM_MAXBITRATE) { kfree(ym); return -EINVAL; } -- cgit v1.2.3-58-ga151 From c63003e3d99761afb280add3b30de1cf30fa522b Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Mon, 24 Jan 2022 15:35:29 +0100 Subject: net: cpsw: Properly initialise struct page_pool_params MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpsw driver didn't properly initialise the struct page_pool_params before calling page_pool_create(), which leads to crashes after the struct has been expanded with new parameters. The second Fixes tag below is where the buggy code was introduced, but because the code was moved around this patch will only apply on top of the commit in the first Fixes tag. Fixes: c5013ac1dd0e ("net: ethernet: ti: cpsw: move set of common functions in cpsw_priv") Fixes: 9ed4050c0d75 ("net: ethernet: ti: cpsw: add XDP support") Reported-by: Colin Foster Signed-off-by: Toke Høiland-Jørgensen Tested-by: Colin Foster Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw_priv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index ba220593e6db..8f6817f346ba 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -1146,7 +1146,7 @@ int cpsw_fill_rx_channels(struct cpsw_priv *priv) static struct page_pool *cpsw_create_page_pool(struct cpsw_common *cpsw, int size) { - struct page_pool_params pp_params; + struct page_pool_params pp_params = {}; struct page_pool *pool; pp_params.order = 0; -- cgit v1.2.3-58-ga151 From 74afa30630976861a1308c5ec93391f8b037a2ae Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 24 Jan 2022 09:22:49 -0800 Subject: net: fec_mpc52xx: don't discard const from netdev->dev_addr Recent changes made netdev->dev_addr const, and it's passed directly to mpc52xx_fec_set_paddr(). Similar problem exists on the probe patch, the driver needs to call eth_hw_addr_set(). Reported-by: Geert Uytterhoeven Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/fec_mpc52xx.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c b/drivers/net/ethernet/freescale/fec_mpc52xx.c index bbbde9f701c2..be0bd4b44926 100644 --- a/drivers/net/ethernet/freescale/fec_mpc52xx.c +++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c @@ -99,13 +99,13 @@ static void mpc52xx_fec_tx_timeout(struct net_device *dev, unsigned int txqueue) netif_wake_queue(dev); } -static void mpc52xx_fec_set_paddr(struct net_device *dev, u8 *mac) +static void mpc52xx_fec_set_paddr(struct net_device *dev, const u8 *mac) { struct mpc52xx_fec_priv *priv = netdev_priv(dev); struct mpc52xx_fec __iomem *fec = priv->fec; - out_be32(&fec->paddr1, *(u32 *)(&mac[0])); - out_be32(&fec->paddr2, (*(u16 *)(&mac[4]) << 16) | FEC_PADDR2_TYPE); + out_be32(&fec->paddr1, *(const u32 *)(&mac[0])); + out_be32(&fec->paddr2, (*(const u16 *)(&mac[4]) << 16) | FEC_PADDR2_TYPE); } static int mpc52xx_fec_set_mac_address(struct net_device *dev, void *addr) @@ -893,13 +893,15 @@ static int mpc52xx_fec_probe(struct platform_device *op) rv = of_get_ethdev_address(np, ndev); if (rv) { struct mpc52xx_fec __iomem *fec = priv->fec; + u8 addr[ETH_ALEN] __aligned(4); /* * If the MAC addresse is not provided via DT then read * it back from the controller regs */ - *(u32 *)(&ndev->dev_addr[0]) = in_be32(&fec->paddr1); - *(u16 *)(&ndev->dev_addr[4]) = in_be32(&fec->paddr2) >> 16; + *(u32 *)(&addr[0]) = in_be32(&fec->paddr1); + *(u16 *)(&addr[4]) = in_be32(&fec->paddr2) >> 16; + eth_hw_addr_set(ndev, addr); } /* -- cgit v1.2.3-58-ga151 From 2f61353cd2f789a4229b6f5c1c24a40a613357bb Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Tue, 25 Jan 2022 15:03:12 +0800 Subject: net: hns3: handle empty unknown interrupt for VF Since some interrupt states may be cleared by hardware, the driver may receive an empty interrupt. Currently, the VF driver directly disables the vector0 interrupt in this case. As a result, the VF is unavailable. Therefore, the vector0 interrupt should be enabled in this case. Fixes: b90fcc5bd904 ("net: hns3: add reset handling for VF when doing Core/Global/IMP reset") Signed-off-by: Yufeng Mo Signed-off-by: Guangbin Huang Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 7df87610ad96..21442a9bb996 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -2043,8 +2043,7 @@ static irqreturn_t hclgevf_misc_irq_handle(int irq, void *data) break; } - if (event_cause != HCLGEVF_VECTOR0_EVENT_OTHER) - hclgevf_enable_vector(&hdev->misc_vector, true); + hclgevf_enable_vector(&hdev->misc_vector, true); return IRQ_HANDLED; } -- cgit v1.2.3-58-ga151 From 8bdd24940b69c0018b64b496aa3b03a25f7295ca Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Tue, 25 Jan 2022 15:40:06 +0100 Subject: amd: declance: use eth_hw_addr_set() Copy scattered mac address octets into an array then eth_hw_addr_set(). Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Thomas Bogendoerfer Link: https://lore.kernel.org/r/20220125144007.64407-1-tsbogend@alpha.franken.de Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/declance.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/declance.c b/drivers/net/ethernet/amd/declance.c index 493b0cefcc2a..ec8df05e7bf6 100644 --- a/drivers/net/ethernet/amd/declance.c +++ b/drivers/net/ethernet/amd/declance.c @@ -1032,6 +1032,7 @@ static int dec_lance_probe(struct device *bdev, const int type) int i, ret; unsigned long esar_base; unsigned char *esar; + u8 addr[ETH_ALEN]; const char *desc; if (dec_lance_debug && version_printed++ == 0) @@ -1228,7 +1229,8 @@ static int dec_lance_probe(struct device *bdev, const int type) break; } for (i = 0; i < 6; i++) - dev->dev_addr[i] = esar[i * 4]; + addr[i] = esar[i * 4]; + eth_hw_addr_set(dev, addr); printk("%s: %s, addr = %pM, irq = %d\n", name, desc, dev->dev_addr, dev->irq); -- cgit v1.2.3-58-ga151 From 7938d61591d33394a21bdd7797a245b65428f44c Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 19 Oct 2021 13:27:10 +0100 Subject: drm/i915: Flush TLBs before releasing backing store We need to flush TLBs before releasing backing store otherwise userspace is able to encounter stale entries if a) it is not declaring access to certain buffers and b) it races with the backing store release from a such undeclared execution already executing on the GPU in parallel. The approach taken is to mark any buffer objects which were ever bound to the GPU and to trigger a serialized TLB flush when their backing store is released. Alternatively the flushing could be done on VMA unbind, at which point we would be able to ascertain whether there is potential a parallel GPU execution (which could race), but essentially it boils down to paying the cost of TLB flushes potentially needlessly at VMA unbind time (when the backing store is not known to be going away so not needed for safety), versus potentially needlessly at backing store relase time (since we at that point cannot tell whether there is anything executing on the GPU which uses that object). Thereforce simplicity of implementation has been chosen for now with scope to benchmark and refine later as required. Signed-off-by: Tvrtko Ursulin Reported-by: Sushma Venkatesh Reddy Reviewed-by: Daniel Vetter Acked-by: Dave Airlie Cc: Daniel Vetter Cc: Jon Bloomfield Cc: Joonas Lahtinen Cc: Jani Nikula Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/gem/i915_gem_object_types.h | 1 + drivers/gpu/drm/i915/gem/i915_gem_pages.c | 10 +++ drivers/gpu/drm/i915/gt/intel_gt.c | 108 +++++++++++++++++++++++ drivers/gpu/drm/i915/gt/intel_gt.h | 2 + drivers/gpu/drm/i915/gt/intel_gt_types.h | 2 + drivers/gpu/drm/i915/i915_reg.h | 11 +++ drivers/gpu/drm/i915/i915_vma.c | 3 + drivers/gpu/drm/i915/intel_uncore.c | 26 +++++- drivers/gpu/drm/i915/intel_uncore.h | 2 + 9 files changed, 161 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index 4b4829eb16c2..0dd107dcecc2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -311,6 +311,7 @@ struct drm_i915_gem_object { #define I915_BO_READONLY BIT(6) #define I915_TILING_QUIRK_BIT 7 /* unknown swizzling; do not release! */ #define I915_BO_PROTECTED BIT(8) +#define I915_BO_WAS_BOUND_BIT 9 /** * @mem_flags - Mutable placement-related flags * diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c index 9f429ed6e78a..a50f884973bc 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c @@ -10,6 +10,8 @@ #include "i915_gem_lmem.h" #include "i915_gem_mman.h" +#include "gt/intel_gt.h" + void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj, struct sg_table *pages, unsigned int sg_page_sizes) @@ -221,6 +223,14 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj) __i915_gem_object_reset_page_iter(obj); obj->mm.page_sizes.phys = obj->mm.page_sizes.sg = 0; + if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) { + struct drm_i915_private *i915 = to_i915(obj->base.dev); + intel_wakeref_t wakeref; + + with_intel_runtime_pm_if_active(&i915->runtime_pm, wakeref) + intel_gt_invalidate_tlbs(to_gt(i915)); + } + return pages; } diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c index f98f0fb21efb..35d0fcd3a86c 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.c +++ b/drivers/gpu/drm/i915/gt/intel_gt.c @@ -29,6 +29,8 @@ void __intel_gt_init_early(struct intel_gt *gt, struct drm_i915_private *i915) { spin_lock_init(>->irq_lock); + mutex_init(>->tlb_invalidate_lock); + INIT_LIST_HEAD(>->closed_vma); spin_lock_init(>->closed_lock); @@ -912,3 +914,109 @@ void intel_gt_info_print(const struct intel_gt_info *info, intel_sseu_dump(&info->sseu, p); } + +struct reg_and_bit { + i915_reg_t reg; + u32 bit; +}; + +static struct reg_and_bit +get_reg_and_bit(const struct intel_engine_cs *engine, const bool gen8, + const i915_reg_t *regs, const unsigned int num) +{ + const unsigned int class = engine->class; + struct reg_and_bit rb = { }; + + if (drm_WARN_ON_ONCE(&engine->i915->drm, + class >= num || !regs[class].reg)) + return rb; + + rb.reg = regs[class]; + if (gen8 && class == VIDEO_DECODE_CLASS) + rb.reg.reg += 4 * engine->instance; /* GEN8_M2TCR */ + else + rb.bit = engine->instance; + + rb.bit = BIT(rb.bit); + + return rb; +} + +void intel_gt_invalidate_tlbs(struct intel_gt *gt) +{ + static const i915_reg_t gen8_regs[] = { + [RENDER_CLASS] = GEN8_RTCR, + [VIDEO_DECODE_CLASS] = GEN8_M1TCR, /* , GEN8_M2TCR */ + [VIDEO_ENHANCEMENT_CLASS] = GEN8_VTCR, + [COPY_ENGINE_CLASS] = GEN8_BTCR, + }; + static const i915_reg_t gen12_regs[] = { + [RENDER_CLASS] = GEN12_GFX_TLB_INV_CR, + [VIDEO_DECODE_CLASS] = GEN12_VD_TLB_INV_CR, + [VIDEO_ENHANCEMENT_CLASS] = GEN12_VE_TLB_INV_CR, + [COPY_ENGINE_CLASS] = GEN12_BLT_TLB_INV_CR, + }; + struct drm_i915_private *i915 = gt->i915; + struct intel_uncore *uncore = gt->uncore; + struct intel_engine_cs *engine; + enum intel_engine_id id; + const i915_reg_t *regs; + unsigned int num = 0; + + if (I915_SELFTEST_ONLY(gt->awake == -ENODEV)) + return; + + if (GRAPHICS_VER(i915) == 12) { + regs = gen12_regs; + num = ARRAY_SIZE(gen12_regs); + } else if (GRAPHICS_VER(i915) >= 8 && GRAPHICS_VER(i915) <= 11) { + regs = gen8_regs; + num = ARRAY_SIZE(gen8_regs); + } else if (GRAPHICS_VER(i915) < 8) { + return; + } + + if (drm_WARN_ONCE(&i915->drm, !num, + "Platform does not implement TLB invalidation!")) + return; + + GEM_TRACE("\n"); + + assert_rpm_wakelock_held(&i915->runtime_pm); + + mutex_lock(>->tlb_invalidate_lock); + intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); + + for_each_engine(engine, gt, id) { + /* + * HW architecture suggest typical invalidation time at 40us, + * with pessimistic cases up to 100us and a recommendation to + * cap at 1ms. We go a bit higher just in case. + */ + const unsigned int timeout_us = 100; + const unsigned int timeout_ms = 4; + struct reg_and_bit rb; + + rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num); + if (!i915_mmio_reg_offset(rb.reg)) + continue; + + intel_uncore_write_fw(uncore, rb.reg, rb.bit); + if (__intel_wait_for_register_fw(uncore, + rb.reg, rb.bit, 0, + timeout_us, timeout_ms, + NULL)) + drm_err_ratelimited(>->i915->drm, + "%s TLB invalidation did not complete in %ums!\n", + engine->name, timeout_ms); + } + + /* + * Use delayed put since a) we mostly expect a flurry of TLB + * invalidations so it is good to avoid paying the forcewake cost and + * b) it works around a bug in Icelake which cannot cope with too rapid + * transitions. + */ + intel_uncore_forcewake_put_delayed(uncore, FORCEWAKE_ALL); + mutex_unlock(>->tlb_invalidate_lock); +} diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h index 3ace129eb2af..a913fb6ffec3 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.h +++ b/drivers/gpu/drm/i915/gt/intel_gt.h @@ -91,4 +91,6 @@ void intel_gt_info_print(const struct intel_gt_info *info, void intel_gt_watchdog_work(struct work_struct *work); +void intel_gt_invalidate_tlbs(struct intel_gt *gt); + #endif /* __INTEL_GT_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_types.h b/drivers/gpu/drm/i915/gt/intel_gt_types.h index 14216cc471b1..f20687796490 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_types.h +++ b/drivers/gpu/drm/i915/gt/intel_gt_types.h @@ -73,6 +73,8 @@ struct intel_gt { struct intel_uc uc; + struct mutex tlb_invalidate_lock; + struct i915_wa_list wa_list; struct intel_gt_timelines { diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 971d601fe751..c32420cb8ed5 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -2721,6 +2721,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING (1 << 28) #define GAMT_CHKN_DISABLE_I2M_CYCLE_ON_WR_PORT (1 << 24) +#define GEN8_RTCR _MMIO(0x4260) +#define GEN8_M1TCR _MMIO(0x4264) +#define GEN8_M2TCR _MMIO(0x4268) +#define GEN8_BTCR _MMIO(0x426c) +#define GEN8_VTCR _MMIO(0x4270) + #if 0 #define PRB0_TAIL _MMIO(0x2030) #define PRB0_HEAD _MMIO(0x2034) @@ -2819,6 +2825,11 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define FAULT_VA_HIGH_BITS (0xf << 0) #define FAULT_GTT_SEL (1 << 4) +#define GEN12_GFX_TLB_INV_CR _MMIO(0xced8) +#define GEN12_VD_TLB_INV_CR _MMIO(0xcedc) +#define GEN12_VE_TLB_INV_CR _MMIO(0xcee0) +#define GEN12_BLT_TLB_INV_CR _MMIO(0xcee4) + #define GEN12_AUX_ERR_DBG _MMIO(0x43f4) #define FPGA_DBG _MMIO(0x42300) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index 29a858c53bdd..c0d6d5526abe 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -457,6 +457,9 @@ int i915_vma_bind(struct i915_vma *vma, vma->ops->bind_vma(vma->vm, NULL, vma, cache_level, bind_flags); } + if (vma->obj) + set_bit(I915_BO_WAS_BOUND_BIT, &vma->obj->flags); + atomic_or(bind_flags, &vma->flags); return 0; } diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index fc25ebf1a593..778da3179b3c 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -724,7 +724,8 @@ void intel_uncore_forcewake_get__locked(struct intel_uncore *uncore, } static void __intel_uncore_forcewake_put(struct intel_uncore *uncore, - enum forcewake_domains fw_domains) + enum forcewake_domains fw_domains, + bool delayed) { struct intel_uncore_forcewake_domain *domain; unsigned int tmp; @@ -739,7 +740,11 @@ static void __intel_uncore_forcewake_put(struct intel_uncore *uncore, continue; } - fw_domains_put(uncore, domain->mask); + if (delayed && + !(domain->uncore->fw_domains_timer & domain->mask)) + fw_domain_arm_timer(domain); + else + fw_domains_put(uncore, domain->mask); } } @@ -760,7 +765,20 @@ void intel_uncore_forcewake_put(struct intel_uncore *uncore, return; spin_lock_irqsave(&uncore->lock, irqflags); - __intel_uncore_forcewake_put(uncore, fw_domains); + __intel_uncore_forcewake_put(uncore, fw_domains, false); + spin_unlock_irqrestore(&uncore->lock, irqflags); +} + +void intel_uncore_forcewake_put_delayed(struct intel_uncore *uncore, + enum forcewake_domains fw_domains) +{ + unsigned long irqflags; + + if (!uncore->fw_get_funcs) + return; + + spin_lock_irqsave(&uncore->lock, irqflags); + __intel_uncore_forcewake_put(uncore, fw_domains, true); spin_unlock_irqrestore(&uncore->lock, irqflags); } @@ -802,7 +820,7 @@ void intel_uncore_forcewake_put__locked(struct intel_uncore *uncore, if (!uncore->fw_get_funcs) return; - __intel_uncore_forcewake_put(uncore, fw_domains); + __intel_uncore_forcewake_put(uncore, fw_domains, false); } void assert_forcewakes_inactive(struct intel_uncore *uncore) diff --git a/drivers/gpu/drm/i915/intel_uncore.h b/drivers/gpu/drm/i915/intel_uncore.h index 210fe2a71612..2a15b2b2e2fc 100644 --- a/drivers/gpu/drm/i915/intel_uncore.h +++ b/drivers/gpu/drm/i915/intel_uncore.h @@ -246,6 +246,8 @@ void intel_uncore_forcewake_get(struct intel_uncore *uncore, enum forcewake_domains domains); void intel_uncore_forcewake_put(struct intel_uncore *uncore, enum forcewake_domains domains); +void intel_uncore_forcewake_put_delayed(struct intel_uncore *uncore, + enum forcewake_domains domains); void intel_uncore_forcewake_flush(struct intel_uncore *uncore, enum forcewake_domains fw_domains); -- cgit v1.2.3-58-ga151 From 429c3be8a5e2695b5b92a6a12361eb89eb185495 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 25 Jan 2022 12:06:54 +0200 Subject: sch_htb: Fail on unsupported parameters when offload is requested The current implementation of HTB offload doesn't support some parameters. Instead of ignoring them, actively return the EINVAL error when they are set to non-defaults. As this patch goes to stable, the driver API is not changed here. If future drivers support more offload parameters, the checks can be moved to the driver side. Note that the buffer and cbuffer parameters are also not supported, but the tc userspace tool assigns some default values derived from rate and ceil, and identifying these defaults in sch_htb would be unreliable, so they are still ignored. Fixes: d03b195b5aa0 ("sch_htb: Hierarchical QoS hardware offload") Reported-by: Jakub Kicinski Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/20220125100654.424570-1-maximmi@nvidia.com Signed-off-by: Jakub Kicinski --- net/sched/sch_htb.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 9267922ea9c3..23a9d6242429 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1810,6 +1810,26 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!hopt->rate.rate || !hopt->ceil.rate) goto failure; + if (q->offload) { + /* Options not supported by the offload. */ + if (hopt->rate.overhead || hopt->ceil.overhead) { + NL_SET_ERR_MSG(extack, "HTB offload doesn't support the overhead parameter"); + goto failure; + } + if (hopt->rate.mpu || hopt->ceil.mpu) { + NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter"); + goto failure; + } + if (hopt->quantum) { + NL_SET_ERR_MSG(extack, "HTB offload doesn't support the quantum parameter"); + goto failure; + } + if (hopt->prio) { + NL_SET_ERR_MSG(extack, "HTB offload doesn't support the prio parameter"); + goto failure; + } + } + /* Keeping backward compatible with rate_table based iproute2 tc */ if (hopt->rate.linklayer == TC_LINKLAYER_UNAWARE) qdisc_put_rtab(qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB], -- cgit v1.2.3-58-ga151 From 007c95120d1b054fb637dff24636a307a4889016 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jan 2022 16:37:56 -0800 Subject: ethernet: 3com/typhoon: don't write directly to netdev->dev_addr This driver casts off the const and writes directly to netdev->dev_addr. This will result in a MAC address tree corruption and a warning. Compile tested ppc6xx_defconfig. Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/3com/typhoon.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c index 481f1df3106c..8aec5d9fbfef 100644 --- a/drivers/net/ethernet/3com/typhoon.c +++ b/drivers/net/ethernet/3com/typhoon.c @@ -2278,6 +2278,7 @@ typhoon_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) struct net_device *dev; struct typhoon *tp; int card_id = (int) ent->driver_data; + u8 addr[ETH_ALEN] __aligned(4); void __iomem *ioaddr; void *shared; dma_addr_t shared_dma; @@ -2409,8 +2410,9 @@ typhoon_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) goto error_out_reset; } - *(__be16 *)&dev->dev_addr[0] = htons(le16_to_cpu(xp_resp[0].parm1)); - *(__be32 *)&dev->dev_addr[2] = htonl(le32_to_cpu(xp_resp[0].parm2)); + *(__be16 *)&addr[0] = htons(le16_to_cpu(xp_resp[0].parm1)); + *(__be32 *)&addr[2] = htonl(le32_to_cpu(xp_resp[0].parm2)); + eth_hw_addr_set(dev, addr); if (!is_valid_ether_addr(dev->dev_addr)) { err_msg = "Could not obtain valid ethernet address, aborting"; -- cgit v1.2.3-58-ga151 From 14ba66a60fbfbe69bb7faf6b45e9803c2efd7a23 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jan 2022 16:37:57 -0800 Subject: ethernet: tundra: don't write directly to netdev->dev_addr netdev->dev_addr is const now. Maintain the questionable offsetting in ndo_set_mac_address. Compile tested holly_defconfig and mpc7448_hpc2_defconfig. Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/tundra/tsi108_eth.c | 35 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/tundra/tsi108_eth.c b/drivers/net/ethernet/tundra/tsi108_eth.c index cf0917b29e30..5251fc324221 100644 --- a/drivers/net/ethernet/tundra/tsi108_eth.c +++ b/drivers/net/ethernet/tundra/tsi108_eth.c @@ -1091,20 +1091,22 @@ static int tsi108_get_mac(struct net_device *dev) struct tsi108_prv_data *data = netdev_priv(dev); u32 word1 = TSI_READ(TSI108_MAC_ADDR1); u32 word2 = TSI_READ(TSI108_MAC_ADDR2); + u8 addr[ETH_ALEN]; /* Note that the octets are reversed from what the manual says, * producing an even weirder ordering... */ if (word2 == 0 && word1 == 0) { - dev->dev_addr[0] = 0x00; - dev->dev_addr[1] = 0x06; - dev->dev_addr[2] = 0xd2; - dev->dev_addr[3] = 0x00; - dev->dev_addr[4] = 0x00; + addr[0] = 0x00; + addr[1] = 0x06; + addr[2] = 0xd2; + addr[3] = 0x00; + addr[4] = 0x00; if (0x8 == data->phy) - dev->dev_addr[5] = 0x01; + addr[5] = 0x01; else - dev->dev_addr[5] = 0x02; + addr[5] = 0x02; + eth_hw_addr_set(dev, addr); word2 = (dev->dev_addr[0] << 16) | (dev->dev_addr[1] << 24); @@ -1114,12 +1116,13 @@ static int tsi108_get_mac(struct net_device *dev) TSI_WRITE(TSI108_MAC_ADDR1, word1); TSI_WRITE(TSI108_MAC_ADDR2, word2); } else { - dev->dev_addr[0] = (word2 >> 16) & 0xff; - dev->dev_addr[1] = (word2 >> 24) & 0xff; - dev->dev_addr[2] = (word1 >> 0) & 0xff; - dev->dev_addr[3] = (word1 >> 8) & 0xff; - dev->dev_addr[4] = (word1 >> 16) & 0xff; - dev->dev_addr[5] = (word1 >> 24) & 0xff; + addr[0] = (word2 >> 16) & 0xff; + addr[1] = (word2 >> 24) & 0xff; + addr[2] = (word1 >> 0) & 0xff; + addr[3] = (word1 >> 8) & 0xff; + addr[4] = (word1 >> 16) & 0xff; + addr[5] = (word1 >> 24) & 0xff; + eth_hw_addr_set(dev, addr); } if (!is_valid_ether_addr(dev->dev_addr)) { @@ -1136,14 +1139,12 @@ static int tsi108_set_mac(struct net_device *dev, void *addr) { struct tsi108_prv_data *data = netdev_priv(dev); u32 word1, word2; - int i; if (!is_valid_ether_addr(addr)) return -EADDRNOTAVAIL; - for (i = 0; i < 6; i++) - /* +2 is for the offset of the HW addr type */ - dev->dev_addr[i] = ((unsigned char *)addr)[i + 2]; + /* +2 is for the offset of the HW addr type */ + eth_hw_addr_set(dev, ((unsigned char *)addr) + 2); word2 = (dev->dev_addr[0] << 16) | (dev->dev_addr[1] << 24); -- cgit v1.2.3-58-ga151 From 7f6ec2b2f01b45b39cd2ffcd49fdb30d0b6c626d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jan 2022 16:37:58 -0800 Subject: ethernet: broadcom/sb1250-mac: don't write directly to netdev->dev_addr netdev->dev_addr is const now. Compile tested bigsur_defconfig and sb1250_swarm_defconfig. Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/sb1250-mac.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/sb1250-mac.c b/drivers/net/ethernet/broadcom/sb1250-mac.c index f38f40eb966e..a1a38456c9a3 100644 --- a/drivers/net/ethernet/broadcom/sb1250-mac.c +++ b/drivers/net/ethernet/broadcom/sb1250-mac.c @@ -2183,9 +2183,7 @@ static int sbmac_init(struct platform_device *pldev, long long base) ea_reg >>= 8; } - for (i = 0; i < 6; i++) { - dev->dev_addr[i] = eaddr[i]; - } + eth_hw_addr_set(dev, eaddr); /* * Initialize context (get pointers to registers and stuff), then -- cgit v1.2.3-58-ga151 From 98ef22bbae788e05109aba16b77deb9f37bfb285 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jan 2022 16:37:59 -0800 Subject: ethernet: i825xx: don't write directly to netdev->dev_addr netdev->dev_addr is const now. Compile tested rpc_defconfig w/ GCC 8.5. Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/ethernet/i825xx/ether1.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/i825xx/ether1.c b/drivers/net/ethernet/i825xx/ether1.c index c612ef526d16..3e7d7c4bafdc 100644 --- a/drivers/net/ethernet/i825xx/ether1.c +++ b/drivers/net/ethernet/i825xx/ether1.c @@ -986,6 +986,7 @@ static int ether1_probe(struct expansion_card *ec, const struct ecard_id *id) { struct net_device *dev; + u8 addr[ETH_ALEN]; int i, ret = 0; ether1_banner(); @@ -1015,7 +1016,8 @@ ether1_probe(struct expansion_card *ec, const struct ecard_id *id) } for (i = 0; i < 6; i++) - dev->dev_addr[i] = readb(IDPROM_ADDRESS + (i << 2)); + addr[i] = readb(IDPROM_ADDRESS + (i << 2)); + eth_hw_addr_set(dev, addr); if (ether1_init_2(dev)) { ret = -ENODEV; -- cgit v1.2.3-58-ga151 From 5518c5246ba600f52c108474ecea615ef8ff9e25 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jan 2022 16:38:00 -0800 Subject: ethernet: 8390/etherh: don't write directly to netdev->dev_addr netdev->dev_addr is const now. Compile tested rpc_defconfig w/ GCC 8.5. Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/ethernet/8390/etherh.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/8390/etherh.c b/drivers/net/ethernet/8390/etherh.c index bd22a534b1c0..e7b879123bb1 100644 --- a/drivers/net/ethernet/8390/etherh.c +++ b/drivers/net/ethernet/8390/etherh.c @@ -655,6 +655,7 @@ etherh_probe(struct expansion_card *ec, const struct ecard_id *id) struct ei_device *ei_local; struct net_device *dev; struct etherh_priv *eh; + u8 addr[ETH_ALEN]; int ret; ret = ecard_request_resources(ec); @@ -724,12 +725,13 @@ etherh_probe(struct expansion_card *ec, const struct ecard_id *id) spin_lock_init(&ei_local->page_lock); if (ec->cid.product == PROD_ANT_ETHERM) { - etherm_addr(dev->dev_addr); + etherm_addr(addr); ei_local->reg_offset = etherm_regoffsets; } else { - etherh_addr(dev->dev_addr, ec); + etherh_addr(addr, ec); ei_local->reg_offset = etherh_regoffsets; } + eth_hw_addr_set(dev, addr); ei_local->name = dev->name; ei_local->word16 = 1; -- cgit v1.2.3-58-ga151 From 8eb86fc2f4905899a4684c1e03555e42556f4d53 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 25 Jan 2022 16:38:01 -0800 Subject: ethernet: seeq/ether3: don't write directly to netdev->dev_addr netdev->dev_addr is const now. Compile tested rpc_defconfig w/ GCC 8.5. Fixes: adeef3e32146 ("net: constify netdev->dev_addr") Signed-off-by: Jakub Kicinski Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/ethernet/seeq/ether3.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/seeq/ether3.c b/drivers/net/ethernet/seeq/ether3.c index 16a4cbae9326..c672f92d65e9 100644 --- a/drivers/net/ethernet/seeq/ether3.c +++ b/drivers/net/ethernet/seeq/ether3.c @@ -749,6 +749,7 @@ ether3_probe(struct expansion_card *ec, const struct ecard_id *id) const struct ether3_data *data = id->data; struct net_device *dev; int bus_type, ret; + u8 addr[ETH_ALEN]; ether3_banner(); @@ -776,7 +777,8 @@ ether3_probe(struct expansion_card *ec, const struct ecard_id *id) priv(dev)->seeq = priv(dev)->base + data->base_offset; dev->irq = ec->irq; - ether3_addr(dev->dev_addr, ec); + ether3_addr(addr, ec); + eth_hw_addr_set(dev, addr); priv(dev)->dev = dev; timer_setup(&priv(dev)->timer, ether3_ledoff, 0); -- cgit v1.2.3-58-ga151 From b6ab149654ef1fada0b37b379c04c475c2fdc675 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 25 Jan 2022 12:48:15 +0100 Subject: net: lan966x: Fix sleep in atomic context when injecting frames On lan966x, when injecting a frame it was polling the register QS_INJ_STATUS to see if it can continue with the injection of the frame. The problem was that it was using readx_poll_timeout which could sleep in atomic context. This patch fixes this issue by using readx_poll_timeout_atomic. Fixes: d28d6d2e37d10d ("net: lan966x: add port module support") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/lan966x/lan966x_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c index 2cb70da63db3..1f60fd125a1d 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c @@ -182,9 +182,9 @@ static int lan966x_port_inj_ready(struct lan966x *lan966x, u8 grp) { u32 val; - return readx_poll_timeout(lan966x_port_inj_status, lan966x, val, - QS_INJ_STATUS_FIFO_RDY_GET(val) & BIT(grp), - READL_SLEEP_US, READL_TIMEOUT_US); + return readx_poll_timeout_atomic(lan966x_port_inj_status, lan966x, val, + QS_INJ_STATUS_FIFO_RDY_GET(val) & BIT(grp), + READL_SLEEP_US, READL_TIMEOUT_US); } static int lan966x_port_ifh_xmit(struct sk_buff *skb, -- cgit v1.2.3-58-ga151 From 77bdaf39f3c8a10892bace06ed60bd063b731529 Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Tue, 25 Jan 2022 12:48:16 +0100 Subject: net: lan966x: Fix sleep in atomic context when updating MAC table The function lan966x_mac_wait_for_completion is used to poll the status of the MAC table using the function readx_poll_timeout. The problem with this function is that is called also from atomic context. Therefore update the function to use readx_poll_timeout_atomic. Fixes: e18aba8941b40b ("net: lan966x: add mactable support") Signed-off-by: Horatiu Vultur Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/lan966x/lan966x_mac.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c index ca5f1177963d..ce5970bdcc6a 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_mac.c @@ -40,11 +40,12 @@ static int lan966x_mac_wait_for_completion(struct lan966x *lan966x) { u32 val; - return readx_poll_timeout(lan966x_mac_get_status, - lan966x, val, - (ANA_MACACCESS_MAC_TABLE_CMD_GET(val)) == - MACACCESS_CMD_IDLE, - TABLE_UPDATE_SLEEP_US, TABLE_UPDATE_TIMEOUT_US); + return readx_poll_timeout_atomic(lan966x_mac_get_status, + lan966x, val, + (ANA_MACACCESS_MAC_TABLE_CMD_GET(val)) == + MACACCESS_CMD_IDLE, + TABLE_UPDATE_SLEEP_US, + TABLE_UPDATE_TIMEOUT_US); } static void lan966x_mac_select(struct lan966x *lan966x, -- cgit v1.2.3-58-ga151 From a92f7a6feeb3884c69c1c7c1f13bccecb2228ad0 Mon Sep 17 00:00:00 2001 From: Catherine Sullivan Date: Tue, 25 Jan 2022 16:38:43 -0800 Subject: gve: Fix GFP flags when allocing pages Use GFP_ATOMIC when allocating pages out of the hotpath, continue to use GFP_KERNEL when allocating pages during setup. GFP_KERNEL will allow blocking which allows it to succeed more often in a low memory enviornment but in the hotpath we do not want to allow the allocation to block. Fixes: f5cedc84a30d2 ("gve: Add transmit and receive support") Signed-off-by: Catherine Sullivan Signed-off-by: David Awogbemila Link: https://lore.kernel.org/r/20220126003843.3584521-1-awogbemila@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve.h | 2 +- drivers/net/ethernet/google/gve/gve_main.c | 6 +++--- drivers/net/ethernet/google/gve/gve_rx.c | 3 ++- drivers/net/ethernet/google/gve/gve_rx_dqo.c | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 5f5d4f7aa813..160735484465 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -843,7 +843,7 @@ static inline bool gve_is_gqi(struct gve_priv *priv) /* buffers */ int gve_alloc_page(struct gve_priv *priv, struct device *dev, struct page **page, dma_addr_t *dma, - enum dma_data_direction); + enum dma_data_direction, gfp_t gfp_flags); void gve_free_page(struct device *dev, struct page *page, dma_addr_t dma, enum dma_data_direction); /* tx handling */ diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index f7f65c4bf993..54e51c8221b8 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -766,9 +766,9 @@ static void gve_free_rings(struct gve_priv *priv) int gve_alloc_page(struct gve_priv *priv, struct device *dev, struct page **page, dma_addr_t *dma, - enum dma_data_direction dir) + enum dma_data_direction dir, gfp_t gfp_flags) { - *page = alloc_page(GFP_KERNEL); + *page = alloc_page(gfp_flags); if (!*page) { priv->page_alloc_fail++; return -ENOMEM; @@ -811,7 +811,7 @@ static int gve_alloc_queue_page_list(struct gve_priv *priv, u32 id, for (i = 0; i < pages; i++) { err = gve_alloc_page(priv, &priv->pdev->dev, &qpl->pages[i], &qpl->page_buses[i], - gve_qpl_dma_dir(priv, id)); + gve_qpl_dma_dir(priv, id), GFP_KERNEL); /* caller handles clean up */ if (err) return -ENOMEM; diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c index 9ddcc497f48e..2068199445bd 100644 --- a/drivers/net/ethernet/google/gve/gve_rx.c +++ b/drivers/net/ethernet/google/gve/gve_rx.c @@ -86,7 +86,8 @@ static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev, dma_addr_t dma; int err; - err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE); + err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE, + GFP_ATOMIC); if (err) return err; diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index beb8bb079023..8c939628e2d8 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -157,7 +157,7 @@ static int gve_alloc_page_dqo(struct gve_priv *priv, int err; err = gve_alloc_page(priv, &priv->pdev->dev, &buf_state->page_info.page, - &buf_state->addr, DMA_FROM_DEVICE); + &buf_state->addr, DMA_FROM_DEVICE, GFP_KERNEL); if (err) return err; -- cgit v1.2.3-58-ga151 From d7e4f8545b497b3f5687e592f1c355cbaee64c8c Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 26 Jan 2022 13:04:26 +0800 Subject: pid: Introduce helper task_is_in_init_pid_ns() Currently the kernel uses open code in multiple places to check if a task is in the root PID namespace with the kind of format: if (task_active_pid_ns(current) == &init_pid_ns) do_something(); This patch creates a new helper function, task_is_in_init_pid_ns(), it returns true if a passed task is in the root PID namespace, otherwise returns false. So it will be used to replace open codes. Suggested-by: Suzuki K Poulose Signed-off-by: Leo Yan Reviewed-by: Leon Romanovsky Acked-by: Suzuki K Poulose Acked-by: Balbir Singh Signed-off-by: Jakub Kicinski --- include/linux/pid_namespace.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c7e627503d2..07481bb87d4e 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -86,4 +86,9 @@ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); void pid_idr_init(void); +static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) +{ + return task_active_pid_ns(tsk) == &init_pid_ns; +} + #endif /* _LINUX_PID_NS_H */ -- cgit v1.2.3-58-ga151 From 42c66d16756402c4749d94f005a998a43e8fa338 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 26 Jan 2022 13:04:27 +0800 Subject: connector/cn_proc: Use task_is_in_init_pid_ns() This patch replaces open code with task_is_in_init_pid_ns() to check if a task is in root PID namespace. Signed-off-by: Leo Yan Acked-by: Balbir Singh Signed-off-by: Jakub Kicinski --- drivers/connector/cn_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c index 646ad385e490..ccac1c453080 100644 --- a/drivers/connector/cn_proc.c +++ b/drivers/connector/cn_proc.c @@ -358,7 +358,7 @@ static void cn_proc_mcast_ctl(struct cn_msg *msg, * other namespaces. */ if ((current_user_ns() != &init_user_ns) || - (task_active_pid_ns(current) != &init_pid_ns)) + !task_is_in_init_pid_ns(current)) return; /* Can only change if privileged. */ -- cgit v1.2.3-58-ga151 From 36268983e90316b37000a005642af42234dabb36 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Wed, 26 Jan 2022 16:38:52 +0100 Subject: Revert "ipv6: Honor all IPv6 PIO Valid Lifetime values" This reverts commit b75326c201242de9495ff98e5d5cff41d7fc0d9d. This commit breaks Linux compatibility with USGv6 tests. The RFC this commit was based on is actually an expired draft: no published RFC currently allows the new behaviour it introduced. Without full IETF endorsement, the flash renumbering scenario this patch was supposed to enable is never going to work, as other IPv6 equipements on the same LAN will keep the 2 hours limit. Fixes: b75326c20124 ("ipv6: Honor all IPv6 PIO Valid Lifetime values") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 ++ net/ipv6/addrconf.c | 27 ++++++++++++++++++++------- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 78ea3e332688..e7ce719838b5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -6,6 +6,8 @@ #define RTR_SOLICITATION_INTERVAL (4*HZ) #define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ +#define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ + #define TEMP_VALID_LIFETIME (7*86400) #define TEMP_PREFERRED_LIFETIME (86400) #define REGEN_MAX_RETRY (3) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3eee17790a82..f927c199a93c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2589,7 +2589,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, __u32 valid_lft, u32 prefered_lft) { struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1); - int create = 0; + int create = 0, update_lft = 0; if (!ifp && valid_lft) { int max_addresses = in6_dev->cnf.max_addresses; @@ -2633,19 +2633,32 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, unsigned long now; u32 stored_lft; - /* Update lifetime (RFC4862 5.5.3 e) - * We deviate from RFC4862 by honoring all Valid Lifetimes to - * improve the reaction of SLAAC to renumbering events - * (draft-gont-6man-slaac-renum-06, Section 4.2) - */ + /* update lifetime (RFC2462 5.5.3 e) */ spin_lock_bh(&ifp->lock); now = jiffies; if (ifp->valid_lft > (now - ifp->tstamp) / HZ) stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; else stored_lft = 0; - if (!create && stored_lft) { + const u32 minimum_lft = min_t(u32, + stored_lft, MIN_VALID_LIFETIME); + valid_lft = max(valid_lft, minimum_lft); + + /* RFC4862 Section 5.5.3e: + * "Note that the preferred lifetime of the + * corresponding address is always reset to + * the Preferred Lifetime in the received + * Prefix Information option, regardless of + * whether the valid lifetime is also reset or + * ignored." + * + * So we should always update prefered_lft here. + */ + update_lft = 1; + } + + if (update_lft) { ifp->valid_lft = valid_lft; ifp->prefered_lft = prefered_lft; ifp->tstamp = now; -- cgit v1.2.3-58-ga151 From 94c82de43e01ef5747a95e4a590880de863fe423 Mon Sep 17 00:00:00 2001 From: Mohammad Athari Bin Ismail Date: Wed, 26 Jan 2022 17:47:22 +0800 Subject: net: stmmac: configure PTP clock source prior to PTP initialization For Intel platform, it is required to configure PTP clock source prior PTP initialization in MAC. So, need to move ptp_clk_freq_config execution from stmmac_ptp_register() to stmmac_init_ptp(). Fixes: 76da35dc99af ("stmmac: intel: Add PSE and PCH PTP clock source selection") Cc: # 5.15.x Signed-off-by: Mohammad Athari Bin Ismail Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 +++ drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 54071ae73879..25c802344356 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -890,6 +890,9 @@ static int stmmac_init_ptp(struct stmmac_priv *priv) bool xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac; int ret; + if (priv->plat->ptp_clk_freq_config) + priv->plat->ptp_clk_freq_config(priv); + ret = stmmac_init_tstamp_counter(priv, STMMAC_HWTS_ACTIVE); if (ret) return ret; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c index 0d24ebd37873..1c9f02f9c317 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c @@ -297,9 +297,6 @@ void stmmac_ptp_register(struct stmmac_priv *priv) { int i; - if (priv->plat->ptp_clk_freq_config) - priv->plat->ptp_clk_freq_config(priv); - for (i = 0; i < priv->dma_cap.pps_out_num; i++) { if (i >= STMMAC_PPS_MAX) break; -- cgit v1.2.3-58-ga151 From 0735e639f129dff455aeb91da291f5c578cc33db Mon Sep 17 00:00:00 2001 From: Mohammad Athari Bin Ismail Date: Wed, 26 Jan 2022 17:47:23 +0800 Subject: net: stmmac: skip only stmmac_ptp_register when resume from suspend When resume from suspend, besides skipping PTP registration, it also skipping PTP HW initialization. This could cause PTP clock not able to operate properly when resume from suspend. To fix this, only stmmac_ptp_register() is skipped when resume from suspend. Fixes: fe1319291150 ("stmmac: Don't init ptp again when resume from suspend/hibernation") Cc: # 5.15.x Signed-off-by: Mohammad Athari Bin Ismail Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 25c802344356..639a753266e6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -915,8 +915,6 @@ static int stmmac_init_ptp(struct stmmac_priv *priv) priv->hwts_tx_en = 0; priv->hwts_rx_en = 0; - stmmac_ptp_register(priv); - return 0; } @@ -3242,7 +3240,7 @@ static int stmmac_fpe_start_wq(struct stmmac_priv *priv) /** * stmmac_hw_setup - setup mac in a usable state. * @dev : pointer to the device structure. - * @init_ptp: initialize PTP if set + * @ptp_register: register PTP if set * Description: * this is the main function to setup the HW in a usable state because the * dma engine is reset, the core registers are configured (e.g. AXI, @@ -3252,7 +3250,7 @@ static int stmmac_fpe_start_wq(struct stmmac_priv *priv) * 0 on success and an appropriate (-)ve integer as defined in errno.h * file on failure. */ -static int stmmac_hw_setup(struct net_device *dev, bool init_ptp) +static int stmmac_hw_setup(struct net_device *dev, bool ptp_register) { struct stmmac_priv *priv = netdev_priv(dev); u32 rx_cnt = priv->plat->rx_queues_to_use; @@ -3309,13 +3307,13 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp) stmmac_mmc_setup(priv); - if (init_ptp) { - ret = stmmac_init_ptp(priv); - if (ret == -EOPNOTSUPP) - netdev_warn(priv->dev, "PTP not supported by HW\n"); - else if (ret) - netdev_warn(priv->dev, "PTP init failed\n"); - } + ret = stmmac_init_ptp(priv); + if (ret == -EOPNOTSUPP) + netdev_warn(priv->dev, "PTP not supported by HW\n"); + else if (ret) + netdev_warn(priv->dev, "PTP init failed\n"); + else if (ptp_register) + stmmac_ptp_register(priv); priv->eee_tw_timer = STMMAC_DEFAULT_TWT_LS; -- cgit v1.2.3-58-ga151 From dcb2c5c6ca9b9177f04abaf76e5a983d177c9414 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 26 Jan 2022 15:10:25 +0200 Subject: net: bridge: vlan: fix single net device option dumping When dumping vlan options for a single net device we send the same entries infinitely because user-space expects a 0 return at the end but we keep returning skb->len and restarting the dump on retry. Fix it by returning the value from br_vlan_dump_dev() if it completed or there was an error. The only case that must return skb->len is when the dump was incomplete and needs to continue (-EMSGSIZE). Reported-by: Benjamin Poirier Fixes: 8dcea187088b ("net: bridge: vlan: add rtm definitions and dump support") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_vlan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 84ba456a78cc..43201260e37b 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -2020,7 +2020,8 @@ static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) goto out_err; } err = br_vlan_dump_dev(dev, skb, cb, dump_flags); - if (err && err != -EMSGSIZE) + /* if the dump completed without an error we return 0 here */ + if (err != -EMSGSIZE) goto out_err; } else { for_each_netdev_rcu(net, dev) { -- cgit v1.2.3-58-ga151 From 9e0db41e7a0b6f1271cbcfb16dbf5b8641b4e440 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 27 Jan 2022 00:52:15 +0800 Subject: net: stmmac: dwmac-sun8i: use return val of readl_poll_timeout() When readl_poll_timeout() timeout, we'd better directly use its return value. Before this patch: [ 2.145528] dwmac-sun8i: probe of 4500000.ethernet failed with error -14 After this patch: [ 2.138520] dwmac-sun8i: probe of 4500000.ethernet failed with error -110 Signed-off-by: Jisheng Zhang Acked-by: Jernej Skrabec Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 617d0e4c6495..09644ab0d87a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -756,7 +756,7 @@ static int sun8i_dwmac_reset(struct stmmac_priv *priv) if (err) { dev_err(priv->device, "EMAC reset timeout\n"); - return -EFAULT; + return err; } return 0; } -- cgit v1.2.3-58-ga151 From 492fefbaafb9d9f49d8d14614d57c956a81f2ea1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 12:24:24 -0800 Subject: MAINTAINERS: add more files to eth PHY include/linux/linkmode.h and include/linux/mii.h do not match anything in MAINTAINERS. Looks like they should be under Ethernet PHY. Signed-off-by: Jakub Kicinski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0d7883977e9b..3e22999669c4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7216,8 +7216,10 @@ F: drivers/net/mdio/of_mdio.c F: drivers/net/pcs/ F: drivers/net/phy/ F: include/dt-bindings/net/qca-ar803x.h +F: include/linux/linkmode.h F: include/linux/*mdio*.h F: include/linux/mdio/*.h +F: include/linux/mii.h F: include/linux/of_net.h F: include/linux/phy.h F: include/linux/phy_fixed.h -- cgit v1.2.3-58-ga151 From 966f435add4821f53bf1c507dadc3ba9aaeb5f01 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 14:55:35 -0800 Subject: MAINTAINERS: add missing IPv4/IPv6 header paths Add missing headers to the IP entry. Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- MAINTAINERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 3e22999669c4..b904348fed0d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13461,7 +13461,11 @@ L: netdev@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git F: arch/x86/net/* +F: include/linux/ip.h +F: include/linux/ipv6* +F: include/net/fib* F: include/net/ip* +F: include/net/route.h F: net/ipv4/ F: net/ipv6/ -- cgit v1.2.3-58-ga151 From 153a0d187e767c68733b8e9f46218eb1f41ab902 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jan 2022 16:51:16 -0800 Subject: ipv4: raw: lock the socket in raw_bind() For some reason, raw_bind() forgot to lock the socket. BUG: KCSAN: data-race in __ip4_datagram_connect / raw_bind write to 0xffff8881170d4308 of 4 bytes by task 5466 on cpu 0: raw_bind+0x1b0/0x250 net/ipv4/raw.c:739 inet_bind+0x56/0xa0 net/ipv4/af_inet.c:443 __sys_bind+0x14b/0x1b0 net/socket.c:1697 __do_sys_bind net/socket.c:1708 [inline] __se_sys_bind net/socket.c:1706 [inline] __x64_sys_bind+0x3d/0x50 net/socket.c:1706 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff8881170d4308 of 4 bytes by task 5468 on cpu 1: __ip4_datagram_connect+0xb7/0x7b0 net/ipv4/datagram.c:39 ip4_datagram_connect+0x2a/0x40 net/ipv4/datagram.c:89 inet_dgram_connect+0x107/0x190 net/ipv4/af_inet.c:576 __sys_connect_file net/socket.c:1900 [inline] __sys_connect+0x197/0x1b0 net/socket.c:1917 __do_sys_connect net/socket.c:1927 [inline] __se_sys_connect net/socket.c:1924 [inline] __x64_sys_connect+0x3d/0x50 net/socket.c:1924 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000000 -> 0x0003007f Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 5468 Comm: syz-executor.5 Not tainted 5.17.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/raw.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index a53f256bf9d3..0505935b6b8c 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -722,6 +722,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) int ret = -EINVAL; int chk_addr_ret; + lock_sock(sk); if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; @@ -741,7 +742,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ sk_dst_reset(sk); ret = 0; -out: return ret; +out: + release_sock(sk); + return ret; } /* -- cgit v1.2.3-58-ga151 From a0f90c8815706981c483a652a6aefca51a5e191c Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 27 Jan 2022 18:34:19 +1000 Subject: drm/vmwgfx: Fix stale file descriptors on failed usercopy A failing usercopy of the fence_rep object will lead to a stale entry in the file descriptor table as put_unused_fd() won't release it. This enables userland to refer to a dangling 'file' object through that still valid file descriptor, leading to all kinds of use-after-free exploitation scenarios. Fix this by deferring the call to fd_install() until after the usercopy has succeeded. Fixes: c906965dee22 ("drm/vmwgfx: Add export fence to file descriptor support") Signed-off-by: Mathias Krause Signed-off-by: Zack Rusin Signed-off-by: Dave Airlie Signed-off-by: Linus Torvalds --- drivers/gpu/drm/vmwgfx/vmwgfx_drv.h | 5 ++--- drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c | 33 +++++++++++++++++---------------- drivers/gpu/drm/vmwgfx/vmwgfx_fence.c | 2 +- drivers/gpu/drm/vmwgfx/vmwgfx_kms.c | 2 +- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h index d6b66636a19b..ea3ecdda561d 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h @@ -1140,15 +1140,14 @@ extern int vmw_execbuf_fence_commands(struct drm_file *file_priv, struct vmw_private *dev_priv, struct vmw_fence_obj **p_fence, uint32_t *p_handle); -extern void vmw_execbuf_copy_fence_user(struct vmw_private *dev_priv, +extern int vmw_execbuf_copy_fence_user(struct vmw_private *dev_priv, struct vmw_fpriv *vmw_fp, int ret, struct drm_vmw_fence_rep __user *user_fence_rep, struct vmw_fence_obj *fence, uint32_t fence_handle, - int32_t out_fence_fd, - struct sync_file *sync_file); + int32_t out_fence_fd); bool vmw_cmd_describe(const void *buf, u32 *size, char const **cmd); /** diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c index 44ca23b0ea4e..dd2ff441068e 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c @@ -3879,17 +3879,17 @@ int vmw_execbuf_fence_commands(struct drm_file *file_priv, * Also if copying fails, user-space will be unable to signal the fence object * so we wait for it immediately, and then unreference the user-space reference. */ -void +int vmw_execbuf_copy_fence_user(struct vmw_private *dev_priv, struct vmw_fpriv *vmw_fp, int ret, struct drm_vmw_fence_rep __user *user_fence_rep, struct vmw_fence_obj *fence, uint32_t fence_handle, - int32_t out_fence_fd, struct sync_file *sync_file) + int32_t out_fence_fd) { struct drm_vmw_fence_rep fence_rep; if (user_fence_rep == NULL) - return; + return 0; memset(&fence_rep, 0, sizeof(fence_rep)); @@ -3917,19 +3917,13 @@ vmw_execbuf_copy_fence_user(struct vmw_private *dev_priv, * handle. */ if (unlikely(ret != 0) && (fence_rep.error == 0)) { - if (sync_file) - fput(sync_file->file); - - if (fence_rep.fd != -1) { - put_unused_fd(fence_rep.fd); - fence_rep.fd = -1; - } - ttm_ref_object_base_unref(vmw_fp->tfile, fence_handle); VMW_DEBUG_USER("Fence copy error. Syncing.\n"); (void) vmw_fence_obj_wait(fence, false, false, VMW_FENCE_WAIT_TIMEOUT); } + + return ret ? -EFAULT : 0; } /** @@ -4266,16 +4260,23 @@ int vmw_execbuf_process(struct drm_file *file_priv, (void) vmw_fence_obj_wait(fence, false, false, VMW_FENCE_WAIT_TIMEOUT); + } + } + + ret = vmw_execbuf_copy_fence_user(dev_priv, vmw_fpriv(file_priv), ret, + user_fence_rep, fence, handle, out_fence_fd); + + if (sync_file) { + if (ret) { + /* usercopy of fence failed, put the file object */ + fput(sync_file->file); + put_unused_fd(out_fence_fd); } else { /* Link the fence with the FD created earlier */ fd_install(out_fence_fd, sync_file->file); } } - vmw_execbuf_copy_fence_user(dev_priv, vmw_fpriv(file_priv), ret, - user_fence_rep, fence, handle, out_fence_fd, - sync_file); - /* Don't unreference when handing fence out */ if (unlikely(out_fence != NULL)) { *out_fence = fence; @@ -4293,7 +4294,7 @@ int vmw_execbuf_process(struct drm_file *file_priv, */ vmw_validation_unref_lists(&val_ctx); - return 0; + return ret; out_unlock_binding: mutex_unlock(&dev_priv->binding_mutex); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c index c60d395f9e2e..5001b87aebe8 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c @@ -1128,7 +1128,7 @@ int vmw_fence_event_ioctl(struct drm_device *dev, void *data, } vmw_execbuf_copy_fence_user(dev_priv, vmw_fp, 0, user_fence_rep, fence, - handle, -1, NULL); + handle, -1); vmw_fence_obj_unreference(&fence); return 0; out_no_create: diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index 4e693e8de2c3..bbd2f4ec08ec 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -2501,7 +2501,7 @@ void vmw_kms_helper_validation_finish(struct vmw_private *dev_priv, if (file_priv) vmw_execbuf_copy_fence_user(dev_priv, vmw_fpriv(file_priv), ret, user_fence_rep, fence, - handle, -1, NULL); + handle, -1); if (out_fence) *out_fence = fence; else -- cgit v1.2.3-58-ga151 From 970a5a3ea86da637471d3cd04d513a0755aba4bf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jan 2022 17:10:21 -0800 Subject: ipv4: tcp: send zero IPID in SYNACK messages In commit 431280eebed9 ("ipv4: tcp: send zero IPID for RST and ACK sent in SYN-RECV and TIME-WAIT state") we took care of some ctl packets sent by TCP. It turns out we need to use a similar strategy for SYNACK packets. By default, they carry IP_DF and IPID==0, but there are ways to ask them to use the hashed IP ident generator and thus be used to build off-path attacks. (Ref: Off-Path TCP Exploits of the Mixed IPID Assignment) One of this way is to force (before listener is started) echo 1 >/proc/sys/net/ipv4/ip_no_pmtu_disc Another way is using forged ICMP ICMP_FRAG_NEEDED with a very small MTU (like 68) to force a false return from ip_dont_fragment() In this patch, ip_build_and_send_pkt() uses the following heuristics. 1) Most SYNACK packets are smaller than IPV4_MIN_MTU and therefore can use IP_DF regardless of the listener or route pmtu setting. 2) In case the SYNACK packet is bigger than IPV4_MIN_MTU, we use prandom_u32() generator instead of the IPv4 hashed ident one. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: Ray Che Reviewed-by: David Ahern Cc: Geoff Alexander Cc: Willy Tarreau Signed-off-by: Jakub Kicinski --- net/ipv4/ip_output.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e331c8d4e6cf..139cec29ed06 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -162,12 +162,19 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr); iph->saddr = saddr; iph->protocol = sk->sk_protocol; - if (ip_dont_fragment(sk, &rt->dst)) { + /* Do not bother generating IPID for small packets (eg SYNACK) */ + if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) { iph->frag_off = htons(IP_DF); iph->id = 0; } else { iph->frag_off = 0; - __ip_select_ident(net, iph, 1); + /* TCP packets here are SYNACK with fat IPv4/TCP options. + * Avoid using the hashed IP ident generator. + */ + if (sk->sk_protocol == IPPROTO_TCP) + iph->id = (__force __be16)prandom_u32(); + else + __ip_select_ident(net, iph, 1); } if (opt && opt->opt.optlen) { -- cgit v1.2.3-58-ga151 From 23f57406b82de51809d5812afd96f210f8b627f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jan 2022 17:10:22 -0800 Subject: ipv4: avoid using shared IP generator for connected sockets ip_select_ident_segs() has been very conservative about using the connected socket private generator only for packets with IP_DF set, claiming it was needed for some VJ compression implementations. As mentioned in this referenced document, this can be abused. (Ref: Off-Path TCP Exploits of the Mixed IPID Assignment) Before switching to pure random IPID generation and possibly hurt some workloads, lets use the private inet socket generator. Not only this will remove one vulnerability, this will also improve performance of TCP flows using pmtudisc==IP_PMTUDISC_DONT Fixes: 73f156a6e8c1 ("inetpeer: get rid of ip_id_count") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Reported-by: Ray Che Cc: Willy Tarreau Signed-off-by: Jakub Kicinski --- include/net/ip.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 81e23a102a0d..b51bae43b0dd 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -525,19 +525,18 @@ static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); + /* We had many attacks based on IPID, use the private + * generator as much as we can. + */ + if (sk && inet_sk(sk)->inet_daddr) { + iph->id = htons(inet_sk(sk)->inet_id); + inet_sk(sk)->inet_id += segs; + return; + } if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { - /* This is only to work around buggy Windows95/2000 - * VJ compression implementations. If the ID field - * does not change, they drop every other packet in - * a TCP stream using header compression. - */ - if (sk && inet_sk(sk)->inet_daddr) { - iph->id = htons(inet_sk(sk)->inet_id); - inet_sk(sk)->inet_id += segs; - } else { - iph->id = 0; - } + iph->id = 0; } else { + /* Unfortunately we need the big hammer to get a suitable IPID */ __ip_select_ident(net, iph, segs); } } -- cgit v1.2.3-58-ga151 From 3c42b2019863b327caa233072c50739d4144dd16 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jan 2022 17:34:04 -0800 Subject: ipv4: remove sparse error in ip_neigh_gw4() ./include/net/route.h:373:48: warning: incorrect type in argument 2 (different base types) ./include/net/route.h:373:48: expected unsigned int [usertype] key ./include/net/route.h:373:48: got restricted __be32 [usertype] daddr Fixes: 5c9f7c1dfc2e ("ipv4: Add helpers for neigh lookup for nexthop") Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220127013404.1279313-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/route.h b/include/net/route.h index 4c858dcf1aa8..25404fc2b483 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -370,7 +370,7 @@ static inline struct neighbour *ip_neigh_gw4(struct net_device *dev, { struct neighbour *neigh; - neigh = __ipv4_neigh_lookup_noref(dev, daddr); + neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)daddr); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &daddr, dev, false); -- cgit v1.2.3-58-ga151 From 364df53c081d93fcfd6b91085ff2650c7f17b3c7 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 27 Jan 2022 17:13:01 +0800 Subject: net: socket: rename SKB_DROP_REASON_SOCKET_FILTER Rename SKB_DROP_REASON_SOCKET_FILTER, which is used as the reason of skb drop out of socket filter before it's part of a released kernel. It will be used for more protocols than just TCP in future series. Signed-off-by: Menglong Dong Reviewed-by: David Ahern Link: https://lore.kernel.org/all/20220127091308.91401-2-imagedong@tencent.com/ Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 +- include/trace/events/skb.h | 2 +- net/ipv4/tcp_ipv4.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bf11e1fbd69b..8a636e678902 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -318,7 +318,7 @@ enum skb_drop_reason { SKB_DROP_REASON_NO_SOCKET, SKB_DROP_REASON_PKT_TOO_SMALL, SKB_DROP_REASON_TCP_CSUM, - SKB_DROP_REASON_TCP_FILTER, + SKB_DROP_REASON_SOCKET_FILTER, SKB_DROP_REASON_UDP_CSUM, SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 3e042ca2cedb..a8a64b97504d 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -14,7 +14,7 @@ EM(SKB_DROP_REASON_NO_SOCKET, NO_SOCKET) \ EM(SKB_DROP_REASON_PKT_TOO_SMALL, PKT_TOO_SMALL) \ EM(SKB_DROP_REASON_TCP_CSUM, TCP_CSUM) \ - EM(SKB_DROP_REASON_TCP_FILTER, TCP_FILTER) \ + EM(SKB_DROP_REASON_SOCKET_FILTER, SOCKET_FILTER) \ EM(SKB_DROP_REASON_UDP_CSUM, UDP_CSUM) \ EMe(SKB_DROP_REASON_MAX, MAX) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b3f34e366b27..938b59636578 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2095,7 +2095,7 @@ process: nf_reset_ct(skb); if (tcp_filter(sk, skb)) { - drop_reason = SKB_DROP_REASON_TCP_FILTER; + drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto discard_and_relse; } th = (const struct tcphdr *)skb->data; -- cgit v1.2.3-58-ga151 From fd20d9738395cf8e27d0a17eba34169699fccdff Mon Sep 17 00:00:00 2001 From: Tim Yi Date: Thu, 27 Jan 2022 15:49:53 +0800 Subject: net: bridge: vlan: fix memory leak in __allowed_ingress When using per-vlan state, if vlan snooping and stats are disabled, untagged or priority-tagged ingress frame will go to check pvid state. If the port state is forwarding and the pvid state is not learning/forwarding, untagged or priority-tagged frame will be dropped but skb memory is not freed. Should free skb when __allowed_ingress returns false. Fixes: a580c76d534c ("net: bridge: vlan: add per-vlan state") Signed-off-by: Tim Yi Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20220127074953.12632-1-tim.yi@pica8.com Signed-off-by: Jakub Kicinski --- net/bridge/br_vlan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 43201260e37b..1402d5ca242d 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -560,10 +560,10 @@ static bool __allowed_ingress(const struct net_bridge *br, !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_pvid_state(vg); - return br_vlan_state_allowed(*state, true); - } else { - return true; + if (!br_vlan_state_allowed(*state, true)) + goto drop; } + return true; } } v = br_vlan_find(vg, *vid); -- cgit v1.2.3-58-ga151