From 17278a91e04f858155d54bee5528ba4fbcec6f87 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 14 Nov 2017 12:01:20 +0000 Subject: MIPS: CPS: Fix r1 .set mt assembler warning MIPS CPS has a build warning on kernels configured for MIPS32R1 or MIPS64R1, due to the use of .set mt without a prior .set mips{32,64}r2: arch/mips/kernel/cps-vec.S Assembler messages: arch/mips/kernel/cps-vec.S:238: Warning: the `mt' extension requires MIPS32 revision 2 or greater Add .set MIPS_ISA_LEVEL_RAW before .set mt to silence the warning. Fixes: 245a7868d2f2 ("MIPS: smp-cps: rework core/VPE initialisation") Signed-off-by: James Hogan Cc: Paul Burton Cc: James Hogan Cc: James Hogan Cc: Paul Burton Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17699/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/cps-vec.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/mips/kernel/cps-vec.S b/arch/mips/kernel/cps-vec.S index c7ed26029cbb..e68e6e04063a 100644 --- a/arch/mips/kernel/cps-vec.S +++ b/arch/mips/kernel/cps-vec.S @@ -235,6 +235,7 @@ LEAF(mips_cps_core_init) has_mt t0, 3f .set push + .set MIPS_ISA_LEVEL_RAW .set mt /* Only allow 1 TC per VPE to execute... */ @@ -388,6 +389,7 @@ LEAF(mips_cps_boot_vpes) #elif defined(CONFIG_MIPS_MT) .set push + .set MIPS_ISA_LEVEL_RAW .set mt /* If the core doesn't support MT then return */ -- cgit v1.2.3-58-ga151 From a03fe72572c12e98f4173f8a535f32468e48b6ec Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:51:35 +0000 Subject: MIPS: Factor out NT_PRFPREG regset access helpers In preparation to fix a commit 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") FCSR access regression factor out NT_PRFPREG regset access helpers for the non-MSA and the MSA variants respectively, to avoid having to deal with excessive indentation in the actual fix. No functional change, however use `target->thread.fpu.fpr[0]' rather than `target->thread.fpu.fpr[i]' for FGR holding type size determination as there's no `i' variable to refer to anymore, and for the factored out `i' variable declaration use `unsigned int' rather than `unsigned' as its type, following the common style. Signed-off-by: Maciej W. Rozycki Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.15+ Patchwork: https://patchwork.linux-mips.org/patch/17925/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 108 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 83 insertions(+), 25 deletions(-) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index efbd8df8b665..62e8ffd9370a 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -419,25 +419,36 @@ static int gpr64_set(struct task_struct *target, #endif /* CONFIG_64BIT */ -static int fpr_get(struct task_struct *target, - const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) +/* + * Copy the floating-point context to the supplied NT_PRFPREG buffer, + * !CONFIG_CPU_HAS_MSA variant. FP context's general register slots + * correspond 1:1 to buffer slots. + */ +static int fpr_get_fpa(struct task_struct *target, + unsigned int *pos, unsigned int *count, + void **kbuf, void __user **ubuf) { - unsigned i; - int err; - u64 fpr_val; - - /* XXX fcr31 */ + return user_regset_copyout(pos, count, kbuf, ubuf, + &target->thread.fpu, + 0, sizeof(elf_fpregset_t)); +} - if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t)) - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu, - 0, sizeof(elf_fpregset_t)); +/* + * Copy the floating-point context to the supplied NT_PRFPREG buffer, + * CONFIG_CPU_HAS_MSA variant. Only lower 64 bits of FP context's + * general register slots are copied to buffer slots. + */ +static int fpr_get_msa(struct task_struct *target, + unsigned int *pos, unsigned int *count, + void **kbuf, void __user **ubuf) +{ + unsigned int i; + u64 fpr_val; + int err; for (i = 0; i < NUM_FPU_REGS; i++) { fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0); - err = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + err = user_regset_copyout(pos, count, kbuf, ubuf, &fpr_val, i * sizeof(elf_fpreg_t), (i + 1) * sizeof(elf_fpreg_t)); if (err) @@ -447,27 +458,54 @@ static int fpr_get(struct task_struct *target, return 0; } -static int fpr_set(struct task_struct *target, +/* Copy the floating-point context to the supplied NT_PRFPREG buffer. */ +static int fpr_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) + void *kbuf, void __user *ubuf) { - unsigned i; int err; - u64 fpr_val; /* XXX fcr31 */ - init_fp_ctx(target); + if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) + err = fpr_get_fpa(target, &pos, &count, &kbuf, &ubuf); + else + err = fpr_get_msa(target, &pos, &count, &kbuf, &ubuf); + + return err; +} - if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t)) - return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu, - 0, sizeof(elf_fpregset_t)); +/* + * Copy the supplied NT_PRFPREG buffer to the floating-point context, + * !CONFIG_CPU_HAS_MSA variant. Buffer slots correspond 1:1 to FP + * context's general register slots. + */ +static int fpr_set_fpa(struct task_struct *target, + unsigned int *pos, unsigned int *count, + const void **kbuf, const void __user **ubuf) +{ + return user_regset_copyin(pos, count, kbuf, ubuf, + &target->thread.fpu, + 0, sizeof(elf_fpregset_t)); +} + +/* + * Copy the supplied NT_PRFPREG buffer to the floating-point context, + * CONFIG_CPU_HAS_MSA variant. Buffer slots are copied to lower 64 + * bits only of FP context's general register slots. + */ +static int fpr_set_msa(struct task_struct *target, + unsigned int *pos, unsigned int *count, + const void **kbuf, const void __user **ubuf) +{ + unsigned int i; + u64 fpr_val; + int err; BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t)); - for (i = 0; i < NUM_FPU_REGS && count >= sizeof(elf_fpreg_t); i++) { - err = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + for (i = 0; i < NUM_FPU_REGS && *count >= sizeof(elf_fpreg_t); i++) { + err = user_regset_copyin(pos, count, kbuf, ubuf, &fpr_val, i * sizeof(elf_fpreg_t), (i + 1) * sizeof(elf_fpreg_t)); if (err) @@ -478,6 +516,26 @@ static int fpr_set(struct task_struct *target, return 0; } +/* Copy the supplied NT_PRFPREG buffer to the floating-point context. */ +static int fpr_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int err; + + /* XXX fcr31 */ + + init_fp_ctx(target); + + if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) + err = fpr_set_fpa(target, &pos, &count, &kbuf, &ubuf); + else + err = fpr_set_msa(target, &pos, &count, &kbuf, &ubuf); + + return err; +} + enum mips_regset { REGSET_GPR, REGSET_FPR, -- cgit v1.2.3-58-ga151 From dc24d0edf33c3e15099688b6bbdf7bdc24bf6e91 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:52:15 +0000 Subject: MIPS: Guard against any partial write attempt with PTRACE_SETREGSET Complement commit d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") and ensure that no partial register write attempt is made with PTRACE_SETREGSET, as we do not preinitialize any temporaries used to hold incoming register data and consequently random data could be written. It is the responsibility of the caller, such as `ptrace_regset', to arrange for writes to span whole registers only, so here we only assert that it has indeed happened. Signed-off-by: Maciej W. Rozycki Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.15+ Patchwork: https://patchwork.linux-mips.org/patch/17926/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 62e8ffd9370a..7fcadaaf330f 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -516,7 +516,15 @@ static int fpr_set_msa(struct task_struct *target, return 0; } -/* Copy the supplied NT_PRFPREG buffer to the floating-point context. */ +/* + * Copy the supplied NT_PRFPREG buffer to the floating-point context. + * + * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0', + * which is supposed to have been guaranteed by the kernel before + * calling us, e.g. in `ptrace_regset'. We enforce that requirement, + * so that we can safely avoid preinitializing temporaries for + * partial register writes. + */ static int fpr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, @@ -524,6 +532,8 @@ static int fpr_set(struct task_struct *target, { int err; + BUG_ON(count % sizeof(elf_fpreg_t)); + /* XXX fcr31 */ init_fp_ctx(target); -- cgit v1.2.3-58-ga151 From 80b3ffce0196ea50068885d085ff981e4b8396f4 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:53:14 +0000 Subject: MIPS: Consistently handle buffer counter with PTRACE_SETREGSET Update commit d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") bug and consistently consume all data supplied to `fpr_set_msa' with the ptrace(2) PTRACE_SETREGSET request, such that a zero data buffer counter is returned where insufficient data has been given to fill a whole number of FP general registers. In reality this is not going to happen, as the caller is supposed to only supply data covering a whole number of registers and it is verified in `ptrace_regset' and again asserted in `fpr_set', however structuring code such that the presence of trailing partial FP general register data causes `fpr_set_msa' to return with a non-zero data buffer counter makes it appear that this trailing data will be used if there are subsequent writes made to FP registers, which is going to be the case with the FCSR once the missing write to that register has been fixed. Fixes: d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") Signed-off-by: Maciej W. Rozycki Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v4.11+ Patchwork: https://patchwork.linux-mips.org/patch/17927/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 7fcadaaf330f..47a01d5f26ea 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -504,7 +504,7 @@ static int fpr_set_msa(struct task_struct *target, int err; BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t)); - for (i = 0; i < NUM_FPU_REGS && *count >= sizeof(elf_fpreg_t); i++) { + for (i = 0; i < NUM_FPU_REGS && *count > 0; i++) { err = user_regset_copyin(pos, count, kbuf, ubuf, &fpr_val, i * sizeof(elf_fpreg_t), (i + 1) * sizeof(elf_fpreg_t)); -- cgit v1.2.3-58-ga151 From be07a6a1188372b6d19a3307ec33211fc9c9439d Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:54:33 +0000 Subject: MIPS: Fix an FCSR access API regression with NT_PRFPREG and MSA Fix a commit 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") public API regression, then activated by commit 1db1af84d6df ("MIPS: Basic MSA context switching support"), that caused the FCSR register not to be read or written for CONFIG_CPU_HAS_MSA kernel configurations (regardless of actual presence or absence of the MSA feature in a given processor) with ptrace(2) PTRACE_GETREGSET and PTRACE_SETREGSET requests nor recorded in core dumps. This is because with !CONFIG_CPU_HAS_MSA configurations the whole of `elf_fpregset_t' array is bulk-copied as it is, which includes the FCSR in one half of the last, 33rd slot, whereas with CONFIG_CPU_HAS_MSA configurations array elements are copied individually, and then only the leading 32 FGR slots while the remaining slot is ignored. Correct the code then such that only FGR slots are copied in the respective !MSA and MSA helpers an then the FCSR slot is handled separately in common code. Use `ptrace_setfcr31' to update the FCSR too, so that the read-only mask is respected. Retrieving a correct value of FCSR is important in debugging not only for the human to be able to get the right interpretation of the situation, but for correct operation of GDB as well. This is because the condition code bits in FSCR are used by GDB to determine the location to place a breakpoint at when single-stepping through an FPU branch instruction. If such a breakpoint is placed incorrectly (i.e. with the condition reversed), then it will be missed, likely causing the debuggee to run away from the control of GDB and consequently breaking the process of investigation. Fortunately GDB continues using the older PTRACE_GETFPREGS ptrace(2) request which is unaffected, so the regression only really hits with post-mortem debug sessions using a core dump file, in which case execution, and consequently single-stepping through branches is not possible. Of course core files created by buggy kernels out there will have the value of FCSR recorded clobbered, but such core files cannot be corrected and the person using them simply will have to be aware that the value of FCSR retrieved is not reliable. Which also means we can likely get away without defining a replacement API which would ensure a correct value of FSCR to be retrieved, or none at all. This is based on previous work by Alex Smith, extensively rewritten. Signed-off-by: Alex Smith Signed-off-by: James Hogan Signed-off-by: Maciej W. Rozycki Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset") Cc: Paul Burton Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.15+ Patchwork: https://patchwork.linux-mips.org/patch/17928/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 47a01d5f26ea..0a939593ccb7 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -422,7 +422,7 @@ static int gpr64_set(struct task_struct *target, /* * Copy the floating-point context to the supplied NT_PRFPREG buffer, * !CONFIG_CPU_HAS_MSA variant. FP context's general register slots - * correspond 1:1 to buffer slots. + * correspond 1:1 to buffer slots. Only general registers are copied. */ static int fpr_get_fpa(struct task_struct *target, unsigned int *pos, unsigned int *count, @@ -430,13 +430,14 @@ static int fpr_get_fpa(struct task_struct *target, { return user_regset_copyout(pos, count, kbuf, ubuf, &target->thread.fpu, - 0, sizeof(elf_fpregset_t)); + 0, NUM_FPU_REGS * sizeof(elf_fpreg_t)); } /* * Copy the floating-point context to the supplied NT_PRFPREG buffer, * CONFIG_CPU_HAS_MSA variant. Only lower 64 bits of FP context's - * general register slots are copied to buffer slots. + * general register slots are copied to buffer slots. Only general + * registers are copied. */ static int fpr_get_msa(struct task_struct *target, unsigned int *pos, unsigned int *count, @@ -458,20 +459,29 @@ static int fpr_get_msa(struct task_struct *target, return 0; } -/* Copy the floating-point context to the supplied NT_PRFPREG buffer. */ +/* + * Copy the floating-point context to the supplied NT_PRFPREG buffer. + * Choose the appropriate helper for general registers, and then copy + * the FCSR register separately. + */ static int fpr_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { + const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t); int err; - /* XXX fcr31 */ - if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) err = fpr_get_fpa(target, &pos, &count, &kbuf, &ubuf); else err = fpr_get_msa(target, &pos, &count, &kbuf, &ubuf); + if (err) + return err; + + err = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.fcr31, + fcr31_pos, fcr31_pos + sizeof(u32)); return err; } @@ -479,7 +489,7 @@ static int fpr_get(struct task_struct *target, /* * Copy the supplied NT_PRFPREG buffer to the floating-point context, * !CONFIG_CPU_HAS_MSA variant. Buffer slots correspond 1:1 to FP - * context's general register slots. + * context's general register slots. Only general registers are copied. */ static int fpr_set_fpa(struct task_struct *target, unsigned int *pos, unsigned int *count, @@ -487,13 +497,14 @@ static int fpr_set_fpa(struct task_struct *target, { return user_regset_copyin(pos, count, kbuf, ubuf, &target->thread.fpu, - 0, sizeof(elf_fpregset_t)); + 0, NUM_FPU_REGS * sizeof(elf_fpreg_t)); } /* * Copy the supplied NT_PRFPREG buffer to the floating-point context, * CONFIG_CPU_HAS_MSA variant. Buffer slots are copied to lower 64 - * bits only of FP context's general register slots. + * bits only of FP context's general register slots. Only general + * registers are copied. */ static int fpr_set_msa(struct task_struct *target, unsigned int *pos, unsigned int *count, @@ -518,6 +529,8 @@ static int fpr_set_msa(struct task_struct *target, /* * Copy the supplied NT_PRFPREG buffer to the floating-point context. + * Choose the appropriate helper for general registers, and then copy + * the FCSR register separately. * * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0', * which is supposed to have been guaranteed by the kernel before @@ -530,18 +543,30 @@ static int fpr_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { + const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t); + u32 fcr31; int err; BUG_ON(count % sizeof(elf_fpreg_t)); - /* XXX fcr31 */ - init_fp_ctx(target); if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) err = fpr_set_fpa(target, &pos, &count, &kbuf, &ubuf); else err = fpr_set_msa(target, &pos, &count, &kbuf, &ubuf); + if (err) + return err; + + if (count > 0) { + err = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &fcr31, + fcr31_pos, fcr31_pos + sizeof(u32)); + if (err) + return err; + + ptrace_setfcr31(target, fcr31); + } return err; } -- cgit v1.2.3-58-ga151 From 006501e039eec411842bb3150c41358867d320c2 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:55:40 +0000 Subject: MIPS: Also verify sizeof `elf_fpreg_t' with PTRACE_SETREGSET Complement commit d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") and like with the PTRACE_GETREGSET ptrace(2) request also apply a BUILD_BUG_ON check for the size of the `elf_fpreg_t' type in the PTRACE_SETREGSET request handler. Signed-off-by: Maciej W. Rozycki Fixes: d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write") Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v4.11+ Patchwork: https://patchwork.linux-mips.org/patch/17929/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 0a939593ccb7..256908951a7c 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -447,6 +447,7 @@ static int fpr_get_msa(struct task_struct *target, u64 fpr_val; int err; + BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t)); for (i = 0; i < NUM_FPU_REGS; i++) { fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0); err = user_regset_copyout(pos, count, kbuf, ubuf, -- cgit v1.2.3-58-ga151 From c8c5a3a24d395b14447a9a89d61586a913840a3b Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 11 Dec 2017 22:56:54 +0000 Subject: MIPS: Disallow outsized PTRACE_SETREGSET NT_PRFPREG regset accesses Complement commit c23b3d1a5311 ("MIPS: ptrace: Change GP regset to use correct core dump register layout") and also reject outsized PTRACE_SETREGSET requests to the NT_PRFPREG regset, like with the NT_PRSTATUS regset. Signed-off-by: Maciej W. Rozycki Fixes: c23b3d1a5311 ("MIPS: ptrace: Change GP regset to use correct core dump register layout") Cc: James Hogan Cc: Paul Burton Cc: Alex Smith Cc: Dave Martin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # v3.17+ Patchwork: https://patchwork.linux-mips.org/patch/17930/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/ptrace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 256908951a7c..0b23b1ad99e6 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -550,6 +550,9 @@ static int fpr_set(struct task_struct *target, BUG_ON(count % sizeof(elf_fpreg_t)); + if (pos + count > sizeof(elf_fpregset_t)) + return -EIO; + init_fp_ctx(target); if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) -- cgit v1.2.3-58-ga151 From b67336eee3fcb8ecedc6c13e2bf88aacfa3151e2 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 27 Nov 2017 09:33:03 +0000 Subject: MIPS: Validate PR_SET_FP_MODE prctl(2) requests against the ABI of the task Fix an API loophole introduced with commit 9791554b45a2 ("MIPS,prctl: add PR_[GS]ET_FP_MODE prctl options for MIPS"), where the caller of prctl(2) is incorrectly allowed to make a change to CP0.Status.FR or CP0.Config5.FRE register bits even if CONFIG_MIPS_O32_FP64_SUPPORT has not been enabled, despite that an executable requesting the mode requested via ELF file annotation would not be allowed to run in the first place, or for n64 and n64 ABI tasks which do not have non-default modes defined at all. Add suitable checks to `mips_set_process_fp_mode' and bail out if an invalid mode change has been requested for the ABI in effect, even if the FPU hardware or emulation would otherwise allow it. Always succeed however without taking any further action if the mode requested is the same as one already in effect, regardless of whether any mode change, should it be requested, would actually be allowed for the task concerned. Signed-off-by: Maciej W. Rozycki Fixes: 9791554b45a2 ("MIPS,prctl: add PR_[GS]ET_FP_MODE prctl options for MIPS") Reviewed-by: Paul Burton Cc: James Hogan Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org # 4.0+ Patchwork: https://patchwork.linux-mips.org/patch/17800/ Signed-off-by: Ralf Baechle --- arch/mips/kernel/process.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 45d0b6b037ee..57028d49c202 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -705,6 +705,18 @@ int mips_set_process_fp_mode(struct task_struct *task, unsigned int value) struct task_struct *t; int max_users; + /* If nothing to change, return right away, successfully. */ + if (value == mips_get_process_fp_mode(task)) + return 0; + + /* Only accept a mode change if 64-bit FP enabled for o32. */ + if (!IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT)) + return -EOPNOTSUPP; + + /* And only for o32 tasks. */ + if (IS_ENABLED(CONFIG_64BIT) && !test_thread_flag(TIF_32BIT_REGS)) + return -EOPNOTSUPP; + /* Check the value is valid */ if (value & ~known_bits) return -EOPNOTSUPP; -- cgit v1.2.3-58-ga151 From 955b1b5a00ba694159a7d3763412597f707c294d Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 20 Dec 2017 16:30:50 +0900 Subject: nvme-pci: move use_sgl initialization to nvme_init_iod() A flag "use_sgl" of "struct nvme_iod" has been used in nvme_init_iod() without being set to any value. It seems like "use_sgl" has been set in either nvme_pci_setup_prps() or nvme_pci_setup_sgls() which occur later than nvme_init_iod(). Make "iod->use_sgl" being set in a proper place, nvme_init_iod(). Also move nvme_pci_use_sgls() up above nvme_init_iod() to make it possible to be called by nvme_init_iod(). Signed-off-by: Minwoo Im Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f5800c3c9082..d53550e612bc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -448,12 +448,31 @@ static void **nvme_pci_iod_list(struct request *req) return (void **)(iod->sg + blk_rq_nr_phys_segments(req)); } +static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + unsigned int avg_seg_size; + + avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), + blk_rq_nr_phys_segments(req)); + + if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) + return false; + if (!iod->nvmeq->qid) + return false; + if (!sgl_threshold || avg_seg_size < sgl_threshold) + return false; + return true; +} + static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) { struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); int nseg = blk_rq_nr_phys_segments(rq); unsigned int size = blk_rq_payload_bytes(rq); + iod->use_sgl = nvme_pci_use_sgls(dev, rq); + if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg, iod->use_sgl); @@ -604,8 +623,6 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, dma_addr_t prp_dma; int nprps, i; - iod->use_sgl = false; - length -= (page_size - offset); if (length <= 0) { iod->first_dma = 0; @@ -715,8 +732,6 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, int entries = iod->nents, i = 0; dma_addr_t sgl_dma; - iod->use_sgl = true; - /* setting the transfer type as SGL */ cmd->flags = NVME_CMD_SGL_METABUF; @@ -770,23 +785,6 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, return BLK_STS_OK; } -static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - unsigned int avg_seg_size; - - avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), - blk_rq_nr_phys_segments(req)); - - if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) - return false; - if (!iod->nvmeq->qid) - return false; - if (!sgl_threshold || avg_seg_size < sgl_threshold) - return false; - return true; -} - static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { @@ -806,7 +804,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, DMA_ATTR_NO_WARN)) goto out; - if (nvme_pci_use_sgls(dev, req)) + if (iod->use_sgl) ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); else ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); -- cgit v1.2.3-58-ga151 From cee160fd34b459ace029653436319557a643795a Mon Sep 17 00:00:00 2001 From: Jeff Lien Date: Tue, 19 Dec 2017 13:24:15 -0600 Subject: nvme: fix sector units when going between formats If you format a device with a 4k sector size back to 512 bytes, the queue limit values for physical block size and minimum IO size were not getting updated; only the logical block size was being updated. This patch adds code to update the physical block and IO minimum sizes. Signed-off-by: Jeff Lien Reviewed-by: Martin K. Petersen Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1e46e60b8f10..961d6a4af19c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1335,6 +1335,7 @@ static void nvme_update_disk_info(struct gendisk *disk, struct nvme_ns *ns, struct nvme_id_ns *id) { sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9); + unsigned short bs = 1 << ns->lba_shift; unsigned stream_alignment = 0; if (ns->ctrl->nr_streams && ns->sws && ns->sgs) @@ -1343,7 +1344,10 @@ static void nvme_update_disk_info(struct gendisk *disk, blk_mq_freeze_queue(disk->queue); blk_integrity_unregister(disk); - blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift); + blk_queue_logical_block_size(disk->queue, bs); + blk_queue_physical_block_size(disk->queue, bs); + blk_queue_io_min(disk->queue, bs); + if (ns->ms && !ns->ext && (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) nvme_init_integrity(disk, ns->ms, ns->pi_type); -- cgit v1.2.3-58-ga151 From d5bf4b7f437c250821d40c3e32158729e6b484ce Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Dec 2017 14:54:15 +0200 Subject: nvme-rdma: fix concurrent reset and reconnect Now ctrl state machine allows to transition from RESETTING to RECONNECTING. In nvme-rdma when we receive a rdma cm DISONNECTED event, we trigger nvme_rdma_error_recovery. This happens also when we execute a controller reset, issue a cm diconnect request and receive a cm disconnect reply, as a result, the reset work and the error recovery work can run concurrently. Until now the state machine prevented from the error recovery work from running as a result of a controller reset (RESETTING -> RECONNECTING was not allowed). To fix this, we adopt the FC state machine approach, we always transition from LIVE to RESETTING and only then to RECONNECTING. We do this both for the error recovery work and the controller reset work: 1. transition to RESETTING 2. teardown the controller association 3. transition to RECONNECTING This will restore the protection against reset work and error recovery work from concurrently running together. Fixes: 3cec7f9de448 ("nvme: allow controller RESETTING to RECONNECTING transition") Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 37af56596be6..2a0bba7f50cf 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -974,12 +974,18 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_start_queues(&ctrl->ctrl); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + /* state change failure should never happen */ + WARN_ON_ONCE(1); + return; + } + nvme_rdma_reconnect_or_remove(ctrl); } static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) { - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) return; queue_work(nvme_wq, &ctrl->err_work); @@ -1753,6 +1759,12 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl, false); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + /* state change failure should never happen */ + WARN_ON_ONCE(1); + return; + } + ret = nvme_rdma_configure_admin_queue(ctrl, false); if (ret) goto out_fail; -- cgit v1.2.3-58-ga151 From 479a322fb729d657d34706ccf8dd12916f36628f Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 21 Dec 2017 15:07:27 +0200 Subject: nvme-mpath: fix last path removal during traffic In case our last path is removed during traffic, we can end up requeueing the bio(s) but never schedule the actual requeue work as upper layers still have open handles on the mpath device node. Fix this by scheduling requeue work if the namespace being removed is the last path in the ns_head path list. Fixes: 32acab3181c7 ("nvme: implement multipath access to nvme subsystems") Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/nvme.h | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 961d6a4af19c..839650e0926a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2991,6 +2991,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) mutex_unlock(&ns->ctrl->namespaces_mutex); synchronize_srcu(&ns->head->srcu); + nvme_mpath_check_last_path(ns); nvme_put_ns(ns); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ea1aa5283e8e..a00eabd06427 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -417,6 +417,15 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) rcu_assign_pointer(head->current_path, NULL); } struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); + +static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) +{ + struct nvme_ns_head *head = ns->head; + + if (head->disk && list_empty(&head->list)) + kblockd_schedule_work(&head->requeue_work); +} + #else static inline void nvme_failover_req(struct request *req) { @@ -448,6 +457,9 @@ static inline void nvme_mpath_remove_disk_links(struct nvme_ns *ns) static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) { } +static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) +{ +} #endif /* CONFIG_NVME_MULTIPATH */ #ifdef CONFIG_NVM -- cgit v1.2.3-58-ga151 From 254beb84faccbe2f4eda0b51924857bdfb679969 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 21 Dec 2017 14:15:47 -0800 Subject: nvme-fcloop: avoid possible uninitialized variable warning The kbuild test robot send mail of a potential use of an uninitialized variable - "tport" in fcloop_delete_targetport() which then calls __targetport_unreg() which uses the variable. It will never be the case it is uninitialized as the call to __targetport_unreg() only occurs if there is a valid nport pointer. And at the time the nport pointer is assigned, the tport variable is set. Remove the warning by assigning a NULL value initially. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 7b75d9de55ab..6a018a0bd6ce 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1085,7 +1085,7 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct fcloop_nport *nport = NULL, *tmpport; - struct fcloop_tport *tport; + struct fcloop_tport *tport = NULL; u64 nodename, portname; unsigned long flags; int ret; -- cgit v1.2.3-58-ga151 From fe08f34d066f4404934a509b6806db1a4f700c86 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 1 Jan 2018 09:50:50 +0100 Subject: ALSA: pcm: Remove incorrect snd_BUG_ON() usages syzkaller triggered kernel warnings through PCM OSS emulation at closing a stream: WARNING: CPU: 0 PID: 3502 at sound/core/pcm_lib.c:1635 snd_pcm_hw_param_first+0x289/0x690 sound/core/pcm_lib.c:1635 Call Trace: .... snd_pcm_hw_param_near.constprop.27+0x78d/0x9a0 sound/core/oss/pcm_oss.c:457 snd_pcm_oss_change_params+0x17d3/0x3720 sound/core/oss/pcm_oss.c:969 snd_pcm_oss_make_ready+0xaa/0x130 sound/core/oss/pcm_oss.c:1128 snd_pcm_oss_sync+0x257/0x830 sound/core/oss/pcm_oss.c:1638 snd_pcm_oss_release+0x20b/0x280 sound/core/oss/pcm_oss.c:2431 __fput+0x327/0x7e0 fs/file_table.c:210 .... This happens while it tries to open and set up the aloop device concurrently. The warning above (invoked from snd_BUG_ON() macro) is to detect the unexpected logical error where snd_pcm_hw_refine() call shouldn't fail. The theory is true for the case where the hw_params config rules are static. But for an aloop device, the hw_params rule condition does vary dynamically depending on the connected target; when another device is opened and changes the parameters, the device connected in another side is also affected, and it caused the error from snd_pcm_hw_refine(). That is, the simplest "solution" for this is to remove the incorrect assumption of static rules, and treat such an error as a normal error path. As there are a couple of other places using snd_BUG_ON() incorrectly, this patch removes these spurious snd_BUG_ON() calls. Reported-by: syzbot+6f11c7e2a1b91d466432@syzkaller.appspotmail.com Cc: Signed-off-by: Takashi Iwai --- sound/core/oss/pcm_oss.c | 1 - sound/core/pcm_lib.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index e49f448ee04f..ceaa51f76591 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -455,7 +455,6 @@ static int snd_pcm_hw_param_near(struct snd_pcm_substream *pcm, v = snd_pcm_hw_param_last(pcm, params, var, dir); else v = snd_pcm_hw_param_first(pcm, params, var, dir); - snd_BUG_ON(v < 0); return v; } diff --git a/sound/core/pcm_lib.c b/sound/core/pcm_lib.c index 10e7ef7a8804..db7894bb028c 100644 --- a/sound/core/pcm_lib.c +++ b/sound/core/pcm_lib.c @@ -1632,7 +1632,7 @@ int snd_pcm_hw_param_first(struct snd_pcm_substream *pcm, return changed; if (params->rmask) { int err = snd_pcm_hw_refine(pcm, params); - if (snd_BUG_ON(err < 0)) + if (err < 0) return err; } return snd_pcm_hw_param_value(params, var, dir); @@ -1678,7 +1678,7 @@ int snd_pcm_hw_param_last(struct snd_pcm_substream *pcm, return changed; if (params->rmask) { int err = snd_pcm_hw_refine(pcm, params); - if (snd_BUG_ON(err < 0)) + if (err < 0) return err; } return snd_pcm_hw_param_value(params, var, dir); -- cgit v1.2.3-58-ga151 From 6708913750344a900f2e73bfe4a4d6dbbce4fe8d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 4 Jan 2018 16:39:27 +0100 Subject: ALSA: pcm: Add missing error checks in OSS emulation plugin builder In the OSS emulation plugin builder where the frame size is parsed in the plugin chain, some places miss the possible errors returned from the plugin src_ or dst_frames callback. This patch papers over such places. Cc: Signed-off-by: Takashi Iwai --- sound/core/oss/pcm_plugin.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sound/core/oss/pcm_plugin.c b/sound/core/oss/pcm_plugin.c index cadc93792868..85a56af104bd 100644 --- a/sound/core/oss/pcm_plugin.c +++ b/sound/core/oss/pcm_plugin.c @@ -592,18 +592,26 @@ snd_pcm_sframes_t snd_pcm_plug_write_transfer(struct snd_pcm_substream *plug, st snd_pcm_sframes_t frames = size; plugin = snd_pcm_plug_first(plug); - while (plugin && frames > 0) { + while (plugin) { + if (frames <= 0) + return frames; if ((next = plugin->next) != NULL) { snd_pcm_sframes_t frames1 = frames; - if (plugin->dst_frames) + if (plugin->dst_frames) { frames1 = plugin->dst_frames(plugin, frames); + if (frames1 <= 0) + return frames1; + } if ((err = next->client_channels(next, frames1, &dst_channels)) < 0) { return err; } if (err != frames1) { frames = err; - if (plugin->src_frames) + if (plugin->src_frames) { frames = plugin->src_frames(plugin, frames1); + if (frames <= 0) + return frames; + } } } else dst_channels = NULL; -- cgit v1.2.3-58-ga151 From 0856655a25476d4431005e39d606e349050066b0 Mon Sep 17 00:00:00 2001 From: Loic Poulain Date: Mon, 11 Dec 2017 09:52:22 +0100 Subject: wcn36xx: Fix dynamic power saving Since driver does not report hardware dynamic power saving cap, this is up to the mac80211 to manage power saving timeout and state machine, using the ieee80211 config callback to report PS changes. This patch enables/disables PS mode according to the new configuration. Remove old behaviour enabling PS mode in a static way, this make the device unusable when power save is enabled since device is forced to PS regardless RX/TX traffic. Acked-by: Bjorn Andersson Signed-off-by: Loic Poulain Signed-off-by: Kalle Valo --- drivers/net/wireless/ath/wcn36xx/main.c | 23 ++++++++++++----------- drivers/net/wireless/ath/wcn36xx/pmc.c | 6 ++++-- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c index f7d228b5ba93..987f1252a3cf 100644 --- a/drivers/net/wireless/ath/wcn36xx/main.c +++ b/drivers/net/wireless/ath/wcn36xx/main.c @@ -384,6 +384,18 @@ static int wcn36xx_config(struct ieee80211_hw *hw, u32 changed) } } + if (changed & IEEE80211_CONF_CHANGE_PS) { + list_for_each_entry(tmp, &wcn->vif_list, list) { + vif = wcn36xx_priv_to_vif(tmp); + if (hw->conf.flags & IEEE80211_CONF_PS) { + if (vif->bss_conf.ps) /* ps allowed ? */ + wcn36xx_pmc_enter_bmps_state(wcn, vif); + } else { + wcn36xx_pmc_exit_bmps_state(wcn, vif); + } + } + } + mutex_unlock(&wcn->conf_mutex); return 0; @@ -747,17 +759,6 @@ static void wcn36xx_bss_info_changed(struct ieee80211_hw *hw, vif_priv->dtim_period = bss_conf->dtim_period; } - if (changed & BSS_CHANGED_PS) { - wcn36xx_dbg(WCN36XX_DBG_MAC, - "mac bss PS set %d\n", - bss_conf->ps); - if (bss_conf->ps) { - wcn36xx_pmc_enter_bmps_state(wcn, vif); - } else { - wcn36xx_pmc_exit_bmps_state(wcn, vif); - } - } - if (changed & BSS_CHANGED_BSSID) { wcn36xx_dbg(WCN36XX_DBG_MAC, "mac bss changed_bssid %pM\n", bss_conf->bssid); diff --git a/drivers/net/wireless/ath/wcn36xx/pmc.c b/drivers/net/wireless/ath/wcn36xx/pmc.c index 589fe5f70971..1976b80c235f 100644 --- a/drivers/net/wireless/ath/wcn36xx/pmc.c +++ b/drivers/net/wireless/ath/wcn36xx/pmc.c @@ -45,8 +45,10 @@ int wcn36xx_pmc_exit_bmps_state(struct wcn36xx *wcn, struct wcn36xx_vif *vif_priv = wcn36xx_vif_to_priv(vif); if (WCN36XX_BMPS != vif_priv->pw_state) { - wcn36xx_err("Not in BMPS mode, no need to exit from BMPS mode!\n"); - return -EINVAL; + /* Unbalanced call or last BMPS enter failed */ + wcn36xx_dbg(WCN36XX_DBG_PMC, + "Not in BMPS mode, no need to exit\n"); + return -EALREADY; } wcn36xx_smd_exit_bmps(wcn, vif); vif_priv->pw_state = WCN36XX_FULL_POWER; -- cgit v1.2.3-58-ga151 From 943309d4aad6732b905f3f500e6e17e33c211494 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Thu, 4 Jan 2018 09:19:13 +0200 Subject: iwlwifi: pcie: fix DMA memory mapping / unmapping 22000 devices (previously referenced as A000) can support short transmit queues. This means that we have less DMA descriptors (TFD) for those shorter queues. Previous devices must still have 256 TFDs for each queue even if those 256 TFDs point to fewer buffers. When I introduced support for the short queues for 22000 I broke older devices by assuming that they can also have less TFDs in their queues. This led to several problems: 1) the payload of the commands weren't unmapped properly which caused the SWIOTLB to complain at some point. 2) the hardware could get confused and we get hardware crashes. The corresponding bugzilla entries are: https://bugzilla.kernel.org/show_bug.cgi?id=198201 https://bugzilla.kernel.org/show_bug.cgi?id=198265 Cc: stable@vger.kernel.org # 4.14+ Fixes: 4ecab5616023 ("iwlwifi: pcie: support short Tx queues for A000 device family") Reviewed-by: Sharon, Sara Signed-off-by: Emmanuel Grumbach Signed-off-by: Kalle Valo --- drivers/net/wireless/intel/iwlwifi/pcie/internal.h | 10 +++++++--- drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c | 11 +++-------- drivers/net/wireless/intel/iwlwifi/pcie/tx.c | 8 ++++---- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h index d749abeca3ae..403e65c309d0 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h @@ -670,11 +670,15 @@ static inline u8 iwl_pcie_get_cmd_index(struct iwl_txq *q, u32 index) return index & (q->n_window - 1); } -static inline void *iwl_pcie_get_tfd(struct iwl_trans_pcie *trans_pcie, +static inline void *iwl_pcie_get_tfd(struct iwl_trans *trans, struct iwl_txq *txq, int idx) { - return txq->tfds + trans_pcie->tfd_size * iwl_pcie_get_cmd_index(txq, - idx); + struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); + + if (trans->cfg->use_tfh) + idx = iwl_pcie_get_cmd_index(txq, idx); + + return txq->tfds + trans_pcie->tfd_size * idx; } static inline void iwl_enable_rfkill_int(struct iwl_trans *trans) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c index 16b345f54ff0..6d0a907d5ba5 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx-gen2.c @@ -171,8 +171,6 @@ static void iwl_pcie_gen2_tfd_unmap(struct iwl_trans *trans, static void iwl_pcie_gen2_free_tfd(struct iwl_trans *trans, struct iwl_txq *txq) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); - /* rd_ptr is bounded by TFD_QUEUE_SIZE_MAX and * idx is bounded by n_window */ @@ -181,7 +179,7 @@ static void iwl_pcie_gen2_free_tfd(struct iwl_trans *trans, struct iwl_txq *txq) lockdep_assert_held(&txq->lock); iwl_pcie_gen2_tfd_unmap(trans, &txq->entries[idx].meta, - iwl_pcie_get_tfd(trans_pcie, txq, idx)); + iwl_pcie_get_tfd(trans, txq, idx)); /* free SKB */ if (txq->entries) { @@ -364,11 +362,9 @@ struct iwl_tfh_tfd *iwl_pcie_gen2_build_tfd(struct iwl_trans *trans, struct sk_buff *skb, struct iwl_cmd_meta *out_meta) { - struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; int idx = iwl_pcie_get_cmd_index(txq, txq->write_ptr); - struct iwl_tfh_tfd *tfd = - iwl_pcie_get_tfd(trans_pcie, txq, idx); + struct iwl_tfh_tfd *tfd = iwl_pcie_get_tfd(trans, txq, idx); dma_addr_t tb_phys; bool amsdu; int i, len, tb1_len, tb2_len, hdr_len; @@ -565,8 +561,7 @@ static int iwl_pcie_gen2_enqueue_hcmd(struct iwl_trans *trans, u8 group_id = iwl_cmd_groupid(cmd->id); const u8 *cmddata[IWL_MAX_CMD_TBS_PER_TFD]; u16 cmdlen[IWL_MAX_CMD_TBS_PER_TFD]; - struct iwl_tfh_tfd *tfd = - iwl_pcie_get_tfd(trans_pcie, txq, txq->write_ptr); + struct iwl_tfh_tfd *tfd = iwl_pcie_get_tfd(trans, txq, txq->write_ptr); memset(tfd, 0, sizeof(*tfd)); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c index fed6d842a5e1..3f85713c41dc 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c @@ -373,7 +373,7 @@ static void iwl_pcie_tfd_unmap(struct iwl_trans *trans, { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); int i, num_tbs; - void *tfd = iwl_pcie_get_tfd(trans_pcie, txq, index); + void *tfd = iwl_pcie_get_tfd(trans, txq, index); /* Sanity check on number of chunks */ num_tbs = iwl_pcie_tfd_get_num_tbs(trans, tfd); @@ -2018,7 +2018,7 @@ static int iwl_fill_data_tbs(struct iwl_trans *trans, struct sk_buff *skb, } trace_iwlwifi_dev_tx(trans->dev, skb, - iwl_pcie_get_tfd(trans_pcie, txq, txq->write_ptr), + iwl_pcie_get_tfd(trans, txq, txq->write_ptr), trans_pcie->tfd_size, &dev_cmd->hdr, IWL_FIRST_TB_SIZE + tb1_len, hdr_len); @@ -2092,7 +2092,7 @@ static int iwl_fill_data_tbs_amsdu(struct iwl_trans *trans, struct sk_buff *skb, IEEE80211_CCMP_HDR_LEN : 0; trace_iwlwifi_dev_tx(trans->dev, skb, - iwl_pcie_get_tfd(trans_pcie, txq, txq->write_ptr), + iwl_pcie_get_tfd(trans, txq, txq->write_ptr), trans_pcie->tfd_size, &dev_cmd->hdr, IWL_FIRST_TB_SIZE + tb1_len, 0); @@ -2425,7 +2425,7 @@ int iwl_trans_pcie_tx(struct iwl_trans *trans, struct sk_buff *skb, memcpy(&txq->first_tb_bufs[txq->write_ptr], &dev_cmd->hdr, IWL_FIRST_TB_SIZE); - tfd = iwl_pcie_get_tfd(trans_pcie, txq, txq->write_ptr); + tfd = iwl_pcie_get_tfd(trans, txq, txq->write_ptr); /* Set up entry for this TFD in Tx byte-count array */ iwl_pcie_txq_update_byte_cnt_tbl(trans, txq, le16_to_cpu(tx_cmd->len), iwl_pcie_tfd_get_num_tbs(trans, tfd)); -- cgit v1.2.3-58-ga151 From fb51f1cd06f9ced7b7085a2a4636375d520431ca Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 3 Jan 2018 15:16:30 +0100 Subject: ALSA: pcm: Workaround for weird PulseAudio behavior on rewind error The commit 9027c4639ef1 ("ALSA: pcm: Call ack() whenever appl_ptr is updated") introduced the possible error code returned from the PCM rewind ioctl. Basically the change was for handling the indirect PCM more correctly, but ironically, it caused rather a side-effect: PulseAudio gets pissed off when receiving an error from rewind, throws everything away and stops processing further, resulting in the silence. It's clearly a failure in the application side, so the best would be to fix that bug in PA. OTOH, PA is mostly the only user of the rewind feature, so it's not good to slap the sole customer. This patch tries to mitigate the situation: instead of returning an error, now the rewind ioctl returns zero when the driver can't rewind. It indicates that no rewind was performed, so the behavior is consistent, at least. Fixes: 9027c4639ef1 ("ALSA: pcm: Call ack() whenever appl_ptr is updated") Cc: Signed-off-by: Takashi Iwai --- sound/core/pcm_native.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index a4d92e46c459..f08772568c17 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -2580,7 +2580,7 @@ static snd_pcm_sframes_t forward_appl_ptr(struct snd_pcm_substream *substream, return ret < 0 ? ret : frames; } -/* decrease the appl_ptr; returns the processed frames or a negative error */ +/* decrease the appl_ptr; returns the processed frames or zero for error */ static snd_pcm_sframes_t rewind_appl_ptr(struct snd_pcm_substream *substream, snd_pcm_uframes_t frames, snd_pcm_sframes_t avail) @@ -2597,7 +2597,12 @@ static snd_pcm_sframes_t rewind_appl_ptr(struct snd_pcm_substream *substream, if (appl_ptr < 0) appl_ptr += runtime->boundary; ret = pcm_lib_apply_appl_ptr(substream, appl_ptr); - return ret < 0 ? ret : frames; + /* NOTE: we return zero for errors because PulseAudio gets depressed + * upon receiving an error from rewind ioctl and stops processing + * any longer. Returning zero means that no rewind is done, so + * it's not absolutely wrong to answer like that. + */ + return ret < 0 ? 0 : frames; } static snd_pcm_sframes_t snd_pcm_playback_rewind(struct snd_pcm_substream *substream, -- cgit v1.2.3-58-ga151 From 9685347aa0a5c2869058ca6ab79fd8e93084a67f Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 5 Jan 2018 16:09:47 +0100 Subject: ALSA: aloop: Release cable upon open error path The aloop runtime object and its assignment in the cable are left even when opening a substream fails. This doesn't mean any memory leak, but it still keeps the invalid pointer that may be referred by the another side of the cable spontaneously, which is a potential Oops cause. Clean up the cable assignment and the empty cable upon the error path properly. Fixes: 597603d615d2 ("ALSA: introduce the snd-aloop module for the PCM loopback") Cc: Signed-off-by: Takashi Iwai --- sound/drivers/aloop.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/sound/drivers/aloop.c b/sound/drivers/aloop.c index afac886ffa28..8b6a39cb7f06 100644 --- a/sound/drivers/aloop.c +++ b/sound/drivers/aloop.c @@ -658,12 +658,31 @@ static int rule_channels(struct snd_pcm_hw_params *params, return snd_interval_refine(hw_param_interval(params, rule->var), &t); } +static void free_cable(struct snd_pcm_substream *substream) +{ + struct loopback *loopback = substream->private_data; + int dev = get_cable_index(substream); + struct loopback_cable *cable; + + cable = loopback->cables[substream->number][dev]; + if (!cable) + return; + if (cable->streams[!substream->stream]) { + /* other stream is still alive */ + cable->streams[substream->stream] = NULL; + } else { + /* free the cable */ + loopback->cables[substream->number][dev] = NULL; + kfree(cable); + } +} + static int loopback_open(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct loopback *loopback = substream->private_data; struct loopback_pcm *dpcm; - struct loopback_cable *cable; + struct loopback_cable *cable = NULL; int err = 0; int dev = get_cable_index(substream); @@ -681,7 +700,6 @@ static int loopback_open(struct snd_pcm_substream *substream) if (!cable) { cable = kzalloc(sizeof(*cable), GFP_KERNEL); if (!cable) { - kfree(dpcm); err = -ENOMEM; goto unlock; } @@ -723,6 +741,10 @@ static int loopback_open(struct snd_pcm_substream *substream) else runtime->hw = cable->hw; unlock: + if (err < 0) { + free_cable(substream); + kfree(dpcm); + } mutex_unlock(&loopback->cable_lock); return err; } @@ -731,20 +753,10 @@ static int loopback_close(struct snd_pcm_substream *substream) { struct loopback *loopback = substream->private_data; struct loopback_pcm *dpcm = substream->runtime->private_data; - struct loopback_cable *cable; - int dev = get_cable_index(substream); loopback_timer_stop(dpcm); mutex_lock(&loopback->cable_lock); - cable = loopback->cables[substream->number][dev]; - if (cable->streams[!substream->stream]) { - /* other stream is still alive */ - cable->streams[substream->stream] = NULL; - } else { - /* free the cable */ - loopback->cables[substream->number][dev] = NULL; - kfree(cable); - } + free_cable(substream); mutex_unlock(&loopback->cable_lock); return 0; } -- cgit v1.2.3-58-ga151 From b088b53e20c7d09b5ab84c5688e609f478e5c417 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 5 Jan 2018 16:15:33 +0100 Subject: ALSA: aloop: Fix inconsistent format due to incomplete rule The extra hw constraint rule for the formats the aloop driver introduced has a slight flaw, where it doesn't return a positive value when the mask got changed. It came from the fact that it's basically a copy&paste from snd_hw_constraint_mask64(). The original code is supposed to be a single-shot and it modifies the mask bits only once and never after, while what we need for aloop is the dynamic hw rule that limits the mask bits. This difference results in the inconsistent state, as the hw_refine doesn't apply the dependencies fully. The worse and surprisingly result is that it causes a crash in OSS emulation when multiple full-duplex reads/writes are performed concurrently (I leave why it triggers Oops to readers as a homework). For fixing this, replace a few open-codes with the standard snd_mask_*() macros. Reported-by: syzbot+3902b5220e8ca27889ca@syzkaller.appspotmail.com Fixes: b1c73fc8e697 ("ALSA: snd-aloop: Fix hw_params restrictions and checking") Cc: Signed-off-by: Takashi Iwai --- sound/drivers/aloop.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sound/drivers/aloop.c b/sound/drivers/aloop.c index 8b6a39cb7f06..006521db487d 100644 --- a/sound/drivers/aloop.c +++ b/sound/drivers/aloop.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -622,14 +623,12 @@ static int rule_format(struct snd_pcm_hw_params *params, { struct snd_pcm_hardware *hw = rule->private; - struct snd_mask *maskp = hw_param_mask(params, rule->var); + struct snd_mask m; - maskp->bits[0] &= (u_int32_t)hw->formats; - maskp->bits[1] &= (u_int32_t)(hw->formats >> 32); - memset(maskp->bits + 2, 0, (SNDRV_MASK_MAX-64) / 8); /* clear rest */ - if (! maskp->bits[0] && ! maskp->bits[1]) - return -EINVAL; - return 0; + snd_mask_none(&m); + m.bits[0] = (u_int32_t)hw->formats; + m.bits[1] = (u_int32_t)(hw->formats >> 32); + return snd_mask_refine(hw_param_mask(params, rule->var), &m); } static int rule_rate(struct snd_pcm_hw_params *params, -- cgit v1.2.3-58-ga151 From 898dfe4687f460ba337a01c11549f87269a13fa2 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 4 Jan 2018 17:38:54 +0100 Subject: ALSA: aloop: Fix racy hw constraints adjustment The aloop driver tries to update the hw constraints of the connected target on the cable of the opened PCM substream. This is done by adding the extra hw constraints rules referring to the substream runtime->hw fields, while the other substream may update the runtime hw of another side on the fly. This is, however, racy and may result in the inconsistent values when both PCM streams perform the prepare concurrently. One of the reason is that it overwrites the other's runtime->hw field; which is not only racy but also broken when it's called before the open of another side finishes. And, since the reference to runtime->hw isn't protected, the concurrent write may give the partial value update and become inconsistent. This patch is an attempt to fix and clean up: - The prepare doesn't change the runtime->hw of other side any longer, but only update the cable->hw that is referred commonly. - The extra rules refer to the loopback_pcm object instead of the runtime->hw. The actual hw is deduced from cable->hw. - The extra rules take the cable_lock to protect against the race. Fixes: b1c73fc8e697 ("ALSA: snd-aloop: Fix hw_params restrictions and checking") Cc: Signed-off-by: Takashi Iwai --- sound/drivers/aloop.c | 51 +++++++++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/sound/drivers/aloop.c b/sound/drivers/aloop.c index 006521db487d..0333143a1fa7 100644 --- a/sound/drivers/aloop.c +++ b/sound/drivers/aloop.c @@ -306,19 +306,6 @@ static int loopback_trigger(struct snd_pcm_substream *substream, int cmd) return 0; } -static void params_change_substream(struct loopback_pcm *dpcm, - struct snd_pcm_runtime *runtime) -{ - struct snd_pcm_runtime *dst_runtime; - - if (dpcm == NULL || dpcm->substream == NULL) - return; - dst_runtime = dpcm->substream->runtime; - if (dst_runtime == NULL) - return; - dst_runtime->hw = dpcm->cable->hw; -} - static void params_change(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; @@ -330,10 +317,6 @@ static void params_change(struct snd_pcm_substream *substream) cable->hw.rate_max = runtime->rate; cable->hw.channels_min = runtime->channels; cable->hw.channels_max = runtime->channels; - params_change_substream(cable->streams[SNDRV_PCM_STREAM_PLAYBACK], - runtime); - params_change_substream(cable->streams[SNDRV_PCM_STREAM_CAPTURE], - runtime); } static int loopback_prepare(struct snd_pcm_substream *substream) @@ -621,24 +604,29 @@ static unsigned int get_cable_index(struct snd_pcm_substream *substream) static int rule_format(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { - - struct snd_pcm_hardware *hw = rule->private; + struct loopback_pcm *dpcm = rule->private; + struct loopback_cable *cable = dpcm->cable; struct snd_mask m; snd_mask_none(&m); - m.bits[0] = (u_int32_t)hw->formats; - m.bits[1] = (u_int32_t)(hw->formats >> 32); + mutex_lock(&dpcm->loopback->cable_lock); + m.bits[0] = (u_int32_t)cable->hw.formats; + m.bits[1] = (u_int32_t)(cable->hw.formats >> 32); + mutex_unlock(&dpcm->loopback->cable_lock); return snd_mask_refine(hw_param_mask(params, rule->var), &m); } static int rule_rate(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { - struct snd_pcm_hardware *hw = rule->private; + struct loopback_pcm *dpcm = rule->private; + struct loopback_cable *cable = dpcm->cable; struct snd_interval t; - t.min = hw->rate_min; - t.max = hw->rate_max; + mutex_lock(&dpcm->loopback->cable_lock); + t.min = cable->hw.rate_min; + t.max = cable->hw.rate_max; + mutex_unlock(&dpcm->loopback->cable_lock); t.openmin = t.openmax = 0; t.integer = 0; return snd_interval_refine(hw_param_interval(params, rule->var), &t); @@ -647,11 +635,14 @@ static int rule_rate(struct snd_pcm_hw_params *params, static int rule_channels(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { - struct snd_pcm_hardware *hw = rule->private; + struct loopback_pcm *dpcm = rule->private; + struct loopback_cable *cable = dpcm->cable; struct snd_interval t; - t.min = hw->channels_min; - t.max = hw->channels_max; + mutex_lock(&dpcm->loopback->cable_lock); + t.min = cable->hw.channels_min; + t.max = cable->hw.channels_max; + mutex_unlock(&dpcm->loopback->cable_lock); t.openmin = t.openmax = 0; t.integer = 0; return snd_interval_refine(hw_param_interval(params, rule->var), &t); @@ -716,19 +707,19 @@ static int loopback_open(struct snd_pcm_substream *substream) /* are cached -> they do not reflect the actual state */ err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FORMAT, - rule_format, &runtime->hw, + rule_format, dpcm, SNDRV_PCM_HW_PARAM_FORMAT, -1); if (err < 0) goto unlock; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, - rule_rate, &runtime->hw, + rule_rate, dpcm, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) goto unlock; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS, - rule_channels, &runtime->hw, + rule_channels, dpcm, SNDRV_PCM_HW_PARAM_CHANNELS, -1); if (err < 0) goto unlock; -- cgit v1.2.3-58-ga151 From 454be724f6f99cc7e7bbf15067128be9868186c6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 30 Nov 2017 07:56:35 +0800 Subject: block: drain queue before waiting for q_usage_counter becoming zero Now we track legacy requests with .q_usage_counter in commit 055f6e18e08f ("block: Make q_usage_counter also track legacy requests"), but that commit never runs and drains legacy queue before waiting for this counter becoming zero, then IO hang is caused in the test of pulling disk during IO. This patch fixes the issue by draining requests before waiting for q_usage_counter becoming zero, both Mauricio and chenxiang reported this issue, and observed that it can be fixed by this patch. Link: https://marc.info/?l=linux-block&m=151192424731797&w=2 Fixes: 055f6e18e08f("block: Make q_usage_counter also track legacy requests") Cc: Wen Xiong Tested-by: "chenxiang (M)" Tested-by: Mauricio Faria de Oliveira Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-core.c | 9 +++++++-- block/blk-mq.c | 2 ++ block/blk.h | 2 ++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index b8881750a3ac..3ba4326a63b5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -562,6 +562,13 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all) } } +void blk_drain_queue(struct request_queue *q) +{ + spin_lock_irq(q->queue_lock); + __blk_drain_queue(q, true); + spin_unlock_irq(q->queue_lock); +} + /** * blk_queue_bypass_start - enter queue bypass mode * @q: queue of interest @@ -689,8 +696,6 @@ void blk_cleanup_queue(struct request_queue *q) */ blk_freeze_queue(q); spin_lock_irq(lock); - if (!q->mq_ops) - __blk_drain_queue(q, true); queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); diff --git a/block/blk-mq.c b/block/blk-mq.c index 11097477eeab..3d3797327491 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -161,6 +161,8 @@ void blk_freeze_queue(struct request_queue *q) * exported to drivers as the only user for unfreeze is blk_mq. */ blk_freeze_queue_start(q); + if (!q->mq_ops) + blk_drain_queue(q); blk_mq_freeze_queue_wait(q); } diff --git a/block/blk.h b/block/blk.h index 3f1446937aec..442098aa9463 100644 --- a/block/blk.h +++ b/block/blk.h @@ -330,4 +330,6 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) } #endif /* CONFIG_BOUNCE */ +extern void blk_drain_queue(struct request_queue *q); + #endif /* BLK_INTERNAL_H */ -- cgit v1.2.3-58-ga151 From ae6650163c66a7eff1acd6eb8b0f752dcfa8eba5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 5 Jan 2018 16:26:00 -0800 Subject: loop: fix concurrent lo_open/lo_release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 范龙飞 reports that KASAN can report a use-after-free in __lock_acquire. The reason is due to insufficient serialization in lo_release(), which will continue to use the loop device even after it has decremented the lo_refcnt to zero. In the meantime, another process can come in, open the loop device again as it is being shut down. Confusion ensues. Reported-by: 范龙飞 Signed-off-by: Linus Torvalds Signed-off-by: Jens Axboe --- drivers/block/loop.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bc8e61506968..d5fe720cf149 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1581,9 +1581,8 @@ out: return err; } -static void lo_release(struct gendisk *disk, fmode_t mode) +static void __lo_release(struct loop_device *lo) { - struct loop_device *lo = disk->private_data; int err; if (atomic_dec_return(&lo->lo_refcnt)) @@ -1610,6 +1609,13 @@ static void lo_release(struct gendisk *disk, fmode_t mode) mutex_unlock(&lo->lo_ctl_mutex); } +static void lo_release(struct gendisk *disk, fmode_t mode) +{ + mutex_lock(&loop_index_mutex); + __lo_release(disk->private_data); + mutex_unlock(&loop_index_mutex); +} + static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, .open = lo_open, -- cgit v1.2.3-58-ga151 From 5731a879d03bdaa00265f8ebc32dfd0e65d25276 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 4 Jan 2018 20:02:09 -0800 Subject: bpf: sockmap missing NULL psock check Add psock NULL check to handle a racing sock event that can get the sk_callback_lock before this case but after xchg happens causing the refcnt to hit zero and sock user data (psock) to be null and queued for garbage collection. Also add a comment in the code because this is a bit subtle and not obvious in my opinion. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5ee2e41893d9..1712d319c2d8 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -591,8 +591,15 @@ static void sock_map_free(struct bpf_map *map) write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); - smap_list_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, sock); + } write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); -- cgit v1.2.3-58-ga151 From 2b36047e7889b7efee22c11e17f035f721855731 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 5 Jan 2018 15:02:00 -0800 Subject: selftests/bpf: fix test_align since commit 82abbf8d2fc4 the verifier rejects the bit-wise arithmetic on pointers earlier. The test 'dubious pointer arithmetic' now has less output to match on. Adjust it. Fixes: 82abbf8d2fc4 ("bpf: do not allow root to mangle valid pointers") Reported-by: kernel test robot Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_align.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/tools/testing/selftests/bpf/test_align.c b/tools/testing/selftests/bpf/test_align.c index 8591c89c0828..471bbbdb94db 100644 --- a/tools/testing/selftests/bpf/test_align.c +++ b/tools/testing/selftests/bpf/test_align.c @@ -474,27 +474,7 @@ static struct bpf_align_test tests[] = { .result = REJECT, .matches = { {4, "R5=pkt(id=0,off=0,r=0,imm=0)"}, - /* ptr & 0x40 == either 0 or 0x40 */ - {5, "R5=inv(id=0,umax_value=64,var_off=(0x0; 0x40))"}, - /* ptr << 2 == unknown, (4n) */ - {7, "R5=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"}, - /* (4n) + 14 == (4n+2). We blow our bounds, because - * the add could overflow. - */ - {8, "R5=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"}, - /* Checked s>=0 */ - {10, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, - /* packet pointer + nonnegative (4n+2) */ - {12, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, - {14, "R4=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, - /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. - * We checked the bounds, but it might have been able - * to overflow if the packet pointer started in the - * upper half of the address space. - * So we did not get a 'range' on R6, and the access - * attempt will fail. - */ - {16, "R6=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"}, + /* R5 bitwise operator &= on pointer prohibited */ } }, { -- cgit v1.2.3-58-ga151 From 33c57c0d3c67f51f491a9d27108f7e97adc03d96 Mon Sep 17 00:00:00 2001 From: Karsten Merker Date: Thu, 4 Jan 2018 23:37:02 +0100 Subject: RISC-V: Add a basic defconfig This patch provides a basic defconfig for the RISC-V architecture that enables enough kernel features to run a basic Linux distribution on qemu's "virt" board for native software development. Features include: - serial console - virtio block and network device support - VFAT and ext2/3/4 filesystem support - NFS client and NFS rootfs support - an assortment of other kernel features required for running systemd It also enables a number of drivers for physical hardware that target the "SiFive U500" SoC and the corresponding development platform. These include: - PCIe host controller support for the FPGA-based U500 development platform (PCIE_XILINX) - USB host controller support (OHCI/EHCI/XHCI) - USB HID (keyboard/mouse) support - USB mass storage support (bulk and UAS) - SATA support (AHCI) - ethernet drivers (MACB for a SoC-internal MAC block, microsemi ethernet phy, E1000E and R8169 for PCIe-connected external devices) - DRM and framebuffer console support for PCIe-connected Radeon graphics chips Signed-off-by: Karsten Merker Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 75 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index e69de29bb2d1..47dacf06c679 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -0,0 +1,75 @@ +CONFIG_SMP=y +CONFIG_PCI=y +CONFIG_PCIE_XILINX=y +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_CGROUP_BPF=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_EXPERT=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_BPF_SYSCALL=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_NETLINK_DIAG=y +CONFIG_DEVTMPFS=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_AHCI_PLATFORM=y +CONFIG_NETDEVICES=y +CONFIG_VIRTIO_NET=y +CONFIG_MACB=y +CONFIG_E1000E=y +CONFIG_R8169=y +CONFIG_MICROSEMI_PHY=y +CONFIG_INPUT_MOUSEDEV=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +# CONFIG_PTP_1588_CLOCK is not set +CONFIG_DRM=y +CONFIG_DRM_RADEON=y +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_USB=y +CONFIG_USB_XHCI_HCD=y +CONFIG_USB_XHCI_PLATFORM=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_HCD_PLATFORM=y +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_OHCI_HCD_PLATFORM=y +CONFIG_USB_STORAGE=y +CONFIG_USB_UAS=y +CONFIG_VIRTIO_MMIO=y +CONFIG_RAS=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_AUTOFS4_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_NFS_FS=y +CONFIG_NFS_V4=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_ROOT_NFS=y +# CONFIG_RCU_TRACE is not set +CONFIG_CRYPTO_USER_API_HASH=y -- cgit v1.2.3-58-ga151 From 9e49a4ed072ab67b17238c5a45d7cba7f848659e Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 26 Dec 2017 19:11:22 -0800 Subject: RISC-V: Make __NR_riscv_flush_icache visible to userspace We were hoping to avoid making this visible to userspace, but it looks like we're going to have to because QEMU's user-mode emulation doesn't want to emulate a vDSO. Having vDSO-only system calls was a bit unothodox anyway, so I think in this case it's OK to just make the actual system call number public. This patch simply moves the definition of __NR_riscv_flush_icache availiable to userspace, which results in the deletion of the now empty vdso-syscalls.h. Changes since v1: * I've moved the definition into uapi/asm/syscalls.h rathen than uapi/asm/unistd.h. This allows me to keep asm/unistd.h, so we can keep the syscall table macros sane. * As a side effect of the above, this no longer disables all system calls on RISC-V. Whoops! Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/unistd.h | 1 + arch/riscv/include/asm/vdso-syscalls.h | 28 ---------------------------- arch/riscv/include/uapi/asm/syscalls.h | 26 ++++++++++++++++++++++++++ arch/riscv/kernel/syscall_table.c | 1 - arch/riscv/kernel/vdso/flush_icache.S | 1 - 5 files changed, 27 insertions(+), 30 deletions(-) delete mode 100644 arch/riscv/include/asm/vdso-syscalls.h create mode 100644 arch/riscv/include/uapi/asm/syscalls.h diff --git a/arch/riscv/include/asm/unistd.h b/arch/riscv/include/asm/unistd.h index 9f250ed007cd..2f704a5c4196 100644 --- a/arch/riscv/include/asm/unistd.h +++ b/arch/riscv/include/asm/unistd.h @@ -14,3 +14,4 @@ #define __ARCH_HAVE_MMU #define __ARCH_WANT_SYS_CLONE #include +#include diff --git a/arch/riscv/include/asm/vdso-syscalls.h b/arch/riscv/include/asm/vdso-syscalls.h deleted file mode 100644 index a2ccf1894929..000000000000 --- a/arch/riscv/include/asm/vdso-syscalls.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (C) 2017 SiFive - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#ifndef _ASM_RISCV_VDSO_SYSCALLS_H -#define _ASM_RISCV_VDSO_SYSCALLS_H - -#ifdef CONFIG_SMP - -/* These syscalls are only used by the vDSO and are not in the uapi. */ -#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15) -__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache) - -#endif - -#endif /* _ASM_RISCV_VDSO_H */ diff --git a/arch/riscv/include/uapi/asm/syscalls.h b/arch/riscv/include/uapi/asm/syscalls.h new file mode 100644 index 000000000000..818655b0d535 --- /dev/null +++ b/arch/riscv/include/uapi/asm/syscalls.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2017 SiFive + */ + +#ifndef _ASM__UAPI__SYSCALLS_H +#define _ASM__UAPI__SYSCALLS_H + +/* + * Allows the instruction cache to be flushed from userspace. Despite RISC-V + * having a direct 'fence.i' instruction available to userspace (which we + * can't trap!), that's not actually viable when running on Linux because the + * kernel might schedule a process on another hart. There is no way for + * userspace to handle this without invoking the kernel (as it doesn't know the + * thread->hart mappings), so we've defined a RISC-V specific system call to + * flush the instruction cache. + * + * __NR_riscv_flush_icache is defined to flush the instruction cache over an + * address range, with the flush applying to either all threads or just the + * caller. We don't currently do anything with the address range, that's just + * in there for forwards compatibility. + */ +#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15) +__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache) + +#endif diff --git a/arch/riscv/kernel/syscall_table.c b/arch/riscv/kernel/syscall_table.c index a5bd6401f95e..ade52b903a43 100644 --- a/arch/riscv/kernel/syscall_table.c +++ b/arch/riscv/kernel/syscall_table.c @@ -23,5 +23,4 @@ void *sys_call_table[__NR_syscalls] = { [0 ... __NR_syscalls - 1] = sys_ni_syscall, #include -#include }; diff --git a/arch/riscv/kernel/vdso/flush_icache.S b/arch/riscv/kernel/vdso/flush_icache.S index b0fbad74e873..023e4d4aef58 100644 --- a/arch/riscv/kernel/vdso/flush_icache.S +++ b/arch/riscv/kernel/vdso/flush_icache.S @@ -13,7 +13,6 @@ #include #include -#include .text /* int __vdso_flush_icache(void *start, void *end, unsigned long flags); */ -- cgit v1.2.3-58-ga151 From c163fb38ca34694b0cce99bb5604257bc29bf200 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jan 2018 18:35:02 +0100 Subject: riscv: remove CONFIG_MMU ifdefs The RISC-V port doesn't suport a nommu mode, so there is no reason to provide some code only under a CONFIG_MMU ifdef. Signed-off-by: Christoph Hellwig Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/io.h | 4 ---- arch/riscv/include/asm/pgtable.h | 4 ---- arch/riscv/include/asm/tlbflush.h | 4 ---- arch/riscv/include/asm/uaccess.h | 12 ------------ 4 files changed, 24 deletions(-) diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index a82ce599b639..b269451e7e85 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -21,8 +21,6 @@ #include -#ifdef CONFIG_MMU - extern void __iomem *ioremap(phys_addr_t offset, unsigned long size); /* @@ -36,8 +34,6 @@ extern void __iomem *ioremap(phys_addr_t offset, unsigned long size); extern void iounmap(volatile void __iomem *addr); -#endif /* CONFIG_MMU */ - /* Generic IO read/write. These perform native-endian accesses. */ #define __raw_writeb __raw_writeb static inline void __raw_writeb(u8 val, volatile void __iomem *addr) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 2cbd92ed1629..16301966d65b 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -20,8 +20,6 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_MMU - /* Page Upper Directory not used in RISC-V */ #include #include @@ -413,8 +411,6 @@ static inline void pgtable_cache_init(void) /* No page table caches to initialize */ } -#endif /* CONFIG_MMU */ - #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index 715b0f10af58..7b9c24ebdf52 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -15,8 +15,6 @@ #ifndef _ASM_RISCV_TLBFLUSH_H #define _ASM_RISCV_TLBFLUSH_H -#ifdef CONFIG_MMU - #include /* @@ -64,6 +62,4 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_all(); } -#endif /* CONFIG_MMU */ - #endif /* _ASM_RISCV_TLBFLUSH_H */ diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 27b90d64814b..14b0b22fb578 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -127,7 +127,6 @@ extern int fixup_exception(struct pt_regs *state); * call. */ -#ifdef CONFIG_MMU #define __get_user_asm(insn, x, ptr, err) \ do { \ uintptr_t __tmp; \ @@ -153,13 +152,11 @@ do { \ __disable_user_access(); \ (x) = __x; \ } while (0) -#endif /* CONFIG_MMU */ #ifdef CONFIG_64BIT #define __get_user_8(x, ptr, err) \ __get_user_asm("ld", x, ptr, err) #else /* !CONFIG_64BIT */ -#ifdef CONFIG_MMU #define __get_user_8(x, ptr, err) \ do { \ u32 __user *__ptr = (u32 __user *)(ptr); \ @@ -193,7 +190,6 @@ do { \ (x) = (__typeof__(x))((__typeof__((x)-(x)))( \ (((u64)__hi << 32) | __lo))); \ } while (0) -#endif /* CONFIG_MMU */ #endif /* CONFIG_64BIT */ @@ -267,8 +263,6 @@ do { \ ((x) = 0, -EFAULT); \ }) - -#ifdef CONFIG_MMU #define __put_user_asm(insn, x, ptr, err) \ do { \ uintptr_t __tmp; \ @@ -292,14 +286,11 @@ do { \ : "rJ" (__x), "i" (-EFAULT)); \ __disable_user_access(); \ } while (0) -#endif /* CONFIG_MMU */ - #ifdef CONFIG_64BIT #define __put_user_8(x, ptr, err) \ __put_user_asm("sd", x, ptr, err) #else /* !CONFIG_64BIT */ -#ifdef CONFIG_MMU #define __put_user_8(x, ptr, err) \ do { \ u32 __user *__ptr = (u32 __user *)(ptr); \ @@ -329,7 +320,6 @@ do { \ : "rJ" (__x), "rJ" (__x >> 32), "i" (-EFAULT)); \ __disable_user_access(); \ } while (0) -#endif /* CONFIG_MMU */ #endif /* CONFIG_64BIT */ @@ -438,7 +428,6 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) * will set "err" to -EFAULT, while successful accesses return the previous * value. */ -#ifdef CONFIG_MMU #define __cmpxchg_user(ptr, old, new, err, size, lrb, scb) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ @@ -508,6 +497,5 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) (err) = __err; \ __ret; \ }) -#endif /* CONFIG_MMU */ #endif /* _ASM_RISCV_UACCESS_H */ -- cgit v1.2.3-58-ga151 From 1125203c13b9da32125e171b4bd75e93d4918ddd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Jan 2018 18:35:03 +0100 Subject: riscv: rename SR_* constants to match the spec Signed-off-by: Christoph Hellwig Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/csr.h | 8 ++++---- arch/riscv/include/asm/irqflags.h | 10 +++++----- arch/riscv/include/asm/ptrace.h | 2 +- arch/riscv/kernel/entry.S | 8 ++++---- arch/riscv/kernel/process.c | 4 ++-- arch/riscv/mm/fault.c | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index 0d64bc9f4f91..3c7a2c97e377 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -17,10 +17,10 @@ #include /* Status register flags */ -#define SR_IE _AC(0x00000002, UL) /* Interrupt Enable */ -#define SR_PIE _AC(0x00000020, UL) /* Previous IE */ -#define SR_PS _AC(0x00000100, UL) /* Previously Supervisor */ -#define SR_SUM _AC(0x00040000, UL) /* Supervisor may access User Memory */ +#define SR_SIE _AC(0x00000002, UL) /* Supervisor Interrupt Enable */ +#define SR_SPIE _AC(0x00000020, UL) /* Previous Supervisor IE */ +#define SR_SPP _AC(0x00000100, UL) /* Previously Supervisor */ +#define SR_SUM _AC(0x00040000, UL) /* Supervisor may access User Memory */ #define SR_FS _AC(0x00006000, UL) /* Floating-point Status */ #define SR_FS_OFF _AC(0x00000000, UL) diff --git a/arch/riscv/include/asm/irqflags.h b/arch/riscv/include/asm/irqflags.h index 6fdc860d7f84..07a3c6d5706f 100644 --- a/arch/riscv/include/asm/irqflags.h +++ b/arch/riscv/include/asm/irqflags.h @@ -27,25 +27,25 @@ static inline unsigned long arch_local_save_flags(void) /* unconditionally enable interrupts */ static inline void arch_local_irq_enable(void) { - csr_set(sstatus, SR_IE); + csr_set(sstatus, SR_SIE); } /* unconditionally disable interrupts */ static inline void arch_local_irq_disable(void) { - csr_clear(sstatus, SR_IE); + csr_clear(sstatus, SR_SIE); } /* get status and disable interrupts */ static inline unsigned long arch_local_irq_save(void) { - return csr_read_clear(sstatus, SR_IE); + return csr_read_clear(sstatus, SR_SIE); } /* test flags */ static inline int arch_irqs_disabled_flags(unsigned long flags) { - return !(flags & SR_IE); + return !(flags & SR_SIE); } /* test hardware interrupt enable bit */ @@ -57,7 +57,7 @@ static inline int arch_irqs_disabled(void) /* set interrupt enabled status */ static inline void arch_local_irq_restore(unsigned long flags) { - csr_set(sstatus, flags & SR_IE); + csr_set(sstatus, flags & SR_SIE); } #endif /* _ASM_RISCV_IRQFLAGS_H */ diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index 93b8956e25e4..2c5df945d43c 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -66,7 +66,7 @@ struct pt_regs { #define REG_FMT "%08lx" #endif -#define user_mode(regs) (((regs)->sstatus & SR_PS) == 0) +#define user_mode(regs) (((regs)->sstatus & SR_SPP) == 0) /* Helpers for working with the instruction pointer */ diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 20ee86f782a9..7404ec222406 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -196,7 +196,7 @@ handle_syscall: addi s2, s2, 0x4 REG_S s2, PT_SEPC(sp) /* System calls run with interrupts enabled */ - csrs sstatus, SR_IE + csrs sstatus, SR_SIE /* Trace syscalls, but only if requested by the user. */ REG_L t0, TASK_TI_FLAGS(tp) andi t0, t0, _TIF_SYSCALL_TRACE @@ -224,8 +224,8 @@ ret_from_syscall: ret_from_exception: REG_L s0, PT_SSTATUS(sp) - csrc sstatus, SR_IE - andi s0, s0, SR_PS + csrc sstatus, SR_SIE + andi s0, s0, SR_SPP bnez s0, restore_all resume_userspace: @@ -255,7 +255,7 @@ work_pending: bnez s1, work_resched work_notifysig: /* Handle pending signals and notify-resume requests */ - csrs sstatus, SR_IE /* Enable interrupts for do_notify_resume() */ + csrs sstatus, SR_SIE /* Enable interrupts for do_notify_resume() */ move a0, sp /* pt_regs */ move a1, s0 /* current_thread_info->flags */ tail do_notify_resume diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 0d90dcc1fbd3..d74d4adf2d54 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -76,7 +76,7 @@ void show_regs(struct pt_regs *regs) void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp) { - regs->sstatus = SR_PIE /* User mode, irqs on */ | SR_FS_INITIAL; + regs->sstatus = SR_SPIE /* User mode, irqs on */ | SR_FS_INITIAL; regs->sepc = pc; regs->sp = sp; set_fs(USER_DS); @@ -110,7 +110,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, const register unsigned long gp __asm__ ("gp"); memset(childregs, 0, sizeof(struct pt_regs)); childregs->gp = gp; - childregs->sstatus = SR_PS | SR_PIE; /* Supervisor, irqs on */ + childregs->sstatus = SR_SPP | SR_SPIE; /* Supervisor, irqs on */ p->thread.ra = (unsigned long)ret_from_kernel_thread; p->thread.s[0] = usp; /* fn */ diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index df2ca3c65048..0713f3c67ab4 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -63,7 +63,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs) goto vmalloc_fault; /* Enable interrupts if they were enabled in the parent context. */ - if (likely(regs->sstatus & SR_PIE)) + if (likely(regs->sstatus & SR_SPIE)) local_irq_enable(); /* -- cgit v1.2.3-58-ga151 From 29159a4ed7044c52e3e2cf1a9fb55cec4745c60b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 8 Jan 2018 13:58:31 +0100 Subject: ALSA: pcm: Abort properly at pending signal in OSS read/write loops The loops for read and write in PCM OSS emulation have no proper check of pending signals, and they keep processing even after user tries to break. This results in a very long delay, often seen as RCU stall when a huge unprocessed bytes remain queued. The bug could be easily triggered by syzkaller. As a simple workaround, this patch adds the proper check of pending signals and aborts the loop appropriately. Reported-by: syzbot+993cb4cfcbbff3947c21@syzkaller.appspotmail.com Cc: Signed-off-by: Takashi Iwai --- sound/core/oss/pcm_oss.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index ceaa51f76591..e317964bd2ea 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -1381,6 +1381,10 @@ static ssize_t snd_pcm_oss_write1(struct snd_pcm_substream *substream, const cha tmp != runtime->oss.period_bytes) break; } + if (signal_pending(current)) { + tmp = -ERESTARTSYS; + goto err; + } } mutex_unlock(&runtime->oss.params_lock); return xfer; @@ -1466,6 +1470,10 @@ static ssize_t snd_pcm_oss_read1(struct snd_pcm_substream *substream, char __use bytes -= tmp; xfer += tmp; } + if (signal_pending(current)) { + tmp = -ERESTARTSYS; + goto err; + } } mutex_unlock(&runtime->oss.params_lock); return xfer; -- cgit v1.2.3-58-ga151 From 900498a34a3ac9c611e9b425094c8106bdd7dc1c Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 8 Jan 2018 14:03:53 +0100 Subject: ALSA: pcm: Allow aborting mutex lock at OSS read/write loops PCM OSS read/write loops keep taking the mutex lock for the whole read/write, and this might take very long when the exceptionally high amount of data is given. Also, since it invokes with mutex_lock(), the concurrent read/write becomes unbreakable. This patch tries to address these issues by replacing mutex_lock() with mutex_lock_interruptible(), and also splits / re-takes the lock at each read/write period chunk, so that it can switch the context more finely if requested. Cc: Signed-off-by: Takashi Iwai --- sound/core/oss/pcm_oss.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index e317964bd2ea..c2db7e905f7d 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -1334,8 +1334,11 @@ static ssize_t snd_pcm_oss_write1(struct snd_pcm_substream *substream, const cha if ((tmp = snd_pcm_oss_make_ready(substream)) < 0) return tmp; - mutex_lock(&runtime->oss.params_lock); while (bytes > 0) { + if (mutex_lock_interruptible(&runtime->oss.params_lock)) { + tmp = -ERESTARTSYS; + break; + } if (bytes < runtime->oss.period_bytes || runtime->oss.buffer_used > 0) { tmp = bytes; if (tmp + runtime->oss.buffer_used > runtime->oss.period_bytes) @@ -1379,18 +1382,18 @@ static ssize_t snd_pcm_oss_write1(struct snd_pcm_substream *substream, const cha xfer += tmp; if ((substream->f_flags & O_NONBLOCK) != 0 && tmp != runtime->oss.period_bytes) - break; + tmp = -EAGAIN; } + err: + mutex_unlock(&runtime->oss.params_lock); + if (tmp < 0) + break; if (signal_pending(current)) { tmp = -ERESTARTSYS; - goto err; + break; } + tmp = 0; } - mutex_unlock(&runtime->oss.params_lock); - return xfer; - - err: - mutex_unlock(&runtime->oss.params_lock); return xfer > 0 ? (snd_pcm_sframes_t)xfer : tmp; } @@ -1438,8 +1441,11 @@ static ssize_t snd_pcm_oss_read1(struct snd_pcm_substream *substream, char __use if ((tmp = snd_pcm_oss_make_ready(substream)) < 0) return tmp; - mutex_lock(&runtime->oss.params_lock); while (bytes > 0) { + if (mutex_lock_interruptible(&runtime->oss.params_lock)) { + tmp = -ERESTARTSYS; + break; + } if (bytes < runtime->oss.period_bytes || runtime->oss.buffer_used > 0) { if (runtime->oss.buffer_used == 0) { tmp = snd_pcm_oss_read2(substream, runtime->oss.buffer, runtime->oss.period_bytes, 1); @@ -1470,16 +1476,16 @@ static ssize_t snd_pcm_oss_read1(struct snd_pcm_substream *substream, char __use bytes -= tmp; xfer += tmp; } + err: + mutex_unlock(&runtime->oss.params_lock); + if (tmp < 0) + break; if (signal_pending(current)) { tmp = -ERESTARTSYS; - goto err; + break; } + tmp = 0; } - mutex_unlock(&runtime->oss.params_lock); - return xfer; - - err: - mutex_unlock(&runtime->oss.params_lock); return xfer > 0 ? (snd_pcm_sframes_t)xfer : tmp; } -- cgit v1.2.3-58-ga151 From b2157399cc9898260d6031c5bfe45fe137c1fbe7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 7 Jan 2018 17:33:02 -0800 Subject: bpf: prevent out-of-bounds speculation Under speculation, CPUs may mis-predict branches in bounds checks. Thus, memory accesses under a bounds check may be speculated even if the bounds check fails, providing a primitive for building a side channel. To avoid leaking kernel data round up array-based maps and mask the index after bounds check, so speculated load with out of bounds index will load either valid value from the array or zero from the padded area. Unconditionally mask index for all array types even when max_entries are not rounded to power of 2 for root user. When map is created by unpriv user generate a sequence of bpf insns that includes AND operation to make sure that JITed code includes the same 'index & index_mask' operation. If prog_array map is created by unpriv user replace bpf_tail_call(ctx, map, index); with if (index >= max_entries) { index &= map->index_mask; bpf_tail_call(ctx, map, index); } (along with roundup to power 2) to prevent out-of-bounds speculation. There is secondary redundant 'if (index >= max_entries)' in the interpreter and in all JITs, but they can be optimized later if necessary. Other array-like maps (cpumap, devmap, sockmap, perf_event_array, cgroup_array) cannot be used by unpriv, so no changes there. That fixes bpf side of "Variant 1: bounds check bypass (CVE-2017-5753)" on all architectures with and without JIT. v2->v3: Daniel noticed that attack potentially can be crafted via syscall commands without loading the program, so add masking to those paths as well. Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/arraymap.c | 47 ++++++++++++++++++++++++++++++++++++----------- kernel/bpf/verifier.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 11 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e55e4255a210..1b985ca4ffbe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -52,6 +52,7 @@ struct bpf_map { u32 pages; u32 id; int numa_node; + bool unpriv_array; struct user_struct *user; const struct bpf_map_ops *ops; struct work_struct work; @@ -221,6 +222,7 @@ struct bpf_prog_aux { struct bpf_array { struct bpf_map map; u32 elem_size; + u32 index_mask; /* 'ownership' of prog_array is claimed by the first program that * is going to use this map or by the first program which FD is stored * in the map to make sure that all callers and callees have the same diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7c25426d3cf5..aaa319848e7d 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -53,9 +53,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); struct bpf_array *array; u64 array_size; - u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -72,11 +73,20 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); + max_entries = attr->max_entries; + index_mask = roundup_pow_of_two(max_entries) - 1; + + if (unpriv) + /* round up array size to nearest power of 2, + * since cpu will speculate within index_mask limits + */ + max_entries = index_mask + 1; + array_size = sizeof(*array); if (percpu) - array_size += (u64) attr->max_entries * sizeof(void *); + array_size += (u64) max_entries * sizeof(void *); else - array_size += (u64) attr->max_entries * elem_size; + array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ if (array_size >= U32_MAX - PAGE_SIZE) @@ -86,6 +96,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); + array->index_mask = index_mask; + array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ array->map.map_type = attr->map_type; @@ -121,12 +133,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return array->value + array->elem_size * index; + return array->value + array->elem_size * (index & array->index_mask); } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; u32 elem_size = round_up(map->value_size, 8); const int ret = BPF_REG_0; @@ -135,7 +148,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + } if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); @@ -157,7 +175,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return this_cpu_ptr(array->pptrs[index]); + return this_cpu_ptr(array->pptrs[index & array->index_mask]); } int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) @@ -177,7 +195,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); off += size; @@ -225,10 +243,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EEXIST; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) - memcpy(this_cpu_ptr(array->pptrs[index]), + memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); else - memcpy(array->value + array->elem_size * index, + memcpy(array->value + + array->elem_size * (index & array->index_mask), value, map->value_size); return 0; } @@ -262,7 +281,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); off += size; @@ -613,6 +632,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) static u32 array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); u32 elem_size = round_up(map->value_size, 8); struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -621,7 +641,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + } if (is_power_of_2(elem_size)) *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); else diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 04b24876cd23..b414d6b2d470 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1729,6 +1729,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; + if (func_id == BPF_FUNC_tail_call) { + if (meta.map_ptr == NULL) { + verbose(env, "verifier bug\n"); + return -EINVAL; + } + env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; + } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -4456,6 +4463,35 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; + + /* instead of changing every JIT dealing with tail_call + * emit two extra insns: + * if (index >= max_entries) goto out; + * index &= array->index_mask; + * to avoid out-of-bounds cpu speculation + */ + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (map_ptr == BPF_MAP_PTR_POISON) { + verbose(env, "tail_call obusing map_ptr\n"); + return -EINVAL; + } + if (!map_ptr->unpriv_array) + continue; + insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, + map_ptr->max_entries, 2); + insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, + container_of(map_ptr, + struct bpf_array, + map)->index_mask); + insn_buf[2] = *insn; + cnt = 3; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; continue; } -- cgit v1.2.3-58-ga151 From b8fd0823e0770c2d5fdbd865bccf0d5e058e5287 Mon Sep 17 00:00:00 2001 From: Andrii Vladyka Date: Thu, 4 Jan 2018 13:09:17 +0200 Subject: net: core: fix module type in sock_diag_bind Use AF_INET6 instead of AF_INET in IPv6-related code path Signed-off-by: Andrii Vladyka Signed-off-by: David S. Miller --- net/core/sock_diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 217f4e3b82f6..146b50e30659 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -288,7 +288,7 @@ static int sock_diag_bind(struct net *net, int group) case SKNLGRP_INET6_UDP_DESTROY: if (!sock_diag_handlers[AF_INET6]) request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, - NETLINK_SOCK_DIAG, AF_INET); + NETLINK_SOCK_DIAG, AF_INET6); break; } return 0; -- cgit v1.2.3-58-ga151 From edd8ca8015800b354453b891d38960f3a474b7e4 Mon Sep 17 00:00:00 2001 From: Florian Margaine Date: Wed, 13 Dec 2017 16:43:59 +0100 Subject: rbd: reacquire lock should update lock owner client id Otherwise, future operations on this RBD using exclusive-lock are going to require the lock from a non-existent client id. Cc: stable@vger.kernel.org Fixes: 14bb211d324d ("rbd: support updating the lock cookie without releasing the lock") Link: http://tracker.ceph.com/issues/19929 Signed-off-by: Florian Margaine [idryomov@gmail.com: rbd_set_owner_cid() call, __rbd_lock() helper] Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 38fc5f397fde..aacae6f7163e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3047,13 +3047,21 @@ static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf) mutex_unlock(&rbd_dev->watch_mutex); } +static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie) +{ + struct rbd_client_id cid = rbd_get_cid(rbd_dev); + + strcpy(rbd_dev->lock_cookie, cookie); + rbd_set_owner_cid(rbd_dev, &cid); + queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); +} + /* * lock_rwsem must be held for write */ static int rbd_lock(struct rbd_device *rbd_dev) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; - struct rbd_client_id cid = rbd_get_cid(rbd_dev); char cookie[32]; int ret; @@ -3068,9 +3076,7 @@ static int rbd_lock(struct rbd_device *rbd_dev) return ret; rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; - strcpy(rbd_dev->lock_cookie, cookie); - rbd_set_owner_cid(rbd_dev, &cid); - queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); + __rbd_lock(rbd_dev, cookie); return 0; } @@ -3856,7 +3862,7 @@ static void rbd_reacquire_lock(struct rbd_device *rbd_dev) queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); } else { - strcpy(rbd_dev->lock_cookie, cookie); + __rbd_lock(rbd_dev, cookie); } } -- cgit v1.2.3-58-ga151 From 21acdf45f4958135940f0b4767185cf911d4b010 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 21 Dec 2017 15:35:11 +0100 Subject: rbd: set max_segments to USHRT_MAX Commit d3834fefcfe5 ("rbd: bump queue_max_segments") bumped max_segments (unsigned short) to max_hw_sectors (unsigned int). max_hw_sectors is set to the number of 512-byte sectors in an object and overflows unsigned short for 32M (largest possible) objects, making the block layer resort to handing us single segment (i.e. single page or even smaller) bios in that case. Cc: stable@vger.kernel.org Fixes: d3834fefcfe5 ("rbd: bump queue_max_segments") Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index aacae6f7163e..cc93522a6d41 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4387,7 +4387,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) segment_size = rbd_obj_bytes(&rbd_dev->header); blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); q->limits.max_sectors = queue_max_hw_sectors(q); - blk_queue_max_segments(q, segment_size / SECTOR_SIZE); + blk_queue_max_segments(q, USHRT_MAX); blk_queue_max_segment_size(q, segment_size); blk_queue_io_min(q, segment_size); blk_queue_io_opt(q, segment_size); -- cgit v1.2.3-58-ga151 From 3dc2fa47549aca71773afdd12a78d31802bb22b4 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Mon, 8 Jan 2018 19:43:00 +0800 Subject: net: caif: use strlcpy() instead of strncpy() gcc-8 reports net/caif/caif_dev.c: In function 'caif_enroll_dev': ./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may be truncated copying 15 bytes from a string of length 15 [-Wstringop-truncation] net/caif/cfctrl.c: In function 'cfctrl_linkup_request': ./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may be truncated copying 15 bytes from a string of length 15 [-Wstringop-truncation] net/caif/cfcnfg.c: In function 'caif_connect_client': ./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may be truncated copying 15 bytes from a string of length 15 [-Wstringop-truncation] The compiler require that the input param 'len' of strncpy() should be greater than the length of the src string, so that '\0' is copied as well. We can just use strlcpy() to avoid this warning. Signed-off-by: Xiongfeng Wang Signed-off-by: David S. Miller --- net/caif/caif_dev.c | 5 ++--- net/caif/cfcnfg.c | 10 ++++------ net/caif/cfctrl.c | 4 ++-- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 2d38b6e34203..e0adcd123f48 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -334,9 +334,8 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, mutex_lock(&caifdevs->lock); list_add_rcu(&caifd->list, &caifdevs->list); - strncpy(caifd->layer.name, dev->name, - sizeof(caifd->layer.name) - 1); - caifd->layer.name[sizeof(caifd->layer.name) - 1] = 0; + strlcpy(caifd->layer.name, dev->name, + sizeof(caifd->layer.name)); caifd->layer.transmit = transmit; cfcnfg_add_phy_layer(cfg, dev, diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index 273cb07f57d8..8f00bea093b9 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -268,17 +268,15 @@ static int caif_connect_req_to_link_param(struct cfcnfg *cnfg, case CAIFPROTO_RFM: l->linktype = CFCTRL_SRV_RFM; l->u.datagram.connid = s->sockaddr.u.rfm.connection_id; - strncpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, - sizeof(l->u.rfm.volume)-1); - l->u.rfm.volume[sizeof(l->u.rfm.volume)-1] = 0; + strlcpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume, + sizeof(l->u.rfm.volume)); break; case CAIFPROTO_UTIL: l->linktype = CFCTRL_SRV_UTIL; l->endpoint = 0x00; l->chtype = 0x00; - strncpy(l->u.utility.name, s->sockaddr.u.util.service, - sizeof(l->u.utility.name)-1); - l->u.utility.name[sizeof(l->u.utility.name)-1] = 0; + strlcpy(l->u.utility.name, s->sockaddr.u.util.service, + sizeof(l->u.utility.name)); caif_assert(sizeof(l->u.utility.name) > 10); l->u.utility.paramlen = s->param.size; if (l->u.utility.paramlen > sizeof(l->u.utility.params)) diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index f5afda1abc76..655ed7032150 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -258,8 +258,8 @@ int cfctrl_linkup_request(struct cflayer *layer, tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs); cfpkt_add_body(pkt, &tmp16, 2); memset(utility_name, 0, sizeof(utility_name)); - strncpy(utility_name, param->u.utility.name, - UTILITY_NAME_LENGTH - 1); + strlcpy(utility_name, param->u.utility.name, + UTILITY_NAME_LENGTH); cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH); tmp8 = param->u.utility.paramlen; cfpkt_add_body(pkt, &tmp8, 1); -- cgit v1.2.3-58-ga151 From 20b50d79974ea3192e8c3ab7faf4e536e5f14d8f Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Mon, 8 Jan 2018 15:54:44 +0100 Subject: net: ipv4: emulate READ_ONCE() on ->hdrincl bit-field in raw_sendmsg() Commit 8f659a03a0ba ("net: ipv4: fix for a race condition in raw_sendmsg") fixed the issue of possibly inconsistent ->hdrincl handling due to concurrent updates by reading this bit-field member into a local variable and using the thus stabilized value in subsequent tests. However, aforementioned commit also adds the (correct) comment that /* hdrincl should be READ_ONCE(inet->hdrincl) * but READ_ONCE() doesn't work with bit fields */ because as it stands, the compiler is free to shortcut or even eliminate the local variable at its will. Note that I have not seen anything like this happening in reality and thus, the concern is a theoretical one. However, in order to be on the safe side, emulate a READ_ONCE() on the bit-field by doing it on the local 'hdrincl' variable itself: int hdrincl = inet->hdrincl; hdrincl = READ_ONCE(hdrincl); This breaks the chain in the sense that the compiler is not allowed to replace subsequent reads from hdrincl with reloads from inet->hdrincl. Fixes: 8f659a03a0ba ("net: ipv4: fix for a race condition in raw_sendmsg") Signed-off-by: Nicolai Stange Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv4/raw.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 125c1eab3eaa..5e570aa9e43b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -520,9 +520,11 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; /* hdrincl should be READ_ONCE(inet->hdrincl) - * but READ_ONCE() doesn't work with bit fields + * but READ_ONCE() doesn't work with bit fields. + * Doing this indirectly yields the same result. */ hdrincl = inet->hdrincl; + hdrincl = READ_ONCE(hdrincl); /* * Check the flags. */ -- cgit v1.2.3-58-ga151 From 2fdd18118dad86bf5e7880d8d02ea27be23e3671 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 8 Jan 2018 08:50:17 +0200 Subject: docs-rst: networking: wire up msg_zerocopy Fix the following 'make htmldocs' complaint: Documentation/networking/msg_zerocopy.rst:: WARNING: document isn't included in any toctree. Signed-off-by: Mike Rapoport Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 66e620866245..7d4b15977d61 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -9,6 +9,7 @@ Contents: batman-adv kapi z8530book + msg_zerocopy .. only:: subproject @@ -16,4 +17,3 @@ Contents: ======= * :ref:`genindex` - -- cgit v1.2.3-58-ga151 From 195e2addbce09e5afbc766efc1e6567c9ce840d3 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 6 Jan 2018 21:53:26 +0300 Subject: SolutionEngine771x: fix Ether platform data The 'sh_eth' driver's probe() method would fail on the SolutionEngine7710 board and crash on SolutionEngine7712 board as the platform code is hopelessly behind the driver's platform data -- it passes the PHY address instead of 'struct sh_eth_plat_data *'; pass the latter to the driver in order to fix the bug... Fixes: 71557a37adb5 ("[netdrvr] sh_eth: Add SH7619 support") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- arch/sh/boards/mach-se/770x/setup.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/sh/boards/mach-se/770x/setup.c b/arch/sh/boards/mach-se/770x/setup.c index 77c35350ee77..b7fa7a87e946 100644 --- a/arch/sh/boards/mach-se/770x/setup.c +++ b/arch/sh/boards/mach-se/770x/setup.c @@ -9,6 +9,7 @@ */ #include #include +#include #include #include #include @@ -115,6 +116,11 @@ static struct platform_device heartbeat_device = { #if defined(CONFIG_CPU_SUBTYPE_SH7710) ||\ defined(CONFIG_CPU_SUBTYPE_SH7712) /* SH771X Ethernet driver */ +static struct sh_eth_plat_data sh_eth_plat = { + .phy = PHY_ID, + .phy_interface = PHY_INTERFACE_MODE_MII, +}; + static struct resource sh_eth0_resources[] = { [0] = { .start = SH_ETH0_BASE, @@ -132,7 +138,7 @@ static struct platform_device sh_eth0_device = { .name = "sh771x-ether", .id = 0, .dev = { - .platform_data = PHY_ID, + .platform_data = &sh_eth_plat, }, .num_resources = ARRAY_SIZE(sh_eth0_resources), .resource = sh_eth0_resources, @@ -155,7 +161,7 @@ static struct platform_device sh_eth1_device = { .name = "sh771x-ether", .id = 1, .dev = { - .platform_data = PHY_ID, + .platform_data = &sh_eth_plat, }, .num_resources = ARRAY_SIZE(sh_eth1_resources), .resource = sh_eth1_resources, -- cgit v1.2.3-58-ga151 From f9a531d6731d74f1e24298d9641c2dc1fef2631b Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 6 Jan 2018 21:53:27 +0300 Subject: SolutionEngine771x: add Ether TSU resource After the Ether platform data is fixed, the driver probe() method would still fail since the 'struct sh_eth_cpu_data' corresponding to SH771x indicates the presence of TSU but the memory resource for it is absent. Add the missing TSU resource to both Ether devices and fix the harmless off-by-one error in the main memory resources, while at it... Fixes: 4986b996882d ("net: sh_eth: remove the SH_TSU_ADDR") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- arch/sh/boards/mach-se/770x/setup.c | 14 ++++++++++++-- arch/sh/include/mach-se/mach/se.h | 1 + 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/sh/boards/mach-se/770x/setup.c b/arch/sh/boards/mach-se/770x/setup.c index b7fa7a87e946..412326d59e6f 100644 --- a/arch/sh/boards/mach-se/770x/setup.c +++ b/arch/sh/boards/mach-se/770x/setup.c @@ -124,10 +124,15 @@ static struct sh_eth_plat_data sh_eth_plat = { static struct resource sh_eth0_resources[] = { [0] = { .start = SH_ETH0_BASE, - .end = SH_ETH0_BASE + 0x1B8, + .end = SH_ETH0_BASE + 0x1B8 - 1, .flags = IORESOURCE_MEM, }, [1] = { + .start = SH_TSU_BASE, + .end = SH_TSU_BASE + 0x200 - 1, + .flags = IORESOURCE_MEM, + }, + [2] = { .start = SH_ETH0_IRQ, .end = SH_ETH0_IRQ, .flags = IORESOURCE_IRQ, @@ -147,10 +152,15 @@ static struct platform_device sh_eth0_device = { static struct resource sh_eth1_resources[] = { [0] = { .start = SH_ETH1_BASE, - .end = SH_ETH1_BASE + 0x1B8, + .end = SH_ETH1_BASE + 0x1B8 - 1, .flags = IORESOURCE_MEM, }, [1] = { + .start = SH_TSU_BASE, + .end = SH_TSU_BASE + 0x200 - 1, + .flags = IORESOURCE_MEM, + }, + [2] = { .start = SH_ETH1_IRQ, .end = SH_ETH1_IRQ, .flags = IORESOURCE_IRQ, diff --git a/arch/sh/include/mach-se/mach/se.h b/arch/sh/include/mach-se/mach/se.h index 4246ef9b07a3..aa83fe1ff0b1 100644 --- a/arch/sh/include/mach-se/mach/se.h +++ b/arch/sh/include/mach-se/mach/se.h @@ -100,6 +100,7 @@ /* Base address */ #define SH_ETH0_BASE 0xA7000000 #define SH_ETH1_BASE 0xA7000400 +#define SH_TSU_BASE 0xA7000800 /* PHY ID */ #if defined(CONFIG_CPU_SUBTYPE_SH7710) # define PHY_ID 0x00 -- cgit v1.2.3-58-ga151 From 4512c43eac7e007d982e7ea45152ea6f3f4d1921 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 8 Jan 2018 10:34:00 -0800 Subject: ipv6: remove null_entry before adding default route In the current code, when creating a new fib6 table, tb6_root.leaf gets initialized to net->ipv6.ip6_null_entry. If a default route is being added with rt->rt6i_metric = 0xffffffff, fib6_add() will add this route after net->ipv6.ip6_null_entry. As null_entry is shared, it could cause problem. In order to fix it, set fn->leaf to NULL before calling fib6_add_rt2node() when trying to add the first default route. And reset fn->leaf to null_entry when adding fails or when deleting the last default route. syzkaller reported the following issue which is fixed by this commit: WARNING: suspicious RCU usage 4.15.0-rc5+ #171 Not tainted ----------------------------- net/ipv6/ip6_fib.c:1702 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 4 locks held by swapper/0/0: #0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] lockdep_copy_map include/linux/lockdep.h:178 [inline] #0: ((&net->ipv6.ip6_fib_timer)){+.-.}, at: [<00000000d43f631b>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1310 #1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] spin_lock_bh include/linux/spinlock.h:315 [inline] #1: (&(&net->ipv6.fib6_gc_lock)->rlock){+.-.}, at: [<000000002ff9d65c>] fib6_run_gc+0x9d/0x3c0 net/ipv6/ip6_fib.c:2007 #2: (rcu_read_lock){....}, at: [<0000000091db762d>] __fib6_clean_all+0x0/0x3a0 net/ipv6/ip6_fib.c:1560 #3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] spin_lock_bh include/linux/spinlock.h:315 [inline] #3: (&(&tb->tb6_lock)->rlock){+.-.}, at: [<000000009e503581>] __fib6_clean_all+0x1d0/0x3a0 net/ipv6/ip6_fib.c:1948 stack backtrace: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.15.0-rc5+ #171 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4585 fib6_del+0xcaa/0x11b0 net/ipv6/ip6_fib.c:1701 fib6_clean_node+0x3aa/0x4f0 net/ipv6/ip6_fib.c:1892 fib6_walk_continue+0x46c/0x8a0 net/ipv6/ip6_fib.c:1815 fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1863 fib6_clean_tree+0x1e6/0x340 net/ipv6/ip6_fib.c:1933 __fib6_clean_all+0x1f4/0x3a0 net/ipv6/ip6_fib.c:1949 fib6_clean_all net/ipv6/ip6_fib.c:1960 [inline] fib6_run_gc+0x16b/0x3c0 net/ipv6/ip6_fib.c:2016 fib6_gc_timer_cb+0x20/0x30 net/ipv6/ip6_fib.c:2033 call_timer_fn+0x228/0x820 kernel/time/timer.c:1320 expire_timers kernel/time/timer.c:1357 [inline] __run_timers+0x7ee/0xb70 kernel/time/timer.c:1660 run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686 __do_softirq+0x2d7/0xb85 kernel/softirq.c:285 invoke_softirq kernel/softirq.c:365 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:540 [inline] smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052 apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:904 Reported-by: syzbot Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Signed-off-by: Wei Wang Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index d11a5578e4f8..9dcc3924a975 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -640,6 +640,11 @@ static struct fib6_node *fib6_add_1(struct net *net, if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); rt6_release(leaf); + /* remove null_entry in the root node */ + } else if (fn->fn_flags & RTN_TL_ROOT && + rcu_access_pointer(fn->leaf) == + net->ipv6.ip6_null_entry) { + RCU_INIT_POINTER(fn->leaf, NULL); } return fn; @@ -1270,13 +1275,17 @@ out: return err; failure: - /* fn->leaf could be NULL if fn is an intermediate node and we - * failed to add the new route to it in both subtree creation - * failure and fib6_add_rt2node() failure case. - * In both cases, fib6_repair_tree() should be called to fix - * fn->leaf. + /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: + * 1. fn is an intermediate node and we failed to add the new + * route to it in both subtree creation failure and fib6_add_rt2node() + * failure case. + * 2. fn is the root node in the table and we fail to add the first + * default route to it. */ - if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) + if (fn && + (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || + (fn->fn_flags & RTN_TL_ROOT && + !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); /* Always release dst as dst->__refcnt is guaranteed * to be taken before entering this function @@ -1531,6 +1540,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_walker *w; int iter = 0; + /* Set fn->leaf to null_entry for root node. */ + if (fn->fn_flags & RTN_TL_ROOT) { + rcu_assign_pointer(fn->leaf, net->ipv6.ip6_null_entry); + return fn; + } + for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); @@ -1685,10 +1700,15 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, } read_unlock(&net->ipv6.fib6_walker_lock); - /* If it was last route, expunge its radix tree node */ + /* If it was last route, call fib6_repair_tree() to: + * 1. For root node, put back null_entry as how the table was created. + * 2. For other nodes, expunge its radix tree node. + */ if (!rcu_access_pointer(fn->leaf)) { - fn->fn_flags &= ~RTN_RTINFO; - net->ipv6.rt6_stats->fib_route_nodes--; + if (!(fn->fn_flags & RTN_TL_ROOT)) { + fn->fn_flags &= ~RTN_RTINFO; + net->ipv6.rt6_stats->fib_route_nodes--; + } fn = fib6_repair_tree(net, table, fn); } -- cgit v1.2.3-58-ga151 From be95a845cc4402272994ce290e3ad928aff06cb9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Jan 2018 13:17:44 +0100 Subject: bpf: avoid false sharing of map refcount with max_entries In addition to commit b2157399cc98 ("bpf: prevent out-of-bounds speculation") also change the layout of struct bpf_map such that false sharing of fast-path members like max_entries is avoided when the maps reference counter is altered. Therefore enforce them to be placed into separate cachelines. pahole dump after change: struct bpf_map { const struct bpf_map_ops * ops; /* 0 8 */ struct bpf_map * inner_map_meta; /* 8 8 */ void * security; /* 16 8 */ enum bpf_map_type map_type; /* 24 4 */ u32 key_size; /* 28 4 */ u32 value_size; /* 32 4 */ u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ u32 pages; /* 44 4 */ u32 id; /* 48 4 */ int numa_node; /* 52 4 */ bool unpriv_array; /* 56 1 */ /* XXX 7 bytes hole, try to pack */ /* --- cacheline 1 boundary (64 bytes) --- */ struct user_struct * user; /* 64 8 */ atomic_t refcnt; /* 72 4 */ atomic_t usercnt; /* 76 4 */ struct work_struct work; /* 80 32 */ char name[16]; /* 112 16 */ /* --- cacheline 2 boundary (128 bytes) --- */ /* size: 128, cachelines: 2, members: 17 */ /* sum members: 121, holes: 1, sum holes: 7 */ }; Now all entries in the first cacheline are read only throughout the life time of the map, set up once during map creation. Overall struct size and number of cachelines doesn't change from the reordering. struct bpf_map is usually first member and embedded in map structs in specific map implementations, so also avoid those members to sit at the end where it could potentially share the cacheline with first map values e.g. in the array since remote CPUs could trigger map updates just as well for those (easily dirtying members like max_entries intentionally as well) while having subsequent values in cache. Quoting from Google's Project Zero blog [1]: Additionally, at least on the Intel machine on which this was tested, bouncing modified cache lines between cores is slow, apparently because the MESI protocol is used for cache coherence [8]. Changing the reference counter of an eBPF array on one physical CPU core causes the cache line containing the reference counter to be bounced over to that CPU core, making reads of the reference counter on all other CPU cores slow until the changed reference counter has been written back to memory. Because the length and the reference counter of an eBPF array are stored in the same cache line, this also means that changing the reference counter on one physical CPU core causes reads of the eBPF array's length to be slow on other physical CPU cores (intentional false sharing). While this doesn't 'control' the out-of-bounds speculation through masking the index as in commit b2157399cc98, triggering a manipulation of the map's reference counter is really trivial, so lets not allow to easily affect max_entries from it. Splitting to separate cachelines also generally makes sense from a performance perspective anyway in that fast-path won't have a cache miss if the map gets pinned, reused in other progs, etc out of control path, thus also avoids unintentional false sharing. [1] https://googleprojectzero.blogspot.ch/2018/01/reading-privileged-memory-with-side.html Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1b985ca4ffbe..fe2cb7c398e3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -43,7 +43,14 @@ struct bpf_map_ops { }; struct bpf_map { - atomic_t refcnt; + /* 1st cacheline with read-mostly members of which some + * are also accessed in fast-path (e.g. ops, max_entries). + */ + const struct bpf_map_ops *ops ____cacheline_aligned; + struct bpf_map *inner_map_meta; +#ifdef CONFIG_SECURITY + void *security; +#endif enum bpf_map_type map_type; u32 key_size; u32 value_size; @@ -53,15 +60,16 @@ struct bpf_map { u32 id; int numa_node; bool unpriv_array; - struct user_struct *user; - const struct bpf_map_ops *ops; - struct work_struct work; + /* 7 bytes hole */ + + /* 2nd cacheline with misc members to avoid false sharing + * particularly with refcounting. + */ + struct user_struct *user ____cacheline_aligned; + atomic_t refcnt; atomic_t usercnt; - struct bpf_map *inner_map_meta; + struct work_struct work; char name[BPF_OBJ_NAME_LEN]; -#ifdef CONFIG_SECURITY - void *security; -#endif }; /* function argument constraints */ -- cgit v1.2.3-58-ga151 From 290af86629b25ffd1ed6232c4e9107da031705cb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Jan 2018 10:04:29 -0800 Subject: bpf: introduce BPF_JIT_ALWAYS_ON config The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715. A quote from goolge project zero blog: "At this point, it would normally be necessary to locate gadgets in the host kernel code that can be used to actually leak data by reading from an attacker-controlled location, shifting and masking the result appropriately and then using the result of that as offset to an attacker-controlled address for a load. But piecing gadgets together and figuring out which ones work in a speculation context seems annoying. So instead, we decided to use the eBPF interpreter, which is built into the host kernel - while there is no legitimate way to invoke it from inside a VM, the presence of the code in the host kernel's text section is sufficient to make it usable for the attack, just like with ordinary ROP gadgets." To make attacker job harder introduce BPF_JIT_ALWAYS_ON config option that removes interpreter from the kernel in favor of JIT-only mode. So far eBPF JIT is supported by: x64, arm64, arm32, sparc64, s390, powerpc64, mips64 The start of JITed program is randomized and code page is marked as read-only. In addition "constant blinding" can be turned on with net.core.bpf_jit_harden v2->v3: - move __bpf_prog_ret0 under ifdef (Daniel) v1->v2: - fix init order, test_bpf and cBPF (Daniel's feedback) - fix offloaded bpf (Jakub's feedback) - add 'return 0' dummy in case something can invoke prog->bpf_func - retarget bpf tree. For bpf-next the patch would need one extra hunk. It will be sent when the trees are merged back to net-next Considered doing: int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT; but it seems better to land the patch as-is and in bpf-next remove bpf_jit_enable global variable from all JITs, consolidate in one place and remove this jit_init() function. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- init/Kconfig | 7 +++++++ kernel/bpf/core.c | 19 +++++++++++++++++++ lib/test_bpf.c | 11 +++++++---- net/core/filter.c | 6 ++---- net/core/sysctl_net_core.c | 6 ++++++ net/socket.c | 9 +++++++++ 6 files changed, 50 insertions(+), 8 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 2934249fba46..5e2a4a391ba9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1392,6 +1392,13 @@ config BPF_SYSCALL Enable the bpf() system call that allows to manipulate eBPF programs and maps via file descriptors. +config BPF_JIT_ALWAYS_ON + bool "Permanently enable BPF JIT and remove BPF interpreter" + depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT + help + Enables BPF JIT and removes BPF interpreter to avoid + speculative execution of BPF instructions by the interpreter + config USERFAULTFD bool "Enable userfaultfd() system call" select ANON_INODES diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 86b50aa26ee8..51ec2dda7f08 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -767,6 +767,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -1317,6 +1318,14 @@ EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#else +static unsigned int __bpf_prog_ret0(const void *ctx, + const struct bpf_insn *insn) +{ + return 0; +} +#endif + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -1364,9 +1373,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0; +#endif /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1376,6 +1389,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) */ if (!bpf_prog_is_dev_bound(fp->aux)) { fp = bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) { + *err = -ENOTSUPP; + return fp; + } +#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 9e9748089270..f369889e521d 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -6250,9 +6250,8 @@ static struct bpf_prog *generate_filter(int which, int *err) return NULL; } } - /* We don't expect to fail. */ if (*err) { - pr_cont("FAIL to attach err=%d len=%d\n", + pr_cont("FAIL to prog_create err=%d len=%d\n", *err, fprog.len); return NULL; } @@ -6276,6 +6275,10 @@ static struct bpf_prog *generate_filter(int which, int *err) * checks. */ fp = bpf_prog_select_runtime(fp, err); + if (*err) { + pr_cont("FAIL to select_runtime err=%d\n", *err); + return NULL; + } break; } @@ -6461,8 +6464,8 @@ static __init int test_bpf(void) pass_cnt++; continue; } - - return err; + err_cnt++; + continue; } pr_cont("jited:%u ", fp->jited); diff --git a/net/core/filter.c b/net/core/filter.c index 6a85e67fafce..d339ef170df6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1054,11 +1054,9 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) */ goto out_err_free; - /* We are guaranteed to never error here with cBPF to eBPF - * transitions, since there's no issue with type compatibility - * checks on program arrays. - */ fp = bpf_prog_select_runtime(fp, &err); + if (err) + goto out_err_free; kfree(old_prog); return fp; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cbc3dde4cfcc..a47ad6cd41c0 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -325,7 +325,13 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, +#ifndef CONFIG_BPF_JIT_ALWAYS_ON .proc_handler = proc_dointvec +#else + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, +#endif }, # ifdef CONFIG_HAVE_EBPF_JIT { diff --git a/net/socket.c b/net/socket.c index 05f361faec45..78acd6ce74c7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2619,6 +2619,15 @@ out_fs: core_initcall(sock_init); /* early initcall */ +static int __init jit_init(void) +{ +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + bpf_jit_enable = 1; +#endif + return 0; +} +pure_initcall(jit_init); + #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { -- cgit v1.2.3-58-ga151 From 1e77fc82110ac36febf46c1e2782f504f7d23099 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 9 Jan 2018 19:08:21 +0100 Subject: gpio: Add missing open drain/source handling to gpiod_set_value_cansleep() Since commit f11a04464ae57e8d ("i2c: gpio: Enable working over slow can_sleep GPIOs"), probing the i2c RTC connected to an i2c-gpio bus on r8a7740/armadillo fails with: rtc-s35390a 0-0030: error resetting chip rtc-s35390a: probe of 0-0030 failed with error -5 More debug code reveals: i2c i2c-0: master_xfer[0] R, addr=0x30, len=1 i2c i2c-0: NAK from device addr 0x30 msg #0 s35390a_get_reg: ret = -6 Commit 02e479808b5d62f8 ("gpio: Alter semantics of *raw* operations to actually be raw") moved open drain/source handling from gpiod_set_raw_value_commit() to gpiod_set_value(), but forgot to take into account that gpiod_set_value_cansleep() also needs this handling. The i2c protocol mandates that i2c signals are open drain, hence i2c communication fails. Fix this by adding the missing handling to gpiod_set_value_cansleep(), using a new common helper gpiod_set_value_nocheck(). Fixes: 02e479808b5d62f8 ("gpio: Alter semantics of *raw* operations to actually be raw") Signed-off-by: Geert Uytterhoeven [removed underscore syntax, added kerneldoc] Signed-off-by: Linus Walleij --- drivers/gpio/gpiolib.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 44332b793718..14532d9576e4 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -2892,6 +2892,27 @@ void gpiod_set_raw_value(struct gpio_desc *desc, int value) } EXPORT_SYMBOL_GPL(gpiod_set_raw_value); +/** + * gpiod_set_value_nocheck() - set a GPIO line value without checking + * @desc: the descriptor to set the value on + * @value: value to set + * + * This sets the value of a GPIO line backing a descriptor, applying + * different semantic quirks like active low and open drain/source + * handling. + */ +static void gpiod_set_value_nocheck(struct gpio_desc *desc, int value) +{ + if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) + value = !value; + if (test_bit(FLAG_OPEN_DRAIN, &desc->flags)) + gpio_set_open_drain_value_commit(desc, value); + else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags)) + gpio_set_open_source_value_commit(desc, value); + else + gpiod_set_raw_value_commit(desc, value); +} + /** * gpiod_set_value() - assign a gpio's value * @desc: gpio whose value will be assigned @@ -2906,16 +2927,8 @@ EXPORT_SYMBOL_GPL(gpiod_set_raw_value); void gpiod_set_value(struct gpio_desc *desc, int value) { VALIDATE_DESC_VOID(desc); - /* Should be using gpiod_set_value_cansleep() */ WARN_ON(desc->gdev->chip->can_sleep); - if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) - value = !value; - if (test_bit(FLAG_OPEN_DRAIN, &desc->flags)) - gpio_set_open_drain_value_commit(desc, value); - else if (test_bit(FLAG_OPEN_SOURCE, &desc->flags)) - gpio_set_open_source_value_commit(desc, value); - else - gpiod_set_raw_value_commit(desc, value); + gpiod_set_value_nocheck(desc, value); } EXPORT_SYMBOL_GPL(gpiod_set_value); @@ -3243,9 +3256,7 @@ void gpiod_set_value_cansleep(struct gpio_desc *desc, int value) { might_sleep_if(extra_checks); VALIDATE_DESC_VOID(desc); - if (test_bit(FLAG_ACTIVE_LOW, &desc->flags)) - value = !value; - gpiod_set_raw_value_commit(desc, value); + gpiod_set_value_nocheck(desc, value); } EXPORT_SYMBOL_GPL(gpiod_set_value_cansleep); -- cgit v1.2.3-58-ga151 From 2e83acb970684008baee471427270c029a76ddbd Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 8 Jan 2018 19:02:27 -0200 Subject: sctp: GFP_ATOMIC is not needed in sctp_setsockopt_events So replace it with GFP_USER and also add __GFP_NOWARN. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b4fb6e4886d2..54c046783a89 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2277,7 +2277,7 @@ static int sctp_setsockopt_events(struct sock *sk, char __user *optval, if (asoc && sctp_outq_is_empty(&asoc->outqueue)) { event = sctp_ulpevent_make_sender_dry_event(asoc, - GFP_ATOMIC); + GFP_USER | __GFP_NOWARN); if (!event) return -ENOMEM; -- cgit v1.2.3-58-ga151 From 5960cefab9df76600a1a7d4ff592c59e14616e88 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 8 Jan 2018 19:02:28 -0200 Subject: sctp: add a ceiling to optlen in some sockopts Hangbin Liu reported that some sockopt calls could cause the kernel to log a warning on memory allocation failure if the user supplied a large optlen value. That is because some of them called memdup_user() without a ceiling on optlen, allowing it to try to allocate really large buffers. This patch adds a ceiling by limiting optlen to the maximum allowed that would still make sense for these sockopt. Reported-by: Hangbin Liu Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 54c046783a89..022b94f11fd8 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3498,6 +3498,8 @@ static int sctp_setsockopt_hmac_ident(struct sock *sk, if (optlen < sizeof(struct sctp_hmacalgo)) return -EINVAL; + optlen = min_t(unsigned int, optlen, sizeof(struct sctp_hmacalgo) + + SCTP_AUTH_NUM_HMACS * sizeof(u16)); hmacs = memdup_user(optval, optlen); if (IS_ERR(hmacs)) @@ -3536,6 +3538,11 @@ static int sctp_setsockopt_auth_key(struct sock *sk, if (optlen <= sizeof(struct sctp_authkey)) return -EINVAL; + /* authkey->sca_keylength is u16, so optlen can't be bigger than + * this. + */ + optlen = min_t(unsigned int, optlen, USHRT_MAX + + sizeof(struct sctp_authkey)); authkey = memdup_user(optval, optlen); if (IS_ERR(authkey)) @@ -3893,6 +3900,9 @@ static int sctp_setsockopt_reset_streams(struct sock *sk, if (optlen < sizeof(*params)) return -EINVAL; + /* srs_number_streams is u16, so optlen can't be bigger than this. */ + optlen = min_t(unsigned int, optlen, USHRT_MAX + + sizeof(__u16) * sizeof(*params)); params = memdup_user(optval, optlen); if (IS_ERR(params)) -- cgit v1.2.3-58-ga151 From c76f97c99ae6d26d14c7f0e50e074382bfbc9f98 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Mon, 8 Jan 2018 19:02:29 -0200 Subject: sctp: make use of pre-calculated len Some sockopt handling functions were calculating the length of the buffer to be written to userspace and then calculating it again when actually writing the buffer, which could lead to some write not using an up-to-date length. This patch updates such places to just make use of the len variable. Also, replace some sizeof(type) to sizeof(var). Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/socket.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 022b94f11fd8..9b01e994f661 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5025,7 +5025,7 @@ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optv len = sizeof(int); if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval, &sctp_sk(sk)->autoclose, sizeof(int))) + if (copy_to_user(optval, &sctp_sk(sk)->autoclose, len)) return -EFAULT; return 0; } @@ -5655,6 +5655,9 @@ copy_getaddrs: err = -EFAULT; goto out; } + /* XXX: We should have accounted for sizeof(struct sctp_getaddrs) too, + * but we can't change it anymore. + */ if (put_user(bytes_copied, optlen)) err = -EFAULT; out: @@ -6091,7 +6094,7 @@ static int sctp_getsockopt_maxseg(struct sock *sk, int len, params.assoc_id = 0; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); - if (copy_from_user(¶ms, optval, sizeof(params))) + if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else return -EINVAL; @@ -6261,7 +6264,9 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, if (len < sizeof(struct sctp_authkeyid)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(struct sctp_authkeyid))) + + len = sizeof(struct sctp_authkeyid); + if (copy_from_user(&val, optval, len)) return -EFAULT; asoc = sctp_id2assoc(sk, val.scact_assoc_id); @@ -6273,7 +6278,6 @@ static int sctp_getsockopt_active_key(struct sock *sk, int len, else val.scact_keynumber = ep->active_key_id; - len = sizeof(struct sctp_authkeyid); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) @@ -6299,7 +6303,7 @@ static int sctp_getsockopt_peer_auth_chunks(struct sock *sk, int len, if (len < sizeof(struct sctp_authchunks)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks))) + if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; to = p->gauth_chunks; @@ -6344,7 +6348,7 @@ static int sctp_getsockopt_local_auth_chunks(struct sock *sk, int len, if (len < sizeof(struct sctp_authchunks)) return -EINVAL; - if (copy_from_user(&val, optval, sizeof(struct sctp_authchunks))) + if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; to = p->gauth_chunks; -- cgit v1.2.3-58-ga151 From 11d827a993a969c3c6ec56758ff63a44ba19b466 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Tue, 9 Jan 2018 11:02:33 +0800 Subject: net: gianfar_ptp: move set_fipers() to spinlock protecting area set_fipers() calling should be protected by spinlock in case that any interrupt breaks related registers setting and the function we expect. This patch is to move set_fipers() to spinlock protecting area in ptp_gianfar_adjtime(). Signed-off-by: Yangbo Lu Acked-by: Richard Cochran Reviewed-by: Fabio Estevam Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/gianfar_ptp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/gianfar_ptp.c b/drivers/net/ethernet/freescale/gianfar_ptp.c index 544114281ea7..9f8d4f8e57e3 100644 --- a/drivers/net/ethernet/freescale/gianfar_ptp.c +++ b/drivers/net/ethernet/freescale/gianfar_ptp.c @@ -319,11 +319,10 @@ static int ptp_gianfar_adjtime(struct ptp_clock_info *ptp, s64 delta) now = tmr_cnt_read(etsects); now += delta; tmr_cnt_write(etsects, now); + set_fipers(etsects); spin_unlock_irqrestore(&etsects->lock, flags); - set_fipers(etsects); - return 0; } -- cgit v1.2.3-58-ga151 From af60d61fa846725566f4a876ae04f891bdff1c7a Mon Sep 17 00:00:00 2001 From: Kornilios Kourtis Date: Tue, 9 Jan 2018 09:52:22 +0100 Subject: doc: clarification about setting SO_ZEROCOPY Signed-off-by: Kornilios Kourtis Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- Documentation/networking/msg_zerocopy.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/networking/msg_zerocopy.rst b/Documentation/networking/msg_zerocopy.rst index 77f6d7e25cfd..291a01264967 100644 --- a/Documentation/networking/msg_zerocopy.rst +++ b/Documentation/networking/msg_zerocopy.rst @@ -72,6 +72,10 @@ this flag, a process must first signal intent by setting a socket option: if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one))) error(1, errno, "setsockopt zerocopy"); +Setting the socket option only works when the socket is in its initial +(TCP_CLOSED) state. Trying to set the option for a socket returned by accept(), +for example, will lead to an EBUSY error. In this case, the option should be set +to the listening socket and it will be inherited by the accepted sockets. Transmission ------------ -- cgit v1.2.3-58-ga151 From b0d55b5bc77755501be9de2c935d106ff8dba9ac Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Tue, 9 Jan 2018 19:58:18 +0800 Subject: caif_usb: use strlcpy() instead of strncpy() gcc-8 reports net/caif/caif_usb.c: In function 'cfusbl_device_notify': ./include/linux/string.h:245:9: warning: '__builtin_strncpy' output may be truncated copying 15 bytes from a string of length 15 [-Wstringop-truncation] The compiler require that the input param 'len' of strncpy() should be greater than the length of the src string, so that '\0' is copied as well. We can just use strlcpy() to avoid this warning. Signed-off-by: Xiongfeng Wang Signed-off-by: David S. Miller --- net/caif/caif_usb.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index 5cd44f001f64..1a082a946045 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -176,9 +176,7 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what, dev_add_pack(&caif_usb_type); pack_added = true; - strncpy(layer->name, dev->name, - sizeof(layer->name) - 1); - layer->name[sizeof(layer->name) - 1] = 0; + strlcpy(layer->name, dev->name, sizeof(layer->name)); return 0; } -- cgit v1.2.3-58-ga151 From 95f566de0269a0c59fd6a737a147731302136429 Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Tue, 9 Jan 2018 14:43:34 +0200 Subject: of_mdio: avoid MDIO bus removal when a PHY is missing If one of the child devices is missing the of_mdiobus_register_phy() call will return -ENODEV. When a missing device is encountered the registration of the remaining PHYs is stopped and the MDIO bus will fail to register. Propagate all errors except ENODEV to avoid it. Signed-off-by: Madalin Bucur Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/of/of_mdio.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index 3481e69738b5..a327be1d264b 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -231,7 +231,12 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np) rc = of_mdiobus_register_phy(mdio, child, addr); else rc = of_mdiobus_register_device(mdio, child, addr); - if (rc) + + if (rc == -ENODEV) + dev_err(&mdio->dev, + "MDIO device at address %d is missing.\n", + addr); + else if (rc) goto unregister; } @@ -255,7 +260,7 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np) if (of_mdiobus_child_is_phy(child)) { rc = of_mdiobus_register_phy(mdio, child, addr); - if (rc) + if (rc && rc != -ENODEV) goto unregister; } } -- cgit v1.2.3-58-ga151 From 78bbb15f2239bc8e663aa20bbe1987c91a0b75f6 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 9 Jan 2018 13:40:41 -0800 Subject: 8021q: fix a memory leak for VLAN 0 device A vlan device with vid 0 is allow to creat by not able to be fully cleaned up by unregister_vlan_dev() which checks for vlan_id!=0. Also, VLAN 0 is probably not a valid number and it is kinda "reserved" for HW accelerating devices, but it is probably too late to reject it from creation even if makes sense. Instead, just remove the check in unregister_vlan_dev(). Reported-by: Dmitry Vyukov Fixes: ad1afb003939 ("vlan_dev: VLAN 0 should be treated as "no vlan tag" (802.1p packet)") Cc: Vlad Yasevich Cc: Ben Hutchings Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/8021q/vlan.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 8dfdd94e430f..bad01b14a4ad 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -111,12 +111,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) vlan_gvrp_uninit_applicant(real_dev); } - /* Take it out of our own structures, but be sure to interlock with - * HW accelerating devices or SW vlan input packet processing if - * VLAN is not 0 (leave it there for 802.1p). - */ - if (vlan_id) - vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); + vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id); /* Get rid of the vlan's reference to real_dev */ dev_put(real_dev); -- cgit v1.2.3-58-ga151 From fc2336505fb49a8b932a0a67a9745c408b79992c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 9 Jan 2018 18:14:28 -0800 Subject: nfp: always unmask aux interrupts at init The link state and exception interrupts may be masked when we probe. The firmware should in theory prevent sending (and automasking) those interrupts if the device is disabled, but if my reading of the FW code is correct there are firmwares out there with race conditions in this area. The interrupt may also be masked if previous driver which used the device was malfunctioning and we didn't load the FW (there is no other good way to comprehensively reset the PF). Note that FW unmasks the data interrupts by itself when vNIC is enabled, such helpful operation is not performed for LSC/EXN interrupts. Always unmask the auxiliary interrupts after request_irq(). On the remove path add missing PCI write flush before free_irq(). Fixes: 4c3523623dc0 ("net: add driver for Netronome NFP4000/NFP6000 NIC VFs") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 1a603fdd9e80..99b0487b6d82 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -568,6 +568,7 @@ nfp_net_aux_irq_request(struct nfp_net *nn, u32 ctrl_offset, return err; } nn_writeb(nn, ctrl_offset, entry->entry); + nfp_net_irq_unmask(nn, entry->entry); return 0; } @@ -582,6 +583,7 @@ static void nfp_net_aux_irq_free(struct nfp_net *nn, u32 ctrl_offset, unsigned int vector_idx) { nn_writeb(nn, ctrl_offset, 0xff); + nn_pci_flush(nn); free_irq(nn->irq_entries[vector_idx].vector, nn); } -- cgit v1.2.3-58-ga151 From 8e033a93b37f37aa9fca71a370a895155320af60 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 10 Jan 2018 11:42:43 +0100 Subject: mlxsw: pci: Wait after reset before accessing HW After performing reset driver polls on HW indication until learning that the reset is done, but immediately after reset the device becomes unresponsive which might lead to completion timeout on the first read. Wait for 100ms before starting the polling. Fixes: 233fa44bd67a ("mlxsw: pci: Implement reset done check") Signed-off-by: Yuval Mintz Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci.c | 7 ++++++- drivers/net/ethernet/mellanox/mlxsw/pci_hw.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index 23f7d828cf67..6ef20e5cc77d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -1643,7 +1643,12 @@ static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci, return 0; } - wmb(); /* reset needs to be written before we read control register */ + /* Reset needs to be written before we read control register, and + * we must wait for the HW to become responsive once again + */ + wmb(); + msleep(MLXSW_PCI_SW_RESET_WAIT_MSECS); + end = jiffies + msecs_to_jiffies(MLXSW_PCI_SW_RESET_TIMEOUT_MSECS); do { u32 val = mlxsw_pci_read32(mlxsw_pci, FW_READY); diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h index a6441208e9d9..fb082ad21b00 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h +++ b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h @@ -59,6 +59,7 @@ #define MLXSW_PCI_SW_RESET 0xF0010 #define MLXSW_PCI_SW_RESET_RST_BIT BIT(0) #define MLXSW_PCI_SW_RESET_TIMEOUT_MSECS 5000 +#define MLXSW_PCI_SW_RESET_WAIT_MSECS 100 #define MLXSW_PCI_FW_READY 0xA1844 #define MLXSW_PCI_FW_READY_MASK 0xFFFF #define MLXSW_PCI_FW_READY_MAGIC 0x5E -- cgit v1.2.3-58-ga151 From db84924c4fc3be1ef0c965d5ece5f6d785c77c5f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 10 Jan 2018 11:42:44 +0100 Subject: mlxsw: spectrum_qdisc: Don't use variable array in mlxsw_sp_tclass_congestion_enable Resolve the sparse warning: "sparse: Variable length array is used." Use 2 arrays for 2 PRM register accesses. Fixes: 96f17e0776c2 ("mlxsw: spectrum: Support RED qdisc offload") Signed-off-by: Jiri Pirko Reviewed-by: Yuval Mintz Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c index c33beac5def0..b5397da94d7f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_qdisc.c @@ -46,7 +46,8 @@ mlxsw_sp_tclass_congestion_enable(struct mlxsw_sp_port *mlxsw_sp_port, int tclass_num, u32 min, u32 max, u32 probability, bool is_ecn) { - char cwtp_cmd[max_t(u8, MLXSW_REG_CWTP_LEN, MLXSW_REG_CWTPM_LEN)]; + char cwtpm_cmd[MLXSW_REG_CWTPM_LEN]; + char cwtp_cmd[MLXSW_REG_CWTP_LEN]; struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; int err; @@ -60,10 +61,10 @@ mlxsw_sp_tclass_congestion_enable(struct mlxsw_sp_port *mlxsw_sp_port, if (err) return err; - mlxsw_reg_cwtpm_pack(cwtp_cmd, mlxsw_sp_port->local_port, tclass_num, + mlxsw_reg_cwtpm_pack(cwtpm_cmd, mlxsw_sp_port->local_port, tclass_num, MLXSW_REG_CWTP_DEFAULT_PROFILE, true, is_ecn); - return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(cwtpm), cwtp_cmd); + return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(cwtpm), cwtpm_cmd); } static int -- cgit v1.2.3-58-ga151 From 862c03ee1deb7e19e0f9931682e0294ecd1fcaf9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 10 Jan 2018 03:45:49 -0800 Subject: ipv6: fix possible mem leaks in ipv6_make_skb() ip6_setup_cork() might return an error, while memory allocations have been done and must be rolled back. Fixes: 6422398c2ab0 ("ipv6: introduce ipv6_make_skb") Signed-off-by: Eric Dumazet Cc: Vlad Yasevich Reported-by: Mike Maloney Acked-by: Mike Maloney Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f7dd51c42314..688ba5f7516b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1735,9 +1735,10 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork.base.opt = NULL; v6_cork.opt = NULL; err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); - if (err) + if (err) { + ip6_cork_release(&cork, &v6_cork); return ERR_PTR(err); - + } if (ipc6->dontfrag < 0) ipc6->dontfrag = inet6_sk(sk)->dontfrag; -- cgit v1.2.3-58-ga151 From ccc12b11c5332c84442ef120dcd631523be75089 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Wed, 10 Jan 2018 13:35:49 +0000 Subject: ipv6: sr: fix TLVs not being copied using setsockopt Function ipv6_push_rthdr4 allows to add an IPv6 Segment Routing Header to a socket through setsockopt, but the current implementation doesn't copy possible TLVs at the end of the SRH received from userspace. Therefore, the execution of the following branch if (sr_has_hmac(sr_phdr)) { ... } will never complete since the len and type fields of a possible HMAC TLV are not copied, hence seg6_get_tlv_hmac will return an error, and the HMAC will not be computed. This commit adds a memcpy in case TLVs have been appended to the SRH. Fixes: a149e7c7ce81 ("ipv6: sr: add support for SRH injection through setsockopt") Acked-by: David Lebrun Signed-off-by: Mathieu Xhonneux Signed-off-by: David S. Miller --- net/ipv6/exthdrs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 83bd75713535..bc68eb661970 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -925,6 +925,15 @@ static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, sr_phdr->segments[0] = **addr_p; *addr_p = &sr_ihdr->segments[sr_ihdr->segments_left]; + if (sr_ihdr->hdrlen > hops * 2) { + int tlvs_offset, tlvs_length; + + tlvs_offset = (1 + hops * 2) << 3; + tlvs_length = (sr_ihdr->hdrlen - hops * 2) << 3; + memcpy((char *)sr_phdr + tlvs_offset, + (char *)sr_ihdr + tlvs_offset, tlvs_length); + } + #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(sr_phdr)) { struct net *net = NULL; -- cgit v1.2.3-58-ga151 From ce4bb04cae8924792ed92f4af2793b77fc986f0e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Jan 2018 18:47:05 -0500 Subject: Fix a leak in socket(2) when we fail to allocate a file descriptor. Got broken by "make sock_alloc_file() do sock_release() on failures" - cleanup after sock_map_fd() failure got pulled all the way into sock_alloc_file(), but it used to serve the case when sock_map_fd() failed *before* getting to sock_alloc_file() as well, and that got lost. Trivial to fix, fortunately. Fixes: 8e1611e23579 (make sock_alloc_file() do sock_release() on failures) Reported-by: Dmitry Vyukov Signed-off-by: Al Viro --- net/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/socket.c b/net/socket.c index 42d8e9c9ccd5..82433a2200ec 100644 --- a/net/socket.c +++ b/net/socket.c @@ -432,8 +432,10 @@ static int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = get_unused_fd_flags(flags); - if (unlikely(fd < 0)) + if (unlikely(fd < 0)) { + sock_release(sock); return fd; + } newfile = sock_alloc_file(sock, flags, NULL); if (likely(!IS_ERR(newfile))) { -- cgit v1.2.3-58-ga151